diff --git "a/llama3-doc-and-code/trainer_state.json" "b/llama3-doc-and-code/trainer_state.json" new file mode 100644--- /dev/null +++ "b/llama3-doc-and-code/trainer_state.json" @@ -0,0 +1,209862 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 29976, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.51953125, + "learning_rate": 6.671114076050701e-08, + "loss": 2.1846, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.671875, + "learning_rate": 1.3342228152101402e-07, + "loss": 2.0338, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.72265625, + "learning_rate": 2.0013342228152104e-07, + "loss": 2.0493, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.62890625, + "learning_rate": 2.6684456304202804e-07, + "loss": 2.2774, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 0.53125, + "learning_rate": 3.3355570380253503e-07, + "loss": 2.1439, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.73046875, + "learning_rate": 4.002668445630421e-07, + "loss": 2.4853, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 0.6171875, + "learning_rate": 4.669779853235491e-07, + "loss": 2.2895, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 0.7109375, + "learning_rate": 5.336891260840561e-07, + "loss": 2.261, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 0.60546875, + "learning_rate": 6.00400266844563e-07, + "loss": 2.0325, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 0.51171875, + "learning_rate": 6.671114076050701e-07, + "loss": 2.0489, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.671875, + "learning_rate": 7.338225483655771e-07, + "loss": 2.4431, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.72265625, + "learning_rate": 8.005336891260842e-07, + "loss": 2.31, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.65625, + "learning_rate": 8.672448298865911e-07, + "loss": 2.3073, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.57421875, + "learning_rate": 9.339559706470982e-07, + "loss": 2.3018, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 0.7109375, + "learning_rate": 1.0006671114076052e-06, + "loss": 2.1663, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.5703125, + "learning_rate": 1.0673782521681121e-06, + "loss": 2.1059, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 0.54296875, + "learning_rate": 1.134089392928619e-06, + "loss": 2.0635, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 0.6796875, + "learning_rate": 1.200800533689126e-06, + "loss": 2.3074, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 1.2675116744496332e-06, + "loss": 2.0739, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 1.0625, + "learning_rate": 1.3342228152101401e-06, + "loss": 2.6405, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.5703125, + "learning_rate": 1.4009339559706473e-06, + "loss": 2.0617, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 0.6796875, + "learning_rate": 1.4676450967311542e-06, + "loss": 2.274, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 0.6015625, + "learning_rate": 1.5343562374916612e-06, + "loss": 2.2576, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 0.388671875, + "learning_rate": 1.6010673782521683e-06, + "loss": 1.8038, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 1.6677785190126753e-06, + "loss": 2.1478, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.59765625, + "learning_rate": 1.7344896597731822e-06, + "loss": 2.1964, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 0.78125, + "learning_rate": 1.8012008005336891e-06, + "loss": 2.6832, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 0.64453125, + "learning_rate": 1.8679119412941963e-06, + "loss": 2.1656, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 0.462890625, + "learning_rate": 1.9346230820547032e-06, + "loss": 1.816, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 0.5234375, + "learning_rate": 2.0013342228152104e-06, + "loss": 1.9058, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.56640625, + "learning_rate": 2.068045363575717e-06, + "loss": 2.347, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 0.59765625, + "learning_rate": 2.1347565043362243e-06, + "loss": 2.3649, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 0.58203125, + "learning_rate": 2.2014676450967314e-06, + "loss": 2.1872, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 2.268178785857238e-06, + "loss": 1.8424, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 0.546875, + "learning_rate": 2.3348899266177453e-06, + "loss": 1.7329, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 2.401601067378252e-06, + "loss": 1.8277, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 0.435546875, + "learning_rate": 2.468312208138759e-06, + "loss": 1.8008, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 0.63671875, + "learning_rate": 2.5350233488992664e-06, + "loss": 2.2312, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 0.5078125, + "learning_rate": 2.601734489659773e-06, + "loss": 1.8857, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 0.71484375, + "learning_rate": 2.6684456304202803e-06, + "loss": 2.3362, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.5703125, + "learning_rate": 2.7351567711807874e-06, + "loss": 2.0573, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 0.447265625, + "learning_rate": 2.8018679119412946e-06, + "loss": 2.057, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 0.4375, + "learning_rate": 2.8685790527018013e-06, + "loss": 1.7149, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 0.55078125, + "learning_rate": 2.9352901934623084e-06, + "loss": 2.2061, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 0.58203125, + "learning_rate": 3.0020013342228156e-06, + "loss": 2.115, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.75390625, + "learning_rate": 3.0687124749833223e-06, + "loss": 2.1279, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 0.578125, + "learning_rate": 3.1354236157438295e-06, + "loss": 2.0749, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 0.484375, + "learning_rate": 3.2021347565043366e-06, + "loss": 1.903, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 0.466796875, + "learning_rate": 3.2688458972648434e-06, + "loss": 1.9123, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 0.6015625, + "learning_rate": 3.3355570380253505e-06, + "loss": 1.8963, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.83203125, + "learning_rate": 3.4022681787858573e-06, + "loss": 1.7852, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 0.65625, + "learning_rate": 3.4689793195463644e-06, + "loss": 2.2327, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 0.52734375, + "learning_rate": 3.5356904603068716e-06, + "loss": 1.9642, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 0.58984375, + "learning_rate": 3.6024016010673783e-06, + "loss": 1.9863, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.6953125, + "learning_rate": 3.6691127418278855e-06, + "loss": 2.1789, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.578125, + "learning_rate": 3.7358238825883926e-06, + "loss": 1.94, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 0.6484375, + "learning_rate": 3.8025350233488993e-06, + "loss": 2.1139, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 0.458984375, + "learning_rate": 3.8692461641094065e-06, + "loss": 1.9934, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 0.7578125, + "learning_rate": 3.935957304869913e-06, + "loss": 2.1705, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 0.55859375, + "learning_rate": 4.002668445630421e-06, + "loss": 2.0687, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.515625, + "learning_rate": 4.0693795863909275e-06, + "loss": 1.8417, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 0.466796875, + "learning_rate": 4.136090727151434e-06, + "loss": 1.9392, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 0.5078125, + "learning_rate": 4.202801867911942e-06, + "loss": 1.7537, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 0.609375, + "learning_rate": 4.2695130086724486e-06, + "loss": 2.0184, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 0.4609375, + "learning_rate": 4.336224149432955e-06, + "loss": 1.7927, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.54296875, + "learning_rate": 4.402935290193463e-06, + "loss": 1.7636, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.51953125, + "learning_rate": 4.46964643095397e-06, + "loss": 2.2046, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.5703125, + "learning_rate": 4.536357571714476e-06, + "loss": 2.1252, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.498046875, + "learning_rate": 4.603068712474984e-06, + "loss": 1.8847, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 0.484375, + "learning_rate": 4.669779853235491e-06, + "loss": 1.659, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.55078125, + "learning_rate": 4.736490993995997e-06, + "loss": 2.04, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 0.7890625, + "learning_rate": 4.803202134756504e-06, + "loss": 2.4532, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.6796875, + "learning_rate": 4.869913275517012e-06, + "loss": 2.173, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 0.69921875, + "learning_rate": 4.936624416277518e-06, + "loss": 1.6952, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 0.71875, + "learning_rate": 5.003335557038025e-06, + "loss": 2.3963, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.5234375, + "learning_rate": 5.070046697798533e-06, + "loss": 2.0259, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 0.6171875, + "learning_rate": 5.13675783855904e-06, + "loss": 2.0041, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 0.578125, + "learning_rate": 5.203468979319546e-06, + "loss": 1.6951, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 0.490234375, + "learning_rate": 5.270180120080054e-06, + "loss": 1.8363, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.423828125, + "learning_rate": 5.3368912608405605e-06, + "loss": 1.5219, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.59375, + "learning_rate": 5.403602401601067e-06, + "loss": 2.082, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 0.58203125, + "learning_rate": 5.470313542361575e-06, + "loss": 2.1693, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 0.71484375, + "learning_rate": 5.5370246831220815e-06, + "loss": 2.3297, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 0.59375, + "learning_rate": 5.603735823882589e-06, + "loss": 1.8865, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 0.6328125, + "learning_rate": 5.670446964643095e-06, + "loss": 2.3677, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.5, + "learning_rate": 5.737158105403603e-06, + "loss": 1.7696, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.58984375, + "learning_rate": 5.80386924616411e-06, + "loss": 1.9006, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.73046875, + "learning_rate": 5.870580386924617e-06, + "loss": 2.2388, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.66015625, + "learning_rate": 5.937291527685124e-06, + "loss": 2.2569, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.65625, + "learning_rate": 6.004002668445631e-06, + "loss": 1.9932, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.53515625, + "learning_rate": 6.070713809206138e-06, + "loss": 1.8399, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.5625, + "learning_rate": 6.137424949966645e-06, + "loss": 1.859, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.75390625, + "learning_rate": 6.204136090727151e-06, + "loss": 2.2759, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.486328125, + "learning_rate": 6.270847231487659e-06, + "loss": 1.8565, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.58203125, + "learning_rate": 6.337558372248166e-06, + "loss": 2.1077, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.70703125, + "learning_rate": 6.404269513008673e-06, + "loss": 1.9779, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.65234375, + "learning_rate": 6.470980653769179e-06, + "loss": 1.826, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.59375, + "learning_rate": 6.537691794529687e-06, + "loss": 1.8471, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.46875, + "learning_rate": 6.6044029352901935e-06, + "loss": 1.8446, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.65625, + "learning_rate": 6.671114076050701e-06, + "loss": 1.9881, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.60546875, + "learning_rate": 6.737825216811209e-06, + "loss": 1.917, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.671875, + "learning_rate": 6.8045363575717145e-06, + "loss": 1.9377, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.66796875, + "learning_rate": 6.871247498332221e-06, + "loss": 2.0538, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.52734375, + "learning_rate": 6.937958639092729e-06, + "loss": 1.91, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.447265625, + "learning_rate": 7.0046697798532356e-06, + "loss": 1.5841, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.5234375, + "learning_rate": 7.071380920613743e-06, + "loss": 1.9165, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.66015625, + "learning_rate": 7.138092061374251e-06, + "loss": 2.0489, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.63671875, + "learning_rate": 7.204803202134757e-06, + "loss": 2.1238, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.55859375, + "learning_rate": 7.271514342895263e-06, + "loss": 1.5805, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.57421875, + "learning_rate": 7.338225483655771e-06, + "loss": 1.7077, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.5859375, + "learning_rate": 7.4049366244162785e-06, + "loss": 1.9022, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.828125, + "learning_rate": 7.471647765176785e-06, + "loss": 1.8026, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 0.6171875, + "learning_rate": 7.538358905937291e-06, + "loss": 2.0569, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 0.6640625, + "learning_rate": 7.605070046697799e-06, + "loss": 1.9457, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 0.546875, + "learning_rate": 7.671781187458306e-06, + "loss": 1.8382, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 0.482421875, + "learning_rate": 7.738492328218813e-06, + "loss": 1.7255, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 7.80520346897932e-06, + "loss": 1.7986, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 0.609375, + "learning_rate": 7.871914609739826e-06, + "loss": 1.6882, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 0.640625, + "learning_rate": 7.938625750500333e-06, + "loss": 1.8244, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 0.5625, + "learning_rate": 8.005336891260842e-06, + "loss": 1.8374, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.455078125, + "learning_rate": 8.072048032021348e-06, + "loss": 1.8119, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 0.66796875, + "learning_rate": 8.138759172781855e-06, + "loss": 1.8236, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 0.65234375, + "learning_rate": 8.205470313542362e-06, + "loss": 1.878, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 0.64453125, + "learning_rate": 8.272181454302869e-06, + "loss": 1.7091, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 0.625, + "learning_rate": 8.338892595063375e-06, + "loss": 1.881, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 0.671875, + "learning_rate": 8.405603735823884e-06, + "loss": 2.1188, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 0.62890625, + "learning_rate": 8.47231487658439e-06, + "loss": 1.9247, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 0.55859375, + "learning_rate": 8.539026017344897e-06, + "loss": 1.7299, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 0.7265625, + "learning_rate": 8.605737158105404e-06, + "loss": 1.9015, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 0.69140625, + "learning_rate": 8.67244829886591e-06, + "loss": 1.7443, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 0.62109375, + "learning_rate": 8.739159439626417e-06, + "loss": 1.7487, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 0.55078125, + "learning_rate": 8.805870580386926e-06, + "loss": 1.7782, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 0.54296875, + "learning_rate": 8.872581721147432e-06, + "loss": 1.6699, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 8.93929286190794e-06, + "loss": 2.0761, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 0.6015625, + "learning_rate": 9.006004002668446e-06, + "loss": 1.9507, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 0.494140625, + "learning_rate": 9.072715143428953e-06, + "loss": 1.7212, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 0.71875, + "learning_rate": 9.139426284189461e-06, + "loss": 1.7352, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 0.59765625, + "learning_rate": 9.206137424949968e-06, + "loss": 1.7839, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 0.51953125, + "learning_rate": 9.272848565710473e-06, + "loss": 1.6841, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 0.68359375, + "learning_rate": 9.339559706470981e-06, + "loss": 1.9437, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 0.55859375, + "learning_rate": 9.406270847231488e-06, + "loss": 1.8975, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 0.5078125, + "learning_rate": 9.472981987991995e-06, + "loss": 1.6757, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 0.54296875, + "learning_rate": 9.539693128752503e-06, + "loss": 1.7213, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 0.5078125, + "learning_rate": 9.606404269513008e-06, + "loss": 1.7765, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 0.5, + "learning_rate": 9.673115410273517e-06, + "loss": 1.6369, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 0.48046875, + "learning_rate": 9.739826551034023e-06, + "loss": 1.499, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 0.423828125, + "learning_rate": 9.80653769179453e-06, + "loss": 1.5492, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 0.65625, + "learning_rate": 9.873248832555037e-06, + "loss": 1.9582, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 0.4453125, + "learning_rate": 9.939959973315544e-06, + "loss": 1.5191, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 0.5859375, + "learning_rate": 1.000667111407605e-05, + "loss": 1.7804, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 0.51953125, + "learning_rate": 1.0073382254836559e-05, + "loss": 1.625, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 0.66015625, + "learning_rate": 1.0140093395597065e-05, + "loss": 1.603, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 0.45703125, + "learning_rate": 1.0206804536357572e-05, + "loss": 1.5944, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 0.578125, + "learning_rate": 1.027351567711808e-05, + "loss": 1.8212, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 0.486328125, + "learning_rate": 1.0340226817878586e-05, + "loss": 1.5948, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 0.609375, + "learning_rate": 1.0406937958639092e-05, + "loss": 1.6167, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 0.59765625, + "learning_rate": 1.04736490993996e-05, + "loss": 1.7589, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 0.546875, + "learning_rate": 1.0540360240160108e-05, + "loss": 1.6307, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 0.57421875, + "learning_rate": 1.0607071380920614e-05, + "loss": 1.6086, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 0.4921875, + "learning_rate": 1.0673782521681121e-05, + "loss": 1.6852, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 0.353515625, + "learning_rate": 1.0740493662441628e-05, + "loss": 1.4366, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 0.58984375, + "learning_rate": 1.0807204803202134e-05, + "loss": 1.6126, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 0.61328125, + "learning_rate": 1.0873915943962643e-05, + "loss": 1.5979, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 0.466796875, + "learning_rate": 1.094062708472315e-05, + "loss": 1.3379, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 0.55078125, + "learning_rate": 1.1007338225483656e-05, + "loss": 1.7221, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.50390625, + "learning_rate": 1.1074049366244163e-05, + "loss": 1.5828, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 0.5546875, + "learning_rate": 1.114076050700467e-05, + "loss": 1.8847, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 0.73046875, + "learning_rate": 1.1207471647765178e-05, + "loss": 1.3537, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 0.59375, + "learning_rate": 1.1274182788525685e-05, + "loss": 1.4071, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 0.55859375, + "learning_rate": 1.134089392928619e-05, + "loss": 1.4998, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.609375, + "learning_rate": 1.1407605070046698e-05, + "loss": 1.6346, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 0.55078125, + "learning_rate": 1.1474316210807205e-05, + "loss": 1.6293, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 0.58984375, + "learning_rate": 1.1541027351567712e-05, + "loss": 1.5329, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 0.54296875, + "learning_rate": 1.160773849232822e-05, + "loss": 1.5234, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 0.72265625, + "learning_rate": 1.1674449633088725e-05, + "loss": 1.5198, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.59765625, + "learning_rate": 1.1741160773849234e-05, + "loss": 1.7617, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 0.4375, + "learning_rate": 1.180787191460974e-05, + "loss": 1.5289, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 0.7109375, + "learning_rate": 1.1874583055370247e-05, + "loss": 1.5756, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 0.470703125, + "learning_rate": 1.1941294196130754e-05, + "loss": 1.5912, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 0.61328125, + "learning_rate": 1.2008005336891262e-05, + "loss": 1.5373, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.62890625, + "learning_rate": 1.2074716477651767e-05, + "loss": 1.5989, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 0.62109375, + "learning_rate": 1.2141427618412276e-05, + "loss": 1.4633, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 0.671875, + "learning_rate": 1.2208138759172783e-05, + "loss": 1.4276, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 0.515625, + "learning_rate": 1.227484989993329e-05, + "loss": 1.6095, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 0.55078125, + "learning_rate": 1.2341561040693798e-05, + "loss": 1.6588, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.5234375, + "learning_rate": 1.2408272181454303e-05, + "loss": 1.498, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 0.53125, + "learning_rate": 1.247498332221481e-05, + "loss": 1.498, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 0.71875, + "learning_rate": 1.2541694462975318e-05, + "loss": 1.2351, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 0.68359375, + "learning_rate": 1.2608405603735823e-05, + "loss": 1.8943, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 0.6484375, + "learning_rate": 1.2675116744496331e-05, + "loss": 1.2741, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 0.65625, + "learning_rate": 1.2741827885256838e-05, + "loss": 1.7695, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 0.5546875, + "learning_rate": 1.2808539026017347e-05, + "loss": 1.2914, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 0.474609375, + "learning_rate": 1.2875250166777853e-05, + "loss": 1.4294, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 0.5, + "learning_rate": 1.2941961307538358e-05, + "loss": 1.5252, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 0.5390625, + "learning_rate": 1.3008672448298867e-05, + "loss": 1.1783, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 0.51953125, + "learning_rate": 1.3075383589059373e-05, + "loss": 1.1359, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 0.8203125, + "learning_rate": 1.3142094729819882e-05, + "loss": 1.3662, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 0.6796875, + "learning_rate": 1.3208805870580387e-05, + "loss": 1.2258, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 1.0078125, + "learning_rate": 1.3275517011340894e-05, + "loss": 1.6397, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 0.5625, + "learning_rate": 1.3342228152101402e-05, + "loss": 1.3819, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 0.75, + "learning_rate": 1.3408939292861907e-05, + "loss": 1.5494, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 0.69140625, + "learning_rate": 1.3475650433622417e-05, + "loss": 1.5932, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 0.5390625, + "learning_rate": 1.3542361574382922e-05, + "loss": 1.6934, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 1.3609072715143429e-05, + "loss": 1.3147, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 0.5625, + "learning_rate": 1.3675783855903937e-05, + "loss": 1.1035, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 0.59765625, + "learning_rate": 1.3742494996664442e-05, + "loss": 1.3294, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 0.6875, + "learning_rate": 1.3809206137424951e-05, + "loss": 1.4678, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 0.546875, + "learning_rate": 1.3875917278185458e-05, + "loss": 1.3057, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 0.66796875, + "learning_rate": 1.3942628418945963e-05, + "loss": 1.3944, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 0.68359375, + "learning_rate": 1.4009339559706471e-05, + "loss": 1.3095, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 0.75390625, + "learning_rate": 1.4076050700466978e-05, + "loss": 1.1588, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 0.75390625, + "learning_rate": 1.4142761841227486e-05, + "loss": 1.3516, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 0.62109375, + "learning_rate": 1.4209472981987993e-05, + "loss": 1.4045, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 0.64453125, + "learning_rate": 1.4276184122748501e-05, + "loss": 1.4111, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 0.59375, + "learning_rate": 1.4342895263509006e-05, + "loss": 1.3673, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 0.60546875, + "learning_rate": 1.4409606404269513e-05, + "loss": 1.1889, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 0.71875, + "learning_rate": 1.4476317545030022e-05, + "loss": 1.3023, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 0.73046875, + "learning_rate": 1.4543028685790527e-05, + "loss": 1.396, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 1.4609739826551037e-05, + "loss": 1.544, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 0.67578125, + "learning_rate": 1.4676450967311542e-05, + "loss": 1.3233, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 0.58984375, + "learning_rate": 1.4743162108072049e-05, + "loss": 1.1058, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 0.7265625, + "learning_rate": 1.4809873248832557e-05, + "loss": 1.0583, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 0.609375, + "learning_rate": 1.4876584389593062e-05, + "loss": 1.3731, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 0.46484375, + "learning_rate": 1.494329553035357e-05, + "loss": 1.2438, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 0.52734375, + "learning_rate": 1.5010006671114077e-05, + "loss": 1.3514, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 0.5703125, + "learning_rate": 1.5076717811874582e-05, + "loss": 1.7485, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 0.62890625, + "learning_rate": 1.514342895263509e-05, + "loss": 1.1726, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 0.78125, + "learning_rate": 1.5210140093395597e-05, + "loss": 1.181, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 0.640625, + "learning_rate": 1.5276851234156107e-05, + "loss": 0.9971, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 0.59765625, + "learning_rate": 1.5343562374916613e-05, + "loss": 1.5258, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 1.5410273515677118e-05, + "loss": 1.1463, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 0.59765625, + "learning_rate": 1.5476984656437626e-05, + "loss": 1.1307, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 0.59375, + "learning_rate": 1.554369579719813e-05, + "loss": 1.4679, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 0.6328125, + "learning_rate": 1.561040693795864e-05, + "loss": 1.3148, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 0.5234375, + "learning_rate": 1.5677118078719148e-05, + "loss": 1.287, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 0.703125, + "learning_rate": 1.5743829219479653e-05, + "loss": 1.5045, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 0.6640625, + "learning_rate": 1.581054036024016e-05, + "loss": 1.1345, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 0.6171875, + "learning_rate": 1.5877251501000666e-05, + "loss": 1.0755, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 0.6328125, + "learning_rate": 1.5943962641761175e-05, + "loss": 1.2104, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 0.765625, + "learning_rate": 1.6010673782521683e-05, + "loss": 1.5143, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 0.828125, + "learning_rate": 1.6077384923282188e-05, + "loss": 1.3813, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 0.625, + "learning_rate": 1.6144096064042697e-05, + "loss": 1.3537, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 0.7578125, + "learning_rate": 1.6210807204803202e-05, + "loss": 1.3038, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 0.6328125, + "learning_rate": 1.627751834556371e-05, + "loss": 1.1376, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 0.55859375, + "learning_rate": 1.6344229486324215e-05, + "loss": 1.5108, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 0.7421875, + "learning_rate": 1.6410940627084724e-05, + "loss": 1.5556, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 0.4765625, + "learning_rate": 1.6477651767845232e-05, + "loss": 1.3688, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 0.53125, + "learning_rate": 1.6544362908605737e-05, + "loss": 1.2553, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 0.60546875, + "learning_rate": 1.6611074049366245e-05, + "loss": 1.0334, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 0.61328125, + "learning_rate": 1.667778519012675e-05, + "loss": 1.1693, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 0.640625, + "learning_rate": 1.674449633088726e-05, + "loss": 1.2176, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 0.703125, + "learning_rate": 1.6811207471647767e-05, + "loss": 0.9967, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 0.46875, + "learning_rate": 1.6877918612408272e-05, + "loss": 1.0257, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 0.64453125, + "learning_rate": 1.694462975316878e-05, + "loss": 0.9373, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 1.7011340893929286e-05, + "loss": 1.2646, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 0.6015625, + "learning_rate": 1.7078052034689794e-05, + "loss": 1.5778, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 0.73828125, + "learning_rate": 1.7144763175450303e-05, + "loss": 1.568, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 0.78515625, + "learning_rate": 1.7211474316210808e-05, + "loss": 1.0891, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 0.7265625, + "learning_rate": 1.7278185456971316e-05, + "loss": 1.1872, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 0.7578125, + "learning_rate": 1.734489659773182e-05, + "loss": 1.2714, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 0.84375, + "learning_rate": 1.741160773849233e-05, + "loss": 1.1016, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 0.82421875, + "learning_rate": 1.7478318879252835e-05, + "loss": 1.1515, + "step": 262 + }, + { + "epoch": 0.04, + "grad_norm": 0.58984375, + "learning_rate": 1.7545030020013343e-05, + "loss": 1.0196, + "step": 263 + }, + { + "epoch": 0.04, + "grad_norm": 0.734375, + "learning_rate": 1.761174116077385e-05, + "loss": 1.0913, + "step": 264 + }, + { + "epoch": 0.04, + "grad_norm": 0.796875, + "learning_rate": 1.7678452301534357e-05, + "loss": 1.2229, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 0.7265625, + "learning_rate": 1.7745163442294865e-05, + "loss": 1.4499, + "step": 266 + }, + { + "epoch": 0.04, + "grad_norm": 0.734375, + "learning_rate": 1.781187458305537e-05, + "loss": 1.2616, + "step": 267 + }, + { + "epoch": 0.04, + "grad_norm": 0.69140625, + "learning_rate": 1.787858572381588e-05, + "loss": 1.2186, + "step": 268 + }, + { + "epoch": 0.04, + "grad_norm": 1.4609375, + "learning_rate": 1.7945296864576387e-05, + "loss": 1.5288, + "step": 269 + }, + { + "epoch": 0.04, + "grad_norm": 0.6484375, + "learning_rate": 1.8012008005336892e-05, + "loss": 1.6145, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 0.65625, + "learning_rate": 1.80787191460974e-05, + "loss": 1.2234, + "step": 271 + }, + { + "epoch": 0.04, + "grad_norm": 0.7265625, + "learning_rate": 1.8145430286857905e-05, + "loss": 1.1957, + "step": 272 + }, + { + "epoch": 0.04, + "grad_norm": 0.828125, + "learning_rate": 1.8212141427618414e-05, + "loss": 1.228, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 0.74609375, + "learning_rate": 1.8278852568378922e-05, + "loss": 1.2268, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 0.6171875, + "learning_rate": 1.8345563709139427e-05, + "loss": 1.357, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 0.63671875, + "learning_rate": 1.8412274849899936e-05, + "loss": 1.3676, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 0.7109375, + "learning_rate": 1.847898599066044e-05, + "loss": 1.2022, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 0.6171875, + "learning_rate": 1.8545697131420946e-05, + "loss": 1.2238, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 0.5859375, + "learning_rate": 1.8612408272181454e-05, + "loss": 1.1721, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 0.88671875, + "learning_rate": 1.8679119412941963e-05, + "loss": 1.4354, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 0.6640625, + "learning_rate": 1.874583055370247e-05, + "loss": 1.1917, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 0.72265625, + "learning_rate": 1.8812541694462976e-05, + "loss": 1.2854, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 0.73828125, + "learning_rate": 1.887925283522348e-05, + "loss": 1.2767, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 0.5859375, + "learning_rate": 1.894596397598399e-05, + "loss": 1.2017, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 0.8125, + "learning_rate": 1.9012675116744498e-05, + "loss": 1.249, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 0.6875, + "learning_rate": 1.9079386257505006e-05, + "loss": 1.3197, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 0.64453125, + "learning_rate": 1.914609739826551e-05, + "loss": 1.1422, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 0.94921875, + "learning_rate": 1.9212808539026016e-05, + "loss": 1.1284, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 0.66796875, + "learning_rate": 1.9279519679786525e-05, + "loss": 1.2601, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 0.5625, + "learning_rate": 1.9346230820547033e-05, + "loss": 1.1904, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 0.61328125, + "learning_rate": 1.9412941961307542e-05, + "loss": 1.182, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 0.93359375, + "learning_rate": 1.9479653102068047e-05, + "loss": 1.2944, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 0.6875, + "learning_rate": 1.9546364242828552e-05, + "loss": 0.9586, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 0.5546875, + "learning_rate": 1.961307538358906e-05, + "loss": 1.2016, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 0.734375, + "learning_rate": 1.9679786524349565e-05, + "loss": 1.6617, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 0.69921875, + "learning_rate": 1.9746497665110074e-05, + "loss": 1.0987, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 0.5859375, + "learning_rate": 1.9813208805870582e-05, + "loss": 1.3126, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 0.75390625, + "learning_rate": 1.9879919946631087e-05, + "loss": 1.1171, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 0.70703125, + "learning_rate": 1.9946631087391596e-05, + "loss": 0.8614, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 0.51953125, + "learning_rate": 2.00133422281521e-05, + "loss": 1.1097, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 0.56640625, + "learning_rate": 2.008005336891261e-05, + "loss": 1.3701, + "step": 301 + }, + { + "epoch": 0.04, + "grad_norm": 0.67578125, + "learning_rate": 2.0146764509673117e-05, + "loss": 1.2597, + "step": 302 + }, + { + "epoch": 0.04, + "grad_norm": 0.671875, + "learning_rate": 2.0213475650433622e-05, + "loss": 1.2049, + "step": 303 + }, + { + "epoch": 0.04, + "grad_norm": 0.63671875, + "learning_rate": 2.028018679119413e-05, + "loss": 1.4233, + "step": 304 + }, + { + "epoch": 0.04, + "grad_norm": 1.28125, + "learning_rate": 2.0346897931954636e-05, + "loss": 1.5894, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 0.78515625, + "learning_rate": 2.0413609072715144e-05, + "loss": 1.1671, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 0.484375, + "learning_rate": 2.048032021347565e-05, + "loss": 1.2735, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 0.8046875, + "learning_rate": 2.054703135423616e-05, + "loss": 1.3139, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 0.83203125, + "learning_rate": 2.0613742494996666e-05, + "loss": 1.2401, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 1.1484375, + "learning_rate": 2.068045363575717e-05, + "loss": 1.139, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 1.53125, + "learning_rate": 2.074716477651768e-05, + "loss": 1.1792, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 0.96484375, + "learning_rate": 2.0813875917278185e-05, + "loss": 1.322, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 0.8046875, + "learning_rate": 2.0880587058038693e-05, + "loss": 1.0565, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 0.578125, + "learning_rate": 2.09472981987992e-05, + "loss": 1.2777, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 0.75390625, + "learning_rate": 2.1014009339559707e-05, + "loss": 1.2441, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 1.15625, + "learning_rate": 2.1080720480320215e-05, + "loss": 1.218, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 0.609375, + "learning_rate": 2.114743162108072e-05, + "loss": 1.121, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 1.0703125, + "learning_rate": 2.121414276184123e-05, + "loss": 1.1398, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 0.69921875, + "learning_rate": 2.1280853902601737e-05, + "loss": 1.0344, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 0.69921875, + "learning_rate": 2.1347565043362242e-05, + "loss": 1.1499, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 0.8984375, + "learning_rate": 2.141427618412275e-05, + "loss": 1.09, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 1.2890625, + "learning_rate": 2.1480987324883255e-05, + "loss": 1.6944, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 0.875, + "learning_rate": 2.1547698465643764e-05, + "loss": 1.3491, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 0.7265625, + "learning_rate": 2.161440960640427e-05, + "loss": 1.1785, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 1.3828125, + "learning_rate": 2.1681120747164777e-05, + "loss": 1.2379, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 0.84375, + "learning_rate": 2.1747831887925286e-05, + "loss": 1.3795, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 0.64453125, + "learning_rate": 2.181454302868579e-05, + "loss": 1.1874, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 0.58984375, + "learning_rate": 2.18812541694463e-05, + "loss": 1.1157, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 0.765625, + "learning_rate": 2.1947965310206804e-05, + "loss": 0.8658, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 0.51953125, + "learning_rate": 2.2014676450967313e-05, + "loss": 1.0776, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 0.640625, + "learning_rate": 2.208138759172782e-05, + "loss": 1.1268, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 0.890625, + "learning_rate": 2.2148098732488326e-05, + "loss": 1.1045, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 0.66796875, + "learning_rate": 2.2214809873248835e-05, + "loss": 1.2061, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 0.5859375, + "learning_rate": 2.228152101400934e-05, + "loss": 1.1694, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 0.8359375, + "learning_rate": 2.2348232154769848e-05, + "loss": 1.1468, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 0.828125, + "learning_rate": 2.2414943295530356e-05, + "loss": 0.9181, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 0.6484375, + "learning_rate": 2.248165443629086e-05, + "loss": 1.3635, + "step": 337 + }, + { + "epoch": 0.05, + "grad_norm": 0.71484375, + "learning_rate": 2.254836557705137e-05, + "loss": 1.2002, + "step": 338 + }, + { + "epoch": 0.05, + "grad_norm": 0.76171875, + "learning_rate": 2.2615076717811875e-05, + "loss": 0.9788, + "step": 339 + }, + { + "epoch": 0.05, + "grad_norm": 0.734375, + "learning_rate": 2.268178785857238e-05, + "loss": 1.3431, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 0.54296875, + "learning_rate": 2.274849899933289e-05, + "loss": 1.2831, + "step": 341 + }, + { + "epoch": 0.05, + "grad_norm": 0.69921875, + "learning_rate": 2.2815210140093397e-05, + "loss": 1.2555, + "step": 342 + }, + { + "epoch": 0.05, + "grad_norm": 0.74609375, + "learning_rate": 2.2881921280853905e-05, + "loss": 0.9097, + "step": 343 + }, + { + "epoch": 0.05, + "grad_norm": 0.796875, + "learning_rate": 2.294863242161441e-05, + "loss": 1.0723, + "step": 344 + }, + { + "epoch": 0.05, + "grad_norm": 0.7890625, + "learning_rate": 2.3015343562374915e-05, + "loss": 0.9437, + "step": 345 + }, + { + "epoch": 0.05, + "grad_norm": 0.75, + "learning_rate": 2.3082054703135424e-05, + "loss": 1.2348, + "step": 346 + }, + { + "epoch": 0.05, + "grad_norm": 0.69140625, + "learning_rate": 2.3148765843895932e-05, + "loss": 1.077, + "step": 347 + }, + { + "epoch": 0.05, + "grad_norm": 0.71484375, + "learning_rate": 2.321547698465644e-05, + "loss": 1.5016, + "step": 348 + }, + { + "epoch": 0.05, + "grad_norm": 0.984375, + "learning_rate": 2.3282188125416946e-05, + "loss": 1.0669, + "step": 349 + }, + { + "epoch": 0.05, + "grad_norm": 0.84375, + "learning_rate": 2.334889926617745e-05, + "loss": 0.9416, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 0.671875, + "learning_rate": 2.341561040693796e-05, + "loss": 1.2808, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 0.7265625, + "learning_rate": 2.3482321547698468e-05, + "loss": 0.8608, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 0.640625, + "learning_rate": 2.3549032688458976e-05, + "loss": 1.0307, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 0.67578125, + "learning_rate": 2.361574382921948e-05, + "loss": 1.0546, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 0.7109375, + "learning_rate": 2.368245496997999e-05, + "loss": 0.8809, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 1.03125, + "learning_rate": 2.3749166110740494e-05, + "loss": 1.2424, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 0.71484375, + "learning_rate": 2.3815877251501e-05, + "loss": 0.9704, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 0.81640625, + "learning_rate": 2.3882588392261508e-05, + "loss": 1.0094, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 0.90234375, + "learning_rate": 2.3949299533022016e-05, + "loss": 0.9449, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 0.8984375, + "learning_rate": 2.4016010673782525e-05, + "loss": 0.8794, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 0.86328125, + "learning_rate": 2.408272181454303e-05, + "loss": 1.0171, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 0.65625, + "learning_rate": 2.4149432955303535e-05, + "loss": 1.0021, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 0.859375, + "learning_rate": 2.4216144096064043e-05, + "loss": 1.1328, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 0.62890625, + "learning_rate": 2.4282855236824552e-05, + "loss": 1.2008, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 0.80859375, + "learning_rate": 2.434956637758506e-05, + "loss": 1.2869, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 0.609375, + "learning_rate": 2.4416277518345565e-05, + "loss": 1.2345, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 0.87109375, + "learning_rate": 2.448298865910607e-05, + "loss": 1.1726, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 0.8359375, + "learning_rate": 2.454969979986658e-05, + "loss": 0.9706, + "step": 368 + }, + { + "epoch": 0.05, + "grad_norm": 0.69921875, + "learning_rate": 2.4616410940627087e-05, + "loss": 1.1713, + "step": 369 + }, + { + "epoch": 0.05, + "grad_norm": 0.66796875, + "learning_rate": 2.4683122081387595e-05, + "loss": 1.236, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 1.171875, + "learning_rate": 2.47498332221481e-05, + "loss": 1.0645, + "step": 371 + }, + { + "epoch": 0.05, + "grad_norm": 0.78125, + "learning_rate": 2.4816544362908606e-05, + "loss": 0.9474, + "step": 372 + }, + { + "epoch": 0.05, + "grad_norm": 0.6015625, + "learning_rate": 2.4883255503669114e-05, + "loss": 1.1446, + "step": 373 + }, + { + "epoch": 0.05, + "grad_norm": 0.79296875, + "learning_rate": 2.494996664442962e-05, + "loss": 0.9964, + "step": 374 + }, + { + "epoch": 0.05, + "grad_norm": 0.80859375, + "learning_rate": 2.5016677785190127e-05, + "loss": 0.9588, + "step": 375 + }, + { + "epoch": 0.05, + "grad_norm": 0.59375, + "learning_rate": 2.5083388925950636e-05, + "loss": 1.0757, + "step": 376 + }, + { + "epoch": 0.05, + "grad_norm": 0.9140625, + "learning_rate": 2.515010006671114e-05, + "loss": 1.0765, + "step": 377 + }, + { + "epoch": 0.05, + "grad_norm": 1.03125, + "learning_rate": 2.5216811207471646e-05, + "loss": 0.9297, + "step": 378 + }, + { + "epoch": 0.05, + "grad_norm": 0.82421875, + "learning_rate": 2.5283522348232158e-05, + "loss": 1.2334, + "step": 379 + }, + { + "epoch": 0.05, + "grad_norm": 0.66796875, + "learning_rate": 2.5350233488992663e-05, + "loss": 1.4063, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 0.7421875, + "learning_rate": 2.541694462975317e-05, + "loss": 1.0228, + "step": 381 + }, + { + "epoch": 0.05, + "grad_norm": 0.58984375, + "learning_rate": 2.5483655770513676e-05, + "loss": 0.8349, + "step": 382 + }, + { + "epoch": 0.05, + "grad_norm": 0.94140625, + "learning_rate": 2.555036691127418e-05, + "loss": 0.8339, + "step": 383 + }, + { + "epoch": 0.05, + "grad_norm": 0.66015625, + "learning_rate": 2.5617078052034693e-05, + "loss": 1.3143, + "step": 384 + }, + { + "epoch": 0.05, + "grad_norm": 0.94921875, + "learning_rate": 2.5683789192795198e-05, + "loss": 1.2394, + "step": 385 + }, + { + "epoch": 0.05, + "grad_norm": 0.97265625, + "learning_rate": 2.5750500333555707e-05, + "loss": 1.0277, + "step": 386 + }, + { + "epoch": 0.05, + "grad_norm": 0.8671875, + "learning_rate": 2.581721147431621e-05, + "loss": 1.0124, + "step": 387 + }, + { + "epoch": 0.05, + "grad_norm": 0.765625, + "learning_rate": 2.5883922615076717e-05, + "loss": 1.2363, + "step": 388 + }, + { + "epoch": 0.05, + "grad_norm": 0.78125, + "learning_rate": 2.595063375583723e-05, + "loss": 1.0005, + "step": 389 + }, + { + "epoch": 0.05, + "grad_norm": 0.640625, + "learning_rate": 2.6017344896597734e-05, + "loss": 0.9891, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 0.7890625, + "learning_rate": 2.608405603735824e-05, + "loss": 1.2065, + "step": 391 + }, + { + "epoch": 0.05, + "grad_norm": 0.59765625, + "learning_rate": 2.6150767178118747e-05, + "loss": 1.0982, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 1.8125, + "learning_rate": 2.6217478318879252e-05, + "loss": 1.0391, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 0.97265625, + "learning_rate": 2.6284189459639764e-05, + "loss": 1.0854, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 1.0859375, + "learning_rate": 2.635090060040027e-05, + "loss": 1.2175, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 0.75, + "learning_rate": 2.6417611741160774e-05, + "loss": 1.0554, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 0.87890625, + "learning_rate": 2.6484322881921282e-05, + "loss": 1.4038, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 0.921875, + "learning_rate": 2.6551034022681787e-05, + "loss": 1.1695, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 0.8984375, + "learning_rate": 2.66177451634423e-05, + "loss": 1.0923, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 0.75390625, + "learning_rate": 2.6684456304202804e-05, + "loss": 0.9614, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 0.61328125, + "learning_rate": 2.675116744496331e-05, + "loss": 1.0043, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 0.703125, + "learning_rate": 2.6817878585723814e-05, + "loss": 1.1966, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 0.90234375, + "learning_rate": 2.6884589726484323e-05, + "loss": 1.0894, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 0.69140625, + "learning_rate": 2.6951300867244835e-05, + "loss": 1.1961, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 1.5546875, + "learning_rate": 2.701801200800534e-05, + "loss": 1.0546, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 0.79296875, + "learning_rate": 2.7084723148765845e-05, + "loss": 1.3094, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 0.71484375, + "learning_rate": 2.715143428952635e-05, + "loss": 1.1177, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 0.8984375, + "learning_rate": 2.7218145430286858e-05, + "loss": 1.0102, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 0.64453125, + "learning_rate": 2.7284856571047366e-05, + "loss": 1.0116, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 0.67578125, + "learning_rate": 2.7351567711807875e-05, + "loss": 1.2684, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 0.7265625, + "learning_rate": 2.741827885256838e-05, + "loss": 0.9479, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 0.7734375, + "learning_rate": 2.7484989993328885e-05, + "loss": 1.0273, + "step": 412 + }, + { + "epoch": 0.06, + "grad_norm": 0.96484375, + "learning_rate": 2.755170113408939e-05, + "loss": 1.0696, + "step": 413 + }, + { + "epoch": 0.06, + "grad_norm": 0.74609375, + "learning_rate": 2.7618412274849902e-05, + "loss": 0.8535, + "step": 414 + }, + { + "epoch": 0.06, + "grad_norm": 0.77734375, + "learning_rate": 2.768512341561041e-05, + "loss": 0.8429, + "step": 415 + }, + { + "epoch": 0.06, + "grad_norm": 0.875, + "learning_rate": 2.7751834556370915e-05, + "loss": 1.3057, + "step": 416 + }, + { + "epoch": 0.06, + "grad_norm": 0.64453125, + "learning_rate": 2.781854569713142e-05, + "loss": 1.1574, + "step": 417 + }, + { + "epoch": 0.06, + "grad_norm": 0.80078125, + "learning_rate": 2.7885256837891925e-05, + "loss": 0.9367, + "step": 418 + }, + { + "epoch": 0.06, + "grad_norm": 0.93359375, + "learning_rate": 2.7951967978652437e-05, + "loss": 1.0787, + "step": 419 + }, + { + "epoch": 0.06, + "grad_norm": 0.796875, + "learning_rate": 2.8018679119412942e-05, + "loss": 0.8731, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 0.70703125, + "learning_rate": 2.808539026017345e-05, + "loss": 1.098, + "step": 421 + }, + { + "epoch": 0.06, + "grad_norm": 0.7421875, + "learning_rate": 2.8152101400933956e-05, + "loss": 0.8363, + "step": 422 + }, + { + "epoch": 0.06, + "grad_norm": 0.71875, + "learning_rate": 2.8218812541694467e-05, + "loss": 0.9245, + "step": 423 + }, + { + "epoch": 0.06, + "grad_norm": 0.87109375, + "learning_rate": 2.8285523682454973e-05, + "loss": 1.1074, + "step": 424 + }, + { + "epoch": 0.06, + "grad_norm": 0.671875, + "learning_rate": 2.8352234823215478e-05, + "loss": 1.068, + "step": 425 + }, + { + "epoch": 0.06, + "grad_norm": 0.703125, + "learning_rate": 2.8418945963975986e-05, + "loss": 0.8535, + "step": 426 + }, + { + "epoch": 0.06, + "grad_norm": 0.8671875, + "learning_rate": 2.848565710473649e-05, + "loss": 1.2947, + "step": 427 + }, + { + "epoch": 0.06, + "grad_norm": 0.828125, + "learning_rate": 2.8552368245497003e-05, + "loss": 0.9517, + "step": 428 + }, + { + "epoch": 0.06, + "grad_norm": 0.89453125, + "learning_rate": 2.8619079386257508e-05, + "loss": 1.1073, + "step": 429 + }, + { + "epoch": 0.06, + "grad_norm": 0.6953125, + "learning_rate": 2.8685790527018013e-05, + "loss": 0.8538, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 0.734375, + "learning_rate": 2.875250166777852e-05, + "loss": 0.7832, + "step": 431 + }, + { + "epoch": 0.06, + "grad_norm": 0.8828125, + "learning_rate": 2.8819212808539026e-05, + "loss": 0.9534, + "step": 432 + }, + { + "epoch": 0.06, + "grad_norm": 0.7421875, + "learning_rate": 2.8885923949299538e-05, + "loss": 1.3369, + "step": 433 + }, + { + "epoch": 0.06, + "grad_norm": 0.85546875, + "learning_rate": 2.8952635090060043e-05, + "loss": 0.9788, + "step": 434 + }, + { + "epoch": 0.06, + "grad_norm": 0.65234375, + "learning_rate": 2.9019346230820548e-05, + "loss": 1.2445, + "step": 435 + }, + { + "epoch": 0.06, + "grad_norm": 0.7734375, + "learning_rate": 2.9086057371581053e-05, + "loss": 0.765, + "step": 436 + }, + { + "epoch": 0.06, + "grad_norm": 0.62109375, + "learning_rate": 2.9152768512341562e-05, + "loss": 1.0292, + "step": 437 + }, + { + "epoch": 0.06, + "grad_norm": 0.65234375, + "learning_rate": 2.9219479653102074e-05, + "loss": 1.0944, + "step": 438 + }, + { + "epoch": 0.06, + "grad_norm": 0.9921875, + "learning_rate": 2.928619079386258e-05, + "loss": 0.899, + "step": 439 + }, + { + "epoch": 0.06, + "grad_norm": 0.96484375, + "learning_rate": 2.9352901934623084e-05, + "loss": 1.1097, + "step": 440 + }, + { + "epoch": 0.06, + "grad_norm": 0.7265625, + "learning_rate": 2.941961307538359e-05, + "loss": 1.3045, + "step": 441 + }, + { + "epoch": 0.06, + "grad_norm": 0.9609375, + "learning_rate": 2.9486324216144097e-05, + "loss": 1.1689, + "step": 442 + }, + { + "epoch": 0.06, + "grad_norm": 0.86328125, + "learning_rate": 2.9553035356904605e-05, + "loss": 1.0692, + "step": 443 + }, + { + "epoch": 0.06, + "grad_norm": 1.0703125, + "learning_rate": 2.9619746497665114e-05, + "loss": 1.2416, + "step": 444 + }, + { + "epoch": 0.06, + "grad_norm": 0.6953125, + "learning_rate": 2.968645763842562e-05, + "loss": 1.1509, + "step": 445 + }, + { + "epoch": 0.06, + "grad_norm": 1.0078125, + "learning_rate": 2.9753168779186124e-05, + "loss": 1.3257, + "step": 446 + }, + { + "epoch": 0.06, + "grad_norm": 0.78515625, + "learning_rate": 2.981987991994663e-05, + "loss": 1.6123, + "step": 447 + }, + { + "epoch": 0.06, + "grad_norm": 0.6953125, + "learning_rate": 2.988659106070714e-05, + "loss": 0.9789, + "step": 448 + }, + { + "epoch": 0.06, + "grad_norm": 0.87890625, + "learning_rate": 2.995330220146765e-05, + "loss": 1.3579, + "step": 449 + }, + { + "epoch": 0.06, + "grad_norm": 0.79296875, + "learning_rate": 3.0020013342228154e-05, + "loss": 1.04, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 0.59375, + "learning_rate": 3.008672448298866e-05, + "loss": 1.3721, + "step": 451 + }, + { + "epoch": 0.06, + "grad_norm": 0.84375, + "learning_rate": 3.0153435623749164e-05, + "loss": 0.9109, + "step": 452 + }, + { + "epoch": 0.06, + "grad_norm": 0.71484375, + "learning_rate": 3.0220146764509676e-05, + "loss": 0.8191, + "step": 453 + }, + { + "epoch": 0.06, + "grad_norm": 0.91796875, + "learning_rate": 3.028685790527018e-05, + "loss": 0.8085, + "step": 454 + }, + { + "epoch": 0.06, + "grad_norm": 0.6953125, + "learning_rate": 3.035356904603069e-05, + "loss": 1.1782, + "step": 455 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 3.0420280186791195e-05, + "loss": 1.1191, + "step": 456 + }, + { + "epoch": 0.06, + "grad_norm": 0.640625, + "learning_rate": 3.04869913275517e-05, + "loss": 1.1137, + "step": 457 + }, + { + "epoch": 0.06, + "grad_norm": 0.8046875, + "learning_rate": 3.0553702468312215e-05, + "loss": 1.1259, + "step": 458 + }, + { + "epoch": 0.06, + "grad_norm": 0.765625, + "learning_rate": 3.062041360907272e-05, + "loss": 1.3593, + "step": 459 + }, + { + "epoch": 0.06, + "grad_norm": 0.85546875, + "learning_rate": 3.0687124749833225e-05, + "loss": 1.0831, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 0.7265625, + "learning_rate": 3.075383589059373e-05, + "loss": 0.9972, + "step": 461 + }, + { + "epoch": 0.06, + "grad_norm": 0.71875, + "learning_rate": 3.0820547031354235e-05, + "loss": 0.9039, + "step": 462 + }, + { + "epoch": 0.06, + "grad_norm": 1.1328125, + "learning_rate": 3.088725817211475e-05, + "loss": 1.1334, + "step": 463 + }, + { + "epoch": 0.06, + "grad_norm": 0.7890625, + "learning_rate": 3.095396931287525e-05, + "loss": 1.1265, + "step": 464 + }, + { + "epoch": 0.06, + "grad_norm": 0.95703125, + "learning_rate": 3.102068045363576e-05, + "loss": 1.2504, + "step": 465 + }, + { + "epoch": 0.06, + "grad_norm": 0.6796875, + "learning_rate": 3.108739159439626e-05, + "loss": 1.0953, + "step": 466 + }, + { + "epoch": 0.06, + "grad_norm": 0.81640625, + "learning_rate": 3.1154102735156774e-05, + "loss": 0.9211, + "step": 467 + }, + { + "epoch": 0.06, + "grad_norm": 0.7890625, + "learning_rate": 3.122081387591728e-05, + "loss": 1.1412, + "step": 468 + }, + { + "epoch": 0.06, + "grad_norm": 0.859375, + "learning_rate": 3.128752501667779e-05, + "loss": 0.8669, + "step": 469 + }, + { + "epoch": 0.06, + "grad_norm": 0.890625, + "learning_rate": 3.1354236157438296e-05, + "loss": 1.3109, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 0.76171875, + "learning_rate": 3.14209472981988e-05, + "loss": 0.9461, + "step": 471 + }, + { + "epoch": 0.06, + "grad_norm": 0.55859375, + "learning_rate": 3.1487658438959306e-05, + "loss": 1.0797, + "step": 472 + }, + { + "epoch": 0.06, + "grad_norm": 0.9609375, + "learning_rate": 3.155436957971982e-05, + "loss": 0.8751, + "step": 473 + }, + { + "epoch": 0.06, + "grad_norm": 0.76171875, + "learning_rate": 3.162108072048032e-05, + "loss": 1.0549, + "step": 474 + }, + { + "epoch": 0.06, + "grad_norm": 0.82421875, + "learning_rate": 3.168779186124083e-05, + "loss": 0.8502, + "step": 475 + }, + { + "epoch": 0.06, + "grad_norm": 0.77734375, + "learning_rate": 3.175450300200133e-05, + "loss": 0.9928, + "step": 476 + }, + { + "epoch": 0.06, + "grad_norm": 0.67578125, + "learning_rate": 3.182121414276184e-05, + "loss": 1.0494, + "step": 477 + }, + { + "epoch": 0.06, + "grad_norm": 0.9765625, + "learning_rate": 3.188792528352235e-05, + "loss": 1.1641, + "step": 478 + }, + { + "epoch": 0.06, + "grad_norm": 0.76171875, + "learning_rate": 3.1954636424282855e-05, + "loss": 0.9275, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 0.79296875, + "learning_rate": 3.2021347565043366e-05, + "loss": 1.0586, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 0.83203125, + "learning_rate": 3.208805870580387e-05, + "loss": 0.9972, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 0.73046875, + "learning_rate": 3.2154769846564376e-05, + "loss": 0.9419, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 1.1015625, + "learning_rate": 3.222148098732489e-05, + "loss": 1.0871, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 1.171875, + "learning_rate": 3.228819212808539e-05, + "loss": 1.3643, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 0.7890625, + "learning_rate": 3.23549032688459e-05, + "loss": 1.4566, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 0.98828125, + "learning_rate": 3.2421614409606403e-05, + "loss": 0.9943, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 0.66796875, + "learning_rate": 3.248832555036691e-05, + "loss": 0.904, + "step": 487 + }, + { + "epoch": 0.07, + "grad_norm": 0.9296875, + "learning_rate": 3.255503669112742e-05, + "loss": 1.2061, + "step": 488 + }, + { + "epoch": 0.07, + "grad_norm": 1.921875, + "learning_rate": 3.2621747831887925e-05, + "loss": 1.2211, + "step": 489 + }, + { + "epoch": 0.07, + "grad_norm": 0.80078125, + "learning_rate": 3.268845897264843e-05, + "loss": 1.0336, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 0.91015625, + "learning_rate": 3.275517011340894e-05, + "loss": 0.9253, + "step": 491 + }, + { + "epoch": 0.07, + "grad_norm": 0.84375, + "learning_rate": 3.282188125416945e-05, + "loss": 0.8956, + "step": 492 + }, + { + "epoch": 0.07, + "grad_norm": 1.0390625, + "learning_rate": 3.288859239492996e-05, + "loss": 1.2526, + "step": 493 + }, + { + "epoch": 0.07, + "grad_norm": 0.72265625, + "learning_rate": 3.2955303535690464e-05, + "loss": 1.1851, + "step": 494 + }, + { + "epoch": 0.07, + "grad_norm": 1.1171875, + "learning_rate": 3.302201467645097e-05, + "loss": 0.9722, + "step": 495 + }, + { + "epoch": 0.07, + "grad_norm": 0.8671875, + "learning_rate": 3.3088725817211474e-05, + "loss": 0.8611, + "step": 496 + }, + { + "epoch": 0.07, + "grad_norm": 0.69140625, + "learning_rate": 3.315543695797198e-05, + "loss": 1.2161, + "step": 497 + }, + { + "epoch": 0.07, + "grad_norm": 1.0703125, + "learning_rate": 3.322214809873249e-05, + "loss": 1.4059, + "step": 498 + }, + { + "epoch": 0.07, + "grad_norm": 0.70703125, + "learning_rate": 3.3288859239492996e-05, + "loss": 1.0321, + "step": 499 + }, + { + "epoch": 0.07, + "grad_norm": 0.7109375, + "learning_rate": 3.33555703802535e-05, + "loss": 1.3008, + "step": 500 + }, + { + "epoch": 0.07, + "grad_norm": 0.859375, + "learning_rate": 3.342228152101401e-05, + "loss": 0.9981, + "step": 501 + }, + { + "epoch": 0.07, + "grad_norm": 0.859375, + "learning_rate": 3.348899266177452e-05, + "loss": 0.9681, + "step": 502 + }, + { + "epoch": 0.07, + "grad_norm": 0.90234375, + "learning_rate": 3.355570380253503e-05, + "loss": 0.9498, + "step": 503 + }, + { + "epoch": 0.07, + "grad_norm": 0.875, + "learning_rate": 3.3622414943295535e-05, + "loss": 1.0536, + "step": 504 + }, + { + "epoch": 0.07, + "grad_norm": 0.76171875, + "learning_rate": 3.368912608405604e-05, + "loss": 1.0306, + "step": 505 + }, + { + "epoch": 0.07, + "grad_norm": 0.9375, + "learning_rate": 3.3755837224816545e-05, + "loss": 1.1125, + "step": 506 + }, + { + "epoch": 0.07, + "grad_norm": 0.62109375, + "learning_rate": 3.382254836557705e-05, + "loss": 1.1324, + "step": 507 + }, + { + "epoch": 0.07, + "grad_norm": 0.8515625, + "learning_rate": 3.388925950633756e-05, + "loss": 1.0125, + "step": 508 + }, + { + "epoch": 0.07, + "grad_norm": 0.78515625, + "learning_rate": 3.395597064709807e-05, + "loss": 0.954, + "step": 509 + }, + { + "epoch": 0.07, + "grad_norm": 0.703125, + "learning_rate": 3.402268178785857e-05, + "loss": 0.839, + "step": 510 + }, + { + "epoch": 0.07, + "grad_norm": 0.76953125, + "learning_rate": 3.408939292861908e-05, + "loss": 0.945, + "step": 511 + }, + { + "epoch": 0.07, + "grad_norm": 0.82421875, + "learning_rate": 3.415610406937959e-05, + "loss": 1.0228, + "step": 512 + }, + { + "epoch": 0.07, + "grad_norm": 0.8671875, + "learning_rate": 3.4222815210140094e-05, + "loss": 1.1981, + "step": 513 + }, + { + "epoch": 0.07, + "grad_norm": 0.8203125, + "learning_rate": 3.4289526350900605e-05, + "loss": 1.1283, + "step": 514 + }, + { + "epoch": 0.07, + "grad_norm": 0.7890625, + "learning_rate": 3.435623749166111e-05, + "loss": 1.0646, + "step": 515 + }, + { + "epoch": 0.07, + "grad_norm": 0.87109375, + "learning_rate": 3.4422948632421615e-05, + "loss": 1.001, + "step": 516 + }, + { + "epoch": 0.07, + "grad_norm": 0.59375, + "learning_rate": 3.448965977318213e-05, + "loss": 1.0601, + "step": 517 + }, + { + "epoch": 0.07, + "grad_norm": 1.171875, + "learning_rate": 3.455637091394263e-05, + "loss": 0.678, + "step": 518 + }, + { + "epoch": 0.07, + "grad_norm": 0.609375, + "learning_rate": 3.462308205470314e-05, + "loss": 1.2515, + "step": 519 + }, + { + "epoch": 0.07, + "grad_norm": 0.76953125, + "learning_rate": 3.468979319546364e-05, + "loss": 1.237, + "step": 520 + }, + { + "epoch": 0.07, + "grad_norm": 1.0078125, + "learning_rate": 3.475650433622415e-05, + "loss": 0.9508, + "step": 521 + }, + { + "epoch": 0.07, + "grad_norm": 0.734375, + "learning_rate": 3.482321547698466e-05, + "loss": 0.8867, + "step": 522 + }, + { + "epoch": 0.07, + "grad_norm": 0.8359375, + "learning_rate": 3.4889926617745164e-05, + "loss": 0.7537, + "step": 523 + }, + { + "epoch": 0.07, + "grad_norm": 0.85546875, + "learning_rate": 3.495663775850567e-05, + "loss": 0.7599, + "step": 524 + }, + { + "epoch": 0.07, + "grad_norm": 0.734375, + "learning_rate": 3.502334889926618e-05, + "loss": 1.0078, + "step": 525 + }, + { + "epoch": 0.07, + "grad_norm": 0.81640625, + "learning_rate": 3.5090060040026686e-05, + "loss": 0.8893, + "step": 526 + }, + { + "epoch": 0.07, + "grad_norm": 0.91796875, + "learning_rate": 3.51567711807872e-05, + "loss": 1.4304, + "step": 527 + }, + { + "epoch": 0.07, + "grad_norm": 0.69921875, + "learning_rate": 3.52234823215477e-05, + "loss": 0.8263, + "step": 528 + }, + { + "epoch": 0.07, + "grad_norm": 1.0546875, + "learning_rate": 3.529019346230821e-05, + "loss": 0.9317, + "step": 529 + }, + { + "epoch": 0.07, + "grad_norm": 1.109375, + "learning_rate": 3.535690460306871e-05, + "loss": 0.948, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 0.7734375, + "learning_rate": 3.542361574382922e-05, + "loss": 0.8784, + "step": 531 + }, + { + "epoch": 0.07, + "grad_norm": 1.09375, + "learning_rate": 3.549032688458973e-05, + "loss": 0.8846, + "step": 532 + }, + { + "epoch": 0.07, + "grad_norm": 0.7265625, + "learning_rate": 3.5557038025350235e-05, + "loss": 1.0468, + "step": 533 + }, + { + "epoch": 0.07, + "grad_norm": 0.99609375, + "learning_rate": 3.562374916611074e-05, + "loss": 1.0854, + "step": 534 + }, + { + "epoch": 0.07, + "grad_norm": 0.9140625, + "learning_rate": 3.5690460306871245e-05, + "loss": 1.011, + "step": 535 + }, + { + "epoch": 0.07, + "grad_norm": 1.1171875, + "learning_rate": 3.575717144763176e-05, + "loss": 0.9337, + "step": 536 + }, + { + "epoch": 0.07, + "grad_norm": 0.76171875, + "learning_rate": 3.582388258839227e-05, + "loss": 0.9783, + "step": 537 + }, + { + "epoch": 0.07, + "grad_norm": 1.0234375, + "learning_rate": 3.5890593729152774e-05, + "loss": 1.1241, + "step": 538 + }, + { + "epoch": 0.07, + "grad_norm": 0.765625, + "learning_rate": 3.595730486991328e-05, + "loss": 1.0288, + "step": 539 + }, + { + "epoch": 0.07, + "grad_norm": 0.74609375, + "learning_rate": 3.6024016010673784e-05, + "loss": 0.8977, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046875, + "learning_rate": 3.609072715143429e-05, + "loss": 0.8951, + "step": 541 + }, + { + "epoch": 0.07, + "grad_norm": 0.91015625, + "learning_rate": 3.61574382921948e-05, + "loss": 0.8616, + "step": 542 + }, + { + "epoch": 0.07, + "grad_norm": 0.74609375, + "learning_rate": 3.6224149432955306e-05, + "loss": 1.1327, + "step": 543 + }, + { + "epoch": 0.07, + "grad_norm": 0.81640625, + "learning_rate": 3.629086057371581e-05, + "loss": 0.8483, + "step": 544 + }, + { + "epoch": 0.07, + "grad_norm": 0.96484375, + "learning_rate": 3.6357571714476316e-05, + "loss": 0.8049, + "step": 545 + }, + { + "epoch": 0.07, + "grad_norm": 0.69921875, + "learning_rate": 3.642428285523683e-05, + "loss": 0.8834, + "step": 546 + }, + { + "epoch": 0.07, + "grad_norm": 0.984375, + "learning_rate": 3.649099399599733e-05, + "loss": 0.9641, + "step": 547 + }, + { + "epoch": 0.07, + "grad_norm": 0.97265625, + "learning_rate": 3.6557705136757844e-05, + "loss": 1.1491, + "step": 548 + }, + { + "epoch": 0.07, + "grad_norm": 0.84375, + "learning_rate": 3.662441627751835e-05, + "loss": 0.977, + "step": 549 + }, + { + "epoch": 0.07, + "grad_norm": 0.63671875, + "learning_rate": 3.6691127418278855e-05, + "loss": 1.0946, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 0.75390625, + "learning_rate": 3.675783855903936e-05, + "loss": 0.9575, + "step": 551 + }, + { + "epoch": 0.07, + "grad_norm": 0.734375, + "learning_rate": 3.682454969979987e-05, + "loss": 1.0073, + "step": 552 + }, + { + "epoch": 0.07, + "grad_norm": 0.81640625, + "learning_rate": 3.6891260840560376e-05, + "loss": 1.2721, + "step": 553 + }, + { + "epoch": 0.07, + "grad_norm": 0.90625, + "learning_rate": 3.695797198132088e-05, + "loss": 0.9454, + "step": 554 + }, + { + "epoch": 0.07, + "grad_norm": 0.65625, + "learning_rate": 3.7024683122081386e-05, + "loss": 1.0553, + "step": 555 + }, + { + "epoch": 0.07, + "grad_norm": 0.69140625, + "learning_rate": 3.709139426284189e-05, + "loss": 0.9666, + "step": 556 + }, + { + "epoch": 0.07, + "grad_norm": 0.69140625, + "learning_rate": 3.71581054036024e-05, + "loss": 1.057, + "step": 557 + }, + { + "epoch": 0.07, + "grad_norm": 0.70703125, + "learning_rate": 3.722481654436291e-05, + "loss": 0.8368, + "step": 558 + }, + { + "epoch": 0.07, + "grad_norm": 0.79296875, + "learning_rate": 3.729152768512342e-05, + "loss": 1.2978, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 0.796875, + "learning_rate": 3.7358238825883925e-05, + "loss": 0.8725, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 0.75390625, + "learning_rate": 3.742494996664443e-05, + "loss": 0.8168, + "step": 561 + }, + { + "epoch": 0.07, + "grad_norm": 0.62890625, + "learning_rate": 3.749166110740494e-05, + "loss": 1.0638, + "step": 562 + }, + { + "epoch": 0.08, + "grad_norm": 1.109375, + "learning_rate": 3.755837224816545e-05, + "loss": 0.8186, + "step": 563 + }, + { + "epoch": 0.08, + "grad_norm": 0.96484375, + "learning_rate": 3.762508338892595e-05, + "loss": 1.2083, + "step": 564 + }, + { + "epoch": 0.08, + "grad_norm": 1.234375, + "learning_rate": 3.769179452968646e-05, + "loss": 0.901, + "step": 565 + }, + { + "epoch": 0.08, + "grad_norm": 0.80078125, + "learning_rate": 3.775850567044696e-05, + "loss": 1.0387, + "step": 566 + }, + { + "epoch": 0.08, + "grad_norm": 0.828125, + "learning_rate": 3.7825216811207474e-05, + "loss": 0.9768, + "step": 567 + }, + { + "epoch": 0.08, + "grad_norm": 0.78125, + "learning_rate": 3.789192795196798e-05, + "loss": 1.0348, + "step": 568 + }, + { + "epoch": 0.08, + "grad_norm": 0.69921875, + "learning_rate": 3.7958639092728484e-05, + "loss": 0.9591, + "step": 569 + }, + { + "epoch": 0.08, + "grad_norm": 0.84375, + "learning_rate": 3.8025350233488996e-05, + "loss": 1.062, + "step": 570 + }, + { + "epoch": 0.08, + "grad_norm": 0.75390625, + "learning_rate": 3.80920613742495e-05, + "loss": 0.8806, + "step": 571 + }, + { + "epoch": 0.08, + "grad_norm": 0.67578125, + "learning_rate": 3.815877251501001e-05, + "loss": 0.8591, + "step": 572 + }, + { + "epoch": 0.08, + "grad_norm": 0.8125, + "learning_rate": 3.822548365577052e-05, + "loss": 0.6587, + "step": 573 + }, + { + "epoch": 0.08, + "grad_norm": 0.6796875, + "learning_rate": 3.829219479653102e-05, + "loss": 0.9169, + "step": 574 + }, + { + "epoch": 0.08, + "grad_norm": 0.90234375, + "learning_rate": 3.835890593729153e-05, + "loss": 0.8298, + "step": 575 + }, + { + "epoch": 0.08, + "grad_norm": 0.76171875, + "learning_rate": 3.842561707805203e-05, + "loss": 0.8593, + "step": 576 + }, + { + "epoch": 0.08, + "grad_norm": 0.7734375, + "learning_rate": 3.8492328218812545e-05, + "loss": 0.9234, + "step": 577 + }, + { + "epoch": 0.08, + "grad_norm": 0.9765625, + "learning_rate": 3.855903935957305e-05, + "loss": 1.0352, + "step": 578 + }, + { + "epoch": 0.08, + "grad_norm": 1.03125, + "learning_rate": 3.8625750500333555e-05, + "loss": 0.7773, + "step": 579 + }, + { + "epoch": 0.08, + "grad_norm": 0.92578125, + "learning_rate": 3.8692461641094067e-05, + "loss": 1.312, + "step": 580 + }, + { + "epoch": 0.08, + "grad_norm": 0.86328125, + "learning_rate": 3.875917278185457e-05, + "loss": 1.1868, + "step": 581 + }, + { + "epoch": 0.08, + "grad_norm": 0.7890625, + "learning_rate": 3.8825883922615083e-05, + "loss": 1.029, + "step": 582 + }, + { + "epoch": 0.08, + "grad_norm": 0.97265625, + "learning_rate": 3.889259506337559e-05, + "loss": 0.7969, + "step": 583 + }, + { + "epoch": 0.08, + "grad_norm": 1.265625, + "learning_rate": 3.8959306204136094e-05, + "loss": 0.8756, + "step": 584 + }, + { + "epoch": 0.08, + "grad_norm": 1.0, + "learning_rate": 3.90260173448966e-05, + "loss": 1.2286, + "step": 585 + }, + { + "epoch": 0.08, + "grad_norm": 0.98828125, + "learning_rate": 3.9092728485657104e-05, + "loss": 0.5999, + "step": 586 + }, + { + "epoch": 0.08, + "grad_norm": 1.59375, + "learning_rate": 3.9159439626417615e-05, + "loss": 1.1073, + "step": 587 + }, + { + "epoch": 0.08, + "grad_norm": 0.69140625, + "learning_rate": 3.922615076717812e-05, + "loss": 1.0085, + "step": 588 + }, + { + "epoch": 0.08, + "grad_norm": 0.90234375, + "learning_rate": 3.9292861907938625e-05, + "loss": 0.9062, + "step": 589 + }, + { + "epoch": 0.08, + "grad_norm": 0.7578125, + "learning_rate": 3.935957304869913e-05, + "loss": 1.0628, + "step": 590 + }, + { + "epoch": 0.08, + "grad_norm": 0.8203125, + "learning_rate": 3.942628418945964e-05, + "loss": 1.1745, + "step": 591 + }, + { + "epoch": 0.08, + "grad_norm": 0.7421875, + "learning_rate": 3.949299533022015e-05, + "loss": 0.9823, + "step": 592 + }, + { + "epoch": 0.08, + "grad_norm": 0.79296875, + "learning_rate": 3.955970647098066e-05, + "loss": 0.9636, + "step": 593 + }, + { + "epoch": 0.08, + "grad_norm": 1.296875, + "learning_rate": 3.9626417611741164e-05, + "loss": 1.0196, + "step": 594 + }, + { + "epoch": 0.08, + "grad_norm": 0.90625, + "learning_rate": 3.969312875250167e-05, + "loss": 0.9245, + "step": 595 + }, + { + "epoch": 0.08, + "grad_norm": 0.8203125, + "learning_rate": 3.9759839893262174e-05, + "loss": 1.0824, + "step": 596 + }, + { + "epoch": 0.08, + "grad_norm": 1.109375, + "learning_rate": 3.9826551034022686e-05, + "loss": 1.4418, + "step": 597 + }, + { + "epoch": 0.08, + "grad_norm": 0.859375, + "learning_rate": 3.989326217478319e-05, + "loss": 1.1553, + "step": 598 + }, + { + "epoch": 0.08, + "grad_norm": 0.92578125, + "learning_rate": 3.9959973315543696e-05, + "loss": 1.0627, + "step": 599 + }, + { + "epoch": 0.08, + "grad_norm": 0.77734375, + "learning_rate": 4.00266844563042e-05, + "loss": 0.9896, + "step": 600 + }, + { + "epoch": 0.08, + "grad_norm": 0.71875, + "learning_rate": 4.0093395597064706e-05, + "loss": 0.8194, + "step": 601 + }, + { + "epoch": 0.08, + "grad_norm": 0.875, + "learning_rate": 4.016010673782522e-05, + "loss": 0.9796, + "step": 602 + }, + { + "epoch": 0.08, + "grad_norm": 0.796875, + "learning_rate": 4.022681787858572e-05, + "loss": 0.8761, + "step": 603 + }, + { + "epoch": 0.08, + "grad_norm": 0.80078125, + "learning_rate": 4.0293529019346235e-05, + "loss": 0.7736, + "step": 604 + }, + { + "epoch": 0.08, + "grad_norm": 0.7734375, + "learning_rate": 4.036024016010674e-05, + "loss": 1.2751, + "step": 605 + }, + { + "epoch": 0.08, + "grad_norm": 0.84375, + "learning_rate": 4.0426951300867245e-05, + "loss": 0.9506, + "step": 606 + }, + { + "epoch": 0.08, + "grad_norm": 0.796875, + "learning_rate": 4.049366244162776e-05, + "loss": 0.9022, + "step": 607 + }, + { + "epoch": 0.08, + "grad_norm": 1.171875, + "learning_rate": 4.056037358238826e-05, + "loss": 1.1103, + "step": 608 + }, + { + "epoch": 0.08, + "grad_norm": 0.7109375, + "learning_rate": 4.062708472314877e-05, + "loss": 0.9041, + "step": 609 + }, + { + "epoch": 0.08, + "grad_norm": 1.25, + "learning_rate": 4.069379586390927e-05, + "loss": 1.1495, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 0.92578125, + "learning_rate": 4.0760507004669784e-05, + "loss": 1.1835, + "step": 611 + }, + { + "epoch": 0.08, + "grad_norm": 0.63671875, + "learning_rate": 4.082721814543029e-05, + "loss": 1.1115, + "step": 612 + }, + { + "epoch": 0.08, + "grad_norm": 0.921875, + "learning_rate": 4.0893929286190794e-05, + "loss": 1.1, + "step": 613 + }, + { + "epoch": 0.08, + "grad_norm": 0.88671875, + "learning_rate": 4.09606404269513e-05, + "loss": 0.7937, + "step": 614 + }, + { + "epoch": 0.08, + "grad_norm": 1.046875, + "learning_rate": 4.102735156771181e-05, + "loss": 0.9753, + "step": 615 + }, + { + "epoch": 0.08, + "grad_norm": 1.15625, + "learning_rate": 4.109406270847232e-05, + "loss": 1.0982, + "step": 616 + }, + { + "epoch": 0.08, + "grad_norm": 0.84375, + "learning_rate": 4.116077384923283e-05, + "loss": 0.8303, + "step": 617 + }, + { + "epoch": 0.08, + "grad_norm": 0.9375, + "learning_rate": 4.122748498999333e-05, + "loss": 0.971, + "step": 618 + }, + { + "epoch": 0.08, + "grad_norm": 0.75, + "learning_rate": 4.129419613075384e-05, + "loss": 0.8104, + "step": 619 + }, + { + "epoch": 0.08, + "grad_norm": 0.85546875, + "learning_rate": 4.136090727151434e-05, + "loss": 1.0032, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 0.84375, + "learning_rate": 4.1427618412274854e-05, + "loss": 0.846, + "step": 621 + }, + { + "epoch": 0.08, + "grad_norm": 0.69921875, + "learning_rate": 4.149432955303536e-05, + "loss": 1.1394, + "step": 622 + }, + { + "epoch": 0.08, + "grad_norm": 0.76171875, + "learning_rate": 4.1561040693795865e-05, + "loss": 0.8926, + "step": 623 + }, + { + "epoch": 0.08, + "grad_norm": 0.9765625, + "learning_rate": 4.162775183455637e-05, + "loss": 0.7852, + "step": 624 + }, + { + "epoch": 0.08, + "grad_norm": 0.890625, + "learning_rate": 4.169446297531688e-05, + "loss": 0.8402, + "step": 625 + }, + { + "epoch": 0.08, + "grad_norm": 0.82421875, + "learning_rate": 4.1761174116077386e-05, + "loss": 0.697, + "step": 626 + }, + { + "epoch": 0.08, + "grad_norm": 0.8125, + "learning_rate": 4.18278852568379e-05, + "loss": 1.1239, + "step": 627 + }, + { + "epoch": 0.08, + "grad_norm": 0.66015625, + "learning_rate": 4.18945963975984e-05, + "loss": 0.9261, + "step": 628 + }, + { + "epoch": 0.08, + "grad_norm": 1.2109375, + "learning_rate": 4.196130753835891e-05, + "loss": 0.9673, + "step": 629 + }, + { + "epoch": 0.08, + "grad_norm": 0.8203125, + "learning_rate": 4.202801867911941e-05, + "loss": 0.989, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 0.75390625, + "learning_rate": 4.2094729819879925e-05, + "loss": 0.8449, + "step": 631 + }, + { + "epoch": 0.08, + "grad_norm": 0.8125, + "learning_rate": 4.216144096064043e-05, + "loss": 0.9594, + "step": 632 + }, + { + "epoch": 0.08, + "grad_norm": 1.0, + "learning_rate": 4.2228152101400935e-05, + "loss": 0.766, + "step": 633 + }, + { + "epoch": 0.08, + "grad_norm": 0.7734375, + "learning_rate": 4.229486324216144e-05, + "loss": 0.6407, + "step": 634 + }, + { + "epoch": 0.08, + "grad_norm": 0.71875, + "learning_rate": 4.2361574382921945e-05, + "loss": 1.0874, + "step": 635 + }, + { + "epoch": 0.08, + "grad_norm": 0.73046875, + "learning_rate": 4.242828552368246e-05, + "loss": 1.1712, + "step": 636 + }, + { + "epoch": 0.09, + "grad_norm": 0.875, + "learning_rate": 4.249499666444296e-05, + "loss": 0.8735, + "step": 637 + }, + { + "epoch": 0.09, + "grad_norm": 0.71484375, + "learning_rate": 4.2561707805203474e-05, + "loss": 1.0167, + "step": 638 + }, + { + "epoch": 0.09, + "grad_norm": 0.859375, + "learning_rate": 4.262841894596398e-05, + "loss": 0.7336, + "step": 639 + }, + { + "epoch": 0.09, + "grad_norm": 0.87109375, + "learning_rate": 4.2695130086724484e-05, + "loss": 0.8086, + "step": 640 + }, + { + "epoch": 0.09, + "grad_norm": 0.7578125, + "learning_rate": 4.2761841227484996e-05, + "loss": 1.1047, + "step": 641 + }, + { + "epoch": 0.09, + "grad_norm": 0.7890625, + "learning_rate": 4.28285523682455e-05, + "loss": 0.9262, + "step": 642 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 4.2895263509006006e-05, + "loss": 0.9155, + "step": 643 + }, + { + "epoch": 0.09, + "grad_norm": 0.9140625, + "learning_rate": 4.296197464976651e-05, + "loss": 0.9079, + "step": 644 + }, + { + "epoch": 0.09, + "grad_norm": 0.91796875, + "learning_rate": 4.3028685790527016e-05, + "loss": 0.9922, + "step": 645 + }, + { + "epoch": 0.09, + "grad_norm": 0.875, + "learning_rate": 4.309539693128753e-05, + "loss": 0.921, + "step": 646 + }, + { + "epoch": 0.09, + "grad_norm": 0.86328125, + "learning_rate": 4.316210807204803e-05, + "loss": 0.6631, + "step": 647 + }, + { + "epoch": 0.09, + "grad_norm": 0.703125, + "learning_rate": 4.322881921280854e-05, + "loss": 0.6981, + "step": 648 + }, + { + "epoch": 0.09, + "grad_norm": 0.87890625, + "learning_rate": 4.329553035356905e-05, + "loss": 0.8416, + "step": 649 + }, + { + "epoch": 0.09, + "grad_norm": 0.89453125, + "learning_rate": 4.3362241494329555e-05, + "loss": 0.8942, + "step": 650 + }, + { + "epoch": 0.09, + "grad_norm": 0.60546875, + "learning_rate": 4.3428952635090067e-05, + "loss": 0.7441, + "step": 651 + }, + { + "epoch": 0.09, + "grad_norm": 1.15625, + "learning_rate": 4.349566377585057e-05, + "loss": 1.4454, + "step": 652 + }, + { + "epoch": 0.09, + "grad_norm": 0.65625, + "learning_rate": 4.3562374916611077e-05, + "loss": 0.8253, + "step": 653 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 4.362908605737158e-05, + "loss": 0.9083, + "step": 654 + }, + { + "epoch": 0.09, + "grad_norm": 0.82421875, + "learning_rate": 4.369579719813209e-05, + "loss": 1.0251, + "step": 655 + }, + { + "epoch": 0.09, + "grad_norm": 0.8515625, + "learning_rate": 4.37625083388926e-05, + "loss": 0.9287, + "step": 656 + }, + { + "epoch": 0.09, + "grad_norm": 0.7578125, + "learning_rate": 4.3829219479653104e-05, + "loss": 1.1044, + "step": 657 + }, + { + "epoch": 0.09, + "grad_norm": 0.87109375, + "learning_rate": 4.389593062041361e-05, + "loss": 0.7651, + "step": 658 + }, + { + "epoch": 0.09, + "grad_norm": 1.2265625, + "learning_rate": 4.396264176117412e-05, + "loss": 0.8787, + "step": 659 + }, + { + "epoch": 0.09, + "grad_norm": 0.7421875, + "learning_rate": 4.4029352901934625e-05, + "loss": 0.9437, + "step": 660 + }, + { + "epoch": 0.09, + "grad_norm": 1.1171875, + "learning_rate": 4.409606404269514e-05, + "loss": 1.2113, + "step": 661 + }, + { + "epoch": 0.09, + "grad_norm": 0.7890625, + "learning_rate": 4.416277518345564e-05, + "loss": 1.1219, + "step": 662 + }, + { + "epoch": 0.09, + "grad_norm": 0.99609375, + "learning_rate": 4.422948632421615e-05, + "loss": 0.6696, + "step": 663 + }, + { + "epoch": 0.09, + "grad_norm": 0.75390625, + "learning_rate": 4.429619746497665e-05, + "loss": 1.0734, + "step": 664 + }, + { + "epoch": 0.09, + "grad_norm": 0.8046875, + "learning_rate": 4.436290860573716e-05, + "loss": 1.1799, + "step": 665 + }, + { + "epoch": 0.09, + "grad_norm": 0.67578125, + "learning_rate": 4.442961974649767e-05, + "loss": 1.1277, + "step": 666 + }, + { + "epoch": 0.09, + "grad_norm": 0.62890625, + "learning_rate": 4.4496330887258174e-05, + "loss": 1.1239, + "step": 667 + }, + { + "epoch": 0.09, + "grad_norm": 0.98046875, + "learning_rate": 4.456304202801868e-05, + "loss": 0.8499, + "step": 668 + }, + { + "epoch": 0.09, + "grad_norm": 0.89453125, + "learning_rate": 4.4629753168779184e-05, + "loss": 0.9228, + "step": 669 + }, + { + "epoch": 0.09, + "grad_norm": 0.7265625, + "learning_rate": 4.4696464309539696e-05, + "loss": 1.0243, + "step": 670 + }, + { + "epoch": 0.09, + "grad_norm": 1.3515625, + "learning_rate": 4.47631754503002e-05, + "loss": 0.6358, + "step": 671 + }, + { + "epoch": 0.09, + "grad_norm": 1.359375, + "learning_rate": 4.482988659106071e-05, + "loss": 0.9669, + "step": 672 + }, + { + "epoch": 0.09, + "grad_norm": 0.6484375, + "learning_rate": 4.489659773182122e-05, + "loss": 0.565, + "step": 673 + }, + { + "epoch": 0.09, + "grad_norm": 0.77734375, + "learning_rate": 4.496330887258172e-05, + "loss": 1.0221, + "step": 674 + }, + { + "epoch": 0.09, + "grad_norm": 0.80078125, + "learning_rate": 4.503002001334223e-05, + "loss": 0.7651, + "step": 675 + }, + { + "epoch": 0.09, + "grad_norm": 0.80078125, + "learning_rate": 4.509673115410274e-05, + "loss": 1.0311, + "step": 676 + }, + { + "epoch": 0.09, + "grad_norm": 0.6796875, + "learning_rate": 4.5163442294863245e-05, + "loss": 0.9347, + "step": 677 + }, + { + "epoch": 0.09, + "grad_norm": 0.72265625, + "learning_rate": 4.523015343562375e-05, + "loss": 1.1545, + "step": 678 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 4.5296864576384255e-05, + "loss": 1.2031, + "step": 679 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 4.536357571714476e-05, + "loss": 0.8375, + "step": 680 + }, + { + "epoch": 0.09, + "grad_norm": 0.890625, + "learning_rate": 4.543028685790527e-05, + "loss": 1.0221, + "step": 681 + }, + { + "epoch": 0.09, + "grad_norm": 1.015625, + "learning_rate": 4.549699799866578e-05, + "loss": 0.5959, + "step": 682 + }, + { + "epoch": 0.09, + "grad_norm": 0.8671875, + "learning_rate": 4.556370913942629e-05, + "loss": 0.6554, + "step": 683 + }, + { + "epoch": 0.09, + "grad_norm": 0.84765625, + "learning_rate": 4.5630420280186794e-05, + "loss": 1.1484, + "step": 684 + }, + { + "epoch": 0.09, + "grad_norm": 1.4140625, + "learning_rate": 4.56971314209473e-05, + "loss": 1.2137, + "step": 685 + }, + { + "epoch": 0.09, + "grad_norm": 0.6796875, + "learning_rate": 4.576384256170781e-05, + "loss": 0.7493, + "step": 686 + }, + { + "epoch": 0.09, + "grad_norm": 0.890625, + "learning_rate": 4.5830553702468316e-05, + "loss": 0.8463, + "step": 687 + }, + { + "epoch": 0.09, + "grad_norm": 0.8828125, + "learning_rate": 4.589726484322882e-05, + "loss": 1.0789, + "step": 688 + }, + { + "epoch": 0.09, + "grad_norm": 0.78125, + "learning_rate": 4.5963975983989326e-05, + "loss": 0.9833, + "step": 689 + }, + { + "epoch": 0.09, + "grad_norm": 0.81640625, + "learning_rate": 4.603068712474983e-05, + "loss": 1.3791, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 1.0546875, + "learning_rate": 4.609739826551034e-05, + "loss": 0.8216, + "step": 691 + }, + { + "epoch": 0.09, + "grad_norm": 0.65625, + "learning_rate": 4.616410940627085e-05, + "loss": 0.7319, + "step": 692 + }, + { + "epoch": 0.09, + "grad_norm": 0.75, + "learning_rate": 4.623082054703136e-05, + "loss": 1.1291, + "step": 693 + }, + { + "epoch": 0.09, + "grad_norm": 0.84375, + "learning_rate": 4.6297531687791864e-05, + "loss": 1.1586, + "step": 694 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 4.636424282855237e-05, + "loss": 1.1304, + "step": 695 + }, + { + "epoch": 0.09, + "grad_norm": 0.8515625, + "learning_rate": 4.643095396931288e-05, + "loss": 1.251, + "step": 696 + }, + { + "epoch": 0.09, + "grad_norm": 0.79296875, + "learning_rate": 4.6497665110073386e-05, + "loss": 0.7543, + "step": 697 + }, + { + "epoch": 0.09, + "grad_norm": 0.6875, + "learning_rate": 4.656437625083389e-05, + "loss": 1.0991, + "step": 698 + }, + { + "epoch": 0.09, + "grad_norm": 0.84375, + "learning_rate": 4.6631087391594396e-05, + "loss": 0.8164, + "step": 699 + }, + { + "epoch": 0.09, + "grad_norm": 0.7890625, + "learning_rate": 4.66977985323549e-05, + "loss": 0.7741, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 0.73828125, + "learning_rate": 4.676450967311541e-05, + "loss": 0.8104, + "step": 701 + }, + { + "epoch": 0.09, + "grad_norm": 0.875, + "learning_rate": 4.683122081387592e-05, + "loss": 0.8253, + "step": 702 + }, + { + "epoch": 0.09, + "grad_norm": 0.7890625, + "learning_rate": 4.689793195463642e-05, + "loss": 1.1599, + "step": 703 + }, + { + "epoch": 0.09, + "grad_norm": 0.8359375, + "learning_rate": 4.6964643095396935e-05, + "loss": 1.132, + "step": 704 + }, + { + "epoch": 0.09, + "grad_norm": 0.9609375, + "learning_rate": 4.703135423615744e-05, + "loss": 0.7118, + "step": 705 + }, + { + "epoch": 0.09, + "grad_norm": 0.96484375, + "learning_rate": 4.709806537691795e-05, + "loss": 0.8563, + "step": 706 + }, + { + "epoch": 0.09, + "grad_norm": 0.9296875, + "learning_rate": 4.716477651767846e-05, + "loss": 0.7447, + "step": 707 + }, + { + "epoch": 0.09, + "grad_norm": 0.73828125, + "learning_rate": 4.723148765843896e-05, + "loss": 0.6558, + "step": 708 + }, + { + "epoch": 0.09, + "grad_norm": 0.796875, + "learning_rate": 4.729819879919947e-05, + "loss": 0.897, + "step": 709 + }, + { + "epoch": 0.09, + "grad_norm": 0.859375, + "learning_rate": 4.736490993995998e-05, + "loss": 0.6501, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 1.09375, + "learning_rate": 4.7431621080720484e-05, + "loss": 0.7051, + "step": 711 + }, + { + "epoch": 0.1, + "grad_norm": 0.88671875, + "learning_rate": 4.749833222148099e-05, + "loss": 0.9212, + "step": 712 + }, + { + "epoch": 0.1, + "grad_norm": 1.0546875, + "learning_rate": 4.7565043362241494e-05, + "loss": 0.9982, + "step": 713 + }, + { + "epoch": 0.1, + "grad_norm": 1.0234375, + "learning_rate": 4.7631754503002e-05, + "loss": 1.1458, + "step": 714 + }, + { + "epoch": 0.1, + "grad_norm": 1.0546875, + "learning_rate": 4.769846564376251e-05, + "loss": 0.8549, + "step": 715 + }, + { + "epoch": 0.1, + "grad_norm": 0.84765625, + "learning_rate": 4.7765176784523016e-05, + "loss": 0.6559, + "step": 716 + }, + { + "epoch": 0.1, + "grad_norm": 0.921875, + "learning_rate": 4.783188792528353e-05, + "loss": 0.8634, + "step": 717 + }, + { + "epoch": 0.1, + "grad_norm": 0.7890625, + "learning_rate": 4.789859906604403e-05, + "loss": 0.8392, + "step": 718 + }, + { + "epoch": 0.1, + "grad_norm": 0.86328125, + "learning_rate": 4.796531020680454e-05, + "loss": 0.7065, + "step": 719 + }, + { + "epoch": 0.1, + "grad_norm": 0.80859375, + "learning_rate": 4.803202134756505e-05, + "loss": 0.9927, + "step": 720 + }, + { + "epoch": 0.1, + "grad_norm": 0.99609375, + "learning_rate": 4.8098732488325555e-05, + "loss": 0.9399, + "step": 721 + }, + { + "epoch": 0.1, + "grad_norm": 0.79296875, + "learning_rate": 4.816544362908606e-05, + "loss": 1.0366, + "step": 722 + }, + { + "epoch": 0.1, + "grad_norm": 0.875, + "learning_rate": 4.8232154769846565e-05, + "loss": 1.0417, + "step": 723 + }, + { + "epoch": 0.1, + "grad_norm": 0.80078125, + "learning_rate": 4.829886591060707e-05, + "loss": 0.839, + "step": 724 + }, + { + "epoch": 0.1, + "grad_norm": 1.4140625, + "learning_rate": 4.836557705136758e-05, + "loss": 1.1621, + "step": 725 + }, + { + "epoch": 0.1, + "grad_norm": 0.8203125, + "learning_rate": 4.8432288192128087e-05, + "loss": 0.9516, + "step": 726 + }, + { + "epoch": 0.1, + "grad_norm": 0.77734375, + "learning_rate": 4.849899933288859e-05, + "loss": 0.8307, + "step": 727 + }, + { + "epoch": 0.1, + "grad_norm": 0.83203125, + "learning_rate": 4.8565710473649103e-05, + "loss": 0.9818, + "step": 728 + }, + { + "epoch": 0.1, + "grad_norm": 0.96484375, + "learning_rate": 4.863242161440961e-05, + "loss": 0.9681, + "step": 729 + }, + { + "epoch": 0.1, + "grad_norm": 0.81640625, + "learning_rate": 4.869913275517012e-05, + "loss": 0.7308, + "step": 730 + }, + { + "epoch": 0.1, + "grad_norm": 0.73046875, + "learning_rate": 4.8765843895930625e-05, + "loss": 0.8418, + "step": 731 + }, + { + "epoch": 0.1, + "grad_norm": 0.93359375, + "learning_rate": 4.883255503669113e-05, + "loss": 1.1546, + "step": 732 + }, + { + "epoch": 0.1, + "grad_norm": 0.8515625, + "learning_rate": 4.8899266177451635e-05, + "loss": 0.9922, + "step": 733 + }, + { + "epoch": 0.1, + "grad_norm": 0.76171875, + "learning_rate": 4.896597731821214e-05, + "loss": 0.7983, + "step": 734 + }, + { + "epoch": 0.1, + "grad_norm": 1.0078125, + "learning_rate": 4.903268845897265e-05, + "loss": 1.0594, + "step": 735 + }, + { + "epoch": 0.1, + "grad_norm": 0.8828125, + "learning_rate": 4.909939959973316e-05, + "loss": 0.9874, + "step": 736 + }, + { + "epoch": 0.1, + "grad_norm": 0.83984375, + "learning_rate": 4.916611074049366e-05, + "loss": 0.8619, + "step": 737 + }, + { + "epoch": 0.1, + "grad_norm": 0.90234375, + "learning_rate": 4.9232821881254174e-05, + "loss": 1.1149, + "step": 738 + }, + { + "epoch": 0.1, + "grad_norm": 0.8203125, + "learning_rate": 4.929953302201468e-05, + "loss": 0.7093, + "step": 739 + }, + { + "epoch": 0.1, + "grad_norm": 0.83203125, + "learning_rate": 4.936624416277519e-05, + "loss": 0.9413, + "step": 740 + }, + { + "epoch": 0.1, + "grad_norm": 0.87890625, + "learning_rate": 4.9432955303535696e-05, + "loss": 0.9649, + "step": 741 + }, + { + "epoch": 0.1, + "grad_norm": 1.5859375, + "learning_rate": 4.94996664442962e-05, + "loss": 1.0514, + "step": 742 + }, + { + "epoch": 0.1, + "grad_norm": 0.77734375, + "learning_rate": 4.9566377585056706e-05, + "loss": 0.6111, + "step": 743 + }, + { + "epoch": 0.1, + "grad_norm": 0.83984375, + "learning_rate": 4.963308872581721e-05, + "loss": 0.8769, + "step": 744 + }, + { + "epoch": 0.1, + "grad_norm": 0.73828125, + "learning_rate": 4.969979986657772e-05, + "loss": 0.8708, + "step": 745 + }, + { + "epoch": 0.1, + "grad_norm": 0.77734375, + "learning_rate": 4.976651100733823e-05, + "loss": 0.9109, + "step": 746 + }, + { + "epoch": 0.1, + "grad_norm": 0.71875, + "learning_rate": 4.983322214809873e-05, + "loss": 0.9079, + "step": 747 + }, + { + "epoch": 0.1, + "grad_norm": 0.84765625, + "learning_rate": 4.989993328885924e-05, + "loss": 0.8349, + "step": 748 + }, + { + "epoch": 0.1, + "grad_norm": 0.94921875, + "learning_rate": 4.996664442961975e-05, + "loss": 0.7302, + "step": 749 + }, + { + "epoch": 0.1, + "grad_norm": 0.671875, + "learning_rate": 5.0033355570380255e-05, + "loss": 0.8177, + "step": 750 + }, + { + "epoch": 0.1, + "grad_norm": 0.8828125, + "learning_rate": 5.010006671114076e-05, + "loss": 1.0042, + "step": 751 + }, + { + "epoch": 0.1, + "grad_norm": 1.0703125, + "learning_rate": 5.016677785190127e-05, + "loss": 0.9767, + "step": 752 + }, + { + "epoch": 0.1, + "grad_norm": 0.796875, + "learning_rate": 5.0233488992661784e-05, + "loss": 0.8609, + "step": 753 + }, + { + "epoch": 0.1, + "grad_norm": 0.796875, + "learning_rate": 5.030020013342228e-05, + "loss": 0.7568, + "step": 754 + }, + { + "epoch": 0.1, + "grad_norm": 0.74609375, + "learning_rate": 5.0366911274182794e-05, + "loss": 0.7131, + "step": 755 + }, + { + "epoch": 0.1, + "grad_norm": 0.95703125, + "learning_rate": 5.043362241494329e-05, + "loss": 0.7571, + "step": 756 + }, + { + "epoch": 0.1, + "grad_norm": 0.83203125, + "learning_rate": 5.0500333555703804e-05, + "loss": 0.9339, + "step": 757 + }, + { + "epoch": 0.1, + "grad_norm": 0.8203125, + "learning_rate": 5.0567044696464316e-05, + "loss": 1.2943, + "step": 758 + }, + { + "epoch": 0.1, + "grad_norm": 0.953125, + "learning_rate": 5.0633755837224814e-05, + "loss": 0.9197, + "step": 759 + }, + { + "epoch": 0.1, + "grad_norm": 0.9296875, + "learning_rate": 5.0700466977985326e-05, + "loss": 1.0122, + "step": 760 + }, + { + "epoch": 0.1, + "grad_norm": 1.0078125, + "learning_rate": 5.076717811874583e-05, + "loss": 0.94, + "step": 761 + }, + { + "epoch": 0.1, + "grad_norm": 0.93359375, + "learning_rate": 5.083388925950634e-05, + "loss": 0.8677, + "step": 762 + }, + { + "epoch": 0.1, + "grad_norm": 0.7890625, + "learning_rate": 5.0900600400266854e-05, + "loss": 0.6689, + "step": 763 + }, + { + "epoch": 0.1, + "grad_norm": 0.7421875, + "learning_rate": 5.096731154102735e-05, + "loss": 0.9483, + "step": 764 + }, + { + "epoch": 0.1, + "grad_norm": 0.796875, + "learning_rate": 5.1034022681787864e-05, + "loss": 0.8982, + "step": 765 + }, + { + "epoch": 0.1, + "grad_norm": 0.77734375, + "learning_rate": 5.110073382254836e-05, + "loss": 0.8863, + "step": 766 + }, + { + "epoch": 0.1, + "grad_norm": 0.8125, + "learning_rate": 5.1167444963308874e-05, + "loss": 1.0275, + "step": 767 + }, + { + "epoch": 0.1, + "grad_norm": 0.6875, + "learning_rate": 5.1234156104069386e-05, + "loss": 1.2102, + "step": 768 + }, + { + "epoch": 0.1, + "grad_norm": 0.64453125, + "learning_rate": 5.1300867244829884e-05, + "loss": 0.6437, + "step": 769 + }, + { + "epoch": 0.1, + "grad_norm": 0.91015625, + "learning_rate": 5.1367578385590396e-05, + "loss": 1.1259, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 0.82421875, + "learning_rate": 5.14342895263509e-05, + "loss": 0.8576, + "step": 771 + }, + { + "epoch": 0.1, + "grad_norm": 0.7578125, + "learning_rate": 5.150100066711141e-05, + "loss": 0.7923, + "step": 772 + }, + { + "epoch": 0.1, + "grad_norm": 0.7421875, + "learning_rate": 5.156771180787192e-05, + "loss": 0.9086, + "step": 773 + }, + { + "epoch": 0.1, + "grad_norm": 0.90234375, + "learning_rate": 5.163442294863242e-05, + "loss": 0.8701, + "step": 774 + }, + { + "epoch": 0.1, + "grad_norm": 0.8515625, + "learning_rate": 5.1701134089392935e-05, + "loss": 0.8754, + "step": 775 + }, + { + "epoch": 0.1, + "grad_norm": 0.765625, + "learning_rate": 5.176784523015343e-05, + "loss": 0.7613, + "step": 776 + }, + { + "epoch": 0.1, + "grad_norm": 0.90234375, + "learning_rate": 5.1834556370913945e-05, + "loss": 1.0233, + "step": 777 + }, + { + "epoch": 0.1, + "grad_norm": 0.8203125, + "learning_rate": 5.190126751167446e-05, + "loss": 0.9083, + "step": 778 + }, + { + "epoch": 0.1, + "grad_norm": 0.8828125, + "learning_rate": 5.1967978652434955e-05, + "loss": 0.9211, + "step": 779 + }, + { + "epoch": 0.1, + "grad_norm": 0.92578125, + "learning_rate": 5.203468979319547e-05, + "loss": 0.6993, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 0.9375, + "learning_rate": 5.2101400933955965e-05, + "loss": 0.8784, + "step": 781 + }, + { + "epoch": 0.1, + "grad_norm": 0.66796875, + "learning_rate": 5.216811207471648e-05, + "loss": 1.0714, + "step": 782 + }, + { + "epoch": 0.1, + "grad_norm": 0.73828125, + "learning_rate": 5.223482321547699e-05, + "loss": 0.9687, + "step": 783 + }, + { + "epoch": 0.1, + "grad_norm": 1.046875, + "learning_rate": 5.2301534356237494e-05, + "loss": 0.9102, + "step": 784 + }, + { + "epoch": 0.1, + "grad_norm": 0.65234375, + "learning_rate": 5.2368245496998006e-05, + "loss": 0.9303, + "step": 785 + }, + { + "epoch": 0.1, + "grad_norm": 0.81640625, + "learning_rate": 5.2434956637758504e-05, + "loss": 0.9606, + "step": 786 + }, + { + "epoch": 0.11, + "grad_norm": 0.828125, + "learning_rate": 5.2501667778519016e-05, + "loss": 1.0497, + "step": 787 + }, + { + "epoch": 0.11, + "grad_norm": 0.8671875, + "learning_rate": 5.256837891927953e-05, + "loss": 0.7739, + "step": 788 + }, + { + "epoch": 0.11, + "grad_norm": 0.82421875, + "learning_rate": 5.2635090060040026e-05, + "loss": 0.7344, + "step": 789 + }, + { + "epoch": 0.11, + "grad_norm": 0.83984375, + "learning_rate": 5.270180120080054e-05, + "loss": 1.1619, + "step": 790 + }, + { + "epoch": 0.11, + "grad_norm": 0.84765625, + "learning_rate": 5.2768512341561036e-05, + "loss": 0.9254, + "step": 791 + }, + { + "epoch": 0.11, + "grad_norm": 0.921875, + "learning_rate": 5.283522348232155e-05, + "loss": 1.1469, + "step": 792 + }, + { + "epoch": 0.11, + "grad_norm": 0.97265625, + "learning_rate": 5.290193462308206e-05, + "loss": 1.0912, + "step": 793 + }, + { + "epoch": 0.11, + "grad_norm": 0.66796875, + "learning_rate": 5.2968645763842565e-05, + "loss": 0.6396, + "step": 794 + }, + { + "epoch": 0.11, + "grad_norm": 0.78125, + "learning_rate": 5.303535690460307e-05, + "loss": 0.7895, + "step": 795 + }, + { + "epoch": 0.11, + "grad_norm": 0.9375, + "learning_rate": 5.3102068045363575e-05, + "loss": 1.1935, + "step": 796 + }, + { + "epoch": 0.11, + "grad_norm": 0.84375, + "learning_rate": 5.3168779186124087e-05, + "loss": 0.9893, + "step": 797 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 5.32354903268846e-05, + "loss": 0.8959, + "step": 798 + }, + { + "epoch": 0.11, + "grad_norm": 1.4375, + "learning_rate": 5.3302201467645097e-05, + "loss": 0.6524, + "step": 799 + }, + { + "epoch": 0.11, + "grad_norm": 0.95703125, + "learning_rate": 5.336891260840561e-05, + "loss": 0.8822, + "step": 800 + }, + { + "epoch": 0.11, + "grad_norm": 1.34375, + "learning_rate": 5.343562374916611e-05, + "loss": 0.9289, + "step": 801 + }, + { + "epoch": 0.11, + "grad_norm": 0.8203125, + "learning_rate": 5.350233488992662e-05, + "loss": 0.4537, + "step": 802 + }, + { + "epoch": 0.11, + "grad_norm": 0.90625, + "learning_rate": 5.356904603068713e-05, + "loss": 0.9532, + "step": 803 + }, + { + "epoch": 0.11, + "grad_norm": 0.8046875, + "learning_rate": 5.363575717144763e-05, + "loss": 0.734, + "step": 804 + }, + { + "epoch": 0.11, + "grad_norm": 0.59765625, + "learning_rate": 5.370246831220814e-05, + "loss": 0.7239, + "step": 805 + }, + { + "epoch": 0.11, + "grad_norm": 0.7578125, + "learning_rate": 5.3769179452968645e-05, + "loss": 1.101, + "step": 806 + }, + { + "epoch": 0.11, + "grad_norm": 0.68359375, + "learning_rate": 5.383589059372916e-05, + "loss": 0.7971, + "step": 807 + }, + { + "epoch": 0.11, + "grad_norm": 0.7734375, + "learning_rate": 5.390260173448967e-05, + "loss": 1.1258, + "step": 808 + }, + { + "epoch": 0.11, + "grad_norm": 0.67578125, + "learning_rate": 5.396931287525017e-05, + "loss": 1.0105, + "step": 809 + }, + { + "epoch": 0.11, + "grad_norm": 0.94140625, + "learning_rate": 5.403602401601068e-05, + "loss": 0.9973, + "step": 810 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 5.410273515677118e-05, + "loss": 0.808, + "step": 811 + }, + { + "epoch": 0.11, + "grad_norm": 0.96484375, + "learning_rate": 5.416944629753169e-05, + "loss": 0.7745, + "step": 812 + }, + { + "epoch": 0.11, + "grad_norm": 0.90234375, + "learning_rate": 5.42361574382922e-05, + "loss": 0.9882, + "step": 813 + }, + { + "epoch": 0.11, + "grad_norm": 0.71484375, + "learning_rate": 5.43028685790527e-05, + "loss": 1.1692, + "step": 814 + }, + { + "epoch": 0.11, + "grad_norm": 0.8984375, + "learning_rate": 5.436957971981321e-05, + "loss": 1.2549, + "step": 815 + }, + { + "epoch": 0.11, + "grad_norm": 0.87890625, + "learning_rate": 5.4436290860573716e-05, + "loss": 0.9158, + "step": 816 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 5.450300200133423e-05, + "loss": 0.8204, + "step": 817 + }, + { + "epoch": 0.11, + "grad_norm": 0.78515625, + "learning_rate": 5.456971314209473e-05, + "loss": 0.7165, + "step": 818 + }, + { + "epoch": 0.11, + "grad_norm": 0.94921875, + "learning_rate": 5.463642428285524e-05, + "loss": 0.6684, + "step": 819 + }, + { + "epoch": 0.11, + "grad_norm": 0.80859375, + "learning_rate": 5.470313542361575e-05, + "loss": 0.7109, + "step": 820 + }, + { + "epoch": 0.11, + "grad_norm": 1.0703125, + "learning_rate": 5.476984656437625e-05, + "loss": 1.0845, + "step": 821 + }, + { + "epoch": 0.11, + "grad_norm": 0.90234375, + "learning_rate": 5.483655770513676e-05, + "loss": 0.6374, + "step": 822 + }, + { + "epoch": 0.11, + "grad_norm": 0.72265625, + "learning_rate": 5.490326884589727e-05, + "loss": 0.9493, + "step": 823 + }, + { + "epoch": 0.11, + "grad_norm": 1.109375, + "learning_rate": 5.496997998665777e-05, + "loss": 1.0945, + "step": 824 + }, + { + "epoch": 0.11, + "grad_norm": 0.6640625, + "learning_rate": 5.503669112741828e-05, + "loss": 1.0742, + "step": 825 + }, + { + "epoch": 0.11, + "grad_norm": 1.046875, + "learning_rate": 5.510340226817878e-05, + "loss": 1.0957, + "step": 826 + }, + { + "epoch": 0.11, + "grad_norm": 1.0390625, + "learning_rate": 5.517011340893929e-05, + "loss": 1.1028, + "step": 827 + }, + { + "epoch": 0.11, + "grad_norm": 0.66015625, + "learning_rate": 5.5236824549699804e-05, + "loss": 0.8594, + "step": 828 + }, + { + "epoch": 0.11, + "grad_norm": 0.71484375, + "learning_rate": 5.530353569046031e-05, + "loss": 1.0489, + "step": 829 + }, + { + "epoch": 0.11, + "grad_norm": 0.66796875, + "learning_rate": 5.537024683122082e-05, + "loss": 0.7289, + "step": 830 + }, + { + "epoch": 0.11, + "grad_norm": 0.75390625, + "learning_rate": 5.543695797198132e-05, + "loss": 0.9887, + "step": 831 + }, + { + "epoch": 0.11, + "grad_norm": 0.75390625, + "learning_rate": 5.550366911274183e-05, + "loss": 0.9081, + "step": 832 + }, + { + "epoch": 0.11, + "grad_norm": 0.73046875, + "learning_rate": 5.557038025350234e-05, + "loss": 0.6031, + "step": 833 + }, + { + "epoch": 0.11, + "grad_norm": 0.72265625, + "learning_rate": 5.563709139426284e-05, + "loss": 1.1169, + "step": 834 + }, + { + "epoch": 0.11, + "grad_norm": 0.94921875, + "learning_rate": 5.570380253502335e-05, + "loss": 0.8667, + "step": 835 + }, + { + "epoch": 0.11, + "grad_norm": 0.7109375, + "learning_rate": 5.577051367578385e-05, + "loss": 0.6484, + "step": 836 + }, + { + "epoch": 0.11, + "grad_norm": 1.046875, + "learning_rate": 5.583722481654436e-05, + "loss": 0.83, + "step": 837 + }, + { + "epoch": 0.11, + "grad_norm": 0.8671875, + "learning_rate": 5.5903935957304874e-05, + "loss": 1.2955, + "step": 838 + }, + { + "epoch": 0.11, + "grad_norm": 1.046875, + "learning_rate": 5.597064709806538e-05, + "loss": 1.0221, + "step": 839 + }, + { + "epoch": 0.11, + "grad_norm": 0.6640625, + "learning_rate": 5.6037358238825884e-05, + "loss": 0.893, + "step": 840 + }, + { + "epoch": 0.11, + "grad_norm": 0.84765625, + "learning_rate": 5.610406937958639e-05, + "loss": 0.7546, + "step": 841 + }, + { + "epoch": 0.11, + "grad_norm": 1.0078125, + "learning_rate": 5.61707805203469e-05, + "loss": 1.0533, + "step": 842 + }, + { + "epoch": 0.11, + "grad_norm": 0.97265625, + "learning_rate": 5.623749166110741e-05, + "loss": 0.8825, + "step": 843 + }, + { + "epoch": 0.11, + "grad_norm": 0.89453125, + "learning_rate": 5.630420280186791e-05, + "loss": 0.9528, + "step": 844 + }, + { + "epoch": 0.11, + "grad_norm": 0.80859375, + "learning_rate": 5.637091394262842e-05, + "loss": 0.5464, + "step": 845 + }, + { + "epoch": 0.11, + "grad_norm": 1.015625, + "learning_rate": 5.6437625083388935e-05, + "loss": 0.6212, + "step": 846 + }, + { + "epoch": 0.11, + "grad_norm": 0.87890625, + "learning_rate": 5.650433622414943e-05, + "loss": 0.9476, + "step": 847 + }, + { + "epoch": 0.11, + "grad_norm": 1.109375, + "learning_rate": 5.6571047364909945e-05, + "loss": 0.8495, + "step": 848 + }, + { + "epoch": 0.11, + "grad_norm": 0.84765625, + "learning_rate": 5.663775850567044e-05, + "loss": 0.794, + "step": 849 + }, + { + "epoch": 0.11, + "grad_norm": 0.703125, + "learning_rate": 5.6704469646430955e-05, + "loss": 0.7137, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 0.86328125, + "learning_rate": 5.677118078719147e-05, + "loss": 0.5734, + "step": 851 + }, + { + "epoch": 0.11, + "grad_norm": 0.9296875, + "learning_rate": 5.683789192795197e-05, + "loss": 1.0214, + "step": 852 + }, + { + "epoch": 0.11, + "grad_norm": 0.87109375, + "learning_rate": 5.6904603068712484e-05, + "loss": 0.665, + "step": 853 + }, + { + "epoch": 0.11, + "grad_norm": 0.67578125, + "learning_rate": 5.697131420947298e-05, + "loss": 0.8592, + "step": 854 + }, + { + "epoch": 0.11, + "grad_norm": 0.9375, + "learning_rate": 5.7038025350233494e-05, + "loss": 1.3341, + "step": 855 + }, + { + "epoch": 0.11, + "grad_norm": 0.77734375, + "learning_rate": 5.7104736490994006e-05, + "loss": 0.8593, + "step": 856 + }, + { + "epoch": 0.11, + "grad_norm": 1.15625, + "learning_rate": 5.7171447631754504e-05, + "loss": 1.1195, + "step": 857 + }, + { + "epoch": 0.11, + "grad_norm": 0.796875, + "learning_rate": 5.7238158772515016e-05, + "loss": 0.9628, + "step": 858 + }, + { + "epoch": 0.11, + "grad_norm": 0.94921875, + "learning_rate": 5.7304869913275514e-05, + "loss": 1.0752, + "step": 859 + }, + { + "epoch": 0.11, + "grad_norm": 0.79296875, + "learning_rate": 5.7371581054036026e-05, + "loss": 0.9949, + "step": 860 + }, + { + "epoch": 0.11, + "grad_norm": 0.7578125, + "learning_rate": 5.743829219479654e-05, + "loss": 0.686, + "step": 861 + }, + { + "epoch": 0.12, + "grad_norm": 0.8203125, + "learning_rate": 5.750500333555704e-05, + "loss": 0.938, + "step": 862 + }, + { + "epoch": 0.12, + "grad_norm": 0.80078125, + "learning_rate": 5.757171447631755e-05, + "loss": 0.7119, + "step": 863 + }, + { + "epoch": 0.12, + "grad_norm": 0.7265625, + "learning_rate": 5.763842561707805e-05, + "loss": 0.8771, + "step": 864 + }, + { + "epoch": 0.12, + "grad_norm": 0.96875, + "learning_rate": 5.7705136757838565e-05, + "loss": 1.3691, + "step": 865 + }, + { + "epoch": 0.12, + "grad_norm": 0.7734375, + "learning_rate": 5.7771847898599076e-05, + "loss": 1.0516, + "step": 866 + }, + { + "epoch": 0.12, + "grad_norm": 0.85546875, + "learning_rate": 5.7838559039359575e-05, + "loss": 0.9701, + "step": 867 + }, + { + "epoch": 0.12, + "grad_norm": 0.69921875, + "learning_rate": 5.7905270180120086e-05, + "loss": 0.5343, + "step": 868 + }, + { + "epoch": 0.12, + "grad_norm": 0.625, + "learning_rate": 5.7971981320880585e-05, + "loss": 1.1096, + "step": 869 + }, + { + "epoch": 0.12, + "grad_norm": 0.95703125, + "learning_rate": 5.8038692461641097e-05, + "loss": 1.0292, + "step": 870 + }, + { + "epoch": 0.12, + "grad_norm": 0.71875, + "learning_rate": 5.810540360240161e-05, + "loss": 0.8705, + "step": 871 + }, + { + "epoch": 0.12, + "grad_norm": 0.93359375, + "learning_rate": 5.8172114743162107e-05, + "loss": 0.9186, + "step": 872 + }, + { + "epoch": 0.12, + "grad_norm": 0.94921875, + "learning_rate": 5.823882588392262e-05, + "loss": 0.8774, + "step": 873 + }, + { + "epoch": 0.12, + "grad_norm": 0.85546875, + "learning_rate": 5.8305537024683123e-05, + "loss": 1.3312, + "step": 874 + }, + { + "epoch": 0.12, + "grad_norm": 0.83984375, + "learning_rate": 5.8372248165443635e-05, + "loss": 0.8596, + "step": 875 + }, + { + "epoch": 0.12, + "grad_norm": 0.95703125, + "learning_rate": 5.843895930620415e-05, + "loss": 0.8298, + "step": 876 + }, + { + "epoch": 0.12, + "grad_norm": 0.6640625, + "learning_rate": 5.8505670446964645e-05, + "loss": 1.1038, + "step": 877 + }, + { + "epoch": 0.12, + "grad_norm": 1.0234375, + "learning_rate": 5.857238158772516e-05, + "loss": 1.0113, + "step": 878 + }, + { + "epoch": 0.12, + "grad_norm": 0.8671875, + "learning_rate": 5.8639092728485655e-05, + "loss": 0.7423, + "step": 879 + }, + { + "epoch": 0.12, + "grad_norm": 1.0078125, + "learning_rate": 5.870580386924617e-05, + "loss": 0.915, + "step": 880 + }, + { + "epoch": 0.12, + "grad_norm": 0.97265625, + "learning_rate": 5.877251501000668e-05, + "loss": 0.9781, + "step": 881 + }, + { + "epoch": 0.12, + "grad_norm": 0.8515625, + "learning_rate": 5.883922615076718e-05, + "loss": 0.9536, + "step": 882 + }, + { + "epoch": 0.12, + "grad_norm": 0.7890625, + "learning_rate": 5.890593729152769e-05, + "loss": 1.0927, + "step": 883 + }, + { + "epoch": 0.12, + "grad_norm": 0.92578125, + "learning_rate": 5.8972648432288194e-05, + "loss": 0.7265, + "step": 884 + }, + { + "epoch": 0.12, + "grad_norm": 0.71484375, + "learning_rate": 5.9039359573048706e-05, + "loss": 1.1217, + "step": 885 + }, + { + "epoch": 0.12, + "grad_norm": 0.8125, + "learning_rate": 5.910607071380921e-05, + "loss": 0.7138, + "step": 886 + }, + { + "epoch": 0.12, + "grad_norm": 1.1015625, + "learning_rate": 5.9172781854569716e-05, + "loss": 0.7957, + "step": 887 + }, + { + "epoch": 0.12, + "grad_norm": 0.97265625, + "learning_rate": 5.923949299533023e-05, + "loss": 0.651, + "step": 888 + }, + { + "epoch": 0.12, + "grad_norm": 0.81640625, + "learning_rate": 5.9306204136090726e-05, + "loss": 0.8917, + "step": 889 + }, + { + "epoch": 0.12, + "grad_norm": 0.87890625, + "learning_rate": 5.937291527685124e-05, + "loss": 0.7842, + "step": 890 + }, + { + "epoch": 0.12, + "grad_norm": 1.0078125, + "learning_rate": 5.943962641761175e-05, + "loss": 0.9263, + "step": 891 + }, + { + "epoch": 0.12, + "grad_norm": 0.84375, + "learning_rate": 5.950633755837225e-05, + "loss": 0.6349, + "step": 892 + }, + { + "epoch": 0.12, + "grad_norm": 0.90625, + "learning_rate": 5.957304869913276e-05, + "loss": 0.6143, + "step": 893 + }, + { + "epoch": 0.12, + "grad_norm": 0.875, + "learning_rate": 5.963975983989326e-05, + "loss": 0.9314, + "step": 894 + }, + { + "epoch": 0.12, + "grad_norm": 0.79296875, + "learning_rate": 5.970647098065377e-05, + "loss": 0.5605, + "step": 895 + }, + { + "epoch": 0.12, + "grad_norm": 1.1953125, + "learning_rate": 5.977318212141428e-05, + "loss": 0.7238, + "step": 896 + }, + { + "epoch": 0.12, + "grad_norm": 0.73046875, + "learning_rate": 5.983989326217479e-05, + "loss": 0.7788, + "step": 897 + }, + { + "epoch": 0.12, + "grad_norm": 0.8984375, + "learning_rate": 5.99066044029353e-05, + "loss": 0.7799, + "step": 898 + }, + { + "epoch": 0.12, + "grad_norm": 1.0546875, + "learning_rate": 5.99733155436958e-05, + "loss": 1.1293, + "step": 899 + }, + { + "epoch": 0.12, + "grad_norm": 0.97265625, + "learning_rate": 6.004002668445631e-05, + "loss": 1.0362, + "step": 900 + }, + { + "epoch": 0.12, + "grad_norm": 1.140625, + "learning_rate": 6.010673782521682e-05, + "loss": 0.8729, + "step": 901 + }, + { + "epoch": 0.12, + "grad_norm": 0.94921875, + "learning_rate": 6.017344896597732e-05, + "loss": 0.8574, + "step": 902 + }, + { + "epoch": 0.12, + "grad_norm": 0.84765625, + "learning_rate": 6.024016010673783e-05, + "loss": 1.0961, + "step": 903 + }, + { + "epoch": 0.12, + "grad_norm": 1.1484375, + "learning_rate": 6.030687124749833e-05, + "loss": 1.1428, + "step": 904 + }, + { + "epoch": 0.12, + "grad_norm": 0.91015625, + "learning_rate": 6.037358238825884e-05, + "loss": 0.6373, + "step": 905 + }, + { + "epoch": 0.12, + "grad_norm": 0.984375, + "learning_rate": 6.044029352901935e-05, + "loss": 1.0967, + "step": 906 + }, + { + "epoch": 0.12, + "grad_norm": 0.88671875, + "learning_rate": 6.050700466977986e-05, + "loss": 0.9069, + "step": 907 + }, + { + "epoch": 0.12, + "grad_norm": 1.078125, + "learning_rate": 6.057371581054036e-05, + "loss": 0.9436, + "step": 908 + }, + { + "epoch": 0.12, + "grad_norm": 0.796875, + "learning_rate": 6.064042695130087e-05, + "loss": 1.1259, + "step": 909 + }, + { + "epoch": 0.12, + "grad_norm": 0.74609375, + "learning_rate": 6.070713809206138e-05, + "loss": 0.8026, + "step": 910 + }, + { + "epoch": 0.12, + "grad_norm": 0.77734375, + "learning_rate": 6.077384923282189e-05, + "loss": 0.8198, + "step": 911 + }, + { + "epoch": 0.12, + "grad_norm": 0.74609375, + "learning_rate": 6.084056037358239e-05, + "loss": 1.029, + "step": 912 + }, + { + "epoch": 0.12, + "grad_norm": 0.86328125, + "learning_rate": 6.09072715143429e-05, + "loss": 1.0552, + "step": 913 + }, + { + "epoch": 0.12, + "grad_norm": 0.72265625, + "learning_rate": 6.09739826551034e-05, + "loss": 0.6996, + "step": 914 + }, + { + "epoch": 0.12, + "grad_norm": 0.83203125, + "learning_rate": 6.104069379586391e-05, + "loss": 1.0265, + "step": 915 + }, + { + "epoch": 0.12, + "grad_norm": 0.7109375, + "learning_rate": 6.110740493662443e-05, + "loss": 0.8004, + "step": 916 + }, + { + "epoch": 0.12, + "grad_norm": 0.76171875, + "learning_rate": 6.117411607738492e-05, + "loss": 0.8388, + "step": 917 + }, + { + "epoch": 0.12, + "grad_norm": 1.03125, + "learning_rate": 6.124082721814544e-05, + "loss": 0.7467, + "step": 918 + }, + { + "epoch": 0.12, + "grad_norm": 0.8046875, + "learning_rate": 6.130753835890593e-05, + "loss": 1.0162, + "step": 919 + }, + { + "epoch": 0.12, + "grad_norm": 0.890625, + "learning_rate": 6.137424949966645e-05, + "loss": 1.0474, + "step": 920 + }, + { + "epoch": 0.12, + "grad_norm": 0.91015625, + "learning_rate": 6.144096064042696e-05, + "loss": 0.8117, + "step": 921 + }, + { + "epoch": 0.12, + "grad_norm": 0.71484375, + "learning_rate": 6.150767178118746e-05, + "loss": 0.4926, + "step": 922 + }, + { + "epoch": 0.12, + "grad_norm": 0.76171875, + "learning_rate": 6.157438292194797e-05, + "loss": 0.9637, + "step": 923 + }, + { + "epoch": 0.12, + "grad_norm": 0.9296875, + "learning_rate": 6.164109406270847e-05, + "loss": 0.9528, + "step": 924 + }, + { + "epoch": 0.12, + "grad_norm": 0.92578125, + "learning_rate": 6.170780520346899e-05, + "loss": 0.6825, + "step": 925 + }, + { + "epoch": 0.12, + "grad_norm": 0.7421875, + "learning_rate": 6.17745163442295e-05, + "loss": 0.8561, + "step": 926 + }, + { + "epoch": 0.12, + "grad_norm": 0.84765625, + "learning_rate": 6.184122748499e-05, + "loss": 0.7605, + "step": 927 + }, + { + "epoch": 0.12, + "grad_norm": 0.9609375, + "learning_rate": 6.19079386257505e-05, + "loss": 0.9794, + "step": 928 + }, + { + "epoch": 0.12, + "grad_norm": 0.79296875, + "learning_rate": 6.197464976651101e-05, + "loss": 0.8591, + "step": 929 + }, + { + "epoch": 0.12, + "grad_norm": 0.98828125, + "learning_rate": 6.204136090727151e-05, + "loss": 0.8213, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 1.25, + "learning_rate": 6.210807204803203e-05, + "loss": 1.0282, + "step": 931 + }, + { + "epoch": 0.12, + "grad_norm": 0.8125, + "learning_rate": 6.217478318879252e-05, + "loss": 1.3549, + "step": 932 + }, + { + "epoch": 0.12, + "grad_norm": 0.93359375, + "learning_rate": 6.224149432955304e-05, + "loss": 1.1491, + "step": 933 + }, + { + "epoch": 0.12, + "grad_norm": 0.9765625, + "learning_rate": 6.230820547031355e-05, + "loss": 0.6934, + "step": 934 + }, + { + "epoch": 0.12, + "grad_norm": 0.75390625, + "learning_rate": 6.237491661107405e-05, + "loss": 0.7553, + "step": 935 + }, + { + "epoch": 0.12, + "grad_norm": 1.1953125, + "learning_rate": 6.244162775183456e-05, + "loss": 1.0659, + "step": 936 + }, + { + "epoch": 0.13, + "grad_norm": 0.74609375, + "learning_rate": 6.250833889259506e-05, + "loss": 1.023, + "step": 937 + }, + { + "epoch": 0.13, + "grad_norm": 0.76171875, + "learning_rate": 6.257505003335558e-05, + "loss": 0.9278, + "step": 938 + }, + { + "epoch": 0.13, + "grad_norm": 1.1640625, + "learning_rate": 6.264176117411607e-05, + "loss": 0.8611, + "step": 939 + }, + { + "epoch": 0.13, + "grad_norm": 0.859375, + "learning_rate": 6.270847231487659e-05, + "loss": 1.0097, + "step": 940 + }, + { + "epoch": 0.13, + "grad_norm": 1.1015625, + "learning_rate": 6.27751834556371e-05, + "loss": 1.0062, + "step": 941 + }, + { + "epoch": 0.13, + "grad_norm": 0.88671875, + "learning_rate": 6.28418945963976e-05, + "loss": 0.7633, + "step": 942 + }, + { + "epoch": 0.13, + "grad_norm": 0.578125, + "learning_rate": 6.29086057371581e-05, + "loss": 0.6044, + "step": 943 + }, + { + "epoch": 0.13, + "grad_norm": 0.81640625, + "learning_rate": 6.297531687791861e-05, + "loss": 0.9011, + "step": 944 + }, + { + "epoch": 0.13, + "grad_norm": 1.5, + "learning_rate": 6.304202801867912e-05, + "loss": 0.8793, + "step": 945 + }, + { + "epoch": 0.13, + "grad_norm": 1.015625, + "learning_rate": 6.310873915943964e-05, + "loss": 1.0778, + "step": 946 + }, + { + "epoch": 0.13, + "grad_norm": 1.0, + "learning_rate": 6.317545030020014e-05, + "loss": 0.8185, + "step": 947 + }, + { + "epoch": 0.13, + "grad_norm": 0.796875, + "learning_rate": 6.324216144096065e-05, + "loss": 0.856, + "step": 948 + }, + { + "epoch": 0.13, + "grad_norm": 0.91796875, + "learning_rate": 6.330887258172115e-05, + "loss": 0.7255, + "step": 949 + }, + { + "epoch": 0.13, + "grad_norm": 0.66015625, + "learning_rate": 6.337558372248166e-05, + "loss": 1.1572, + "step": 950 + }, + { + "epoch": 0.13, + "grad_norm": 0.984375, + "learning_rate": 6.344229486324217e-05, + "loss": 0.974, + "step": 951 + }, + { + "epoch": 0.13, + "grad_norm": 0.8203125, + "learning_rate": 6.350900600400267e-05, + "loss": 0.7483, + "step": 952 + }, + { + "epoch": 0.13, + "grad_norm": 0.8515625, + "learning_rate": 6.357571714476318e-05, + "loss": 0.9758, + "step": 953 + }, + { + "epoch": 0.13, + "grad_norm": 0.9453125, + "learning_rate": 6.364242828552368e-05, + "loss": 0.7419, + "step": 954 + }, + { + "epoch": 0.13, + "grad_norm": 1.0234375, + "learning_rate": 6.37091394262842e-05, + "loss": 1.1292, + "step": 955 + }, + { + "epoch": 0.13, + "grad_norm": 1.0, + "learning_rate": 6.37758505670447e-05, + "loss": 1.0008, + "step": 956 + }, + { + "epoch": 0.13, + "grad_norm": 1.203125, + "learning_rate": 6.38425617078052e-05, + "loss": 0.649, + "step": 957 + }, + { + "epoch": 0.13, + "grad_norm": 0.87109375, + "learning_rate": 6.390927284856571e-05, + "loss": 0.901, + "step": 958 + }, + { + "epoch": 0.13, + "grad_norm": 0.77734375, + "learning_rate": 6.397598398932621e-05, + "loss": 1.0032, + "step": 959 + }, + { + "epoch": 0.13, + "grad_norm": 1.09375, + "learning_rate": 6.404269513008673e-05, + "loss": 1.0691, + "step": 960 + }, + { + "epoch": 0.13, + "grad_norm": 0.79296875, + "learning_rate": 6.410940627084724e-05, + "loss": 1.0682, + "step": 961 + }, + { + "epoch": 0.13, + "grad_norm": 0.953125, + "learning_rate": 6.417611741160774e-05, + "loss": 0.9334, + "step": 962 + }, + { + "epoch": 0.13, + "grad_norm": 0.9375, + "learning_rate": 6.424282855236825e-05, + "loss": 0.8351, + "step": 963 + }, + { + "epoch": 0.13, + "grad_norm": 0.94140625, + "learning_rate": 6.430953969312875e-05, + "loss": 0.9272, + "step": 964 + }, + { + "epoch": 0.13, + "grad_norm": 0.95703125, + "learning_rate": 6.437625083388926e-05, + "loss": 0.6819, + "step": 965 + }, + { + "epoch": 0.13, + "grad_norm": 1.0390625, + "learning_rate": 6.444296197464978e-05, + "loss": 1.5416, + "step": 966 + }, + { + "epoch": 0.13, + "grad_norm": 0.9296875, + "learning_rate": 6.450967311541027e-05, + "loss": 0.7363, + "step": 967 + }, + { + "epoch": 0.13, + "grad_norm": 1.109375, + "learning_rate": 6.457638425617079e-05, + "loss": 0.9263, + "step": 968 + }, + { + "epoch": 0.13, + "grad_norm": 0.73046875, + "learning_rate": 6.464309539693129e-05, + "loss": 0.7062, + "step": 969 + }, + { + "epoch": 0.13, + "grad_norm": 0.98046875, + "learning_rate": 6.47098065376918e-05, + "loss": 0.7197, + "step": 970 + }, + { + "epoch": 0.13, + "grad_norm": 0.99609375, + "learning_rate": 6.477651767845232e-05, + "loss": 0.9062, + "step": 971 + }, + { + "epoch": 0.13, + "grad_norm": 0.890625, + "learning_rate": 6.484322881921281e-05, + "loss": 0.9632, + "step": 972 + }, + { + "epoch": 0.13, + "grad_norm": 0.83984375, + "learning_rate": 6.490993995997333e-05, + "loss": 0.8655, + "step": 973 + }, + { + "epoch": 0.13, + "grad_norm": 0.69921875, + "learning_rate": 6.497665110073382e-05, + "loss": 0.6861, + "step": 974 + }, + { + "epoch": 0.13, + "grad_norm": 1.140625, + "learning_rate": 6.504336224149434e-05, + "loss": 0.7996, + "step": 975 + }, + { + "epoch": 0.13, + "grad_norm": 0.66015625, + "learning_rate": 6.511007338225484e-05, + "loss": 0.6798, + "step": 976 + }, + { + "epoch": 0.13, + "grad_norm": 1.1171875, + "learning_rate": 6.517678452301535e-05, + "loss": 1.0638, + "step": 977 + }, + { + "epoch": 0.13, + "grad_norm": 0.80859375, + "learning_rate": 6.524349566377585e-05, + "loss": 0.9395, + "step": 978 + }, + { + "epoch": 0.13, + "grad_norm": 0.9765625, + "learning_rate": 6.531020680453636e-05, + "loss": 0.8119, + "step": 979 + }, + { + "epoch": 0.13, + "grad_norm": 1.015625, + "learning_rate": 6.537691794529686e-05, + "loss": 1.1366, + "step": 980 + }, + { + "epoch": 0.13, + "grad_norm": 1.0078125, + "learning_rate": 6.544362908605738e-05, + "loss": 0.6522, + "step": 981 + }, + { + "epoch": 0.13, + "grad_norm": 0.83984375, + "learning_rate": 6.551034022681788e-05, + "loss": 0.9519, + "step": 982 + }, + { + "epoch": 0.13, + "grad_norm": 0.86328125, + "learning_rate": 6.557705136757839e-05, + "loss": 1.1091, + "step": 983 + }, + { + "epoch": 0.13, + "grad_norm": 1.1015625, + "learning_rate": 6.56437625083389e-05, + "loss": 0.7684, + "step": 984 + }, + { + "epoch": 0.13, + "grad_norm": 0.79296875, + "learning_rate": 6.57104736490994e-05, + "loss": 0.8215, + "step": 985 + }, + { + "epoch": 0.13, + "grad_norm": 0.8046875, + "learning_rate": 6.577718478985992e-05, + "loss": 0.6688, + "step": 986 + }, + { + "epoch": 0.13, + "grad_norm": 0.765625, + "learning_rate": 6.584389593062041e-05, + "loss": 0.7081, + "step": 987 + }, + { + "epoch": 0.13, + "grad_norm": 0.79296875, + "learning_rate": 6.591060707138093e-05, + "loss": 1.0104, + "step": 988 + }, + { + "epoch": 0.13, + "grad_norm": 0.7890625, + "learning_rate": 6.597731821214142e-05, + "loss": 0.951, + "step": 989 + }, + { + "epoch": 0.13, + "grad_norm": 0.8203125, + "learning_rate": 6.604402935290194e-05, + "loss": 1.183, + "step": 990 + }, + { + "epoch": 0.13, + "grad_norm": 1.078125, + "learning_rate": 6.611074049366244e-05, + "loss": 1.0495, + "step": 991 + }, + { + "epoch": 0.13, + "grad_norm": 1.2578125, + "learning_rate": 6.617745163442295e-05, + "loss": 0.8715, + "step": 992 + }, + { + "epoch": 0.13, + "grad_norm": 1.8125, + "learning_rate": 6.624416277518347e-05, + "loss": 0.8534, + "step": 993 + }, + { + "epoch": 0.13, + "grad_norm": 0.8359375, + "learning_rate": 6.631087391594396e-05, + "loss": 0.7531, + "step": 994 + }, + { + "epoch": 0.13, + "grad_norm": 0.89453125, + "learning_rate": 6.637758505670448e-05, + "loss": 0.9426, + "step": 995 + }, + { + "epoch": 0.13, + "grad_norm": 0.90625, + "learning_rate": 6.644429619746498e-05, + "loss": 1.0416, + "step": 996 + }, + { + "epoch": 0.13, + "grad_norm": 0.8046875, + "learning_rate": 6.651100733822549e-05, + "loss": 0.9612, + "step": 997 + }, + { + "epoch": 0.13, + "grad_norm": 0.76953125, + "learning_rate": 6.657771847898599e-05, + "loss": 0.8534, + "step": 998 + }, + { + "epoch": 0.13, + "grad_norm": 0.921875, + "learning_rate": 6.66444296197465e-05, + "loss": 1.0133, + "step": 999 + }, + { + "epoch": 0.13, + "grad_norm": 0.96484375, + "learning_rate": 6.6711140760507e-05, + "loss": 1.035, + "step": 1000 + }, + { + "epoch": 0.13, + "grad_norm": 0.78125, + "learning_rate": 6.677785190126752e-05, + "loss": 0.8234, + "step": 1001 + }, + { + "epoch": 0.13, + "grad_norm": 0.78125, + "learning_rate": 6.684456304202803e-05, + "loss": 1.0226, + "step": 1002 + }, + { + "epoch": 0.13, + "grad_norm": 0.79296875, + "learning_rate": 6.691127418278853e-05, + "loss": 0.8366, + "step": 1003 + }, + { + "epoch": 0.13, + "grad_norm": 0.76953125, + "learning_rate": 6.697798532354904e-05, + "loss": 0.6109, + "step": 1004 + }, + { + "epoch": 0.13, + "grad_norm": 0.734375, + "learning_rate": 6.704469646430954e-05, + "loss": 0.5566, + "step": 1005 + }, + { + "epoch": 0.13, + "grad_norm": 0.84375, + "learning_rate": 6.711140760507006e-05, + "loss": 0.979, + "step": 1006 + }, + { + "epoch": 0.13, + "grad_norm": 0.671875, + "learning_rate": 6.717811874583055e-05, + "loss": 1.0015, + "step": 1007 + }, + { + "epoch": 0.13, + "grad_norm": 0.828125, + "learning_rate": 6.724482988659107e-05, + "loss": 0.5012, + "step": 1008 + }, + { + "epoch": 0.13, + "grad_norm": 0.94140625, + "learning_rate": 6.731154102735156e-05, + "loss": 0.7588, + "step": 1009 + }, + { + "epoch": 0.13, + "grad_norm": 0.72265625, + "learning_rate": 6.737825216811208e-05, + "loss": 0.7841, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 0.92578125, + "learning_rate": 6.744496330887258e-05, + "loss": 0.98, + "step": 1011 + }, + { + "epoch": 0.14, + "grad_norm": 1.0390625, + "learning_rate": 6.751167444963309e-05, + "loss": 0.5854, + "step": 1012 + }, + { + "epoch": 0.14, + "grad_norm": 0.83203125, + "learning_rate": 6.75783855903936e-05, + "loss": 0.8342, + "step": 1013 + }, + { + "epoch": 0.14, + "grad_norm": 0.96484375, + "learning_rate": 6.76450967311541e-05, + "loss": 0.9684, + "step": 1014 + }, + { + "epoch": 0.14, + "grad_norm": 0.76953125, + "learning_rate": 6.771180787191462e-05, + "loss": 1.0379, + "step": 1015 + }, + { + "epoch": 0.14, + "grad_norm": 0.70703125, + "learning_rate": 6.777851901267512e-05, + "loss": 0.7707, + "step": 1016 + }, + { + "epoch": 0.14, + "grad_norm": 0.81640625, + "learning_rate": 6.784523015343563e-05, + "loss": 0.4938, + "step": 1017 + }, + { + "epoch": 0.14, + "grad_norm": 0.72265625, + "learning_rate": 6.791194129419613e-05, + "loss": 0.7213, + "step": 1018 + }, + { + "epoch": 0.14, + "grad_norm": 1.0234375, + "learning_rate": 6.797865243495664e-05, + "loss": 0.7672, + "step": 1019 + }, + { + "epoch": 0.14, + "grad_norm": 0.875, + "learning_rate": 6.804536357571714e-05, + "loss": 0.9702, + "step": 1020 + }, + { + "epoch": 0.14, + "grad_norm": 0.95703125, + "learning_rate": 6.811207471647766e-05, + "loss": 0.7612, + "step": 1021 + }, + { + "epoch": 0.14, + "grad_norm": 0.69921875, + "learning_rate": 6.817878585723815e-05, + "loss": 1.0343, + "step": 1022 + }, + { + "epoch": 0.14, + "grad_norm": 0.97265625, + "learning_rate": 6.824549699799867e-05, + "loss": 0.7846, + "step": 1023 + }, + { + "epoch": 0.14, + "grad_norm": 1.0234375, + "learning_rate": 6.831220813875918e-05, + "loss": 0.9598, + "step": 1024 + }, + { + "epoch": 0.14, + "grad_norm": 0.95703125, + "learning_rate": 6.837891927951968e-05, + "loss": 0.7678, + "step": 1025 + }, + { + "epoch": 0.14, + "grad_norm": 0.98828125, + "learning_rate": 6.844563042028019e-05, + "loss": 0.7673, + "step": 1026 + }, + { + "epoch": 0.14, + "grad_norm": 0.71484375, + "learning_rate": 6.851234156104069e-05, + "loss": 0.7732, + "step": 1027 + }, + { + "epoch": 0.14, + "grad_norm": 0.8671875, + "learning_rate": 6.857905270180121e-05, + "loss": 0.9293, + "step": 1028 + }, + { + "epoch": 0.14, + "grad_norm": 0.9296875, + "learning_rate": 6.86457638425617e-05, + "loss": 0.9665, + "step": 1029 + }, + { + "epoch": 0.14, + "grad_norm": 0.77734375, + "learning_rate": 6.871247498332222e-05, + "loss": 1.0983, + "step": 1030 + }, + { + "epoch": 0.14, + "grad_norm": 0.9609375, + "learning_rate": 6.877918612408273e-05, + "loss": 0.9227, + "step": 1031 + }, + { + "epoch": 0.14, + "grad_norm": 0.7890625, + "learning_rate": 6.884589726484323e-05, + "loss": 0.6671, + "step": 1032 + }, + { + "epoch": 0.14, + "grad_norm": 0.6328125, + "learning_rate": 6.891260840560374e-05, + "loss": 0.9541, + "step": 1033 + }, + { + "epoch": 0.14, + "grad_norm": 0.703125, + "learning_rate": 6.897931954636425e-05, + "loss": 0.912, + "step": 1034 + }, + { + "epoch": 0.14, + "grad_norm": 0.94140625, + "learning_rate": 6.904603068712475e-05, + "loss": 0.7434, + "step": 1035 + }, + { + "epoch": 0.14, + "grad_norm": 0.984375, + "learning_rate": 6.911274182788526e-05, + "loss": 1.0538, + "step": 1036 + }, + { + "epoch": 0.14, + "grad_norm": 0.82421875, + "learning_rate": 6.917945296864577e-05, + "loss": 0.7025, + "step": 1037 + }, + { + "epoch": 0.14, + "grad_norm": 1.09375, + "learning_rate": 6.924616410940627e-05, + "loss": 0.7339, + "step": 1038 + }, + { + "epoch": 0.14, + "grad_norm": 0.9296875, + "learning_rate": 6.93128752501668e-05, + "loss": 0.8016, + "step": 1039 + }, + { + "epoch": 0.14, + "grad_norm": 1.0859375, + "learning_rate": 6.937958639092728e-05, + "loss": 1.0606, + "step": 1040 + }, + { + "epoch": 0.14, + "grad_norm": 0.74609375, + "learning_rate": 6.94462975316878e-05, + "loss": 0.7929, + "step": 1041 + }, + { + "epoch": 0.14, + "grad_norm": 1.078125, + "learning_rate": 6.95130086724483e-05, + "loss": 0.6075, + "step": 1042 + }, + { + "epoch": 0.14, + "grad_norm": 0.96484375, + "learning_rate": 6.957971981320881e-05, + "loss": 1.0767, + "step": 1043 + }, + { + "epoch": 0.14, + "grad_norm": 0.828125, + "learning_rate": 6.964643095396932e-05, + "loss": 0.9811, + "step": 1044 + }, + { + "epoch": 0.14, + "grad_norm": 0.89453125, + "learning_rate": 6.971314209472982e-05, + "loss": 0.6327, + "step": 1045 + }, + { + "epoch": 0.14, + "grad_norm": 0.95703125, + "learning_rate": 6.977985323549033e-05, + "loss": 0.8293, + "step": 1046 + }, + { + "epoch": 0.14, + "grad_norm": 1.2890625, + "learning_rate": 6.984656437625083e-05, + "loss": 1.203, + "step": 1047 + }, + { + "epoch": 0.14, + "grad_norm": 0.92578125, + "learning_rate": 6.991327551701134e-05, + "loss": 0.8429, + "step": 1048 + }, + { + "epoch": 0.14, + "grad_norm": 0.71875, + "learning_rate": 6.997998665777186e-05, + "loss": 0.9663, + "step": 1049 + }, + { + "epoch": 0.14, + "grad_norm": 0.7421875, + "learning_rate": 7.004669779853236e-05, + "loss": 1.049, + "step": 1050 + }, + { + "epoch": 0.14, + "grad_norm": 0.83984375, + "learning_rate": 7.011340893929287e-05, + "loss": 0.8066, + "step": 1051 + }, + { + "epoch": 0.14, + "grad_norm": 0.79296875, + "learning_rate": 7.018012008005337e-05, + "loss": 0.8256, + "step": 1052 + }, + { + "epoch": 0.14, + "grad_norm": 0.93359375, + "learning_rate": 7.024683122081388e-05, + "loss": 0.7977, + "step": 1053 + }, + { + "epoch": 0.14, + "grad_norm": 0.7890625, + "learning_rate": 7.03135423615744e-05, + "loss": 0.9507, + "step": 1054 + }, + { + "epoch": 0.14, + "grad_norm": 1.2890625, + "learning_rate": 7.038025350233489e-05, + "loss": 0.97, + "step": 1055 + }, + { + "epoch": 0.14, + "grad_norm": 0.6484375, + "learning_rate": 7.04469646430954e-05, + "loss": 0.8135, + "step": 1056 + }, + { + "epoch": 0.14, + "grad_norm": 0.91015625, + "learning_rate": 7.05136757838559e-05, + "loss": 0.8539, + "step": 1057 + }, + { + "epoch": 0.14, + "grad_norm": 0.7109375, + "learning_rate": 7.058038692461642e-05, + "loss": 0.8288, + "step": 1058 + }, + { + "epoch": 0.14, + "grad_norm": 0.69921875, + "learning_rate": 7.064709806537692e-05, + "loss": 0.8568, + "step": 1059 + }, + { + "epoch": 0.14, + "grad_norm": 0.7734375, + "learning_rate": 7.071380920613743e-05, + "loss": 1.0093, + "step": 1060 + }, + { + "epoch": 0.14, + "grad_norm": 0.87109375, + "learning_rate": 7.078052034689794e-05, + "loss": 0.9183, + "step": 1061 + }, + { + "epoch": 0.14, + "grad_norm": 0.76171875, + "learning_rate": 7.084723148765844e-05, + "loss": 1.1289, + "step": 1062 + }, + { + "epoch": 0.14, + "grad_norm": 1.40625, + "learning_rate": 7.091394262841895e-05, + "loss": 1.0194, + "step": 1063 + }, + { + "epoch": 0.14, + "grad_norm": 1.09375, + "learning_rate": 7.098065376917946e-05, + "loss": 0.539, + "step": 1064 + }, + { + "epoch": 0.14, + "grad_norm": 1.015625, + "learning_rate": 7.104736490993996e-05, + "loss": 0.9647, + "step": 1065 + }, + { + "epoch": 0.14, + "grad_norm": 0.9609375, + "learning_rate": 7.111407605070047e-05, + "loss": 1.126, + "step": 1066 + }, + { + "epoch": 0.14, + "grad_norm": 0.734375, + "learning_rate": 7.118078719146098e-05, + "loss": 0.8536, + "step": 1067 + }, + { + "epoch": 0.14, + "grad_norm": 0.95703125, + "learning_rate": 7.124749833222148e-05, + "loss": 1.0475, + "step": 1068 + }, + { + "epoch": 0.14, + "grad_norm": 0.97265625, + "learning_rate": 7.1314209472982e-05, + "loss": 0.8899, + "step": 1069 + }, + { + "epoch": 0.14, + "grad_norm": 0.8125, + "learning_rate": 7.138092061374249e-05, + "loss": 0.8447, + "step": 1070 + }, + { + "epoch": 0.14, + "grad_norm": 0.99609375, + "learning_rate": 7.144763175450301e-05, + "loss": 0.6864, + "step": 1071 + }, + { + "epoch": 0.14, + "grad_norm": 0.93359375, + "learning_rate": 7.151434289526351e-05, + "loss": 0.8801, + "step": 1072 + }, + { + "epoch": 0.14, + "grad_norm": 1.0, + "learning_rate": 7.158105403602402e-05, + "loss": 0.6546, + "step": 1073 + }, + { + "epoch": 0.14, + "grad_norm": 0.93359375, + "learning_rate": 7.164776517678454e-05, + "loss": 0.5623, + "step": 1074 + }, + { + "epoch": 0.14, + "grad_norm": 0.80859375, + "learning_rate": 7.171447631754503e-05, + "loss": 0.7849, + "step": 1075 + }, + { + "epoch": 0.14, + "grad_norm": 0.74609375, + "learning_rate": 7.178118745830555e-05, + "loss": 0.9458, + "step": 1076 + }, + { + "epoch": 0.14, + "grad_norm": 0.8515625, + "learning_rate": 7.184789859906604e-05, + "loss": 0.9366, + "step": 1077 + }, + { + "epoch": 0.14, + "grad_norm": 1.0390625, + "learning_rate": 7.191460973982656e-05, + "loss": 0.9385, + "step": 1078 + }, + { + "epoch": 0.14, + "grad_norm": 0.84375, + "learning_rate": 7.198132088058706e-05, + "loss": 0.7306, + "step": 1079 + }, + { + "epoch": 0.14, + "grad_norm": 0.80078125, + "learning_rate": 7.204803202134757e-05, + "loss": 0.6564, + "step": 1080 + }, + { + "epoch": 0.14, + "grad_norm": 0.8203125, + "learning_rate": 7.211474316210807e-05, + "loss": 0.6848, + "step": 1081 + }, + { + "epoch": 0.14, + "grad_norm": 0.7421875, + "learning_rate": 7.218145430286858e-05, + "loss": 0.7737, + "step": 1082 + }, + { + "epoch": 0.14, + "grad_norm": 0.73828125, + "learning_rate": 7.22481654436291e-05, + "loss": 0.989, + "step": 1083 + }, + { + "epoch": 0.14, + "grad_norm": 1.59375, + "learning_rate": 7.23148765843896e-05, + "loss": 0.9863, + "step": 1084 + }, + { + "epoch": 0.14, + "grad_norm": 1.5546875, + "learning_rate": 7.23815877251501e-05, + "loss": 1.0107, + "step": 1085 + }, + { + "epoch": 0.14, + "grad_norm": 0.91015625, + "learning_rate": 7.244829886591061e-05, + "loss": 0.8379, + "step": 1086 + }, + { + "epoch": 0.15, + "grad_norm": 0.859375, + "learning_rate": 7.251501000667112e-05, + "loss": 0.765, + "step": 1087 + }, + { + "epoch": 0.15, + "grad_norm": 1.03125, + "learning_rate": 7.258172114743162e-05, + "loss": 0.7939, + "step": 1088 + }, + { + "epoch": 0.15, + "grad_norm": 0.8203125, + "learning_rate": 7.264843228819214e-05, + "loss": 0.6133, + "step": 1089 + }, + { + "epoch": 0.15, + "grad_norm": 1.0546875, + "learning_rate": 7.271514342895263e-05, + "loss": 0.7577, + "step": 1090 + }, + { + "epoch": 0.15, + "grad_norm": 0.73046875, + "learning_rate": 7.278185456971315e-05, + "loss": 0.8741, + "step": 1091 + }, + { + "epoch": 0.15, + "grad_norm": 1.0390625, + "learning_rate": 7.284856571047366e-05, + "loss": 0.8782, + "step": 1092 + }, + { + "epoch": 0.15, + "grad_norm": 1.1875, + "learning_rate": 7.291527685123416e-05, + "loss": 0.8059, + "step": 1093 + }, + { + "epoch": 0.15, + "grad_norm": 0.70703125, + "learning_rate": 7.298198799199467e-05, + "loss": 0.5395, + "step": 1094 + }, + { + "epoch": 0.15, + "grad_norm": 0.6875, + "learning_rate": 7.304869913275517e-05, + "loss": 0.8685, + "step": 1095 + }, + { + "epoch": 0.15, + "grad_norm": 1.140625, + "learning_rate": 7.311541027351569e-05, + "loss": 1.0782, + "step": 1096 + }, + { + "epoch": 0.15, + "grad_norm": 0.80859375, + "learning_rate": 7.318212141427618e-05, + "loss": 0.8774, + "step": 1097 + }, + { + "epoch": 0.15, + "grad_norm": 0.96484375, + "learning_rate": 7.32488325550367e-05, + "loss": 1.0516, + "step": 1098 + }, + { + "epoch": 0.15, + "grad_norm": 0.9296875, + "learning_rate": 7.33155436957972e-05, + "loss": 0.9155, + "step": 1099 + }, + { + "epoch": 0.15, + "grad_norm": 0.99609375, + "learning_rate": 7.338225483655771e-05, + "loss": 0.8571, + "step": 1100 + }, + { + "epoch": 0.15, + "grad_norm": 0.87890625, + "learning_rate": 7.344896597731821e-05, + "loss": 1.1734, + "step": 1101 + }, + { + "epoch": 0.15, + "grad_norm": 0.58984375, + "learning_rate": 7.351567711807872e-05, + "loss": 0.8494, + "step": 1102 + }, + { + "epoch": 0.15, + "grad_norm": 1.0078125, + "learning_rate": 7.358238825883922e-05, + "loss": 0.9193, + "step": 1103 + }, + { + "epoch": 0.15, + "grad_norm": 0.734375, + "learning_rate": 7.364909939959974e-05, + "loss": 0.7432, + "step": 1104 + }, + { + "epoch": 0.15, + "grad_norm": 0.7109375, + "learning_rate": 7.371581054036025e-05, + "loss": 0.7978, + "step": 1105 + }, + { + "epoch": 0.15, + "grad_norm": 0.75390625, + "learning_rate": 7.378252168112075e-05, + "loss": 0.9106, + "step": 1106 + }, + { + "epoch": 0.15, + "grad_norm": 0.84765625, + "learning_rate": 7.384923282188126e-05, + "loss": 0.9181, + "step": 1107 + }, + { + "epoch": 0.15, + "grad_norm": 0.8359375, + "learning_rate": 7.391594396264176e-05, + "loss": 1.0865, + "step": 1108 + }, + { + "epoch": 0.15, + "grad_norm": 0.734375, + "learning_rate": 7.398265510340228e-05, + "loss": 0.8498, + "step": 1109 + }, + { + "epoch": 0.15, + "grad_norm": 0.8515625, + "learning_rate": 7.404936624416277e-05, + "loss": 1.0243, + "step": 1110 + }, + { + "epoch": 0.15, + "grad_norm": 0.7421875, + "learning_rate": 7.411607738492329e-05, + "loss": 0.5925, + "step": 1111 + }, + { + "epoch": 0.15, + "grad_norm": 0.71875, + "learning_rate": 7.418278852568378e-05, + "loss": 0.8494, + "step": 1112 + }, + { + "epoch": 0.15, + "grad_norm": 0.6796875, + "learning_rate": 7.42494996664443e-05, + "loss": 0.8516, + "step": 1113 + }, + { + "epoch": 0.15, + "grad_norm": 0.84765625, + "learning_rate": 7.43162108072048e-05, + "loss": 0.8986, + "step": 1114 + }, + { + "epoch": 0.15, + "grad_norm": 0.8125, + "learning_rate": 7.438292194796531e-05, + "loss": 0.7019, + "step": 1115 + }, + { + "epoch": 0.15, + "grad_norm": 0.8828125, + "learning_rate": 7.444963308872582e-05, + "loss": 0.8806, + "step": 1116 + }, + { + "epoch": 0.15, + "grad_norm": 0.95703125, + "learning_rate": 7.451634422948632e-05, + "loss": 0.6033, + "step": 1117 + }, + { + "epoch": 0.15, + "grad_norm": 0.76171875, + "learning_rate": 7.458305537024684e-05, + "loss": 0.8387, + "step": 1118 + }, + { + "epoch": 0.15, + "grad_norm": 0.94140625, + "learning_rate": 7.464976651100735e-05, + "loss": 0.9074, + "step": 1119 + }, + { + "epoch": 0.15, + "grad_norm": 1.265625, + "learning_rate": 7.471647765176785e-05, + "loss": 0.7831, + "step": 1120 + }, + { + "epoch": 0.15, + "grad_norm": 1.109375, + "learning_rate": 7.478318879252836e-05, + "loss": 1.0368, + "step": 1121 + }, + { + "epoch": 0.15, + "grad_norm": 0.88671875, + "learning_rate": 7.484989993328886e-05, + "loss": 0.7129, + "step": 1122 + }, + { + "epoch": 0.15, + "grad_norm": 0.78515625, + "learning_rate": 7.491661107404937e-05, + "loss": 0.8432, + "step": 1123 + }, + { + "epoch": 0.15, + "grad_norm": 0.953125, + "learning_rate": 7.498332221480988e-05, + "loss": 0.835, + "step": 1124 + }, + { + "epoch": 0.15, + "grad_norm": 0.8125, + "learning_rate": 7.505003335557038e-05, + "loss": 0.9652, + "step": 1125 + }, + { + "epoch": 0.15, + "grad_norm": 0.87109375, + "learning_rate": 7.51167444963309e-05, + "loss": 0.8715, + "step": 1126 + }, + { + "epoch": 0.15, + "grad_norm": 0.65234375, + "learning_rate": 7.51834556370914e-05, + "loss": 0.6734, + "step": 1127 + }, + { + "epoch": 0.15, + "grad_norm": 0.76171875, + "learning_rate": 7.52501667778519e-05, + "loss": 0.7266, + "step": 1128 + }, + { + "epoch": 0.15, + "grad_norm": 0.8828125, + "learning_rate": 7.531687791861242e-05, + "loss": 0.7918, + "step": 1129 + }, + { + "epoch": 0.15, + "grad_norm": 0.74609375, + "learning_rate": 7.538358905937291e-05, + "loss": 1.1213, + "step": 1130 + }, + { + "epoch": 0.15, + "grad_norm": 0.83203125, + "learning_rate": 7.545030020013343e-05, + "loss": 0.518, + "step": 1131 + }, + { + "epoch": 0.15, + "grad_norm": 0.87109375, + "learning_rate": 7.551701134089392e-05, + "loss": 0.9162, + "step": 1132 + }, + { + "epoch": 0.15, + "grad_norm": 0.78515625, + "learning_rate": 7.558372248165444e-05, + "loss": 0.868, + "step": 1133 + }, + { + "epoch": 0.15, + "grad_norm": 0.8359375, + "learning_rate": 7.565043362241495e-05, + "loss": 0.8914, + "step": 1134 + }, + { + "epoch": 0.15, + "grad_norm": 0.9140625, + "learning_rate": 7.571714476317545e-05, + "loss": 0.5751, + "step": 1135 + }, + { + "epoch": 0.15, + "grad_norm": 0.87890625, + "learning_rate": 7.578385590393596e-05, + "loss": 0.8383, + "step": 1136 + }, + { + "epoch": 0.15, + "grad_norm": 1.5625, + "learning_rate": 7.585056704469646e-05, + "loss": 0.9733, + "step": 1137 + }, + { + "epoch": 0.15, + "grad_norm": 0.90234375, + "learning_rate": 7.591727818545697e-05, + "loss": 1.051, + "step": 1138 + }, + { + "epoch": 0.15, + "grad_norm": 0.890625, + "learning_rate": 7.598398932621749e-05, + "loss": 0.624, + "step": 1139 + }, + { + "epoch": 0.15, + "grad_norm": 0.90625, + "learning_rate": 7.605070046697799e-05, + "loss": 0.9206, + "step": 1140 + }, + { + "epoch": 0.15, + "grad_norm": 0.8671875, + "learning_rate": 7.61174116077385e-05, + "loss": 0.8236, + "step": 1141 + }, + { + "epoch": 0.15, + "grad_norm": 0.83203125, + "learning_rate": 7.6184122748499e-05, + "loss": 1.1501, + "step": 1142 + }, + { + "epoch": 0.15, + "grad_norm": 0.8046875, + "learning_rate": 7.625083388925951e-05, + "loss": 0.9654, + "step": 1143 + }, + { + "epoch": 0.15, + "grad_norm": 0.796875, + "learning_rate": 7.631754503002003e-05, + "loss": 0.9319, + "step": 1144 + }, + { + "epoch": 0.15, + "grad_norm": 0.6953125, + "learning_rate": 7.638425617078052e-05, + "loss": 1.147, + "step": 1145 + }, + { + "epoch": 0.15, + "grad_norm": 0.69140625, + "learning_rate": 7.645096731154104e-05, + "loss": 0.6825, + "step": 1146 + }, + { + "epoch": 0.15, + "grad_norm": 1.0234375, + "learning_rate": 7.651767845230153e-05, + "loss": 0.7406, + "step": 1147 + }, + { + "epoch": 0.15, + "grad_norm": 0.8984375, + "learning_rate": 7.658438959306205e-05, + "loss": 0.8594, + "step": 1148 + }, + { + "epoch": 0.15, + "grad_norm": 0.6953125, + "learning_rate": 7.665110073382255e-05, + "loss": 0.7437, + "step": 1149 + }, + { + "epoch": 0.15, + "grad_norm": 0.75, + "learning_rate": 7.671781187458306e-05, + "loss": 0.777, + "step": 1150 + }, + { + "epoch": 0.15, + "grad_norm": 0.75, + "learning_rate": 7.678452301534357e-05, + "loss": 0.9251, + "step": 1151 + }, + { + "epoch": 0.15, + "grad_norm": 1.3046875, + "learning_rate": 7.685123415610407e-05, + "loss": 0.7129, + "step": 1152 + }, + { + "epoch": 0.15, + "grad_norm": 0.8671875, + "learning_rate": 7.691794529686458e-05, + "loss": 0.7777, + "step": 1153 + }, + { + "epoch": 0.15, + "grad_norm": 0.73828125, + "learning_rate": 7.698465643762509e-05, + "loss": 0.5872, + "step": 1154 + }, + { + "epoch": 0.15, + "grad_norm": 0.81640625, + "learning_rate": 7.70513675783856e-05, + "loss": 0.9663, + "step": 1155 + }, + { + "epoch": 0.15, + "grad_norm": 0.7890625, + "learning_rate": 7.71180787191461e-05, + "loss": 0.6157, + "step": 1156 + }, + { + "epoch": 0.15, + "grad_norm": 0.9375, + "learning_rate": 7.71847898599066e-05, + "loss": 0.7714, + "step": 1157 + }, + { + "epoch": 0.15, + "grad_norm": 1.109375, + "learning_rate": 7.725150100066711e-05, + "loss": 0.8707, + "step": 1158 + }, + { + "epoch": 0.15, + "grad_norm": 0.859375, + "learning_rate": 7.731821214142763e-05, + "loss": 0.8812, + "step": 1159 + }, + { + "epoch": 0.15, + "grad_norm": 0.875, + "learning_rate": 7.738492328218813e-05, + "loss": 0.8493, + "step": 1160 + }, + { + "epoch": 0.15, + "grad_norm": 0.80859375, + "learning_rate": 7.745163442294864e-05, + "loss": 0.771, + "step": 1161 + }, + { + "epoch": 0.16, + "grad_norm": 1.078125, + "learning_rate": 7.751834556370914e-05, + "loss": 0.7799, + "step": 1162 + }, + { + "epoch": 0.16, + "grad_norm": 1.0546875, + "learning_rate": 7.758505670446965e-05, + "loss": 0.7924, + "step": 1163 + }, + { + "epoch": 0.16, + "grad_norm": 0.92578125, + "learning_rate": 7.765176784523017e-05, + "loss": 0.9598, + "step": 1164 + }, + { + "epoch": 0.16, + "grad_norm": 1.1640625, + "learning_rate": 7.771847898599066e-05, + "loss": 0.7236, + "step": 1165 + }, + { + "epoch": 0.16, + "grad_norm": 0.859375, + "learning_rate": 7.778519012675118e-05, + "loss": 1.0738, + "step": 1166 + }, + { + "epoch": 0.16, + "grad_norm": 0.88671875, + "learning_rate": 7.785190126751167e-05, + "loss": 0.6976, + "step": 1167 + }, + { + "epoch": 0.16, + "grad_norm": 0.7890625, + "learning_rate": 7.791861240827219e-05, + "loss": 0.8993, + "step": 1168 + }, + { + "epoch": 0.16, + "grad_norm": 0.93359375, + "learning_rate": 7.798532354903269e-05, + "loss": 0.5729, + "step": 1169 + }, + { + "epoch": 0.16, + "grad_norm": 0.9296875, + "learning_rate": 7.80520346897932e-05, + "loss": 1.097, + "step": 1170 + }, + { + "epoch": 0.16, + "grad_norm": 0.875, + "learning_rate": 7.81187458305537e-05, + "loss": 0.7524, + "step": 1171 + }, + { + "epoch": 0.16, + "grad_norm": 0.7890625, + "learning_rate": 7.818545697131421e-05, + "loss": 0.9188, + "step": 1172 + }, + { + "epoch": 0.16, + "grad_norm": 0.765625, + "learning_rate": 7.825216811207473e-05, + "loss": 0.9011, + "step": 1173 + }, + { + "epoch": 0.16, + "grad_norm": 0.90625, + "learning_rate": 7.831887925283523e-05, + "loss": 0.7059, + "step": 1174 + }, + { + "epoch": 0.16, + "grad_norm": 0.7578125, + "learning_rate": 7.838559039359574e-05, + "loss": 1.0366, + "step": 1175 + }, + { + "epoch": 0.16, + "grad_norm": 0.7578125, + "learning_rate": 7.845230153435624e-05, + "loss": 0.8552, + "step": 1176 + }, + { + "epoch": 0.16, + "grad_norm": 0.91015625, + "learning_rate": 7.851901267511675e-05, + "loss": 1.0531, + "step": 1177 + }, + { + "epoch": 0.16, + "grad_norm": 0.95703125, + "learning_rate": 7.858572381587725e-05, + "loss": 0.8077, + "step": 1178 + }, + { + "epoch": 0.16, + "grad_norm": 0.96484375, + "learning_rate": 7.865243495663777e-05, + "loss": 0.8183, + "step": 1179 + }, + { + "epoch": 0.16, + "grad_norm": 0.9921875, + "learning_rate": 7.871914609739826e-05, + "loss": 1.0385, + "step": 1180 + }, + { + "epoch": 0.16, + "grad_norm": 0.625, + "learning_rate": 7.878585723815878e-05, + "loss": 1.0096, + "step": 1181 + }, + { + "epoch": 0.16, + "grad_norm": 0.76953125, + "learning_rate": 7.885256837891928e-05, + "loss": 0.7488, + "step": 1182 + }, + { + "epoch": 0.16, + "grad_norm": 1.0390625, + "learning_rate": 7.891927951967979e-05, + "loss": 0.7755, + "step": 1183 + }, + { + "epoch": 0.16, + "grad_norm": 1.1875, + "learning_rate": 7.89859906604403e-05, + "loss": 1.0158, + "step": 1184 + }, + { + "epoch": 0.16, + "grad_norm": 0.9375, + "learning_rate": 7.90527018012008e-05, + "loss": 0.7816, + "step": 1185 + }, + { + "epoch": 0.16, + "grad_norm": 0.80859375, + "learning_rate": 7.911941294196132e-05, + "loss": 0.7487, + "step": 1186 + }, + { + "epoch": 0.16, + "grad_norm": 0.97265625, + "learning_rate": 7.918612408272181e-05, + "loss": 0.9624, + "step": 1187 + }, + { + "epoch": 0.16, + "grad_norm": 0.953125, + "learning_rate": 7.925283522348233e-05, + "loss": 1.1425, + "step": 1188 + }, + { + "epoch": 0.16, + "grad_norm": 1.1171875, + "learning_rate": 7.931954636424283e-05, + "loss": 0.7944, + "step": 1189 + }, + { + "epoch": 0.16, + "grad_norm": 0.890625, + "learning_rate": 7.938625750500334e-05, + "loss": 0.9318, + "step": 1190 + }, + { + "epoch": 0.16, + "grad_norm": 0.859375, + "learning_rate": 7.945296864576384e-05, + "loss": 0.7246, + "step": 1191 + }, + { + "epoch": 0.16, + "grad_norm": 0.875, + "learning_rate": 7.951967978652435e-05, + "loss": 1.0425, + "step": 1192 + }, + { + "epoch": 0.16, + "grad_norm": 0.9453125, + "learning_rate": 7.958639092728485e-05, + "loss": 0.6797, + "step": 1193 + }, + { + "epoch": 0.16, + "grad_norm": 0.8671875, + "learning_rate": 7.965310206804537e-05, + "loss": 0.9391, + "step": 1194 + }, + { + "epoch": 0.16, + "grad_norm": 0.7109375, + "learning_rate": 7.971981320880588e-05, + "loss": 0.8538, + "step": 1195 + }, + { + "epoch": 0.16, + "grad_norm": 0.92578125, + "learning_rate": 7.978652434956638e-05, + "loss": 1.0067, + "step": 1196 + }, + { + "epoch": 0.16, + "grad_norm": 0.734375, + "learning_rate": 7.985323549032689e-05, + "loss": 0.8298, + "step": 1197 + }, + { + "epoch": 0.16, + "grad_norm": 0.65234375, + "learning_rate": 7.991994663108739e-05, + "loss": 0.6079, + "step": 1198 + }, + { + "epoch": 0.16, + "grad_norm": 0.76953125, + "learning_rate": 7.998665777184791e-05, + "loss": 0.6557, + "step": 1199 + }, + { + "epoch": 0.16, + "grad_norm": 0.96484375, + "learning_rate": 8.00533689126084e-05, + "loss": 0.6425, + "step": 1200 + }, + { + "epoch": 0.16, + "grad_norm": 0.70703125, + "learning_rate": 8.012008005336892e-05, + "loss": 0.7544, + "step": 1201 + }, + { + "epoch": 0.16, + "grad_norm": 0.83203125, + "learning_rate": 8.018679119412941e-05, + "loss": 0.5654, + "step": 1202 + }, + { + "epoch": 0.16, + "grad_norm": 1.6015625, + "learning_rate": 8.025350233488993e-05, + "loss": 1.0013, + "step": 1203 + }, + { + "epoch": 0.16, + "grad_norm": 0.84375, + "learning_rate": 8.032021347565044e-05, + "loss": 0.6012, + "step": 1204 + }, + { + "epoch": 0.16, + "grad_norm": 1.109375, + "learning_rate": 8.038692461641094e-05, + "loss": 0.8255, + "step": 1205 + }, + { + "epoch": 0.16, + "grad_norm": 0.828125, + "learning_rate": 8.045363575717145e-05, + "loss": 0.6913, + "step": 1206 + }, + { + "epoch": 0.16, + "grad_norm": 0.953125, + "learning_rate": 8.052034689793195e-05, + "loss": 0.8901, + "step": 1207 + }, + { + "epoch": 0.16, + "grad_norm": 1.0546875, + "learning_rate": 8.058705803869247e-05, + "loss": 0.9427, + "step": 1208 + }, + { + "epoch": 0.16, + "grad_norm": 0.98828125, + "learning_rate": 8.065376917945297e-05, + "loss": 1.0314, + "step": 1209 + }, + { + "epoch": 0.16, + "grad_norm": 0.75, + "learning_rate": 8.072048032021348e-05, + "loss": 0.7785, + "step": 1210 + }, + { + "epoch": 0.16, + "grad_norm": 0.8046875, + "learning_rate": 8.078719146097398e-05, + "loss": 0.8256, + "step": 1211 + }, + { + "epoch": 0.16, + "grad_norm": 0.72265625, + "learning_rate": 8.085390260173449e-05, + "loss": 0.8223, + "step": 1212 + }, + { + "epoch": 0.16, + "grad_norm": 1.09375, + "learning_rate": 8.0920613742495e-05, + "loss": 0.7378, + "step": 1213 + }, + { + "epoch": 0.16, + "grad_norm": 0.75390625, + "learning_rate": 8.098732488325551e-05, + "loss": 0.8607, + "step": 1214 + }, + { + "epoch": 0.16, + "grad_norm": 1.0703125, + "learning_rate": 8.1054036024016e-05, + "loss": 0.6689, + "step": 1215 + }, + { + "epoch": 0.16, + "grad_norm": 0.7890625, + "learning_rate": 8.112074716477652e-05, + "loss": 0.627, + "step": 1216 + }, + { + "epoch": 0.16, + "grad_norm": 0.90625, + "learning_rate": 8.118745830553703e-05, + "loss": 0.8455, + "step": 1217 + }, + { + "epoch": 0.16, + "grad_norm": 1.1484375, + "learning_rate": 8.125416944629753e-05, + "loss": 0.8056, + "step": 1218 + }, + { + "epoch": 0.16, + "grad_norm": 0.78125, + "learning_rate": 8.132088058705805e-05, + "loss": 0.6757, + "step": 1219 + }, + { + "epoch": 0.16, + "grad_norm": 0.7578125, + "learning_rate": 8.138759172781854e-05, + "loss": 0.3719, + "step": 1220 + }, + { + "epoch": 0.16, + "grad_norm": 0.765625, + "learning_rate": 8.145430286857906e-05, + "loss": 0.7208, + "step": 1221 + }, + { + "epoch": 0.16, + "grad_norm": 0.9296875, + "learning_rate": 8.152101400933957e-05, + "loss": 1.0012, + "step": 1222 + }, + { + "epoch": 0.16, + "grad_norm": 1.546875, + "learning_rate": 8.158772515010007e-05, + "loss": 0.8011, + "step": 1223 + }, + { + "epoch": 0.16, + "grad_norm": 0.85546875, + "learning_rate": 8.165443629086058e-05, + "loss": 1.2408, + "step": 1224 + }, + { + "epoch": 0.16, + "grad_norm": 0.8984375, + "learning_rate": 8.172114743162108e-05, + "loss": 0.8405, + "step": 1225 + }, + { + "epoch": 0.16, + "grad_norm": 0.921875, + "learning_rate": 8.178785857238159e-05, + "loss": 0.8852, + "step": 1226 + }, + { + "epoch": 0.16, + "grad_norm": 1.03125, + "learning_rate": 8.18545697131421e-05, + "loss": 0.9185, + "step": 1227 + }, + { + "epoch": 0.16, + "grad_norm": 0.765625, + "learning_rate": 8.19212808539026e-05, + "loss": 1.0689, + "step": 1228 + }, + { + "epoch": 0.16, + "grad_norm": 0.81640625, + "learning_rate": 8.198799199466312e-05, + "loss": 0.6222, + "step": 1229 + }, + { + "epoch": 0.16, + "grad_norm": 0.9140625, + "learning_rate": 8.205470313542362e-05, + "loss": 0.5423, + "step": 1230 + }, + { + "epoch": 0.16, + "grad_norm": 0.85546875, + "learning_rate": 8.212141427618413e-05, + "loss": 0.6034, + "step": 1231 + }, + { + "epoch": 0.16, + "grad_norm": 0.76953125, + "learning_rate": 8.218812541694464e-05, + "loss": 1.1255, + "step": 1232 + }, + { + "epoch": 0.16, + "grad_norm": 1.015625, + "learning_rate": 8.225483655770514e-05, + "loss": 0.6806, + "step": 1233 + }, + { + "epoch": 0.16, + "grad_norm": 1.09375, + "learning_rate": 8.232154769846566e-05, + "loss": 0.9443, + "step": 1234 + }, + { + "epoch": 0.16, + "grad_norm": 0.828125, + "learning_rate": 8.238825883922615e-05, + "loss": 1.0352, + "step": 1235 + }, + { + "epoch": 0.16, + "grad_norm": 0.859375, + "learning_rate": 8.245496997998667e-05, + "loss": 0.6199, + "step": 1236 + }, + { + "epoch": 0.17, + "grad_norm": 0.75, + "learning_rate": 8.252168112074717e-05, + "loss": 1.2549, + "step": 1237 + }, + { + "epoch": 0.17, + "grad_norm": 0.79296875, + "learning_rate": 8.258839226150768e-05, + "loss": 0.6965, + "step": 1238 + }, + { + "epoch": 0.17, + "grad_norm": 0.94921875, + "learning_rate": 8.265510340226818e-05, + "loss": 0.7485, + "step": 1239 + }, + { + "epoch": 0.17, + "grad_norm": 0.7421875, + "learning_rate": 8.272181454302869e-05, + "loss": 0.6752, + "step": 1240 + }, + { + "epoch": 0.17, + "grad_norm": 0.66796875, + "learning_rate": 8.27885256837892e-05, + "loss": 0.4465, + "step": 1241 + }, + { + "epoch": 0.17, + "grad_norm": 0.9609375, + "learning_rate": 8.285523682454971e-05, + "loss": 0.7008, + "step": 1242 + }, + { + "epoch": 0.17, + "grad_norm": 0.97265625, + "learning_rate": 8.292194796531021e-05, + "loss": 0.9975, + "step": 1243 + }, + { + "epoch": 0.17, + "grad_norm": 1.1875, + "learning_rate": 8.298865910607072e-05, + "loss": 0.8899, + "step": 1244 + }, + { + "epoch": 0.17, + "grad_norm": 1.0390625, + "learning_rate": 8.305537024683122e-05, + "loss": 1.1432, + "step": 1245 + }, + { + "epoch": 0.17, + "grad_norm": 0.8515625, + "learning_rate": 8.312208138759173e-05, + "loss": 0.688, + "step": 1246 + }, + { + "epoch": 0.17, + "grad_norm": 0.84765625, + "learning_rate": 8.318879252835225e-05, + "loss": 0.6995, + "step": 1247 + }, + { + "epoch": 0.17, + "grad_norm": 0.81640625, + "learning_rate": 8.325550366911274e-05, + "loss": 1.0987, + "step": 1248 + }, + { + "epoch": 0.17, + "grad_norm": 0.890625, + "learning_rate": 8.332221480987326e-05, + "loss": 0.8578, + "step": 1249 + }, + { + "epoch": 0.17, + "grad_norm": 0.89453125, + "learning_rate": 8.338892595063376e-05, + "loss": 1.0011, + "step": 1250 + }, + { + "epoch": 0.17, + "grad_norm": 0.8359375, + "learning_rate": 8.345563709139427e-05, + "loss": 0.8523, + "step": 1251 + }, + { + "epoch": 0.17, + "grad_norm": 0.74609375, + "learning_rate": 8.352234823215477e-05, + "loss": 0.6632, + "step": 1252 + }, + { + "epoch": 0.17, + "grad_norm": 0.953125, + "learning_rate": 8.358905937291528e-05, + "loss": 0.6311, + "step": 1253 + }, + { + "epoch": 0.17, + "grad_norm": 0.81640625, + "learning_rate": 8.36557705136758e-05, + "loss": 0.6206, + "step": 1254 + }, + { + "epoch": 0.17, + "grad_norm": 0.8046875, + "learning_rate": 8.372248165443629e-05, + "loss": 0.8342, + "step": 1255 + }, + { + "epoch": 0.17, + "grad_norm": 0.6953125, + "learning_rate": 8.37891927951968e-05, + "loss": 0.6522, + "step": 1256 + }, + { + "epoch": 0.17, + "grad_norm": 0.671875, + "learning_rate": 8.385590393595731e-05, + "loss": 0.6419, + "step": 1257 + }, + { + "epoch": 0.17, + "grad_norm": 1.2265625, + "learning_rate": 8.392261507671782e-05, + "loss": 0.8814, + "step": 1258 + }, + { + "epoch": 0.17, + "grad_norm": 0.8828125, + "learning_rate": 8.398932621747832e-05, + "loss": 1.0798, + "step": 1259 + }, + { + "epoch": 0.17, + "grad_norm": 1.0703125, + "learning_rate": 8.405603735823883e-05, + "loss": 0.6099, + "step": 1260 + }, + { + "epoch": 0.17, + "grad_norm": 0.7890625, + "learning_rate": 8.412274849899933e-05, + "loss": 0.8957, + "step": 1261 + }, + { + "epoch": 0.17, + "grad_norm": 0.85546875, + "learning_rate": 8.418945963975985e-05, + "loss": 0.8578, + "step": 1262 + }, + { + "epoch": 0.17, + "grad_norm": 0.75390625, + "learning_rate": 8.425617078052036e-05, + "loss": 0.7516, + "step": 1263 + }, + { + "epoch": 0.17, + "grad_norm": 0.7109375, + "learning_rate": 8.432288192128086e-05, + "loss": 0.6969, + "step": 1264 + }, + { + "epoch": 0.17, + "grad_norm": 0.71484375, + "learning_rate": 8.438959306204137e-05, + "loss": 0.5583, + "step": 1265 + }, + { + "epoch": 0.17, + "grad_norm": 1.09375, + "learning_rate": 8.445630420280187e-05, + "loss": 0.7754, + "step": 1266 + }, + { + "epoch": 0.17, + "grad_norm": 0.99609375, + "learning_rate": 8.452301534356239e-05, + "loss": 0.654, + "step": 1267 + }, + { + "epoch": 0.17, + "grad_norm": 0.93359375, + "learning_rate": 8.458972648432288e-05, + "loss": 0.7315, + "step": 1268 + }, + { + "epoch": 0.17, + "grad_norm": 0.94921875, + "learning_rate": 8.46564376250834e-05, + "loss": 0.8021, + "step": 1269 + }, + { + "epoch": 0.17, + "grad_norm": 0.8203125, + "learning_rate": 8.472314876584389e-05, + "loss": 0.8689, + "step": 1270 + }, + { + "epoch": 0.17, + "grad_norm": 0.7890625, + "learning_rate": 8.478985990660441e-05, + "loss": 0.9126, + "step": 1271 + }, + { + "epoch": 0.17, + "grad_norm": 0.73046875, + "learning_rate": 8.485657104736491e-05, + "loss": 0.6471, + "step": 1272 + }, + { + "epoch": 0.17, + "grad_norm": 0.73828125, + "learning_rate": 8.492328218812542e-05, + "loss": 0.9369, + "step": 1273 + }, + { + "epoch": 0.17, + "grad_norm": 0.84375, + "learning_rate": 8.498999332888592e-05, + "loss": 0.9124, + "step": 1274 + }, + { + "epoch": 0.17, + "grad_norm": 0.82421875, + "learning_rate": 8.505670446964643e-05, + "loss": 0.6939, + "step": 1275 + }, + { + "epoch": 0.17, + "grad_norm": 1.0078125, + "learning_rate": 8.512341561040695e-05, + "loss": 0.8669, + "step": 1276 + }, + { + "epoch": 0.17, + "grad_norm": 0.7890625, + "learning_rate": 8.519012675116745e-05, + "loss": 0.7211, + "step": 1277 + }, + { + "epoch": 0.17, + "grad_norm": 0.875, + "learning_rate": 8.525683789192796e-05, + "loss": 0.6569, + "step": 1278 + }, + { + "epoch": 0.17, + "grad_norm": 1.15625, + "learning_rate": 8.532354903268846e-05, + "loss": 0.8716, + "step": 1279 + }, + { + "epoch": 0.17, + "grad_norm": 0.85546875, + "learning_rate": 8.539026017344897e-05, + "loss": 0.7919, + "step": 1280 + }, + { + "epoch": 0.17, + "grad_norm": 1.109375, + "learning_rate": 8.545697131420947e-05, + "loss": 0.8241, + "step": 1281 + }, + { + "epoch": 0.17, + "grad_norm": 0.78125, + "learning_rate": 8.552368245496999e-05, + "loss": 0.9754, + "step": 1282 + }, + { + "epoch": 0.17, + "grad_norm": 0.85546875, + "learning_rate": 8.559039359573048e-05, + "loss": 0.6723, + "step": 1283 + }, + { + "epoch": 0.17, + "grad_norm": 0.71484375, + "learning_rate": 8.5657104736491e-05, + "loss": 0.5687, + "step": 1284 + }, + { + "epoch": 0.17, + "grad_norm": 0.671875, + "learning_rate": 8.57238158772515e-05, + "loss": 0.8536, + "step": 1285 + }, + { + "epoch": 0.17, + "grad_norm": 0.8203125, + "learning_rate": 8.579052701801201e-05, + "loss": 0.5856, + "step": 1286 + }, + { + "epoch": 0.17, + "grad_norm": 0.80859375, + "learning_rate": 8.585723815877253e-05, + "loss": 0.8239, + "step": 1287 + }, + { + "epoch": 0.17, + "grad_norm": 1.0703125, + "learning_rate": 8.592394929953302e-05, + "loss": 0.6672, + "step": 1288 + }, + { + "epoch": 0.17, + "grad_norm": 0.796875, + "learning_rate": 8.599066044029354e-05, + "loss": 0.816, + "step": 1289 + }, + { + "epoch": 0.17, + "grad_norm": 1.046875, + "learning_rate": 8.605737158105403e-05, + "loss": 0.8372, + "step": 1290 + }, + { + "epoch": 0.17, + "grad_norm": 0.82421875, + "learning_rate": 8.612408272181455e-05, + "loss": 0.9088, + "step": 1291 + }, + { + "epoch": 0.17, + "grad_norm": 1.0625, + "learning_rate": 8.619079386257506e-05, + "loss": 0.881, + "step": 1292 + }, + { + "epoch": 0.17, + "grad_norm": 1.5234375, + "learning_rate": 8.625750500333556e-05, + "loss": 0.9926, + "step": 1293 + }, + { + "epoch": 0.17, + "grad_norm": 1.0390625, + "learning_rate": 8.632421614409607e-05, + "loss": 0.8405, + "step": 1294 + }, + { + "epoch": 0.17, + "grad_norm": 0.859375, + "learning_rate": 8.639092728485657e-05, + "loss": 0.779, + "step": 1295 + }, + { + "epoch": 0.17, + "grad_norm": 0.76953125, + "learning_rate": 8.645763842561708e-05, + "loss": 0.7548, + "step": 1296 + }, + { + "epoch": 0.17, + "grad_norm": 0.68359375, + "learning_rate": 8.65243495663776e-05, + "loss": 0.9113, + "step": 1297 + }, + { + "epoch": 0.17, + "grad_norm": 0.734375, + "learning_rate": 8.65910607071381e-05, + "loss": 0.7947, + "step": 1298 + }, + { + "epoch": 0.17, + "grad_norm": 0.81640625, + "learning_rate": 8.66577718478986e-05, + "loss": 0.7748, + "step": 1299 + }, + { + "epoch": 0.17, + "grad_norm": 0.69140625, + "learning_rate": 8.672448298865911e-05, + "loss": 0.887, + "step": 1300 + }, + { + "epoch": 0.17, + "grad_norm": 0.875, + "learning_rate": 8.679119412941961e-05, + "loss": 0.8959, + "step": 1301 + }, + { + "epoch": 0.17, + "grad_norm": 0.71484375, + "learning_rate": 8.685790527018013e-05, + "loss": 0.8286, + "step": 1302 + }, + { + "epoch": 0.17, + "grad_norm": 0.71484375, + "learning_rate": 8.692461641094062e-05, + "loss": 0.6221, + "step": 1303 + }, + { + "epoch": 0.17, + "grad_norm": 0.921875, + "learning_rate": 8.699132755170114e-05, + "loss": 0.8427, + "step": 1304 + }, + { + "epoch": 0.17, + "grad_norm": 0.796875, + "learning_rate": 8.705803869246163e-05, + "loss": 1.0579, + "step": 1305 + }, + { + "epoch": 0.17, + "grad_norm": 0.9375, + "learning_rate": 8.712474983322215e-05, + "loss": 0.8454, + "step": 1306 + }, + { + "epoch": 0.17, + "grad_norm": 0.765625, + "learning_rate": 8.719146097398266e-05, + "loss": 0.7443, + "step": 1307 + }, + { + "epoch": 0.17, + "grad_norm": 0.90625, + "learning_rate": 8.725817211474316e-05, + "loss": 0.9227, + "step": 1308 + }, + { + "epoch": 0.17, + "grad_norm": 0.80078125, + "learning_rate": 8.732488325550368e-05, + "loss": 0.7563, + "step": 1309 + }, + { + "epoch": 0.17, + "grad_norm": 0.73046875, + "learning_rate": 8.739159439626417e-05, + "loss": 0.9052, + "step": 1310 + }, + { + "epoch": 0.17, + "grad_norm": 1.078125, + "learning_rate": 8.745830553702469e-05, + "loss": 1.0238, + "step": 1311 + }, + { + "epoch": 0.18, + "grad_norm": 0.7734375, + "learning_rate": 8.75250166777852e-05, + "loss": 0.9656, + "step": 1312 + }, + { + "epoch": 0.18, + "grad_norm": 0.7109375, + "learning_rate": 8.75917278185457e-05, + "loss": 0.7842, + "step": 1313 + }, + { + "epoch": 0.18, + "grad_norm": 1.0625, + "learning_rate": 8.765843895930621e-05, + "loss": 1.0508, + "step": 1314 + }, + { + "epoch": 0.18, + "grad_norm": 0.85546875, + "learning_rate": 8.772515010006671e-05, + "loss": 0.7647, + "step": 1315 + }, + { + "epoch": 0.18, + "grad_norm": 0.94140625, + "learning_rate": 8.779186124082722e-05, + "loss": 1.0394, + "step": 1316 + }, + { + "epoch": 0.18, + "grad_norm": 0.703125, + "learning_rate": 8.785857238158774e-05, + "loss": 0.9752, + "step": 1317 + }, + { + "epoch": 0.18, + "grad_norm": 0.81640625, + "learning_rate": 8.792528352234824e-05, + "loss": 1.0268, + "step": 1318 + }, + { + "epoch": 0.18, + "grad_norm": 0.67578125, + "learning_rate": 8.799199466310875e-05, + "loss": 0.7894, + "step": 1319 + }, + { + "epoch": 0.18, + "grad_norm": 0.8203125, + "learning_rate": 8.805870580386925e-05, + "loss": 0.8324, + "step": 1320 + }, + { + "epoch": 0.18, + "grad_norm": 0.703125, + "learning_rate": 8.812541694462976e-05, + "loss": 0.6159, + "step": 1321 + }, + { + "epoch": 0.18, + "grad_norm": 0.703125, + "learning_rate": 8.819212808539027e-05, + "loss": 0.8431, + "step": 1322 + }, + { + "epoch": 0.18, + "grad_norm": 0.765625, + "learning_rate": 8.825883922615077e-05, + "loss": 0.7975, + "step": 1323 + }, + { + "epoch": 0.18, + "grad_norm": 0.9140625, + "learning_rate": 8.832555036691128e-05, + "loss": 0.512, + "step": 1324 + }, + { + "epoch": 0.18, + "grad_norm": 0.9140625, + "learning_rate": 8.839226150767178e-05, + "loss": 0.8294, + "step": 1325 + }, + { + "epoch": 0.18, + "grad_norm": 0.99609375, + "learning_rate": 8.84589726484323e-05, + "loss": 0.5952, + "step": 1326 + }, + { + "epoch": 0.18, + "grad_norm": 0.70703125, + "learning_rate": 8.85256837891928e-05, + "loss": 0.8219, + "step": 1327 + }, + { + "epoch": 0.18, + "grad_norm": 1.1953125, + "learning_rate": 8.85923949299533e-05, + "loss": 1.29, + "step": 1328 + }, + { + "epoch": 0.18, + "grad_norm": 0.875, + "learning_rate": 8.865910607071381e-05, + "loss": 0.7574, + "step": 1329 + }, + { + "epoch": 0.18, + "grad_norm": 0.9453125, + "learning_rate": 8.872581721147431e-05, + "loss": 0.7578, + "step": 1330 + }, + { + "epoch": 0.18, + "grad_norm": 0.82421875, + "learning_rate": 8.879252835223483e-05, + "loss": 0.7533, + "step": 1331 + }, + { + "epoch": 0.18, + "grad_norm": 0.80859375, + "learning_rate": 8.885923949299534e-05, + "loss": 0.8032, + "step": 1332 + }, + { + "epoch": 0.18, + "grad_norm": 0.6875, + "learning_rate": 8.892595063375584e-05, + "loss": 0.7726, + "step": 1333 + }, + { + "epoch": 0.18, + "grad_norm": 0.8671875, + "learning_rate": 8.899266177451635e-05, + "loss": 0.985, + "step": 1334 + }, + { + "epoch": 0.18, + "grad_norm": 0.8515625, + "learning_rate": 8.905937291527685e-05, + "loss": 0.7357, + "step": 1335 + }, + { + "epoch": 0.18, + "grad_norm": 0.9609375, + "learning_rate": 8.912608405603736e-05, + "loss": 0.9152, + "step": 1336 + }, + { + "epoch": 0.18, + "grad_norm": 0.8359375, + "learning_rate": 8.919279519679788e-05, + "loss": 0.5814, + "step": 1337 + }, + { + "epoch": 0.18, + "grad_norm": 0.921875, + "learning_rate": 8.925950633755837e-05, + "loss": 0.8126, + "step": 1338 + }, + { + "epoch": 0.18, + "grad_norm": 0.90625, + "learning_rate": 8.932621747831889e-05, + "loss": 0.7539, + "step": 1339 + }, + { + "epoch": 0.18, + "grad_norm": 0.921875, + "learning_rate": 8.939292861907939e-05, + "loss": 0.7339, + "step": 1340 + }, + { + "epoch": 0.18, + "grad_norm": 0.86328125, + "learning_rate": 8.94596397598399e-05, + "loss": 0.5493, + "step": 1341 + }, + { + "epoch": 0.18, + "grad_norm": 0.75390625, + "learning_rate": 8.95263509006004e-05, + "loss": 0.656, + "step": 1342 + }, + { + "epoch": 0.18, + "grad_norm": 0.87890625, + "learning_rate": 8.959306204136091e-05, + "loss": 0.6483, + "step": 1343 + }, + { + "epoch": 0.18, + "grad_norm": 0.96484375, + "learning_rate": 8.965977318212143e-05, + "loss": 0.6586, + "step": 1344 + }, + { + "epoch": 0.18, + "grad_norm": 0.8359375, + "learning_rate": 8.972648432288192e-05, + "loss": 0.7392, + "step": 1345 + }, + { + "epoch": 0.18, + "grad_norm": 0.66796875, + "learning_rate": 8.979319546364244e-05, + "loss": 0.9477, + "step": 1346 + }, + { + "epoch": 0.18, + "grad_norm": 0.8828125, + "learning_rate": 8.985990660440294e-05, + "loss": 0.6344, + "step": 1347 + }, + { + "epoch": 0.18, + "grad_norm": 0.8359375, + "learning_rate": 8.992661774516345e-05, + "loss": 0.8588, + "step": 1348 + }, + { + "epoch": 0.18, + "grad_norm": 0.8515625, + "learning_rate": 8.999332888592395e-05, + "loss": 0.8164, + "step": 1349 + }, + { + "epoch": 0.18, + "grad_norm": 0.8515625, + "learning_rate": 9.006004002668446e-05, + "loss": 0.9482, + "step": 1350 + }, + { + "epoch": 0.18, + "grad_norm": 0.77734375, + "learning_rate": 9.012675116744496e-05, + "loss": 0.8779, + "step": 1351 + }, + { + "epoch": 0.18, + "grad_norm": 1.0, + "learning_rate": 9.019346230820548e-05, + "loss": 0.8374, + "step": 1352 + }, + { + "epoch": 0.18, + "grad_norm": 0.7734375, + "learning_rate": 9.026017344896598e-05, + "loss": 0.7101, + "step": 1353 + }, + { + "epoch": 0.18, + "grad_norm": 0.95703125, + "learning_rate": 9.032688458972649e-05, + "loss": 0.5694, + "step": 1354 + }, + { + "epoch": 0.18, + "grad_norm": 1.0078125, + "learning_rate": 9.0393595730487e-05, + "loss": 0.8657, + "step": 1355 + }, + { + "epoch": 0.18, + "grad_norm": 0.9375, + "learning_rate": 9.04603068712475e-05, + "loss": 0.7285, + "step": 1356 + }, + { + "epoch": 0.18, + "grad_norm": 0.703125, + "learning_rate": 9.052701801200802e-05, + "loss": 0.6435, + "step": 1357 + }, + { + "epoch": 0.18, + "grad_norm": 0.88671875, + "learning_rate": 9.059372915276851e-05, + "loss": 0.6268, + "step": 1358 + }, + { + "epoch": 0.18, + "grad_norm": 1.203125, + "learning_rate": 9.066044029352903e-05, + "loss": 0.6726, + "step": 1359 + }, + { + "epoch": 0.18, + "grad_norm": 0.7734375, + "learning_rate": 9.072715143428952e-05, + "loss": 0.9662, + "step": 1360 + }, + { + "epoch": 0.18, + "grad_norm": 0.765625, + "learning_rate": 9.079386257505004e-05, + "loss": 0.8455, + "step": 1361 + }, + { + "epoch": 0.18, + "grad_norm": 0.87109375, + "learning_rate": 9.086057371581054e-05, + "loss": 0.7774, + "step": 1362 + }, + { + "epoch": 0.18, + "grad_norm": 0.84375, + "learning_rate": 9.092728485657105e-05, + "loss": 1.073, + "step": 1363 + }, + { + "epoch": 0.18, + "grad_norm": 0.7578125, + "learning_rate": 9.099399599733155e-05, + "loss": 0.8334, + "step": 1364 + }, + { + "epoch": 0.18, + "grad_norm": 0.88671875, + "learning_rate": 9.106070713809206e-05, + "loss": 0.8232, + "step": 1365 + }, + { + "epoch": 0.18, + "grad_norm": 1.078125, + "learning_rate": 9.112741827885258e-05, + "loss": 0.6841, + "step": 1366 + }, + { + "epoch": 0.18, + "grad_norm": 0.80078125, + "learning_rate": 9.119412941961308e-05, + "loss": 0.8807, + "step": 1367 + }, + { + "epoch": 0.18, + "grad_norm": 0.70703125, + "learning_rate": 9.126084056037359e-05, + "loss": 0.9032, + "step": 1368 + }, + { + "epoch": 0.18, + "grad_norm": 0.7890625, + "learning_rate": 9.132755170113409e-05, + "loss": 0.8198, + "step": 1369 + }, + { + "epoch": 0.18, + "grad_norm": 0.82421875, + "learning_rate": 9.13942628418946e-05, + "loss": 0.8403, + "step": 1370 + }, + { + "epoch": 0.18, + "grad_norm": 0.7265625, + "learning_rate": 9.14609739826551e-05, + "loss": 1.1633, + "step": 1371 + }, + { + "epoch": 0.18, + "grad_norm": 0.90625, + "learning_rate": 9.152768512341562e-05, + "loss": 0.5757, + "step": 1372 + }, + { + "epoch": 0.18, + "grad_norm": 0.7734375, + "learning_rate": 9.159439626417611e-05, + "loss": 0.7583, + "step": 1373 + }, + { + "epoch": 0.18, + "grad_norm": 0.86328125, + "learning_rate": 9.166110740493663e-05, + "loss": 0.6436, + "step": 1374 + }, + { + "epoch": 0.18, + "grad_norm": 1.2265625, + "learning_rate": 9.172781854569714e-05, + "loss": 0.914, + "step": 1375 + }, + { + "epoch": 0.18, + "grad_norm": 0.99609375, + "learning_rate": 9.179452968645764e-05, + "loss": 1.1867, + "step": 1376 + }, + { + "epoch": 0.18, + "grad_norm": 0.8125, + "learning_rate": 9.186124082721816e-05, + "loss": 1.0058, + "step": 1377 + }, + { + "epoch": 0.18, + "grad_norm": 0.78125, + "learning_rate": 9.192795196797865e-05, + "loss": 0.8712, + "step": 1378 + }, + { + "epoch": 0.18, + "grad_norm": 0.91015625, + "learning_rate": 9.199466310873917e-05, + "loss": 0.9485, + "step": 1379 + }, + { + "epoch": 0.18, + "grad_norm": 0.8359375, + "learning_rate": 9.206137424949966e-05, + "loss": 0.9245, + "step": 1380 + }, + { + "epoch": 0.18, + "grad_norm": 0.7421875, + "learning_rate": 9.212808539026018e-05, + "loss": 0.7997, + "step": 1381 + }, + { + "epoch": 0.18, + "grad_norm": 0.84375, + "learning_rate": 9.219479653102069e-05, + "loss": 0.8351, + "step": 1382 + }, + { + "epoch": 0.18, + "grad_norm": 1.2265625, + "learning_rate": 9.226150767178119e-05, + "loss": 0.7822, + "step": 1383 + }, + { + "epoch": 0.18, + "grad_norm": 0.81640625, + "learning_rate": 9.23282188125417e-05, + "loss": 1.0103, + "step": 1384 + }, + { + "epoch": 0.18, + "grad_norm": 1.015625, + "learning_rate": 9.23949299533022e-05, + "loss": 0.6373, + "step": 1385 + }, + { + "epoch": 0.18, + "grad_norm": 0.78125, + "learning_rate": 9.246164109406272e-05, + "loss": 0.7281, + "step": 1386 + }, + { + "epoch": 0.19, + "grad_norm": 0.984375, + "learning_rate": 9.252835223482322e-05, + "loss": 1.1251, + "step": 1387 + }, + { + "epoch": 0.19, + "grad_norm": 0.66015625, + "learning_rate": 9.259506337558373e-05, + "loss": 0.7029, + "step": 1388 + }, + { + "epoch": 0.19, + "grad_norm": 0.83984375, + "learning_rate": 9.266177451634423e-05, + "loss": 0.8684, + "step": 1389 + }, + { + "epoch": 0.19, + "grad_norm": 0.71875, + "learning_rate": 9.272848565710474e-05, + "loss": 0.6473, + "step": 1390 + }, + { + "epoch": 0.19, + "grad_norm": 0.70703125, + "learning_rate": 9.279519679786524e-05, + "loss": 0.8423, + "step": 1391 + }, + { + "epoch": 0.19, + "grad_norm": 0.90234375, + "learning_rate": 9.286190793862576e-05, + "loss": 0.6332, + "step": 1392 + }, + { + "epoch": 0.19, + "grad_norm": 1.171875, + "learning_rate": 9.292861907938625e-05, + "loss": 1.0014, + "step": 1393 + }, + { + "epoch": 0.19, + "grad_norm": 0.84375, + "learning_rate": 9.299533022014677e-05, + "loss": 0.6182, + "step": 1394 + }, + { + "epoch": 0.19, + "grad_norm": 0.7578125, + "learning_rate": 9.306204136090726e-05, + "loss": 0.6033, + "step": 1395 + }, + { + "epoch": 0.19, + "grad_norm": 0.70703125, + "learning_rate": 9.312875250166778e-05, + "loss": 0.7766, + "step": 1396 + }, + { + "epoch": 0.19, + "grad_norm": 0.94921875, + "learning_rate": 9.319546364242829e-05, + "loss": 0.7988, + "step": 1397 + }, + { + "epoch": 0.19, + "grad_norm": 0.98828125, + "learning_rate": 9.326217478318879e-05, + "loss": 0.7264, + "step": 1398 + }, + { + "epoch": 0.19, + "grad_norm": 0.83984375, + "learning_rate": 9.332888592394931e-05, + "loss": 0.661, + "step": 1399 + }, + { + "epoch": 0.19, + "grad_norm": 0.91015625, + "learning_rate": 9.33955970647098e-05, + "loss": 1.0412, + "step": 1400 + }, + { + "epoch": 0.19, + "grad_norm": 0.828125, + "learning_rate": 9.346230820547032e-05, + "loss": 0.8451, + "step": 1401 + }, + { + "epoch": 0.19, + "grad_norm": 0.80078125, + "learning_rate": 9.352901934623083e-05, + "loss": 0.8753, + "step": 1402 + }, + { + "epoch": 0.19, + "grad_norm": 0.7421875, + "learning_rate": 9.359573048699133e-05, + "loss": 0.9608, + "step": 1403 + }, + { + "epoch": 0.19, + "grad_norm": 0.63671875, + "learning_rate": 9.366244162775184e-05, + "loss": 0.7826, + "step": 1404 + }, + { + "epoch": 0.19, + "grad_norm": 0.8984375, + "learning_rate": 9.372915276851234e-05, + "loss": 0.6541, + "step": 1405 + }, + { + "epoch": 0.19, + "grad_norm": 0.98828125, + "learning_rate": 9.379586390927285e-05, + "loss": 0.6287, + "step": 1406 + }, + { + "epoch": 0.19, + "grad_norm": 0.8671875, + "learning_rate": 9.386257505003337e-05, + "loss": 1.0533, + "step": 1407 + }, + { + "epoch": 0.19, + "grad_norm": 1.2109375, + "learning_rate": 9.392928619079387e-05, + "loss": 0.5335, + "step": 1408 + }, + { + "epoch": 0.19, + "grad_norm": 0.91796875, + "learning_rate": 9.399599733155438e-05, + "loss": 0.7855, + "step": 1409 + }, + { + "epoch": 0.19, + "grad_norm": 1.0390625, + "learning_rate": 9.406270847231488e-05, + "loss": 0.759, + "step": 1410 + }, + { + "epoch": 0.19, + "grad_norm": 0.94921875, + "learning_rate": 9.412941961307539e-05, + "loss": 0.8287, + "step": 1411 + }, + { + "epoch": 0.19, + "grad_norm": 0.89453125, + "learning_rate": 9.41961307538359e-05, + "loss": 0.9073, + "step": 1412 + }, + { + "epoch": 0.19, + "grad_norm": 0.921875, + "learning_rate": 9.42628418945964e-05, + "loss": 0.7335, + "step": 1413 + }, + { + "epoch": 0.19, + "grad_norm": 1.0234375, + "learning_rate": 9.432955303535691e-05, + "loss": 0.8791, + "step": 1414 + }, + { + "epoch": 0.19, + "grad_norm": 0.80859375, + "learning_rate": 9.439626417611742e-05, + "loss": 0.5808, + "step": 1415 + }, + { + "epoch": 0.19, + "grad_norm": 0.76171875, + "learning_rate": 9.446297531687792e-05, + "loss": 0.9679, + "step": 1416 + }, + { + "epoch": 0.19, + "grad_norm": 0.83984375, + "learning_rate": 9.452968645763843e-05, + "loss": 1.0712, + "step": 1417 + }, + { + "epoch": 0.19, + "grad_norm": 1.1328125, + "learning_rate": 9.459639759839893e-05, + "loss": 0.8817, + "step": 1418 + }, + { + "epoch": 0.19, + "grad_norm": 1.015625, + "learning_rate": 9.466310873915944e-05, + "loss": 0.8018, + "step": 1419 + }, + { + "epoch": 0.19, + "grad_norm": 1.1796875, + "learning_rate": 9.472981987991996e-05, + "loss": 0.5115, + "step": 1420 + }, + { + "epoch": 0.19, + "grad_norm": 0.78125, + "learning_rate": 9.479653102068046e-05, + "loss": 1.1526, + "step": 1421 + }, + { + "epoch": 0.19, + "grad_norm": 0.6953125, + "learning_rate": 9.486324216144097e-05, + "loss": 0.8906, + "step": 1422 + }, + { + "epoch": 0.19, + "grad_norm": 0.63671875, + "learning_rate": 9.492995330220147e-05, + "loss": 0.7872, + "step": 1423 + }, + { + "epoch": 0.19, + "grad_norm": 0.796875, + "learning_rate": 9.499666444296198e-05, + "loss": 0.828, + "step": 1424 + }, + { + "epoch": 0.19, + "grad_norm": 1.2109375, + "learning_rate": 9.50633755837225e-05, + "loss": 0.7831, + "step": 1425 + }, + { + "epoch": 0.19, + "grad_norm": 0.83203125, + "learning_rate": 9.513008672448299e-05, + "loss": 1.0651, + "step": 1426 + }, + { + "epoch": 0.19, + "grad_norm": 0.87890625, + "learning_rate": 9.51967978652435e-05, + "loss": 0.6891, + "step": 1427 + }, + { + "epoch": 0.19, + "grad_norm": 0.76953125, + "learning_rate": 9.5263509006004e-05, + "loss": 0.9444, + "step": 1428 + }, + { + "epoch": 0.19, + "grad_norm": 0.72265625, + "learning_rate": 9.533022014676452e-05, + "loss": 1.0597, + "step": 1429 + }, + { + "epoch": 0.19, + "grad_norm": 0.79296875, + "learning_rate": 9.539693128752502e-05, + "loss": 0.8569, + "step": 1430 + }, + { + "epoch": 0.19, + "grad_norm": 0.8828125, + "learning_rate": 9.546364242828553e-05, + "loss": 0.7636, + "step": 1431 + }, + { + "epoch": 0.19, + "grad_norm": 0.875, + "learning_rate": 9.553035356904603e-05, + "loss": 0.7063, + "step": 1432 + }, + { + "epoch": 0.19, + "grad_norm": 0.8359375, + "learning_rate": 9.559706470980654e-05, + "loss": 0.9141, + "step": 1433 + }, + { + "epoch": 0.19, + "grad_norm": 0.90625, + "learning_rate": 9.566377585056706e-05, + "loss": 0.8461, + "step": 1434 + }, + { + "epoch": 0.19, + "grad_norm": 0.91015625, + "learning_rate": 9.573048699132756e-05, + "loss": 0.6476, + "step": 1435 + }, + { + "epoch": 0.19, + "grad_norm": 0.6640625, + "learning_rate": 9.579719813208807e-05, + "loss": 0.6974, + "step": 1436 + }, + { + "epoch": 0.19, + "grad_norm": 0.76953125, + "learning_rate": 9.586390927284857e-05, + "loss": 0.7584, + "step": 1437 + }, + { + "epoch": 0.19, + "grad_norm": 0.71875, + "learning_rate": 9.593062041360908e-05, + "loss": 0.8803, + "step": 1438 + }, + { + "epoch": 0.19, + "grad_norm": 0.71484375, + "learning_rate": 9.599733155436958e-05, + "loss": 0.812, + "step": 1439 + }, + { + "epoch": 0.19, + "grad_norm": 1.0234375, + "learning_rate": 9.60640426951301e-05, + "loss": 0.9237, + "step": 1440 + }, + { + "epoch": 0.19, + "grad_norm": 0.953125, + "learning_rate": 9.613075383589059e-05, + "loss": 0.8534, + "step": 1441 + }, + { + "epoch": 0.19, + "grad_norm": 0.8828125, + "learning_rate": 9.619746497665111e-05, + "loss": 0.6692, + "step": 1442 + }, + { + "epoch": 0.19, + "grad_norm": 0.90625, + "learning_rate": 9.626417611741161e-05, + "loss": 0.8875, + "step": 1443 + }, + { + "epoch": 0.19, + "grad_norm": 0.8359375, + "learning_rate": 9.633088725817212e-05, + "loss": 1.0255, + "step": 1444 + }, + { + "epoch": 0.19, + "grad_norm": 0.99609375, + "learning_rate": 9.639759839893264e-05, + "loss": 0.7583, + "step": 1445 + }, + { + "epoch": 0.19, + "grad_norm": 0.76953125, + "learning_rate": 9.646430953969313e-05, + "loss": 0.8068, + "step": 1446 + }, + { + "epoch": 0.19, + "grad_norm": 0.828125, + "learning_rate": 9.653102068045365e-05, + "loss": 0.6694, + "step": 1447 + }, + { + "epoch": 0.19, + "grad_norm": 0.85546875, + "learning_rate": 9.659773182121414e-05, + "loss": 1.0339, + "step": 1448 + }, + { + "epoch": 0.19, + "grad_norm": 0.85546875, + "learning_rate": 9.666444296197466e-05, + "loss": 0.7402, + "step": 1449 + }, + { + "epoch": 0.19, + "grad_norm": 0.78125, + "learning_rate": 9.673115410273516e-05, + "loss": 0.7428, + "step": 1450 + }, + { + "epoch": 0.19, + "grad_norm": 0.96875, + "learning_rate": 9.679786524349567e-05, + "loss": 0.8305, + "step": 1451 + }, + { + "epoch": 0.19, + "grad_norm": 0.921875, + "learning_rate": 9.686457638425617e-05, + "loss": 0.8569, + "step": 1452 + }, + { + "epoch": 0.19, + "grad_norm": 0.84765625, + "learning_rate": 9.693128752501668e-05, + "loss": 0.7124, + "step": 1453 + }, + { + "epoch": 0.19, + "grad_norm": 0.796875, + "learning_rate": 9.699799866577718e-05, + "loss": 0.4611, + "step": 1454 + }, + { + "epoch": 0.19, + "grad_norm": 0.78515625, + "learning_rate": 9.70647098065377e-05, + "loss": 0.7509, + "step": 1455 + }, + { + "epoch": 0.19, + "grad_norm": 0.76953125, + "learning_rate": 9.713142094729821e-05, + "loss": 0.6017, + "step": 1456 + }, + { + "epoch": 0.19, + "grad_norm": 0.92578125, + "learning_rate": 9.719813208805871e-05, + "loss": 0.7859, + "step": 1457 + }, + { + "epoch": 0.19, + "grad_norm": 0.68359375, + "learning_rate": 9.726484322881922e-05, + "loss": 0.8656, + "step": 1458 + }, + { + "epoch": 0.19, + "grad_norm": 1.0546875, + "learning_rate": 9.733155436957972e-05, + "loss": 1.0778, + "step": 1459 + }, + { + "epoch": 0.19, + "grad_norm": 0.6953125, + "learning_rate": 9.739826551034024e-05, + "loss": 0.7005, + "step": 1460 + }, + { + "epoch": 0.19, + "grad_norm": 0.6796875, + "learning_rate": 9.746497665110073e-05, + "loss": 0.7232, + "step": 1461 + }, + { + "epoch": 0.2, + "grad_norm": 1.375, + "learning_rate": 9.753168779186125e-05, + "loss": 0.9293, + "step": 1462 + }, + { + "epoch": 0.2, + "grad_norm": 0.859375, + "learning_rate": 9.759839893262174e-05, + "loss": 0.6269, + "step": 1463 + }, + { + "epoch": 0.2, + "grad_norm": 0.94921875, + "learning_rate": 9.766511007338226e-05, + "loss": 0.895, + "step": 1464 + }, + { + "epoch": 0.2, + "grad_norm": 0.80859375, + "learning_rate": 9.773182121414277e-05, + "loss": 0.9033, + "step": 1465 + }, + { + "epoch": 0.2, + "grad_norm": 0.71875, + "learning_rate": 9.779853235490327e-05, + "loss": 0.7611, + "step": 1466 + }, + { + "epoch": 0.2, + "grad_norm": 0.70703125, + "learning_rate": 9.786524349566379e-05, + "loss": 0.7913, + "step": 1467 + }, + { + "epoch": 0.2, + "grad_norm": 0.75390625, + "learning_rate": 9.793195463642428e-05, + "loss": 0.8891, + "step": 1468 + }, + { + "epoch": 0.2, + "grad_norm": 0.6875, + "learning_rate": 9.79986657771848e-05, + "loss": 0.9264, + "step": 1469 + }, + { + "epoch": 0.2, + "grad_norm": 1.0, + "learning_rate": 9.80653769179453e-05, + "loss": 0.4757, + "step": 1470 + }, + { + "epoch": 0.2, + "grad_norm": 0.8046875, + "learning_rate": 9.813208805870581e-05, + "loss": 1.0025, + "step": 1471 + }, + { + "epoch": 0.2, + "grad_norm": 0.7890625, + "learning_rate": 9.819879919946631e-05, + "loss": 0.816, + "step": 1472 + }, + { + "epoch": 0.2, + "grad_norm": 0.765625, + "learning_rate": 9.826551034022682e-05, + "loss": 1.1158, + "step": 1473 + }, + { + "epoch": 0.2, + "grad_norm": 0.83984375, + "learning_rate": 9.833222148098732e-05, + "loss": 0.7987, + "step": 1474 + }, + { + "epoch": 0.2, + "grad_norm": 0.85546875, + "learning_rate": 9.839893262174784e-05, + "loss": 0.8933, + "step": 1475 + }, + { + "epoch": 0.2, + "grad_norm": 0.7890625, + "learning_rate": 9.846564376250835e-05, + "loss": 0.8094, + "step": 1476 + }, + { + "epoch": 0.2, + "grad_norm": 0.84375, + "learning_rate": 9.853235490326885e-05, + "loss": 0.9795, + "step": 1477 + }, + { + "epoch": 0.2, + "grad_norm": 1.265625, + "learning_rate": 9.859906604402936e-05, + "loss": 0.8539, + "step": 1478 + }, + { + "epoch": 0.2, + "grad_norm": 0.84375, + "learning_rate": 9.866577718478986e-05, + "loss": 0.9943, + "step": 1479 + }, + { + "epoch": 0.2, + "grad_norm": 0.73828125, + "learning_rate": 9.873248832555038e-05, + "loss": 0.7313, + "step": 1480 + }, + { + "epoch": 0.2, + "grad_norm": 0.87890625, + "learning_rate": 9.879919946631087e-05, + "loss": 0.6955, + "step": 1481 + }, + { + "epoch": 0.2, + "grad_norm": 0.859375, + "learning_rate": 9.886591060707139e-05, + "loss": 1.3029, + "step": 1482 + }, + { + "epoch": 0.2, + "grad_norm": 1.0078125, + "learning_rate": 9.893262174783188e-05, + "loss": 0.6544, + "step": 1483 + }, + { + "epoch": 0.2, + "grad_norm": 0.94140625, + "learning_rate": 9.89993328885924e-05, + "loss": 1.0204, + "step": 1484 + }, + { + "epoch": 0.2, + "grad_norm": 0.9140625, + "learning_rate": 9.906604402935291e-05, + "loss": 0.7041, + "step": 1485 + }, + { + "epoch": 0.2, + "grad_norm": 0.8984375, + "learning_rate": 9.913275517011341e-05, + "loss": 0.9156, + "step": 1486 + }, + { + "epoch": 0.2, + "grad_norm": 1.3984375, + "learning_rate": 9.919946631087392e-05, + "loss": 0.8264, + "step": 1487 + }, + { + "epoch": 0.2, + "grad_norm": 0.6953125, + "learning_rate": 9.926617745163442e-05, + "loss": 0.6242, + "step": 1488 + }, + { + "epoch": 0.2, + "grad_norm": 0.88671875, + "learning_rate": 9.933288859239494e-05, + "loss": 0.9644, + "step": 1489 + }, + { + "epoch": 0.2, + "grad_norm": 1.0390625, + "learning_rate": 9.939959973315545e-05, + "loss": 0.7773, + "step": 1490 + }, + { + "epoch": 0.2, + "grad_norm": 0.765625, + "learning_rate": 9.946631087391595e-05, + "loss": 0.8773, + "step": 1491 + }, + { + "epoch": 0.2, + "grad_norm": 1.125, + "learning_rate": 9.953302201467646e-05, + "loss": 1.0474, + "step": 1492 + }, + { + "epoch": 0.2, + "grad_norm": 0.9140625, + "learning_rate": 9.959973315543696e-05, + "loss": 0.8696, + "step": 1493 + }, + { + "epoch": 0.2, + "grad_norm": 0.7421875, + "learning_rate": 9.966644429619747e-05, + "loss": 0.5974, + "step": 1494 + }, + { + "epoch": 0.2, + "grad_norm": 1.53125, + "learning_rate": 9.973315543695798e-05, + "loss": 1.0203, + "step": 1495 + }, + { + "epoch": 0.2, + "grad_norm": 0.7734375, + "learning_rate": 9.979986657771848e-05, + "loss": 1.2744, + "step": 1496 + }, + { + "epoch": 0.2, + "grad_norm": 0.97265625, + "learning_rate": 9.9866577718479e-05, + "loss": 0.9374, + "step": 1497 + }, + { + "epoch": 0.2, + "grad_norm": 0.77734375, + "learning_rate": 9.99332888592395e-05, + "loss": 0.694, + "step": 1498 + }, + { + "epoch": 0.2, + "grad_norm": 0.77734375, + "learning_rate": 0.0001, + "loss": 0.9647, + "step": 1499 + }, + { + "epoch": 0.2, + "grad_norm": 0.99609375, + "learning_rate": 0.00010006671114076051, + "loss": 0.775, + "step": 1500 + }, + { + "epoch": 0.2, + "grad_norm": 0.890625, + "learning_rate": 0.00010013342228152103, + "loss": 0.7487, + "step": 1501 + }, + { + "epoch": 0.2, + "grad_norm": 0.8828125, + "learning_rate": 0.00010020013342228152, + "loss": 0.7966, + "step": 1502 + }, + { + "epoch": 0.2, + "grad_norm": 0.7734375, + "learning_rate": 0.00010026684456304202, + "loss": 0.7132, + "step": 1503 + }, + { + "epoch": 0.2, + "grad_norm": 0.84375, + "learning_rate": 0.00010033355570380254, + "loss": 0.7612, + "step": 1504 + }, + { + "epoch": 0.2, + "grad_norm": 0.71875, + "learning_rate": 0.00010040026684456305, + "loss": 0.7221, + "step": 1505 + }, + { + "epoch": 0.2, + "grad_norm": 0.80859375, + "learning_rate": 0.00010046697798532357, + "loss": 0.394, + "step": 1506 + }, + { + "epoch": 0.2, + "grad_norm": 0.96875, + "learning_rate": 0.00010053368912608406, + "loss": 0.852, + "step": 1507 + }, + { + "epoch": 0.2, + "grad_norm": 1.0390625, + "learning_rate": 0.00010060040026684456, + "loss": 0.6678, + "step": 1508 + }, + { + "epoch": 0.2, + "grad_norm": 0.98046875, + "learning_rate": 0.00010066711140760507, + "loss": 1.1321, + "step": 1509 + }, + { + "epoch": 0.2, + "grad_norm": 0.85546875, + "learning_rate": 0.00010073382254836559, + "loss": 0.98, + "step": 1510 + }, + { + "epoch": 0.2, + "grad_norm": 0.8203125, + "learning_rate": 0.00010080053368912609, + "loss": 0.9795, + "step": 1511 + }, + { + "epoch": 0.2, + "grad_norm": 0.8203125, + "learning_rate": 0.00010086724482988658, + "loss": 0.7299, + "step": 1512 + }, + { + "epoch": 0.2, + "grad_norm": 0.81640625, + "learning_rate": 0.0001009339559706471, + "loss": 0.6399, + "step": 1513 + }, + { + "epoch": 0.2, + "grad_norm": 0.640625, + "learning_rate": 0.00010100066711140761, + "loss": 0.7466, + "step": 1514 + }, + { + "epoch": 0.2, + "grad_norm": 0.9140625, + "learning_rate": 0.00010106737825216813, + "loss": 0.9945, + "step": 1515 + }, + { + "epoch": 0.2, + "grad_norm": 0.765625, + "learning_rate": 0.00010113408939292863, + "loss": 0.9262, + "step": 1516 + }, + { + "epoch": 0.2, + "grad_norm": 0.734375, + "learning_rate": 0.00010120080053368912, + "loss": 0.7197, + "step": 1517 + }, + { + "epoch": 0.2, + "grad_norm": 0.90234375, + "learning_rate": 0.00010126751167444963, + "loss": 0.5532, + "step": 1518 + }, + { + "epoch": 0.2, + "grad_norm": 1.0, + "learning_rate": 0.00010133422281521015, + "loss": 0.7573, + "step": 1519 + }, + { + "epoch": 0.2, + "grad_norm": 0.81640625, + "learning_rate": 0.00010140093395597065, + "loss": 0.6684, + "step": 1520 + }, + { + "epoch": 0.2, + "grad_norm": 0.92578125, + "learning_rate": 0.00010146764509673117, + "loss": 0.6951, + "step": 1521 + }, + { + "epoch": 0.2, + "grad_norm": 0.734375, + "learning_rate": 0.00010153435623749166, + "loss": 0.7613, + "step": 1522 + }, + { + "epoch": 0.2, + "grad_norm": 0.9140625, + "learning_rate": 0.00010160106737825217, + "loss": 0.8219, + "step": 1523 + }, + { + "epoch": 0.2, + "grad_norm": 0.82421875, + "learning_rate": 0.00010166777851901268, + "loss": 0.8574, + "step": 1524 + }, + { + "epoch": 0.2, + "grad_norm": 0.859375, + "learning_rate": 0.00010173448965977319, + "loss": 0.4179, + "step": 1525 + }, + { + "epoch": 0.2, + "grad_norm": 0.8359375, + "learning_rate": 0.00010180120080053371, + "loss": 1.0226, + "step": 1526 + }, + { + "epoch": 0.2, + "grad_norm": 0.93359375, + "learning_rate": 0.00010186791194129419, + "loss": 1.1061, + "step": 1527 + }, + { + "epoch": 0.2, + "grad_norm": 0.671875, + "learning_rate": 0.0001019346230820547, + "loss": 0.5271, + "step": 1528 + }, + { + "epoch": 0.2, + "grad_norm": 0.99609375, + "learning_rate": 0.00010200133422281521, + "loss": 0.7754, + "step": 1529 + }, + { + "epoch": 0.2, + "grad_norm": 0.9453125, + "learning_rate": 0.00010206804536357573, + "loss": 0.8419, + "step": 1530 + }, + { + "epoch": 0.2, + "grad_norm": 0.83984375, + "learning_rate": 0.00010213475650433623, + "loss": 0.6336, + "step": 1531 + }, + { + "epoch": 0.2, + "grad_norm": 0.85546875, + "learning_rate": 0.00010220146764509673, + "loss": 0.7401, + "step": 1532 + }, + { + "epoch": 0.2, + "grad_norm": 0.67578125, + "learning_rate": 0.00010226817878585724, + "loss": 0.6822, + "step": 1533 + }, + { + "epoch": 0.2, + "grad_norm": 0.62109375, + "learning_rate": 0.00010233488992661775, + "loss": 0.7251, + "step": 1534 + }, + { + "epoch": 0.2, + "grad_norm": 0.765625, + "learning_rate": 0.00010240160106737827, + "loss": 0.6296, + "step": 1535 + }, + { + "epoch": 0.2, + "grad_norm": 0.890625, + "learning_rate": 0.00010246831220813877, + "loss": 1.1286, + "step": 1536 + }, + { + "epoch": 0.21, + "grad_norm": 0.73828125, + "learning_rate": 0.00010253502334889926, + "loss": 0.5024, + "step": 1537 + }, + { + "epoch": 0.21, + "grad_norm": 0.79296875, + "learning_rate": 0.00010260173448965977, + "loss": 0.6893, + "step": 1538 + }, + { + "epoch": 0.21, + "grad_norm": 0.8984375, + "learning_rate": 0.00010266844563042029, + "loss": 0.7655, + "step": 1539 + }, + { + "epoch": 0.21, + "grad_norm": 0.66015625, + "learning_rate": 0.00010273515677118079, + "loss": 0.688, + "step": 1540 + }, + { + "epoch": 0.21, + "grad_norm": 0.96484375, + "learning_rate": 0.00010280186791194131, + "loss": 0.7591, + "step": 1541 + }, + { + "epoch": 0.21, + "grad_norm": 0.82421875, + "learning_rate": 0.0001028685790527018, + "loss": 1.153, + "step": 1542 + }, + { + "epoch": 0.21, + "grad_norm": 0.7890625, + "learning_rate": 0.00010293529019346231, + "loss": 0.9639, + "step": 1543 + }, + { + "epoch": 0.21, + "grad_norm": 0.7421875, + "learning_rate": 0.00010300200133422283, + "loss": 0.9522, + "step": 1544 + }, + { + "epoch": 0.21, + "grad_norm": 0.76953125, + "learning_rate": 0.00010306871247498333, + "loss": 0.9462, + "step": 1545 + }, + { + "epoch": 0.21, + "grad_norm": 0.75, + "learning_rate": 0.00010313542361574384, + "loss": 0.8679, + "step": 1546 + }, + { + "epoch": 0.21, + "grad_norm": 1.0859375, + "learning_rate": 0.00010320213475650433, + "loss": 1.0244, + "step": 1547 + }, + { + "epoch": 0.21, + "grad_norm": 0.78515625, + "learning_rate": 0.00010326884589726485, + "loss": 0.9103, + "step": 1548 + }, + { + "epoch": 0.21, + "grad_norm": 0.8671875, + "learning_rate": 0.00010333555703802535, + "loss": 0.9605, + "step": 1549 + }, + { + "epoch": 0.21, + "grad_norm": 0.875, + "learning_rate": 0.00010340226817878587, + "loss": 0.9036, + "step": 1550 + }, + { + "epoch": 0.21, + "grad_norm": 0.76171875, + "learning_rate": 0.00010346897931954638, + "loss": 0.8852, + "step": 1551 + }, + { + "epoch": 0.21, + "grad_norm": 0.7578125, + "learning_rate": 0.00010353569046030687, + "loss": 0.8079, + "step": 1552 + }, + { + "epoch": 0.21, + "grad_norm": 0.7421875, + "learning_rate": 0.00010360240160106737, + "loss": 0.5998, + "step": 1553 + }, + { + "epoch": 0.21, + "grad_norm": 0.87109375, + "learning_rate": 0.00010366911274182789, + "loss": 0.938, + "step": 1554 + }, + { + "epoch": 0.21, + "grad_norm": 0.84375, + "learning_rate": 0.0001037358238825884, + "loss": 0.7737, + "step": 1555 + }, + { + "epoch": 0.21, + "grad_norm": 0.9609375, + "learning_rate": 0.00010380253502334891, + "loss": 0.5863, + "step": 1556 + }, + { + "epoch": 0.21, + "grad_norm": 0.98046875, + "learning_rate": 0.0001038692461641094, + "loss": 0.6995, + "step": 1557 + }, + { + "epoch": 0.21, + "grad_norm": 0.70703125, + "learning_rate": 0.00010393595730486991, + "loss": 0.6834, + "step": 1558 + }, + { + "epoch": 0.21, + "grad_norm": 0.94921875, + "learning_rate": 0.00010400266844563043, + "loss": 0.7243, + "step": 1559 + }, + { + "epoch": 0.21, + "grad_norm": 0.88671875, + "learning_rate": 0.00010406937958639093, + "loss": 0.661, + "step": 1560 + }, + { + "epoch": 0.21, + "grad_norm": 0.98828125, + "learning_rate": 0.00010413609072715145, + "loss": 0.7501, + "step": 1561 + }, + { + "epoch": 0.21, + "grad_norm": 1.3671875, + "learning_rate": 0.00010420280186791193, + "loss": 0.8189, + "step": 1562 + }, + { + "epoch": 0.21, + "grad_norm": 0.8828125, + "learning_rate": 0.00010426951300867245, + "loss": 0.9425, + "step": 1563 + }, + { + "epoch": 0.21, + "grad_norm": 1.0859375, + "learning_rate": 0.00010433622414943295, + "loss": 0.9644, + "step": 1564 + }, + { + "epoch": 0.21, + "grad_norm": 0.84765625, + "learning_rate": 0.00010440293529019347, + "loss": 0.8183, + "step": 1565 + }, + { + "epoch": 0.21, + "grad_norm": 0.7890625, + "learning_rate": 0.00010446964643095398, + "loss": 0.76, + "step": 1566 + }, + { + "epoch": 0.21, + "grad_norm": 0.703125, + "learning_rate": 0.00010453635757171447, + "loss": 0.8269, + "step": 1567 + }, + { + "epoch": 0.21, + "grad_norm": 0.64453125, + "learning_rate": 0.00010460306871247499, + "loss": 0.9889, + "step": 1568 + }, + { + "epoch": 0.21, + "grad_norm": 0.82421875, + "learning_rate": 0.00010466977985323549, + "loss": 0.7, + "step": 1569 + }, + { + "epoch": 0.21, + "grad_norm": 0.8828125, + "learning_rate": 0.00010473649099399601, + "loss": 0.8472, + "step": 1570 + }, + { + "epoch": 0.21, + "grad_norm": 0.84765625, + "learning_rate": 0.00010480320213475652, + "loss": 0.9324, + "step": 1571 + }, + { + "epoch": 0.21, + "grad_norm": 1.0390625, + "learning_rate": 0.00010486991327551701, + "loss": 0.6243, + "step": 1572 + }, + { + "epoch": 0.21, + "grad_norm": 0.73828125, + "learning_rate": 0.00010493662441627751, + "loss": 0.7709, + "step": 1573 + }, + { + "epoch": 0.21, + "grad_norm": 0.62109375, + "learning_rate": 0.00010500333555703803, + "loss": 0.9352, + "step": 1574 + }, + { + "epoch": 0.21, + "grad_norm": 0.8125, + "learning_rate": 0.00010507004669779854, + "loss": 0.7588, + "step": 1575 + }, + { + "epoch": 0.21, + "grad_norm": 0.8125, + "learning_rate": 0.00010513675783855906, + "loss": 0.8114, + "step": 1576 + }, + { + "epoch": 0.21, + "grad_norm": 0.7265625, + "learning_rate": 0.00010520346897931955, + "loss": 0.6047, + "step": 1577 + }, + { + "epoch": 0.21, + "grad_norm": 0.6171875, + "learning_rate": 0.00010527018012008005, + "loss": 0.9235, + "step": 1578 + }, + { + "epoch": 0.21, + "grad_norm": 0.71484375, + "learning_rate": 0.00010533689126084057, + "loss": 0.7918, + "step": 1579 + }, + { + "epoch": 0.21, + "grad_norm": 0.890625, + "learning_rate": 0.00010540360240160108, + "loss": 0.7981, + "step": 1580 + }, + { + "epoch": 0.21, + "grad_norm": 1.21875, + "learning_rate": 0.0001054703135423616, + "loss": 0.7414, + "step": 1581 + }, + { + "epoch": 0.21, + "grad_norm": 0.96875, + "learning_rate": 0.00010553702468312207, + "loss": 0.4827, + "step": 1582 + }, + { + "epoch": 0.21, + "grad_norm": 1.0234375, + "learning_rate": 0.00010560373582388259, + "loss": 0.7347, + "step": 1583 + }, + { + "epoch": 0.21, + "grad_norm": 0.8359375, + "learning_rate": 0.0001056704469646431, + "loss": 0.7229, + "step": 1584 + }, + { + "epoch": 0.21, + "grad_norm": 0.73828125, + "learning_rate": 0.00010573715810540361, + "loss": 0.6939, + "step": 1585 + }, + { + "epoch": 0.21, + "grad_norm": 0.8515625, + "learning_rate": 0.00010580386924616412, + "loss": 0.4733, + "step": 1586 + }, + { + "epoch": 0.21, + "grad_norm": 0.94921875, + "learning_rate": 0.00010587058038692461, + "loss": 1.1099, + "step": 1587 + }, + { + "epoch": 0.21, + "grad_norm": 0.70703125, + "learning_rate": 0.00010593729152768513, + "loss": 0.6406, + "step": 1588 + }, + { + "epoch": 0.21, + "grad_norm": 0.859375, + "learning_rate": 0.00010600400266844563, + "loss": 0.7322, + "step": 1589 + }, + { + "epoch": 0.21, + "grad_norm": 1.0390625, + "learning_rate": 0.00010607071380920614, + "loss": 0.9022, + "step": 1590 + }, + { + "epoch": 0.21, + "grad_norm": 0.80859375, + "learning_rate": 0.00010613742494996666, + "loss": 1.0461, + "step": 1591 + }, + { + "epoch": 0.21, + "grad_norm": 0.734375, + "learning_rate": 0.00010620413609072715, + "loss": 0.9253, + "step": 1592 + }, + { + "epoch": 0.21, + "grad_norm": 0.96875, + "learning_rate": 0.00010627084723148765, + "loss": 1.1049, + "step": 1593 + }, + { + "epoch": 0.21, + "grad_norm": 0.7734375, + "learning_rate": 0.00010633755837224817, + "loss": 0.822, + "step": 1594 + }, + { + "epoch": 0.21, + "grad_norm": 0.79296875, + "learning_rate": 0.00010640426951300868, + "loss": 0.8176, + "step": 1595 + }, + { + "epoch": 0.21, + "grad_norm": 1.015625, + "learning_rate": 0.0001064709806537692, + "loss": 0.8522, + "step": 1596 + }, + { + "epoch": 0.21, + "grad_norm": 0.84765625, + "learning_rate": 0.00010653769179452969, + "loss": 0.6736, + "step": 1597 + }, + { + "epoch": 0.21, + "grad_norm": 0.84375, + "learning_rate": 0.00010660440293529019, + "loss": 0.9186, + "step": 1598 + }, + { + "epoch": 0.21, + "grad_norm": 0.80859375, + "learning_rate": 0.0001066711140760507, + "loss": 0.9558, + "step": 1599 + }, + { + "epoch": 0.21, + "grad_norm": 0.98046875, + "learning_rate": 0.00010673782521681122, + "loss": 0.8869, + "step": 1600 + }, + { + "epoch": 0.21, + "grad_norm": 1.0, + "learning_rate": 0.00010680453635757172, + "loss": 0.6752, + "step": 1601 + }, + { + "epoch": 0.21, + "grad_norm": 1.125, + "learning_rate": 0.00010687124749833221, + "loss": 0.6732, + "step": 1602 + }, + { + "epoch": 0.21, + "grad_norm": 0.6953125, + "learning_rate": 0.00010693795863909273, + "loss": 0.7307, + "step": 1603 + }, + { + "epoch": 0.21, + "grad_norm": 1.09375, + "learning_rate": 0.00010700466977985324, + "loss": 1.312, + "step": 1604 + }, + { + "epoch": 0.21, + "grad_norm": 0.921875, + "learning_rate": 0.00010707138092061376, + "loss": 1.0878, + "step": 1605 + }, + { + "epoch": 0.21, + "grad_norm": 1.453125, + "learning_rate": 0.00010713809206137426, + "loss": 0.8299, + "step": 1606 + }, + { + "epoch": 0.21, + "grad_norm": 0.9921875, + "learning_rate": 0.00010720480320213475, + "loss": 0.8713, + "step": 1607 + }, + { + "epoch": 0.21, + "grad_norm": 1.2109375, + "learning_rate": 0.00010727151434289526, + "loss": 0.6119, + "step": 1608 + }, + { + "epoch": 0.21, + "grad_norm": 1.1015625, + "learning_rate": 0.00010733822548365578, + "loss": 0.9916, + "step": 1609 + }, + { + "epoch": 0.21, + "grad_norm": 0.96484375, + "learning_rate": 0.00010740493662441628, + "loss": 0.8414, + "step": 1610 + }, + { + "epoch": 0.21, + "grad_norm": 1.0546875, + "learning_rate": 0.0001074716477651768, + "loss": 0.7662, + "step": 1611 + }, + { + "epoch": 0.22, + "grad_norm": 0.92578125, + "learning_rate": 0.00010753835890593729, + "loss": 0.9521, + "step": 1612 + }, + { + "epoch": 0.22, + "grad_norm": 0.8828125, + "learning_rate": 0.0001076050700466978, + "loss": 0.6799, + "step": 1613 + }, + { + "epoch": 0.22, + "grad_norm": 1.0078125, + "learning_rate": 0.00010767178118745831, + "loss": 1.0091, + "step": 1614 + }, + { + "epoch": 0.22, + "grad_norm": 0.6171875, + "learning_rate": 0.00010773849232821882, + "loss": 0.6414, + "step": 1615 + }, + { + "epoch": 0.22, + "grad_norm": 1.0234375, + "learning_rate": 0.00010780520346897934, + "loss": 0.697, + "step": 1616 + }, + { + "epoch": 0.22, + "grad_norm": 0.8359375, + "learning_rate": 0.00010787191460973982, + "loss": 0.8352, + "step": 1617 + }, + { + "epoch": 0.22, + "grad_norm": 0.86328125, + "learning_rate": 0.00010793862575050033, + "loss": 0.8497, + "step": 1618 + }, + { + "epoch": 0.22, + "grad_norm": 0.671875, + "learning_rate": 0.00010800533689126084, + "loss": 0.93, + "step": 1619 + }, + { + "epoch": 0.22, + "grad_norm": 0.76953125, + "learning_rate": 0.00010807204803202136, + "loss": 0.8887, + "step": 1620 + }, + { + "epoch": 0.22, + "grad_norm": 0.921875, + "learning_rate": 0.00010813875917278186, + "loss": 0.9569, + "step": 1621 + }, + { + "epoch": 0.22, + "grad_norm": 0.69921875, + "learning_rate": 0.00010820547031354235, + "loss": 0.5003, + "step": 1622 + }, + { + "epoch": 0.22, + "grad_norm": 0.80078125, + "learning_rate": 0.00010827218145430287, + "loss": 0.7869, + "step": 1623 + }, + { + "epoch": 0.22, + "grad_norm": 0.796875, + "learning_rate": 0.00010833889259506338, + "loss": 0.6134, + "step": 1624 + }, + { + "epoch": 0.22, + "grad_norm": 0.84375, + "learning_rate": 0.0001084056037358239, + "loss": 0.9464, + "step": 1625 + }, + { + "epoch": 0.22, + "grad_norm": 0.890625, + "learning_rate": 0.0001084723148765844, + "loss": 0.3839, + "step": 1626 + }, + { + "epoch": 0.22, + "grad_norm": 0.65625, + "learning_rate": 0.0001085390260173449, + "loss": 0.6264, + "step": 1627 + }, + { + "epoch": 0.22, + "grad_norm": 0.8203125, + "learning_rate": 0.0001086057371581054, + "loss": 0.9481, + "step": 1628 + }, + { + "epoch": 0.22, + "grad_norm": 1.328125, + "learning_rate": 0.00010867244829886592, + "loss": 1.0953, + "step": 1629 + }, + { + "epoch": 0.22, + "grad_norm": 0.99609375, + "learning_rate": 0.00010873915943962642, + "loss": 0.6278, + "step": 1630 + }, + { + "epoch": 0.22, + "grad_norm": 1.578125, + "learning_rate": 0.00010880587058038694, + "loss": 0.9722, + "step": 1631 + }, + { + "epoch": 0.22, + "grad_norm": 0.74609375, + "learning_rate": 0.00010887258172114743, + "loss": 0.4759, + "step": 1632 + }, + { + "epoch": 0.22, + "grad_norm": 0.7578125, + "learning_rate": 0.00010893929286190794, + "loss": 0.5224, + "step": 1633 + }, + { + "epoch": 0.22, + "grad_norm": 0.9609375, + "learning_rate": 0.00010900600400266846, + "loss": 0.6577, + "step": 1634 + }, + { + "epoch": 0.22, + "grad_norm": 0.8671875, + "learning_rate": 0.00010907271514342896, + "loss": 0.4336, + "step": 1635 + }, + { + "epoch": 0.22, + "grad_norm": 1.2109375, + "learning_rate": 0.00010913942628418947, + "loss": 0.9255, + "step": 1636 + }, + { + "epoch": 0.22, + "grad_norm": 0.953125, + "learning_rate": 0.00010920613742494996, + "loss": 0.7967, + "step": 1637 + }, + { + "epoch": 0.22, + "grad_norm": 0.84375, + "learning_rate": 0.00010927284856571048, + "loss": 0.7045, + "step": 1638 + }, + { + "epoch": 0.22, + "grad_norm": 0.81640625, + "learning_rate": 0.00010933955970647098, + "loss": 0.6959, + "step": 1639 + }, + { + "epoch": 0.22, + "grad_norm": 0.85546875, + "learning_rate": 0.0001094062708472315, + "loss": 0.777, + "step": 1640 + }, + { + "epoch": 0.22, + "grad_norm": 0.82421875, + "learning_rate": 0.000109472981987992, + "loss": 0.5346, + "step": 1641 + }, + { + "epoch": 0.22, + "grad_norm": 0.75, + "learning_rate": 0.0001095396931287525, + "loss": 0.8635, + "step": 1642 + }, + { + "epoch": 0.22, + "grad_norm": 0.8984375, + "learning_rate": 0.000109606404269513, + "loss": 0.6793, + "step": 1643 + }, + { + "epoch": 0.22, + "grad_norm": 0.8046875, + "learning_rate": 0.00010967311541027352, + "loss": 0.6, + "step": 1644 + }, + { + "epoch": 0.22, + "grad_norm": 1.3359375, + "learning_rate": 0.00010973982655103402, + "loss": 1.1286, + "step": 1645 + }, + { + "epoch": 0.22, + "grad_norm": 0.84765625, + "learning_rate": 0.00010980653769179454, + "loss": 1.0535, + "step": 1646 + }, + { + "epoch": 0.22, + "grad_norm": 0.70703125, + "learning_rate": 0.00010987324883255503, + "loss": 0.7374, + "step": 1647 + }, + { + "epoch": 0.22, + "grad_norm": 0.90625, + "learning_rate": 0.00010993995997331554, + "loss": 0.8071, + "step": 1648 + }, + { + "epoch": 0.22, + "grad_norm": 1.2265625, + "learning_rate": 0.00011000667111407606, + "loss": 0.8053, + "step": 1649 + }, + { + "epoch": 0.22, + "grad_norm": 1.0, + "learning_rate": 0.00011007338225483656, + "loss": 0.6665, + "step": 1650 + }, + { + "epoch": 0.22, + "grad_norm": 0.82421875, + "learning_rate": 0.00011014009339559708, + "loss": 0.4685, + "step": 1651 + }, + { + "epoch": 0.22, + "grad_norm": 0.83984375, + "learning_rate": 0.00011020680453635756, + "loss": 0.9201, + "step": 1652 + }, + { + "epoch": 0.22, + "grad_norm": 0.96875, + "learning_rate": 0.00011027351567711808, + "loss": 0.7316, + "step": 1653 + }, + { + "epoch": 0.22, + "grad_norm": 1.0234375, + "learning_rate": 0.00011034022681787858, + "loss": 0.9127, + "step": 1654 + }, + { + "epoch": 0.22, + "grad_norm": 0.9765625, + "learning_rate": 0.0001104069379586391, + "loss": 0.6965, + "step": 1655 + }, + { + "epoch": 0.22, + "grad_norm": 0.73046875, + "learning_rate": 0.00011047364909939961, + "loss": 0.4929, + "step": 1656 + }, + { + "epoch": 0.22, + "grad_norm": 0.6640625, + "learning_rate": 0.0001105403602401601, + "loss": 0.7779, + "step": 1657 + }, + { + "epoch": 0.22, + "grad_norm": 0.8515625, + "learning_rate": 0.00011060707138092062, + "loss": 0.7599, + "step": 1658 + }, + { + "epoch": 0.22, + "grad_norm": 0.6875, + "learning_rate": 0.00011067378252168112, + "loss": 0.8968, + "step": 1659 + }, + { + "epoch": 0.22, + "grad_norm": 0.80078125, + "learning_rate": 0.00011074049366244164, + "loss": 0.4712, + "step": 1660 + }, + { + "epoch": 0.22, + "grad_norm": 0.65234375, + "learning_rate": 0.00011080720480320215, + "loss": 1.0424, + "step": 1661 + }, + { + "epoch": 0.22, + "grad_norm": 0.73046875, + "learning_rate": 0.00011087391594396264, + "loss": 1.0375, + "step": 1662 + }, + { + "epoch": 0.22, + "grad_norm": 0.7265625, + "learning_rate": 0.00011094062708472314, + "loss": 0.6213, + "step": 1663 + }, + { + "epoch": 0.22, + "grad_norm": 0.703125, + "learning_rate": 0.00011100733822548366, + "loss": 0.7642, + "step": 1664 + }, + { + "epoch": 0.22, + "grad_norm": 1.109375, + "learning_rate": 0.00011107404936624417, + "loss": 0.8758, + "step": 1665 + }, + { + "epoch": 0.22, + "grad_norm": 0.8125, + "learning_rate": 0.00011114076050700468, + "loss": 1.0629, + "step": 1666 + }, + { + "epoch": 0.22, + "grad_norm": 1.0703125, + "learning_rate": 0.00011120747164776518, + "loss": 0.8565, + "step": 1667 + }, + { + "epoch": 0.22, + "grad_norm": 0.7734375, + "learning_rate": 0.00011127418278852568, + "loss": 0.672, + "step": 1668 + }, + { + "epoch": 0.22, + "grad_norm": 0.66015625, + "learning_rate": 0.0001113408939292862, + "loss": 0.8095, + "step": 1669 + }, + { + "epoch": 0.22, + "grad_norm": 0.67578125, + "learning_rate": 0.0001114076050700467, + "loss": 0.9201, + "step": 1670 + }, + { + "epoch": 0.22, + "grad_norm": 0.93359375, + "learning_rate": 0.00011147431621080722, + "loss": 0.8403, + "step": 1671 + }, + { + "epoch": 0.22, + "grad_norm": 0.75, + "learning_rate": 0.0001115410273515677, + "loss": 0.6696, + "step": 1672 + }, + { + "epoch": 0.22, + "grad_norm": 0.765625, + "learning_rate": 0.00011160773849232822, + "loss": 0.819, + "step": 1673 + }, + { + "epoch": 0.22, + "grad_norm": 0.7578125, + "learning_rate": 0.00011167444963308873, + "loss": 0.829, + "step": 1674 + }, + { + "epoch": 0.22, + "grad_norm": 0.828125, + "learning_rate": 0.00011174116077384924, + "loss": 0.7406, + "step": 1675 + }, + { + "epoch": 0.22, + "grad_norm": 0.9296875, + "learning_rate": 0.00011180787191460975, + "loss": 0.7994, + "step": 1676 + }, + { + "epoch": 0.22, + "grad_norm": 0.83203125, + "learning_rate": 0.00011187458305537024, + "loss": 0.8501, + "step": 1677 + }, + { + "epoch": 0.22, + "grad_norm": 0.625, + "learning_rate": 0.00011194129419613076, + "loss": 0.4738, + "step": 1678 + }, + { + "epoch": 0.22, + "grad_norm": 0.8046875, + "learning_rate": 0.00011200800533689126, + "loss": 1.0002, + "step": 1679 + }, + { + "epoch": 0.22, + "grad_norm": 0.6484375, + "learning_rate": 0.00011207471647765177, + "loss": 0.5663, + "step": 1680 + }, + { + "epoch": 0.22, + "grad_norm": 1.1015625, + "learning_rate": 0.00011214142761841229, + "loss": 0.7841, + "step": 1681 + }, + { + "epoch": 0.22, + "grad_norm": 0.79296875, + "learning_rate": 0.00011220813875917278, + "loss": 0.6763, + "step": 1682 + }, + { + "epoch": 0.22, + "grad_norm": 0.6640625, + "learning_rate": 0.00011227484989993328, + "loss": 0.7886, + "step": 1683 + }, + { + "epoch": 0.22, + "grad_norm": 0.71875, + "learning_rate": 0.0001123415610406938, + "loss": 0.7295, + "step": 1684 + }, + { + "epoch": 0.22, + "grad_norm": 0.671875, + "learning_rate": 0.00011240827218145431, + "loss": 0.8371, + "step": 1685 + }, + { + "epoch": 0.22, + "grad_norm": 0.79296875, + "learning_rate": 0.00011247498332221483, + "loss": 0.8273, + "step": 1686 + }, + { + "epoch": 0.23, + "grad_norm": 0.78515625, + "learning_rate": 0.00011254169446297533, + "loss": 0.8379, + "step": 1687 + }, + { + "epoch": 0.23, + "grad_norm": 1.1015625, + "learning_rate": 0.00011260840560373582, + "loss": 0.6722, + "step": 1688 + }, + { + "epoch": 0.23, + "grad_norm": 0.87890625, + "learning_rate": 0.00011267511674449633, + "loss": 0.8395, + "step": 1689 + }, + { + "epoch": 0.23, + "grad_norm": 0.765625, + "learning_rate": 0.00011274182788525685, + "loss": 0.8, + "step": 1690 + }, + { + "epoch": 0.23, + "grad_norm": 0.8203125, + "learning_rate": 0.00011280853902601735, + "loss": 1.0446, + "step": 1691 + }, + { + "epoch": 0.23, + "grad_norm": 0.72265625, + "learning_rate": 0.00011287525016677787, + "loss": 0.8956, + "step": 1692 + }, + { + "epoch": 0.23, + "grad_norm": 0.66015625, + "learning_rate": 0.00011294196130753836, + "loss": 0.4816, + "step": 1693 + }, + { + "epoch": 0.23, + "grad_norm": 0.859375, + "learning_rate": 0.00011300867244829887, + "loss": 0.5455, + "step": 1694 + }, + { + "epoch": 0.23, + "grad_norm": 0.72265625, + "learning_rate": 0.00011307538358905939, + "loss": 0.6333, + "step": 1695 + }, + { + "epoch": 0.23, + "grad_norm": 0.9921875, + "learning_rate": 0.00011314209472981989, + "loss": 1.1236, + "step": 1696 + }, + { + "epoch": 0.23, + "grad_norm": 1.0, + "learning_rate": 0.00011320880587058041, + "loss": 0.885, + "step": 1697 + }, + { + "epoch": 0.23, + "grad_norm": 0.76171875, + "learning_rate": 0.00011327551701134089, + "loss": 0.9668, + "step": 1698 + }, + { + "epoch": 0.23, + "grad_norm": 1.0546875, + "learning_rate": 0.0001133422281521014, + "loss": 0.8942, + "step": 1699 + }, + { + "epoch": 0.23, + "grad_norm": 1.15625, + "learning_rate": 0.00011340893929286191, + "loss": 0.718, + "step": 1700 + }, + { + "epoch": 0.23, + "grad_norm": 0.75390625, + "learning_rate": 0.00011347565043362243, + "loss": 1.0114, + "step": 1701 + }, + { + "epoch": 0.23, + "grad_norm": 0.75, + "learning_rate": 0.00011354236157438293, + "loss": 1.1333, + "step": 1702 + }, + { + "epoch": 0.23, + "grad_norm": 1.0390625, + "learning_rate": 0.00011360907271514343, + "loss": 0.8972, + "step": 1703 + }, + { + "epoch": 0.23, + "grad_norm": 0.93359375, + "learning_rate": 0.00011367578385590394, + "loss": 0.9488, + "step": 1704 + }, + { + "epoch": 0.23, + "grad_norm": 0.8046875, + "learning_rate": 0.00011374249499666445, + "loss": 0.7008, + "step": 1705 + }, + { + "epoch": 0.23, + "grad_norm": 0.83984375, + "learning_rate": 0.00011380920613742497, + "loss": 1.003, + "step": 1706 + }, + { + "epoch": 0.23, + "grad_norm": 0.90625, + "learning_rate": 0.00011387591727818547, + "loss": 0.4421, + "step": 1707 + }, + { + "epoch": 0.23, + "grad_norm": 0.80859375, + "learning_rate": 0.00011394262841894596, + "loss": 0.4733, + "step": 1708 + }, + { + "epoch": 0.23, + "grad_norm": 0.6875, + "learning_rate": 0.00011400933955970647, + "loss": 0.5772, + "step": 1709 + }, + { + "epoch": 0.23, + "grad_norm": 0.7265625, + "learning_rate": 0.00011407605070046699, + "loss": 0.8363, + "step": 1710 + }, + { + "epoch": 0.23, + "grad_norm": 0.671875, + "learning_rate": 0.00011414276184122749, + "loss": 0.7982, + "step": 1711 + }, + { + "epoch": 0.23, + "grad_norm": 1.0078125, + "learning_rate": 0.00011420947298198801, + "loss": 0.6288, + "step": 1712 + }, + { + "epoch": 0.23, + "grad_norm": 0.77734375, + "learning_rate": 0.0001142761841227485, + "loss": 0.9677, + "step": 1713 + }, + { + "epoch": 0.23, + "grad_norm": 0.78125, + "learning_rate": 0.00011434289526350901, + "loss": 0.7936, + "step": 1714 + }, + { + "epoch": 0.23, + "grad_norm": 0.9609375, + "learning_rate": 0.00011440960640426953, + "loss": 1.0175, + "step": 1715 + }, + { + "epoch": 0.23, + "grad_norm": 0.75390625, + "learning_rate": 0.00011447631754503003, + "loss": 0.6843, + "step": 1716 + }, + { + "epoch": 0.23, + "grad_norm": 0.85546875, + "learning_rate": 0.00011454302868579054, + "loss": 0.9602, + "step": 1717 + }, + { + "epoch": 0.23, + "grad_norm": 1.0859375, + "learning_rate": 0.00011460973982655103, + "loss": 0.8171, + "step": 1718 + }, + { + "epoch": 0.23, + "grad_norm": 0.8046875, + "learning_rate": 0.00011467645096731155, + "loss": 0.5688, + "step": 1719 + }, + { + "epoch": 0.23, + "grad_norm": 0.78515625, + "learning_rate": 0.00011474316210807205, + "loss": 0.8823, + "step": 1720 + }, + { + "epoch": 0.23, + "grad_norm": 0.765625, + "learning_rate": 0.00011480987324883257, + "loss": 0.4823, + "step": 1721 + }, + { + "epoch": 0.23, + "grad_norm": 0.8515625, + "learning_rate": 0.00011487658438959308, + "loss": 0.6739, + "step": 1722 + }, + { + "epoch": 0.23, + "grad_norm": 0.89453125, + "learning_rate": 0.00011494329553035357, + "loss": 0.9974, + "step": 1723 + }, + { + "epoch": 0.23, + "grad_norm": 0.859375, + "learning_rate": 0.00011501000667111409, + "loss": 0.4565, + "step": 1724 + }, + { + "epoch": 0.23, + "grad_norm": 1.09375, + "learning_rate": 0.00011507671781187459, + "loss": 0.78, + "step": 1725 + }, + { + "epoch": 0.23, + "grad_norm": 1.03125, + "learning_rate": 0.0001151434289526351, + "loss": 0.7029, + "step": 1726 + }, + { + "epoch": 0.23, + "grad_norm": 0.8125, + "learning_rate": 0.00011521014009339561, + "loss": 0.6827, + "step": 1727 + }, + { + "epoch": 0.23, + "grad_norm": 0.80859375, + "learning_rate": 0.0001152768512341561, + "loss": 0.8286, + "step": 1728 + }, + { + "epoch": 0.23, + "grad_norm": 0.84375, + "learning_rate": 0.00011534356237491661, + "loss": 1.0058, + "step": 1729 + }, + { + "epoch": 0.23, + "grad_norm": 0.59765625, + "learning_rate": 0.00011541027351567713, + "loss": 0.5446, + "step": 1730 + }, + { + "epoch": 0.23, + "grad_norm": 1.1640625, + "learning_rate": 0.00011547698465643763, + "loss": 0.8983, + "step": 1731 + }, + { + "epoch": 0.23, + "grad_norm": 0.875, + "learning_rate": 0.00011554369579719815, + "loss": 0.8196, + "step": 1732 + }, + { + "epoch": 0.23, + "grad_norm": 0.70703125, + "learning_rate": 0.00011561040693795863, + "loss": 0.5966, + "step": 1733 + }, + { + "epoch": 0.23, + "grad_norm": 0.73828125, + "learning_rate": 0.00011567711807871915, + "loss": 0.7592, + "step": 1734 + }, + { + "epoch": 0.23, + "grad_norm": 0.796875, + "learning_rate": 0.00011574382921947965, + "loss": 0.7281, + "step": 1735 + }, + { + "epoch": 0.23, + "grad_norm": 1.0390625, + "learning_rate": 0.00011581054036024017, + "loss": 0.9145, + "step": 1736 + }, + { + "epoch": 0.23, + "grad_norm": 0.69921875, + "learning_rate": 0.00011587725150100068, + "loss": 0.7372, + "step": 1737 + }, + { + "epoch": 0.23, + "grad_norm": 0.7109375, + "learning_rate": 0.00011594396264176117, + "loss": 0.6967, + "step": 1738 + }, + { + "epoch": 0.23, + "grad_norm": 0.828125, + "learning_rate": 0.00011601067378252169, + "loss": 0.9189, + "step": 1739 + }, + { + "epoch": 0.23, + "grad_norm": 0.96875, + "learning_rate": 0.00011607738492328219, + "loss": 0.6412, + "step": 1740 + }, + { + "epoch": 0.23, + "grad_norm": 0.7578125, + "learning_rate": 0.00011614409606404271, + "loss": 0.5891, + "step": 1741 + }, + { + "epoch": 0.23, + "grad_norm": 0.79296875, + "learning_rate": 0.00011621080720480322, + "loss": 0.9426, + "step": 1742 + }, + { + "epoch": 0.23, + "grad_norm": 0.9140625, + "learning_rate": 0.00011627751834556371, + "loss": 0.635, + "step": 1743 + }, + { + "epoch": 0.23, + "grad_norm": 0.79296875, + "learning_rate": 0.00011634422948632421, + "loss": 0.6862, + "step": 1744 + }, + { + "epoch": 0.23, + "grad_norm": 0.71875, + "learning_rate": 0.00011641094062708473, + "loss": 0.5786, + "step": 1745 + }, + { + "epoch": 0.23, + "grad_norm": 0.72265625, + "learning_rate": 0.00011647765176784524, + "loss": 0.6934, + "step": 1746 + }, + { + "epoch": 0.23, + "grad_norm": 0.91015625, + "learning_rate": 0.00011654436290860576, + "loss": 0.7457, + "step": 1747 + }, + { + "epoch": 0.23, + "grad_norm": 0.703125, + "learning_rate": 0.00011661107404936625, + "loss": 0.9101, + "step": 1748 + }, + { + "epoch": 0.23, + "grad_norm": 0.6953125, + "learning_rate": 0.00011667778519012675, + "loss": 0.5465, + "step": 1749 + }, + { + "epoch": 0.23, + "grad_norm": 0.8515625, + "learning_rate": 0.00011674449633088727, + "loss": 0.8595, + "step": 1750 + }, + { + "epoch": 0.23, + "grad_norm": 0.8984375, + "learning_rate": 0.00011681120747164778, + "loss": 0.8601, + "step": 1751 + }, + { + "epoch": 0.23, + "grad_norm": 0.7578125, + "learning_rate": 0.0001168779186124083, + "loss": 0.7062, + "step": 1752 + }, + { + "epoch": 0.23, + "grad_norm": 0.8828125, + "learning_rate": 0.00011694462975316877, + "loss": 0.8663, + "step": 1753 + }, + { + "epoch": 0.23, + "grad_norm": 0.703125, + "learning_rate": 0.00011701134089392929, + "loss": 0.6496, + "step": 1754 + }, + { + "epoch": 0.23, + "grad_norm": 0.9140625, + "learning_rate": 0.0001170780520346898, + "loss": 0.7817, + "step": 1755 + }, + { + "epoch": 0.23, + "grad_norm": 0.84765625, + "learning_rate": 0.00011714476317545031, + "loss": 0.7224, + "step": 1756 + }, + { + "epoch": 0.23, + "grad_norm": 0.91015625, + "learning_rate": 0.00011721147431621082, + "loss": 0.6928, + "step": 1757 + }, + { + "epoch": 0.23, + "grad_norm": 0.84375, + "learning_rate": 0.00011727818545697131, + "loss": 1.0232, + "step": 1758 + }, + { + "epoch": 0.23, + "grad_norm": 0.8828125, + "learning_rate": 0.00011734489659773183, + "loss": 0.6752, + "step": 1759 + }, + { + "epoch": 0.23, + "grad_norm": 0.8359375, + "learning_rate": 0.00011741160773849233, + "loss": 0.7061, + "step": 1760 + }, + { + "epoch": 0.23, + "grad_norm": 0.8125, + "learning_rate": 0.00011747831887925285, + "loss": 0.7557, + "step": 1761 + }, + { + "epoch": 0.24, + "grad_norm": 1.0859375, + "learning_rate": 0.00011754503002001336, + "loss": 0.7455, + "step": 1762 + }, + { + "epoch": 0.24, + "grad_norm": 0.8984375, + "learning_rate": 0.00011761174116077385, + "loss": 1.0575, + "step": 1763 + }, + { + "epoch": 0.24, + "grad_norm": 0.59375, + "learning_rate": 0.00011767845230153435, + "loss": 0.6334, + "step": 1764 + }, + { + "epoch": 0.24, + "grad_norm": 0.77734375, + "learning_rate": 0.00011774516344229487, + "loss": 0.8038, + "step": 1765 + }, + { + "epoch": 0.24, + "grad_norm": 0.71875, + "learning_rate": 0.00011781187458305538, + "loss": 0.724, + "step": 1766 + }, + { + "epoch": 0.24, + "grad_norm": 0.84375, + "learning_rate": 0.0001178785857238159, + "loss": 0.8052, + "step": 1767 + }, + { + "epoch": 0.24, + "grad_norm": 0.66015625, + "learning_rate": 0.00011794529686457639, + "loss": 0.6042, + "step": 1768 + }, + { + "epoch": 0.24, + "grad_norm": 0.890625, + "learning_rate": 0.0001180120080053369, + "loss": 0.6785, + "step": 1769 + }, + { + "epoch": 0.24, + "grad_norm": 0.765625, + "learning_rate": 0.00011807871914609741, + "loss": 0.713, + "step": 1770 + }, + { + "epoch": 0.24, + "grad_norm": 1.015625, + "learning_rate": 0.00011814543028685792, + "loss": 0.6566, + "step": 1771 + }, + { + "epoch": 0.24, + "grad_norm": 0.86328125, + "learning_rate": 0.00011821214142761842, + "loss": 0.6518, + "step": 1772 + }, + { + "epoch": 0.24, + "grad_norm": 0.72265625, + "learning_rate": 0.00011827885256837891, + "loss": 0.5543, + "step": 1773 + }, + { + "epoch": 0.24, + "grad_norm": 0.78125, + "learning_rate": 0.00011834556370913943, + "loss": 0.6014, + "step": 1774 + }, + { + "epoch": 0.24, + "grad_norm": 1.046875, + "learning_rate": 0.00011841227484989994, + "loss": 0.703, + "step": 1775 + }, + { + "epoch": 0.24, + "grad_norm": 0.79296875, + "learning_rate": 0.00011847898599066046, + "loss": 0.577, + "step": 1776 + }, + { + "epoch": 0.24, + "grad_norm": 0.984375, + "learning_rate": 0.00011854569713142096, + "loss": 0.819, + "step": 1777 + }, + { + "epoch": 0.24, + "grad_norm": 0.89453125, + "learning_rate": 0.00011861240827218145, + "loss": 0.6242, + "step": 1778 + }, + { + "epoch": 0.24, + "grad_norm": 1.0546875, + "learning_rate": 0.00011867911941294196, + "loss": 0.9576, + "step": 1779 + }, + { + "epoch": 0.24, + "grad_norm": 1.1015625, + "learning_rate": 0.00011874583055370248, + "loss": 1.0136, + "step": 1780 + }, + { + "epoch": 0.24, + "grad_norm": 0.8828125, + "learning_rate": 0.00011881254169446298, + "loss": 0.6262, + "step": 1781 + }, + { + "epoch": 0.24, + "grad_norm": 0.9765625, + "learning_rate": 0.0001188792528352235, + "loss": 0.5218, + "step": 1782 + }, + { + "epoch": 0.24, + "grad_norm": 0.85546875, + "learning_rate": 0.00011894596397598399, + "loss": 0.6942, + "step": 1783 + }, + { + "epoch": 0.24, + "grad_norm": 1.1796875, + "learning_rate": 0.0001190126751167445, + "loss": 0.6779, + "step": 1784 + }, + { + "epoch": 0.24, + "grad_norm": 0.80078125, + "learning_rate": 0.00011907938625750501, + "loss": 1.0984, + "step": 1785 + }, + { + "epoch": 0.24, + "grad_norm": 0.98046875, + "learning_rate": 0.00011914609739826552, + "loss": 0.9313, + "step": 1786 + }, + { + "epoch": 0.24, + "grad_norm": 0.76171875, + "learning_rate": 0.00011921280853902604, + "loss": 0.978, + "step": 1787 + }, + { + "epoch": 0.24, + "grad_norm": 0.65625, + "learning_rate": 0.00011927951967978652, + "loss": 0.7866, + "step": 1788 + }, + { + "epoch": 0.24, + "grad_norm": 0.71875, + "learning_rate": 0.00011934623082054703, + "loss": 0.5486, + "step": 1789 + }, + { + "epoch": 0.24, + "grad_norm": 0.83203125, + "learning_rate": 0.00011941294196130754, + "loss": 0.7756, + "step": 1790 + }, + { + "epoch": 0.24, + "grad_norm": 0.87109375, + "learning_rate": 0.00011947965310206806, + "loss": 0.7215, + "step": 1791 + }, + { + "epoch": 0.24, + "grad_norm": 0.59765625, + "learning_rate": 0.00011954636424282856, + "loss": 0.626, + "step": 1792 + }, + { + "epoch": 0.24, + "grad_norm": 0.74609375, + "learning_rate": 0.00011961307538358905, + "loss": 0.8629, + "step": 1793 + }, + { + "epoch": 0.24, + "grad_norm": 0.80078125, + "learning_rate": 0.00011967978652434957, + "loss": 0.7221, + "step": 1794 + }, + { + "epoch": 0.24, + "grad_norm": 0.8828125, + "learning_rate": 0.00011974649766511008, + "loss": 1.1066, + "step": 1795 + }, + { + "epoch": 0.24, + "grad_norm": 0.6875, + "learning_rate": 0.0001198132088058706, + "loss": 0.8803, + "step": 1796 + }, + { + "epoch": 0.24, + "grad_norm": 0.82421875, + "learning_rate": 0.0001198799199466311, + "loss": 0.8508, + "step": 1797 + }, + { + "epoch": 0.24, + "grad_norm": 0.78515625, + "learning_rate": 0.0001199466310873916, + "loss": 0.9199, + "step": 1798 + }, + { + "epoch": 0.24, + "grad_norm": 0.796875, + "learning_rate": 0.0001200133422281521, + "loss": 0.8156, + "step": 1799 + }, + { + "epoch": 0.24, + "grad_norm": 0.73828125, + "learning_rate": 0.00012008005336891262, + "loss": 0.6175, + "step": 1800 + }, + { + "epoch": 0.24, + "grad_norm": 0.984375, + "learning_rate": 0.00012014676450967312, + "loss": 0.8087, + "step": 1801 + }, + { + "epoch": 0.24, + "grad_norm": 0.94140625, + "learning_rate": 0.00012021347565043364, + "loss": 0.6882, + "step": 1802 + }, + { + "epoch": 0.24, + "grad_norm": 0.87109375, + "learning_rate": 0.00012028018679119413, + "loss": 0.7001, + "step": 1803 + }, + { + "epoch": 0.24, + "grad_norm": 0.78125, + "learning_rate": 0.00012034689793195464, + "loss": 0.7446, + "step": 1804 + }, + { + "epoch": 0.24, + "grad_norm": 0.828125, + "learning_rate": 0.00012041360907271516, + "loss": 0.7305, + "step": 1805 + }, + { + "epoch": 0.24, + "grad_norm": 0.6953125, + "learning_rate": 0.00012048032021347566, + "loss": 0.7721, + "step": 1806 + }, + { + "epoch": 0.24, + "grad_norm": 0.9140625, + "learning_rate": 0.00012054703135423618, + "loss": 0.9363, + "step": 1807 + }, + { + "epoch": 0.24, + "grad_norm": 0.69140625, + "learning_rate": 0.00012061374249499666, + "loss": 0.5018, + "step": 1808 + }, + { + "epoch": 0.24, + "grad_norm": 0.77734375, + "learning_rate": 0.00012068045363575718, + "loss": 0.8218, + "step": 1809 + }, + { + "epoch": 0.24, + "grad_norm": 0.7578125, + "learning_rate": 0.00012074716477651768, + "loss": 0.9012, + "step": 1810 + }, + { + "epoch": 0.24, + "grad_norm": 0.69921875, + "learning_rate": 0.0001208138759172782, + "loss": 0.6648, + "step": 1811 + }, + { + "epoch": 0.24, + "grad_norm": 0.9609375, + "learning_rate": 0.0001208805870580387, + "loss": 0.7433, + "step": 1812 + }, + { + "epoch": 0.24, + "grad_norm": 0.7109375, + "learning_rate": 0.0001209472981987992, + "loss": 0.8371, + "step": 1813 + }, + { + "epoch": 0.24, + "grad_norm": 0.828125, + "learning_rate": 0.00012101400933955971, + "loss": 0.7247, + "step": 1814 + }, + { + "epoch": 0.24, + "grad_norm": 0.94921875, + "learning_rate": 0.00012108072048032022, + "loss": 0.7081, + "step": 1815 + }, + { + "epoch": 0.24, + "grad_norm": 0.6796875, + "learning_rate": 0.00012114743162108072, + "loss": 0.909, + "step": 1816 + }, + { + "epoch": 0.24, + "grad_norm": 0.859375, + "learning_rate": 0.00012121414276184124, + "loss": 0.6981, + "step": 1817 + }, + { + "epoch": 0.24, + "grad_norm": 0.90234375, + "learning_rate": 0.00012128085390260173, + "loss": 0.808, + "step": 1818 + }, + { + "epoch": 0.24, + "grad_norm": 0.82421875, + "learning_rate": 0.00012134756504336224, + "loss": 0.7937, + "step": 1819 + }, + { + "epoch": 0.24, + "grad_norm": 0.89453125, + "learning_rate": 0.00012141427618412276, + "loss": 0.4215, + "step": 1820 + }, + { + "epoch": 0.24, + "grad_norm": 1.0703125, + "learning_rate": 0.00012148098732488326, + "loss": 0.7786, + "step": 1821 + }, + { + "epoch": 0.24, + "grad_norm": 0.734375, + "learning_rate": 0.00012154769846564378, + "loss": 0.6342, + "step": 1822 + }, + { + "epoch": 0.24, + "grad_norm": 0.87890625, + "learning_rate": 0.00012161440960640427, + "loss": 0.5614, + "step": 1823 + }, + { + "epoch": 0.24, + "grad_norm": 1.203125, + "learning_rate": 0.00012168112074716478, + "loss": 0.8094, + "step": 1824 + }, + { + "epoch": 0.24, + "grad_norm": 1.0078125, + "learning_rate": 0.00012174783188792528, + "loss": 0.7038, + "step": 1825 + }, + { + "epoch": 0.24, + "grad_norm": 0.9140625, + "learning_rate": 0.0001218145430286858, + "loss": 0.5473, + "step": 1826 + }, + { + "epoch": 0.24, + "grad_norm": 0.72265625, + "learning_rate": 0.00012188125416944631, + "loss": 0.748, + "step": 1827 + }, + { + "epoch": 0.24, + "grad_norm": 0.6640625, + "learning_rate": 0.0001219479653102068, + "loss": 0.6014, + "step": 1828 + }, + { + "epoch": 0.24, + "grad_norm": 1.0234375, + "learning_rate": 0.00012201467645096732, + "loss": 1.0253, + "step": 1829 + }, + { + "epoch": 0.24, + "grad_norm": 0.8046875, + "learning_rate": 0.00012208138759172782, + "loss": 0.9072, + "step": 1830 + }, + { + "epoch": 0.24, + "grad_norm": 1.09375, + "learning_rate": 0.00012214809873248834, + "loss": 0.8998, + "step": 1831 + }, + { + "epoch": 0.24, + "grad_norm": 0.65234375, + "learning_rate": 0.00012221480987324886, + "loss": 0.6769, + "step": 1832 + }, + { + "epoch": 0.24, + "grad_norm": 0.890625, + "learning_rate": 0.00012228152101400932, + "loss": 0.7267, + "step": 1833 + }, + { + "epoch": 0.24, + "grad_norm": 0.80078125, + "learning_rate": 0.00012234823215476984, + "loss": 0.9409, + "step": 1834 + }, + { + "epoch": 0.24, + "grad_norm": 0.828125, + "learning_rate": 0.00012241494329553036, + "loss": 0.7476, + "step": 1835 + }, + { + "epoch": 0.24, + "grad_norm": 0.89453125, + "learning_rate": 0.00012248165443629088, + "loss": 0.7999, + "step": 1836 + }, + { + "epoch": 0.25, + "grad_norm": 0.7421875, + "learning_rate": 0.00012254836557705137, + "loss": 0.7198, + "step": 1837 + }, + { + "epoch": 0.25, + "grad_norm": 0.76953125, + "learning_rate": 0.00012261507671781186, + "loss": 0.3887, + "step": 1838 + }, + { + "epoch": 0.25, + "grad_norm": 0.69921875, + "learning_rate": 0.00012268178785857238, + "loss": 0.7862, + "step": 1839 + }, + { + "epoch": 0.25, + "grad_norm": 0.76953125, + "learning_rate": 0.0001227484989993329, + "loss": 1.0891, + "step": 1840 + }, + { + "epoch": 0.25, + "grad_norm": 1.15625, + "learning_rate": 0.00012281521014009342, + "loss": 0.9213, + "step": 1841 + }, + { + "epoch": 0.25, + "grad_norm": 1.0390625, + "learning_rate": 0.0001228819212808539, + "loss": 1.0295, + "step": 1842 + }, + { + "epoch": 0.25, + "grad_norm": 0.9453125, + "learning_rate": 0.0001229486324216144, + "loss": 0.752, + "step": 1843 + }, + { + "epoch": 0.25, + "grad_norm": 1.015625, + "learning_rate": 0.00012301534356237492, + "loss": 0.7627, + "step": 1844 + }, + { + "epoch": 0.25, + "grad_norm": 0.609375, + "learning_rate": 0.00012308205470313544, + "loss": 0.6213, + "step": 1845 + }, + { + "epoch": 0.25, + "grad_norm": 0.89453125, + "learning_rate": 0.00012314876584389593, + "loss": 0.7637, + "step": 1846 + }, + { + "epoch": 0.25, + "grad_norm": 0.92578125, + "learning_rate": 0.00012321547698465645, + "loss": 0.7267, + "step": 1847 + }, + { + "epoch": 0.25, + "grad_norm": 0.73828125, + "learning_rate": 0.00012328218812541694, + "loss": 0.7092, + "step": 1848 + }, + { + "epoch": 0.25, + "grad_norm": 0.84765625, + "learning_rate": 0.00012334889926617746, + "loss": 0.5256, + "step": 1849 + }, + { + "epoch": 0.25, + "grad_norm": 0.86328125, + "learning_rate": 0.00012341561040693798, + "loss": 0.9965, + "step": 1850 + }, + { + "epoch": 0.25, + "grad_norm": 0.64453125, + "learning_rate": 0.00012348232154769847, + "loss": 0.7505, + "step": 1851 + }, + { + "epoch": 0.25, + "grad_norm": 0.89453125, + "learning_rate": 0.000123549032688459, + "loss": 0.8025, + "step": 1852 + }, + { + "epoch": 0.25, + "grad_norm": 0.65234375, + "learning_rate": 0.00012361574382921948, + "loss": 0.7588, + "step": 1853 + }, + { + "epoch": 0.25, + "grad_norm": 0.82421875, + "learning_rate": 0.00012368245496998, + "loss": 1.0344, + "step": 1854 + }, + { + "epoch": 0.25, + "grad_norm": 0.9765625, + "learning_rate": 0.0001237491661107405, + "loss": 0.6213, + "step": 1855 + }, + { + "epoch": 0.25, + "grad_norm": 1.1796875, + "learning_rate": 0.000123815877251501, + "loss": 0.473, + "step": 1856 + }, + { + "epoch": 0.25, + "grad_norm": 0.921875, + "learning_rate": 0.00012388258839226153, + "loss": 0.7779, + "step": 1857 + }, + { + "epoch": 0.25, + "grad_norm": 0.8671875, + "learning_rate": 0.00012394929953302202, + "loss": 0.6708, + "step": 1858 + }, + { + "epoch": 0.25, + "grad_norm": 1.0234375, + "learning_rate": 0.00012401601067378254, + "loss": 0.5649, + "step": 1859 + }, + { + "epoch": 0.25, + "grad_norm": 1.203125, + "learning_rate": 0.00012408272181454303, + "loss": 0.8803, + "step": 1860 + }, + { + "epoch": 0.25, + "grad_norm": 0.828125, + "learning_rate": 0.00012414943295530355, + "loss": 0.729, + "step": 1861 + }, + { + "epoch": 0.25, + "grad_norm": 0.80859375, + "learning_rate": 0.00012421614409606407, + "loss": 0.8512, + "step": 1862 + }, + { + "epoch": 0.25, + "grad_norm": 0.84375, + "learning_rate": 0.00012428285523682456, + "loss": 0.5793, + "step": 1863 + }, + { + "epoch": 0.25, + "grad_norm": 0.875, + "learning_rate": 0.00012434956637758505, + "loss": 0.795, + "step": 1864 + }, + { + "epoch": 0.25, + "grad_norm": 0.76171875, + "learning_rate": 0.00012441627751834557, + "loss": 0.8158, + "step": 1865 + }, + { + "epoch": 0.25, + "grad_norm": 0.765625, + "learning_rate": 0.00012448298865910609, + "loss": 0.7478, + "step": 1866 + }, + { + "epoch": 0.25, + "grad_norm": 1.0390625, + "learning_rate": 0.0001245496997998666, + "loss": 0.8919, + "step": 1867 + }, + { + "epoch": 0.25, + "grad_norm": 0.98046875, + "learning_rate": 0.0001246164109406271, + "loss": 0.9434, + "step": 1868 + }, + { + "epoch": 0.25, + "grad_norm": 0.7890625, + "learning_rate": 0.0001246831220813876, + "loss": 0.7176, + "step": 1869 + }, + { + "epoch": 0.25, + "grad_norm": 0.8359375, + "learning_rate": 0.0001247498332221481, + "loss": 0.9166, + "step": 1870 + }, + { + "epoch": 0.25, + "grad_norm": 1.03125, + "learning_rate": 0.00012481654436290862, + "loss": 0.6324, + "step": 1871 + }, + { + "epoch": 0.25, + "grad_norm": 1.234375, + "learning_rate": 0.00012488325550366912, + "loss": 0.5626, + "step": 1872 + }, + { + "epoch": 0.25, + "grad_norm": 0.71875, + "learning_rate": 0.0001249499666444296, + "loss": 0.7535, + "step": 1873 + }, + { + "epoch": 0.25, + "grad_norm": 0.70703125, + "learning_rate": 0.00012501667778519013, + "loss": 0.8196, + "step": 1874 + }, + { + "epoch": 0.25, + "grad_norm": 0.86328125, + "learning_rate": 0.00012508338892595064, + "loss": 0.6483, + "step": 1875 + }, + { + "epoch": 0.25, + "grad_norm": 0.85546875, + "learning_rate": 0.00012515010006671116, + "loss": 0.9598, + "step": 1876 + }, + { + "epoch": 0.25, + "grad_norm": 0.8671875, + "learning_rate": 0.00012521681120747165, + "loss": 0.7971, + "step": 1877 + }, + { + "epoch": 0.25, + "grad_norm": 0.8203125, + "learning_rate": 0.00012528352234823215, + "loss": 0.7211, + "step": 1878 + }, + { + "epoch": 0.25, + "grad_norm": 0.72265625, + "learning_rate": 0.00012535023348899266, + "loss": 0.7867, + "step": 1879 + }, + { + "epoch": 0.25, + "grad_norm": 0.7265625, + "learning_rate": 0.00012541694462975318, + "loss": 0.577, + "step": 1880 + }, + { + "epoch": 0.25, + "grad_norm": 0.85546875, + "learning_rate": 0.00012548365577051367, + "loss": 0.7643, + "step": 1881 + }, + { + "epoch": 0.25, + "grad_norm": 0.8671875, + "learning_rate": 0.0001255503669112742, + "loss": 0.7532, + "step": 1882 + }, + { + "epoch": 0.25, + "grad_norm": 0.9140625, + "learning_rate": 0.00012561707805203468, + "loss": 0.9703, + "step": 1883 + }, + { + "epoch": 0.25, + "grad_norm": 0.78125, + "learning_rate": 0.0001256837891927952, + "loss": 0.7669, + "step": 1884 + }, + { + "epoch": 0.25, + "grad_norm": 0.84375, + "learning_rate": 0.00012575050033355572, + "loss": 0.7455, + "step": 1885 + }, + { + "epoch": 0.25, + "grad_norm": 0.6328125, + "learning_rate": 0.0001258172114743162, + "loss": 0.8602, + "step": 1886 + }, + { + "epoch": 0.25, + "grad_norm": 0.74609375, + "learning_rate": 0.00012588392261507673, + "loss": 0.6912, + "step": 1887 + }, + { + "epoch": 0.25, + "grad_norm": 0.7890625, + "learning_rate": 0.00012595063375583722, + "loss": 0.652, + "step": 1888 + }, + { + "epoch": 0.25, + "grad_norm": 0.671875, + "learning_rate": 0.00012601734489659774, + "loss": 0.9731, + "step": 1889 + }, + { + "epoch": 0.25, + "grad_norm": 0.6015625, + "learning_rate": 0.00012608405603735823, + "loss": 0.8472, + "step": 1890 + }, + { + "epoch": 0.25, + "grad_norm": 0.75, + "learning_rate": 0.00012615076717811875, + "loss": 1.1115, + "step": 1891 + }, + { + "epoch": 0.25, + "grad_norm": 0.9140625, + "learning_rate": 0.00012621747831887927, + "loss": 0.7456, + "step": 1892 + }, + { + "epoch": 0.25, + "grad_norm": 0.82421875, + "learning_rate": 0.00012628418945963976, + "loss": 0.9344, + "step": 1893 + }, + { + "epoch": 0.25, + "grad_norm": 0.9140625, + "learning_rate": 0.00012635090060040028, + "loss": 0.6444, + "step": 1894 + }, + { + "epoch": 0.25, + "grad_norm": 0.953125, + "learning_rate": 0.00012641761174116077, + "loss": 0.7733, + "step": 1895 + }, + { + "epoch": 0.25, + "grad_norm": 0.76171875, + "learning_rate": 0.0001264843228819213, + "loss": 0.8494, + "step": 1896 + }, + { + "epoch": 0.25, + "grad_norm": 0.6640625, + "learning_rate": 0.0001265510340226818, + "loss": 0.6682, + "step": 1897 + }, + { + "epoch": 0.25, + "grad_norm": 1.0390625, + "learning_rate": 0.0001266177451634423, + "loss": 0.7612, + "step": 1898 + }, + { + "epoch": 0.25, + "grad_norm": 0.671875, + "learning_rate": 0.0001266844563042028, + "loss": 0.6914, + "step": 1899 + }, + { + "epoch": 0.25, + "grad_norm": 0.67578125, + "learning_rate": 0.0001267511674449633, + "loss": 0.5673, + "step": 1900 + }, + { + "epoch": 0.25, + "grad_norm": 0.92578125, + "learning_rate": 0.00012681787858572383, + "loss": 0.6872, + "step": 1901 + }, + { + "epoch": 0.25, + "grad_norm": 0.73046875, + "learning_rate": 0.00012688458972648435, + "loss": 0.4734, + "step": 1902 + }, + { + "epoch": 0.25, + "grad_norm": 0.73828125, + "learning_rate": 0.00012695130086724484, + "loss": 0.8021, + "step": 1903 + }, + { + "epoch": 0.25, + "grad_norm": 1.015625, + "learning_rate": 0.00012701801200800533, + "loss": 1.0223, + "step": 1904 + }, + { + "epoch": 0.25, + "grad_norm": 0.6796875, + "learning_rate": 0.00012708472314876585, + "loss": 0.7358, + "step": 1905 + }, + { + "epoch": 0.25, + "grad_norm": 0.78125, + "learning_rate": 0.00012715143428952637, + "loss": 0.8303, + "step": 1906 + }, + { + "epoch": 0.25, + "grad_norm": 0.828125, + "learning_rate": 0.00012721814543028686, + "loss": 0.7058, + "step": 1907 + }, + { + "epoch": 0.25, + "grad_norm": 0.953125, + "learning_rate": 0.00012728485657104735, + "loss": 0.7419, + "step": 1908 + }, + { + "epoch": 0.25, + "grad_norm": 0.67578125, + "learning_rate": 0.00012735156771180787, + "loss": 0.4448, + "step": 1909 + }, + { + "epoch": 0.25, + "grad_norm": 0.67578125, + "learning_rate": 0.0001274182788525684, + "loss": 0.7135, + "step": 1910 + }, + { + "epoch": 0.26, + "grad_norm": 0.91015625, + "learning_rate": 0.0001274849899933289, + "loss": 0.9302, + "step": 1911 + }, + { + "epoch": 0.26, + "grad_norm": 1.03125, + "learning_rate": 0.0001275517011340894, + "loss": 1.0426, + "step": 1912 + }, + { + "epoch": 0.26, + "grad_norm": 0.765625, + "learning_rate": 0.0001276184122748499, + "loss": 0.6871, + "step": 1913 + }, + { + "epoch": 0.26, + "grad_norm": 0.890625, + "learning_rate": 0.0001276851234156104, + "loss": 0.6891, + "step": 1914 + }, + { + "epoch": 0.26, + "grad_norm": 0.765625, + "learning_rate": 0.00012775183455637093, + "loss": 0.8417, + "step": 1915 + }, + { + "epoch": 0.26, + "grad_norm": 0.87109375, + "learning_rate": 0.00012781854569713142, + "loss": 0.7444, + "step": 1916 + }, + { + "epoch": 0.26, + "grad_norm": 0.7265625, + "learning_rate": 0.00012788525683789194, + "loss": 0.6363, + "step": 1917 + }, + { + "epoch": 0.26, + "grad_norm": 0.609375, + "learning_rate": 0.00012795196797865243, + "loss": 0.5482, + "step": 1918 + }, + { + "epoch": 0.26, + "grad_norm": 1.0625, + "learning_rate": 0.00012801867911941295, + "loss": 0.8549, + "step": 1919 + }, + { + "epoch": 0.26, + "grad_norm": 1.1328125, + "learning_rate": 0.00012808539026017347, + "loss": 0.9281, + "step": 1920 + }, + { + "epoch": 0.26, + "grad_norm": 0.859375, + "learning_rate": 0.00012815210140093396, + "loss": 0.5513, + "step": 1921 + }, + { + "epoch": 0.26, + "grad_norm": 0.796875, + "learning_rate": 0.00012821881254169448, + "loss": 0.869, + "step": 1922 + }, + { + "epoch": 0.26, + "grad_norm": 0.703125, + "learning_rate": 0.00012828552368245497, + "loss": 0.6359, + "step": 1923 + }, + { + "epoch": 0.26, + "grad_norm": 0.77734375, + "learning_rate": 0.00012835223482321549, + "loss": 0.4928, + "step": 1924 + }, + { + "epoch": 0.26, + "grad_norm": 0.84375, + "learning_rate": 0.00012841894596397598, + "loss": 0.5698, + "step": 1925 + }, + { + "epoch": 0.26, + "grad_norm": 0.73046875, + "learning_rate": 0.0001284856571047365, + "loss": 0.6124, + "step": 1926 + }, + { + "epoch": 0.26, + "grad_norm": 0.9296875, + "learning_rate": 0.00012855236824549701, + "loss": 0.756, + "step": 1927 + }, + { + "epoch": 0.26, + "grad_norm": 0.78125, + "learning_rate": 0.0001286190793862575, + "loss": 0.536, + "step": 1928 + }, + { + "epoch": 0.26, + "grad_norm": 0.8671875, + "learning_rate": 0.00012868579052701802, + "loss": 0.8903, + "step": 1929 + }, + { + "epoch": 0.26, + "grad_norm": 0.76953125, + "learning_rate": 0.00012875250166777852, + "loss": 0.8855, + "step": 1930 + }, + { + "epoch": 0.26, + "grad_norm": 0.76171875, + "learning_rate": 0.00012881921280853903, + "loss": 1.0304, + "step": 1931 + }, + { + "epoch": 0.26, + "grad_norm": 0.91796875, + "learning_rate": 0.00012888592394929955, + "loss": 0.7894, + "step": 1932 + }, + { + "epoch": 0.26, + "grad_norm": 0.921875, + "learning_rate": 0.00012895263509006004, + "loss": 0.7037, + "step": 1933 + }, + { + "epoch": 0.26, + "grad_norm": 0.96484375, + "learning_rate": 0.00012901934623082054, + "loss": 1.0501, + "step": 1934 + }, + { + "epoch": 0.26, + "grad_norm": 0.7890625, + "learning_rate": 0.00012908605737158105, + "loss": 0.7055, + "step": 1935 + }, + { + "epoch": 0.26, + "grad_norm": 0.88671875, + "learning_rate": 0.00012915276851234157, + "loss": 0.7055, + "step": 1936 + }, + { + "epoch": 0.26, + "grad_norm": 0.84375, + "learning_rate": 0.0001292194796531021, + "loss": 0.6458, + "step": 1937 + }, + { + "epoch": 0.26, + "grad_norm": 0.69140625, + "learning_rate": 0.00012928619079386258, + "loss": 0.6128, + "step": 1938 + }, + { + "epoch": 0.26, + "grad_norm": 0.71875, + "learning_rate": 0.00012935290193462307, + "loss": 0.4287, + "step": 1939 + }, + { + "epoch": 0.26, + "grad_norm": 0.7734375, + "learning_rate": 0.0001294196130753836, + "loss": 0.6748, + "step": 1940 + }, + { + "epoch": 0.26, + "grad_norm": 0.94921875, + "learning_rate": 0.0001294863242161441, + "loss": 0.855, + "step": 1941 + }, + { + "epoch": 0.26, + "grad_norm": 0.7578125, + "learning_rate": 0.00012955303535690463, + "loss": 0.6143, + "step": 1942 + }, + { + "epoch": 0.26, + "grad_norm": 0.69140625, + "learning_rate": 0.0001296197464976651, + "loss": 0.5645, + "step": 1943 + }, + { + "epoch": 0.26, + "grad_norm": 0.6953125, + "learning_rate": 0.00012968645763842561, + "loss": 0.8424, + "step": 1944 + }, + { + "epoch": 0.26, + "grad_norm": 0.82421875, + "learning_rate": 0.00012975316877918613, + "loss": 0.7727, + "step": 1945 + }, + { + "epoch": 0.26, + "grad_norm": 0.99609375, + "learning_rate": 0.00012981987991994665, + "loss": 0.7918, + "step": 1946 + }, + { + "epoch": 0.26, + "grad_norm": 0.71484375, + "learning_rate": 0.00012988659106070714, + "loss": 0.7297, + "step": 1947 + }, + { + "epoch": 0.26, + "grad_norm": 0.796875, + "learning_rate": 0.00012995330220146763, + "loss": 0.8738, + "step": 1948 + }, + { + "epoch": 0.26, + "grad_norm": 1.3125, + "learning_rate": 0.00013002001334222815, + "loss": 0.8422, + "step": 1949 + }, + { + "epoch": 0.26, + "grad_norm": 0.703125, + "learning_rate": 0.00013008672448298867, + "loss": 0.5588, + "step": 1950 + }, + { + "epoch": 0.26, + "grad_norm": 0.94140625, + "learning_rate": 0.00013015343562374916, + "loss": 0.9762, + "step": 1951 + }, + { + "epoch": 0.26, + "grad_norm": 0.796875, + "learning_rate": 0.00013022014676450968, + "loss": 0.8019, + "step": 1952 + }, + { + "epoch": 0.26, + "grad_norm": 0.76953125, + "learning_rate": 0.00013028685790527017, + "loss": 0.536, + "step": 1953 + }, + { + "epoch": 0.26, + "grad_norm": 0.9296875, + "learning_rate": 0.0001303535690460307, + "loss": 0.791, + "step": 1954 + }, + { + "epoch": 0.26, + "grad_norm": 0.80859375, + "learning_rate": 0.0001304202801867912, + "loss": 0.7134, + "step": 1955 + }, + { + "epoch": 0.26, + "grad_norm": 0.796875, + "learning_rate": 0.0001304869913275517, + "loss": 0.8589, + "step": 1956 + }, + { + "epoch": 0.26, + "grad_norm": 0.69140625, + "learning_rate": 0.00013055370246831222, + "loss": 0.5456, + "step": 1957 + }, + { + "epoch": 0.26, + "grad_norm": 0.6796875, + "learning_rate": 0.0001306204136090727, + "loss": 0.9595, + "step": 1958 + }, + { + "epoch": 0.26, + "grad_norm": 1.1171875, + "learning_rate": 0.00013068712474983323, + "loss": 0.8115, + "step": 1959 + }, + { + "epoch": 0.26, + "grad_norm": 0.7890625, + "learning_rate": 0.00013075383589059372, + "loss": 1.1915, + "step": 1960 + }, + { + "epoch": 0.26, + "grad_norm": 0.7421875, + "learning_rate": 0.00013082054703135424, + "loss": 0.4704, + "step": 1961 + }, + { + "epoch": 0.26, + "grad_norm": 0.8671875, + "learning_rate": 0.00013088725817211476, + "loss": 0.8236, + "step": 1962 + }, + { + "epoch": 0.26, + "grad_norm": 0.9453125, + "learning_rate": 0.00013095396931287525, + "loss": 0.6653, + "step": 1963 + }, + { + "epoch": 0.26, + "grad_norm": 1.0078125, + "learning_rate": 0.00013102068045363577, + "loss": 1.1326, + "step": 1964 + }, + { + "epoch": 0.26, + "grad_norm": 0.671875, + "learning_rate": 0.00013108739159439626, + "loss": 0.8018, + "step": 1965 + }, + { + "epoch": 0.26, + "grad_norm": 0.734375, + "learning_rate": 0.00013115410273515678, + "loss": 0.7357, + "step": 1966 + }, + { + "epoch": 0.26, + "grad_norm": 0.796875, + "learning_rate": 0.0001312208138759173, + "loss": 0.7012, + "step": 1967 + }, + { + "epoch": 0.26, + "grad_norm": 1.40625, + "learning_rate": 0.0001312875250166778, + "loss": 0.6394, + "step": 1968 + }, + { + "epoch": 0.26, + "grad_norm": 1.0, + "learning_rate": 0.00013135423615743828, + "loss": 0.6811, + "step": 1969 + }, + { + "epoch": 0.26, + "grad_norm": 0.71875, + "learning_rate": 0.0001314209472981988, + "loss": 0.5684, + "step": 1970 + }, + { + "epoch": 0.26, + "grad_norm": 0.78515625, + "learning_rate": 0.00013148765843895932, + "loss": 0.8457, + "step": 1971 + }, + { + "epoch": 0.26, + "grad_norm": 0.69921875, + "learning_rate": 0.00013155436957971984, + "loss": 0.9433, + "step": 1972 + }, + { + "epoch": 0.26, + "grad_norm": 0.65234375, + "learning_rate": 0.00013162108072048033, + "loss": 0.8544, + "step": 1973 + }, + { + "epoch": 0.26, + "grad_norm": 0.72265625, + "learning_rate": 0.00013168779186124082, + "loss": 0.4747, + "step": 1974 + }, + { + "epoch": 0.26, + "grad_norm": 0.65625, + "learning_rate": 0.00013175450300200134, + "loss": 0.6188, + "step": 1975 + }, + { + "epoch": 0.26, + "grad_norm": 0.828125, + "learning_rate": 0.00013182121414276186, + "loss": 0.9186, + "step": 1976 + }, + { + "epoch": 0.26, + "grad_norm": 0.75, + "learning_rate": 0.00013188792528352237, + "loss": 0.6744, + "step": 1977 + }, + { + "epoch": 0.26, + "grad_norm": 1.1484375, + "learning_rate": 0.00013195463642428284, + "loss": 0.6511, + "step": 1978 + }, + { + "epoch": 0.26, + "grad_norm": 1.0703125, + "learning_rate": 0.00013202134756504336, + "loss": 0.847, + "step": 1979 + }, + { + "epoch": 0.26, + "grad_norm": 1.328125, + "learning_rate": 0.00013208805870580388, + "loss": 0.7089, + "step": 1980 + }, + { + "epoch": 0.26, + "grad_norm": 1.1015625, + "learning_rate": 0.0001321547698465644, + "loss": 0.8121, + "step": 1981 + }, + { + "epoch": 0.26, + "grad_norm": 0.72265625, + "learning_rate": 0.00013222148098732489, + "loss": 0.8035, + "step": 1982 + }, + { + "epoch": 0.26, + "grad_norm": 0.85546875, + "learning_rate": 0.00013228819212808538, + "loss": 0.6551, + "step": 1983 + }, + { + "epoch": 0.26, + "grad_norm": 0.7890625, + "learning_rate": 0.0001323549032688459, + "loss": 0.722, + "step": 1984 + }, + { + "epoch": 0.26, + "grad_norm": 0.875, + "learning_rate": 0.00013242161440960641, + "loss": 0.4667, + "step": 1985 + }, + { + "epoch": 0.27, + "grad_norm": 1.0078125, + "learning_rate": 0.00013248832555036693, + "loss": 1.1023, + "step": 1986 + }, + { + "epoch": 0.27, + "grad_norm": 0.75390625, + "learning_rate": 0.00013255503669112743, + "loss": 0.8061, + "step": 1987 + }, + { + "epoch": 0.27, + "grad_norm": 0.89453125, + "learning_rate": 0.00013262174783188792, + "loss": 0.9598, + "step": 1988 + }, + { + "epoch": 0.27, + "grad_norm": 0.87890625, + "learning_rate": 0.00013268845897264844, + "loss": 0.6706, + "step": 1989 + }, + { + "epoch": 0.27, + "grad_norm": 0.9921875, + "learning_rate": 0.00013275517011340895, + "loss": 0.804, + "step": 1990 + }, + { + "epoch": 0.27, + "grad_norm": 0.609375, + "learning_rate": 0.00013282188125416945, + "loss": 0.8535, + "step": 1991 + }, + { + "epoch": 0.27, + "grad_norm": 0.8203125, + "learning_rate": 0.00013288859239492996, + "loss": 0.7031, + "step": 1992 + }, + { + "epoch": 0.27, + "grad_norm": 0.859375, + "learning_rate": 0.00013295530353569046, + "loss": 0.6633, + "step": 1993 + }, + { + "epoch": 0.27, + "grad_norm": 0.72265625, + "learning_rate": 0.00013302201467645097, + "loss": 0.8696, + "step": 1994 + }, + { + "epoch": 0.27, + "grad_norm": 0.703125, + "learning_rate": 0.0001330887258172115, + "loss": 0.6766, + "step": 1995 + }, + { + "epoch": 0.27, + "grad_norm": 0.7578125, + "learning_rate": 0.00013315543695797198, + "loss": 0.6703, + "step": 1996 + }, + { + "epoch": 0.27, + "grad_norm": 0.80859375, + "learning_rate": 0.0001332221480987325, + "loss": 0.6321, + "step": 1997 + }, + { + "epoch": 0.27, + "grad_norm": 0.98046875, + "learning_rate": 0.000133288859239493, + "loss": 0.6984, + "step": 1998 + }, + { + "epoch": 0.27, + "grad_norm": 0.7890625, + "learning_rate": 0.0001333555703802535, + "loss": 0.879, + "step": 1999 + }, + { + "epoch": 0.27, + "grad_norm": 1.0078125, + "learning_rate": 0.000133422281521014, + "loss": 0.5366, + "step": 2000 + }, + { + "epoch": 0.27, + "grad_norm": 0.60546875, + "learning_rate": 0.00013348899266177452, + "loss": 0.4656, + "step": 2001 + }, + { + "epoch": 0.27, + "grad_norm": 0.703125, + "learning_rate": 0.00013355570380253504, + "loss": 0.7375, + "step": 2002 + }, + { + "epoch": 0.27, + "grad_norm": 1.0703125, + "learning_rate": 0.00013362241494329553, + "loss": 0.8313, + "step": 2003 + }, + { + "epoch": 0.27, + "grad_norm": 0.75, + "learning_rate": 0.00013368912608405605, + "loss": 0.6798, + "step": 2004 + }, + { + "epoch": 0.27, + "grad_norm": 0.91796875, + "learning_rate": 0.00013375583722481654, + "loss": 1.0903, + "step": 2005 + }, + { + "epoch": 0.27, + "grad_norm": 0.765625, + "learning_rate": 0.00013382254836557706, + "loss": 0.6915, + "step": 2006 + }, + { + "epoch": 0.27, + "grad_norm": 0.81640625, + "learning_rate": 0.00013388925950633758, + "loss": 0.7384, + "step": 2007 + }, + { + "epoch": 0.27, + "grad_norm": 0.87890625, + "learning_rate": 0.00013395597064709807, + "loss": 0.516, + "step": 2008 + }, + { + "epoch": 0.27, + "grad_norm": 0.83203125, + "learning_rate": 0.00013402268178785856, + "loss": 0.7479, + "step": 2009 + }, + { + "epoch": 0.27, + "grad_norm": 0.8046875, + "learning_rate": 0.00013408939292861908, + "loss": 1.0075, + "step": 2010 + }, + { + "epoch": 0.27, + "grad_norm": 0.69921875, + "learning_rate": 0.0001341561040693796, + "loss": 0.6768, + "step": 2011 + }, + { + "epoch": 0.27, + "grad_norm": 0.6484375, + "learning_rate": 0.00013422281521014012, + "loss": 0.8447, + "step": 2012 + }, + { + "epoch": 0.27, + "grad_norm": 0.82421875, + "learning_rate": 0.00013428952635090058, + "loss": 0.6533, + "step": 2013 + }, + { + "epoch": 0.27, + "grad_norm": 0.69921875, + "learning_rate": 0.0001343562374916611, + "loss": 0.6732, + "step": 2014 + }, + { + "epoch": 0.27, + "grad_norm": 0.73046875, + "learning_rate": 0.00013442294863242162, + "loss": 0.6713, + "step": 2015 + }, + { + "epoch": 0.27, + "grad_norm": 1.015625, + "learning_rate": 0.00013448965977318214, + "loss": 0.7823, + "step": 2016 + }, + { + "epoch": 0.27, + "grad_norm": 0.765625, + "learning_rate": 0.00013455637091394263, + "loss": 0.7089, + "step": 2017 + }, + { + "epoch": 0.27, + "grad_norm": 0.796875, + "learning_rate": 0.00013462308205470312, + "loss": 0.9631, + "step": 2018 + }, + { + "epoch": 0.27, + "grad_norm": 0.9375, + "learning_rate": 0.00013468979319546364, + "loss": 0.6562, + "step": 2019 + }, + { + "epoch": 0.27, + "grad_norm": 0.92578125, + "learning_rate": 0.00013475650433622416, + "loss": 0.958, + "step": 2020 + }, + { + "epoch": 0.27, + "grad_norm": 0.78515625, + "learning_rate": 0.00013482321547698468, + "loss": 0.556, + "step": 2021 + }, + { + "epoch": 0.27, + "grad_norm": 1.109375, + "learning_rate": 0.00013488992661774517, + "loss": 0.7544, + "step": 2022 + }, + { + "epoch": 0.27, + "grad_norm": 0.66015625, + "learning_rate": 0.00013495663775850566, + "loss": 0.6381, + "step": 2023 + }, + { + "epoch": 0.27, + "grad_norm": 0.76171875, + "learning_rate": 0.00013502334889926618, + "loss": 0.5211, + "step": 2024 + }, + { + "epoch": 0.27, + "grad_norm": 0.6328125, + "learning_rate": 0.0001350900600400267, + "loss": 0.4673, + "step": 2025 + }, + { + "epoch": 0.27, + "grad_norm": 0.74609375, + "learning_rate": 0.0001351567711807872, + "loss": 0.9756, + "step": 2026 + }, + { + "epoch": 0.27, + "grad_norm": 0.73046875, + "learning_rate": 0.0001352234823215477, + "loss": 0.7699, + "step": 2027 + }, + { + "epoch": 0.27, + "grad_norm": 0.74609375, + "learning_rate": 0.0001352901934623082, + "loss": 0.4781, + "step": 2028 + }, + { + "epoch": 0.27, + "grad_norm": 0.80078125, + "learning_rate": 0.00013535690460306872, + "loss": 0.7582, + "step": 2029 + }, + { + "epoch": 0.27, + "grad_norm": 0.84765625, + "learning_rate": 0.00013542361574382924, + "loss": 0.5259, + "step": 2030 + }, + { + "epoch": 0.27, + "grad_norm": 0.6640625, + "learning_rate": 0.00013549032688458973, + "loss": 0.5077, + "step": 2031 + }, + { + "epoch": 0.27, + "grad_norm": 0.859375, + "learning_rate": 0.00013555703802535025, + "loss": 0.7103, + "step": 2032 + }, + { + "epoch": 0.27, + "grad_norm": 0.78125, + "learning_rate": 0.00013562374916611074, + "loss": 0.9572, + "step": 2033 + }, + { + "epoch": 0.27, + "grad_norm": 1.0, + "learning_rate": 0.00013569046030687126, + "loss": 0.6365, + "step": 2034 + }, + { + "epoch": 0.27, + "grad_norm": 0.6875, + "learning_rate": 0.00013575717144763175, + "loss": 0.7853, + "step": 2035 + }, + { + "epoch": 0.27, + "grad_norm": 0.71484375, + "learning_rate": 0.00013582388258839227, + "loss": 0.7789, + "step": 2036 + }, + { + "epoch": 0.27, + "grad_norm": 0.828125, + "learning_rate": 0.00013589059372915279, + "loss": 0.7427, + "step": 2037 + }, + { + "epoch": 0.27, + "grad_norm": 0.70703125, + "learning_rate": 0.00013595730486991328, + "loss": 0.8533, + "step": 2038 + }, + { + "epoch": 0.27, + "grad_norm": 0.67578125, + "learning_rate": 0.0001360240160106738, + "loss": 0.6894, + "step": 2039 + }, + { + "epoch": 0.27, + "grad_norm": 0.7109375, + "learning_rate": 0.0001360907271514343, + "loss": 0.732, + "step": 2040 + }, + { + "epoch": 0.27, + "grad_norm": 0.76953125, + "learning_rate": 0.0001361574382921948, + "loss": 0.67, + "step": 2041 + }, + { + "epoch": 0.27, + "grad_norm": 1.21875, + "learning_rate": 0.00013622414943295532, + "loss": 0.6834, + "step": 2042 + }, + { + "epoch": 0.27, + "grad_norm": 0.68359375, + "learning_rate": 0.00013629086057371582, + "loss": 0.615, + "step": 2043 + }, + { + "epoch": 0.27, + "grad_norm": 0.75390625, + "learning_rate": 0.0001363575717144763, + "loss": 0.9512, + "step": 2044 + }, + { + "epoch": 0.27, + "grad_norm": 1.1171875, + "learning_rate": 0.00013642428285523683, + "loss": 0.6372, + "step": 2045 + }, + { + "epoch": 0.27, + "grad_norm": 0.9765625, + "learning_rate": 0.00013649099399599734, + "loss": 0.6457, + "step": 2046 + }, + { + "epoch": 0.27, + "grad_norm": 0.8359375, + "learning_rate": 0.00013655770513675786, + "loss": 0.9498, + "step": 2047 + }, + { + "epoch": 0.27, + "grad_norm": 0.984375, + "learning_rate": 0.00013662441627751835, + "loss": 0.5593, + "step": 2048 + }, + { + "epoch": 0.27, + "grad_norm": 0.8125, + "learning_rate": 0.00013669112741827885, + "loss": 0.9204, + "step": 2049 + }, + { + "epoch": 0.27, + "grad_norm": 0.8359375, + "learning_rate": 0.00013675783855903936, + "loss": 0.7602, + "step": 2050 + }, + { + "epoch": 0.27, + "grad_norm": 0.640625, + "learning_rate": 0.00013682454969979988, + "loss": 0.7589, + "step": 2051 + }, + { + "epoch": 0.27, + "grad_norm": 1.0390625, + "learning_rate": 0.00013689126084056037, + "loss": 0.6498, + "step": 2052 + }, + { + "epoch": 0.27, + "grad_norm": 0.66015625, + "learning_rate": 0.00013695797198132087, + "loss": 0.6173, + "step": 2053 + }, + { + "epoch": 0.27, + "grad_norm": 0.68359375, + "learning_rate": 0.00013702468312208138, + "loss": 0.6911, + "step": 2054 + }, + { + "epoch": 0.27, + "grad_norm": 0.67578125, + "learning_rate": 0.0001370913942628419, + "loss": 0.844, + "step": 2055 + }, + { + "epoch": 0.27, + "grad_norm": 1.1328125, + "learning_rate": 0.00013715810540360242, + "loss": 0.7604, + "step": 2056 + }, + { + "epoch": 0.27, + "grad_norm": 0.70703125, + "learning_rate": 0.0001372248165443629, + "loss": 0.7486, + "step": 2057 + }, + { + "epoch": 0.27, + "grad_norm": 0.73828125, + "learning_rate": 0.0001372915276851234, + "loss": 0.5879, + "step": 2058 + }, + { + "epoch": 0.27, + "grad_norm": 0.88671875, + "learning_rate": 0.00013735823882588392, + "loss": 0.809, + "step": 2059 + }, + { + "epoch": 0.27, + "grad_norm": 0.921875, + "learning_rate": 0.00013742494996664444, + "loss": 0.7651, + "step": 2060 + }, + { + "epoch": 0.28, + "grad_norm": 0.6796875, + "learning_rate": 0.00013749166110740493, + "loss": 0.6102, + "step": 2061 + }, + { + "epoch": 0.28, + "grad_norm": 0.6796875, + "learning_rate": 0.00013755837224816545, + "loss": 0.9437, + "step": 2062 + }, + { + "epoch": 0.28, + "grad_norm": 0.6171875, + "learning_rate": 0.00013762508338892597, + "loss": 0.7819, + "step": 2063 + }, + { + "epoch": 0.28, + "grad_norm": 0.83984375, + "learning_rate": 0.00013769179452968646, + "loss": 0.9454, + "step": 2064 + }, + { + "epoch": 0.28, + "grad_norm": 0.76953125, + "learning_rate": 0.00013775850567044698, + "loss": 0.6558, + "step": 2065 + }, + { + "epoch": 0.28, + "grad_norm": 0.71484375, + "learning_rate": 0.00013782521681120747, + "loss": 0.6198, + "step": 2066 + }, + { + "epoch": 0.28, + "grad_norm": 0.6171875, + "learning_rate": 0.000137891927951968, + "loss": 0.4548, + "step": 2067 + }, + { + "epoch": 0.28, + "grad_norm": 0.7109375, + "learning_rate": 0.0001379586390927285, + "loss": 0.5543, + "step": 2068 + }, + { + "epoch": 0.28, + "grad_norm": 0.71875, + "learning_rate": 0.000138025350233489, + "loss": 0.7685, + "step": 2069 + }, + { + "epoch": 0.28, + "grad_norm": 0.96484375, + "learning_rate": 0.0001380920613742495, + "loss": 0.9998, + "step": 2070 + }, + { + "epoch": 0.28, + "grad_norm": 0.83203125, + "learning_rate": 0.00013815877251501, + "loss": 0.6528, + "step": 2071 + }, + { + "epoch": 0.28, + "grad_norm": 0.76171875, + "learning_rate": 0.00013822548365577053, + "loss": 0.4474, + "step": 2072 + }, + { + "epoch": 0.28, + "grad_norm": 0.94921875, + "learning_rate": 0.00013829219479653105, + "loss": 0.9168, + "step": 2073 + }, + { + "epoch": 0.28, + "grad_norm": 0.8359375, + "learning_rate": 0.00013835890593729154, + "loss": 0.952, + "step": 2074 + }, + { + "epoch": 0.28, + "grad_norm": 0.7109375, + "learning_rate": 0.00013842561707805203, + "loss": 0.7991, + "step": 2075 + }, + { + "epoch": 0.28, + "grad_norm": 0.8125, + "learning_rate": 0.00013849232821881255, + "loss": 0.7031, + "step": 2076 + }, + { + "epoch": 0.28, + "grad_norm": 0.83984375, + "learning_rate": 0.00013855903935957307, + "loss": 0.4657, + "step": 2077 + }, + { + "epoch": 0.28, + "grad_norm": 0.6796875, + "learning_rate": 0.0001386257505003336, + "loss": 0.7354, + "step": 2078 + }, + { + "epoch": 0.28, + "grad_norm": 1.21875, + "learning_rate": 0.00013869246164109405, + "loss": 0.6255, + "step": 2079 + }, + { + "epoch": 0.28, + "grad_norm": 0.83203125, + "learning_rate": 0.00013875917278185457, + "loss": 0.8715, + "step": 2080 + }, + { + "epoch": 0.28, + "grad_norm": 0.89453125, + "learning_rate": 0.0001388258839226151, + "loss": 0.6586, + "step": 2081 + }, + { + "epoch": 0.28, + "grad_norm": 0.796875, + "learning_rate": 0.0001388925950633756, + "loss": 1.1798, + "step": 2082 + }, + { + "epoch": 0.28, + "grad_norm": 0.625, + "learning_rate": 0.0001389593062041361, + "loss": 0.8335, + "step": 2083 + }, + { + "epoch": 0.28, + "grad_norm": 0.78125, + "learning_rate": 0.0001390260173448966, + "loss": 0.7593, + "step": 2084 + }, + { + "epoch": 0.28, + "grad_norm": 1.015625, + "learning_rate": 0.0001390927284856571, + "loss": 0.573, + "step": 2085 + }, + { + "epoch": 0.28, + "grad_norm": 0.5546875, + "learning_rate": 0.00013915943962641763, + "loss": 0.5087, + "step": 2086 + }, + { + "epoch": 0.28, + "grad_norm": 0.765625, + "learning_rate": 0.00013922615076717812, + "loss": 0.6046, + "step": 2087 + }, + { + "epoch": 0.28, + "grad_norm": 0.83984375, + "learning_rate": 0.00013929286190793864, + "loss": 0.3238, + "step": 2088 + }, + { + "epoch": 0.28, + "grad_norm": 0.7734375, + "learning_rate": 0.00013935957304869913, + "loss": 0.7754, + "step": 2089 + }, + { + "epoch": 0.28, + "grad_norm": 0.953125, + "learning_rate": 0.00013942628418945965, + "loss": 0.7256, + "step": 2090 + }, + { + "epoch": 0.28, + "grad_norm": 0.609375, + "learning_rate": 0.00013949299533022017, + "loss": 0.4344, + "step": 2091 + }, + { + "epoch": 0.28, + "grad_norm": 0.90625, + "learning_rate": 0.00013955970647098066, + "loss": 1.0119, + "step": 2092 + }, + { + "epoch": 0.28, + "grad_norm": 1.28125, + "learning_rate": 0.00013962641761174118, + "loss": 0.8886, + "step": 2093 + }, + { + "epoch": 0.28, + "grad_norm": 0.90234375, + "learning_rate": 0.00013969312875250167, + "loss": 0.9071, + "step": 2094 + }, + { + "epoch": 0.28, + "grad_norm": 0.6953125, + "learning_rate": 0.00013975983989326219, + "loss": 0.7736, + "step": 2095 + }, + { + "epoch": 0.28, + "grad_norm": 0.80078125, + "learning_rate": 0.00013982655103402268, + "loss": 0.5987, + "step": 2096 + }, + { + "epoch": 0.28, + "grad_norm": 0.85546875, + "learning_rate": 0.0001398932621747832, + "loss": 0.577, + "step": 2097 + }, + { + "epoch": 0.28, + "grad_norm": 0.83203125, + "learning_rate": 0.00013995997331554371, + "loss": 0.8354, + "step": 2098 + }, + { + "epoch": 0.28, + "grad_norm": 0.82421875, + "learning_rate": 0.0001400266844563042, + "loss": 0.9689, + "step": 2099 + }, + { + "epoch": 0.28, + "grad_norm": 0.68359375, + "learning_rate": 0.00014009339559706472, + "loss": 0.5283, + "step": 2100 + }, + { + "epoch": 0.28, + "grad_norm": 0.8359375, + "learning_rate": 0.00014016010673782522, + "loss": 0.7106, + "step": 2101 + }, + { + "epoch": 0.28, + "grad_norm": 0.7421875, + "learning_rate": 0.00014022681787858573, + "loss": 0.5303, + "step": 2102 + }, + { + "epoch": 0.28, + "grad_norm": 0.7265625, + "learning_rate": 0.00014029352901934625, + "loss": 0.7955, + "step": 2103 + }, + { + "epoch": 0.28, + "grad_norm": 0.82421875, + "learning_rate": 0.00014036024016010674, + "loss": 0.8164, + "step": 2104 + }, + { + "epoch": 0.28, + "grad_norm": 1.0546875, + "learning_rate": 0.00014042695130086724, + "loss": 0.9231, + "step": 2105 + }, + { + "epoch": 0.28, + "grad_norm": 0.921875, + "learning_rate": 0.00014049366244162775, + "loss": 0.7221, + "step": 2106 + }, + { + "epoch": 0.28, + "grad_norm": 0.65234375, + "learning_rate": 0.00014056037358238827, + "loss": 0.7519, + "step": 2107 + }, + { + "epoch": 0.28, + "grad_norm": 0.7734375, + "learning_rate": 0.0001406270847231488, + "loss": 0.9045, + "step": 2108 + }, + { + "epoch": 0.28, + "grad_norm": 0.84765625, + "learning_rate": 0.00014069379586390928, + "loss": 0.6653, + "step": 2109 + }, + { + "epoch": 0.28, + "grad_norm": 0.671875, + "learning_rate": 0.00014076050700466977, + "loss": 0.5892, + "step": 2110 + }, + { + "epoch": 0.28, + "grad_norm": 0.74609375, + "learning_rate": 0.0001408272181454303, + "loss": 0.7628, + "step": 2111 + }, + { + "epoch": 0.28, + "grad_norm": 0.7109375, + "learning_rate": 0.0001408939292861908, + "loss": 0.7211, + "step": 2112 + }, + { + "epoch": 0.28, + "grad_norm": 0.73828125, + "learning_rate": 0.00014096064042695133, + "loss": 0.7328, + "step": 2113 + }, + { + "epoch": 0.28, + "grad_norm": 0.96875, + "learning_rate": 0.0001410273515677118, + "loss": 0.7902, + "step": 2114 + }, + { + "epoch": 0.28, + "grad_norm": 0.70703125, + "learning_rate": 0.00014109406270847231, + "loss": 0.655, + "step": 2115 + }, + { + "epoch": 0.28, + "grad_norm": 0.94921875, + "learning_rate": 0.00014116077384923283, + "loss": 0.5938, + "step": 2116 + }, + { + "epoch": 0.28, + "grad_norm": 0.9375, + "learning_rate": 0.00014122748498999335, + "loss": 0.6429, + "step": 2117 + }, + { + "epoch": 0.28, + "grad_norm": 0.8671875, + "learning_rate": 0.00014129419613075384, + "loss": 0.6079, + "step": 2118 + }, + { + "epoch": 0.28, + "grad_norm": 0.88671875, + "learning_rate": 0.00014136090727151433, + "loss": 0.7344, + "step": 2119 + }, + { + "epoch": 0.28, + "grad_norm": 0.7734375, + "learning_rate": 0.00014142761841227485, + "loss": 0.8461, + "step": 2120 + }, + { + "epoch": 0.28, + "grad_norm": 0.94140625, + "learning_rate": 0.00014149432955303537, + "loss": 0.8156, + "step": 2121 + }, + { + "epoch": 0.28, + "grad_norm": 0.98046875, + "learning_rate": 0.0001415610406937959, + "loss": 0.5934, + "step": 2122 + }, + { + "epoch": 0.28, + "grad_norm": 1.0078125, + "learning_rate": 0.00014162775183455638, + "loss": 0.5638, + "step": 2123 + }, + { + "epoch": 0.28, + "grad_norm": 0.92578125, + "learning_rate": 0.00014169446297531687, + "loss": 0.7352, + "step": 2124 + }, + { + "epoch": 0.28, + "grad_norm": 0.7734375, + "learning_rate": 0.0001417611741160774, + "loss": 0.8256, + "step": 2125 + }, + { + "epoch": 0.28, + "grad_norm": 0.8671875, + "learning_rate": 0.0001418278852568379, + "loss": 0.4417, + "step": 2126 + }, + { + "epoch": 0.28, + "grad_norm": 0.765625, + "learning_rate": 0.0001418945963975984, + "loss": 0.6424, + "step": 2127 + }, + { + "epoch": 0.28, + "grad_norm": 0.53515625, + "learning_rate": 0.00014196130753835892, + "loss": 0.6519, + "step": 2128 + }, + { + "epoch": 0.28, + "grad_norm": 0.70703125, + "learning_rate": 0.0001420280186791194, + "loss": 0.8113, + "step": 2129 + }, + { + "epoch": 0.28, + "grad_norm": 0.72265625, + "learning_rate": 0.00014209472981987993, + "loss": 0.6093, + "step": 2130 + }, + { + "epoch": 0.28, + "grad_norm": 0.5625, + "learning_rate": 0.00014216144096064045, + "loss": 0.4219, + "step": 2131 + }, + { + "epoch": 0.28, + "grad_norm": 0.94921875, + "learning_rate": 0.00014222815210140094, + "loss": 0.6611, + "step": 2132 + }, + { + "epoch": 0.28, + "grad_norm": 1.0234375, + "learning_rate": 0.00014229486324216146, + "loss": 0.8288, + "step": 2133 + }, + { + "epoch": 0.28, + "grad_norm": 0.77734375, + "learning_rate": 0.00014236157438292195, + "loss": 0.7191, + "step": 2134 + }, + { + "epoch": 0.28, + "grad_norm": 0.6484375, + "learning_rate": 0.00014242828552368247, + "loss": 0.781, + "step": 2135 + }, + { + "epoch": 0.29, + "grad_norm": 0.765625, + "learning_rate": 0.00014249499666444296, + "loss": 0.7434, + "step": 2136 + }, + { + "epoch": 0.29, + "grad_norm": 0.73046875, + "learning_rate": 0.00014256170780520348, + "loss": 0.8349, + "step": 2137 + }, + { + "epoch": 0.29, + "grad_norm": 0.640625, + "learning_rate": 0.000142628418945964, + "loss": 0.725, + "step": 2138 + }, + { + "epoch": 0.29, + "grad_norm": 0.8046875, + "learning_rate": 0.0001426951300867245, + "loss": 0.5179, + "step": 2139 + }, + { + "epoch": 0.29, + "grad_norm": 0.9921875, + "learning_rate": 0.00014276184122748498, + "loss": 0.6725, + "step": 2140 + }, + { + "epoch": 0.29, + "grad_norm": 0.73828125, + "learning_rate": 0.0001428285523682455, + "loss": 0.717, + "step": 2141 + }, + { + "epoch": 0.29, + "grad_norm": 0.75390625, + "learning_rate": 0.00014289526350900602, + "loss": 0.5937, + "step": 2142 + }, + { + "epoch": 0.29, + "grad_norm": 1.1640625, + "learning_rate": 0.00014296197464976654, + "loss": 0.7917, + "step": 2143 + }, + { + "epoch": 0.29, + "grad_norm": 1.1875, + "learning_rate": 0.00014302868579052703, + "loss": 0.9668, + "step": 2144 + }, + { + "epoch": 0.29, + "grad_norm": 0.77734375, + "learning_rate": 0.00014309539693128752, + "loss": 0.6031, + "step": 2145 + }, + { + "epoch": 0.29, + "grad_norm": 0.94140625, + "learning_rate": 0.00014316210807204804, + "loss": 0.5106, + "step": 2146 + }, + { + "epoch": 0.29, + "grad_norm": 0.83984375, + "learning_rate": 0.00014322881921280856, + "loss": 0.5753, + "step": 2147 + }, + { + "epoch": 0.29, + "grad_norm": 0.73046875, + "learning_rate": 0.00014329553035356907, + "loss": 0.5242, + "step": 2148 + }, + { + "epoch": 0.29, + "grad_norm": 0.97265625, + "learning_rate": 0.00014336224149432954, + "loss": 0.7612, + "step": 2149 + }, + { + "epoch": 0.29, + "grad_norm": 0.7734375, + "learning_rate": 0.00014342895263509006, + "loss": 0.7867, + "step": 2150 + }, + { + "epoch": 0.29, + "grad_norm": 0.9765625, + "learning_rate": 0.00014349566377585058, + "loss": 1.2402, + "step": 2151 + }, + { + "epoch": 0.29, + "grad_norm": 0.80078125, + "learning_rate": 0.0001435623749166111, + "loss": 0.9007, + "step": 2152 + }, + { + "epoch": 0.29, + "grad_norm": 0.6328125, + "learning_rate": 0.00014362908605737159, + "loss": 0.7427, + "step": 2153 + }, + { + "epoch": 0.29, + "grad_norm": 0.66796875, + "learning_rate": 0.00014369579719813208, + "loss": 0.6941, + "step": 2154 + }, + { + "epoch": 0.29, + "grad_norm": 0.65234375, + "learning_rate": 0.0001437625083388926, + "loss": 0.5657, + "step": 2155 + }, + { + "epoch": 0.29, + "grad_norm": 0.69921875, + "learning_rate": 0.00014382921947965312, + "loss": 0.8065, + "step": 2156 + }, + { + "epoch": 0.29, + "grad_norm": 0.87890625, + "learning_rate": 0.00014389593062041363, + "loss": 0.7801, + "step": 2157 + }, + { + "epoch": 0.29, + "grad_norm": 0.82421875, + "learning_rate": 0.00014396264176117413, + "loss": 0.7636, + "step": 2158 + }, + { + "epoch": 0.29, + "grad_norm": 0.68359375, + "learning_rate": 0.00014402935290193462, + "loss": 0.6884, + "step": 2159 + }, + { + "epoch": 0.29, + "grad_norm": 0.73828125, + "learning_rate": 0.00014409606404269514, + "loss": 0.7761, + "step": 2160 + }, + { + "epoch": 0.29, + "grad_norm": 0.7421875, + "learning_rate": 0.00014416277518345565, + "loss": 0.5947, + "step": 2161 + }, + { + "epoch": 0.29, + "grad_norm": 0.625, + "learning_rate": 0.00014422948632421615, + "loss": 0.5212, + "step": 2162 + }, + { + "epoch": 0.29, + "grad_norm": 0.75, + "learning_rate": 0.00014429619746497666, + "loss": 0.7308, + "step": 2163 + }, + { + "epoch": 0.29, + "grad_norm": 0.94140625, + "learning_rate": 0.00014436290860573716, + "loss": 0.7732, + "step": 2164 + }, + { + "epoch": 0.29, + "grad_norm": 0.62109375, + "learning_rate": 0.00014442961974649767, + "loss": 0.6562, + "step": 2165 + }, + { + "epoch": 0.29, + "grad_norm": 0.90234375, + "learning_rate": 0.0001444963308872582, + "loss": 0.5486, + "step": 2166 + }, + { + "epoch": 0.29, + "grad_norm": 0.68359375, + "learning_rate": 0.00014456304202801868, + "loss": 0.4045, + "step": 2167 + }, + { + "epoch": 0.29, + "grad_norm": 0.96484375, + "learning_rate": 0.0001446297531687792, + "loss": 0.8475, + "step": 2168 + }, + { + "epoch": 0.29, + "grad_norm": 0.8671875, + "learning_rate": 0.0001446964643095397, + "loss": 0.6302, + "step": 2169 + }, + { + "epoch": 0.29, + "grad_norm": 0.9765625, + "learning_rate": 0.0001447631754503002, + "loss": 0.6617, + "step": 2170 + }, + { + "epoch": 0.29, + "grad_norm": 0.80859375, + "learning_rate": 0.0001448298865910607, + "loss": 0.7804, + "step": 2171 + }, + { + "epoch": 0.29, + "grad_norm": 0.80078125, + "learning_rate": 0.00014489659773182122, + "loss": 0.81, + "step": 2172 + }, + { + "epoch": 0.29, + "grad_norm": 0.9765625, + "learning_rate": 0.00014496330887258174, + "loss": 0.7108, + "step": 2173 + }, + { + "epoch": 0.29, + "grad_norm": 0.9765625, + "learning_rate": 0.00014503002001334223, + "loss": 0.8246, + "step": 2174 + }, + { + "epoch": 0.29, + "grad_norm": 0.7890625, + "learning_rate": 0.00014509673115410275, + "loss": 0.7221, + "step": 2175 + }, + { + "epoch": 0.29, + "grad_norm": 0.7265625, + "learning_rate": 0.00014516344229486324, + "loss": 0.5445, + "step": 2176 + }, + { + "epoch": 0.29, + "grad_norm": 0.74609375, + "learning_rate": 0.00014523015343562376, + "loss": 0.5521, + "step": 2177 + }, + { + "epoch": 0.29, + "grad_norm": 0.77734375, + "learning_rate": 0.00014529686457638428, + "loss": 0.9899, + "step": 2178 + }, + { + "epoch": 0.29, + "grad_norm": 1.4609375, + "learning_rate": 0.00014536357571714477, + "loss": 0.7301, + "step": 2179 + }, + { + "epoch": 0.29, + "grad_norm": 0.75, + "learning_rate": 0.00014543028685790526, + "loss": 0.9469, + "step": 2180 + }, + { + "epoch": 0.29, + "grad_norm": 0.72265625, + "learning_rate": 0.00014549699799866578, + "loss": 0.7071, + "step": 2181 + }, + { + "epoch": 0.29, + "grad_norm": 0.890625, + "learning_rate": 0.0001455637091394263, + "loss": 0.4521, + "step": 2182 + }, + { + "epoch": 0.29, + "grad_norm": 0.6796875, + "learning_rate": 0.00014563042028018682, + "loss": 0.7295, + "step": 2183 + }, + { + "epoch": 0.29, + "grad_norm": 0.83203125, + "learning_rate": 0.0001456971314209473, + "loss": 0.7505, + "step": 2184 + }, + { + "epoch": 0.29, + "grad_norm": 0.62890625, + "learning_rate": 0.0001457638425617078, + "loss": 0.9087, + "step": 2185 + }, + { + "epoch": 0.29, + "grad_norm": 0.66796875, + "learning_rate": 0.00014583055370246832, + "loss": 0.8753, + "step": 2186 + }, + { + "epoch": 0.29, + "grad_norm": 0.8984375, + "learning_rate": 0.00014589726484322884, + "loss": 0.6266, + "step": 2187 + }, + { + "epoch": 0.29, + "grad_norm": 0.69140625, + "learning_rate": 0.00014596397598398933, + "loss": 0.3493, + "step": 2188 + }, + { + "epoch": 0.29, + "grad_norm": 0.68359375, + "learning_rate": 0.00014603068712474982, + "loss": 0.6526, + "step": 2189 + }, + { + "epoch": 0.29, + "grad_norm": 0.6328125, + "learning_rate": 0.00014609739826551034, + "loss": 0.4442, + "step": 2190 + }, + { + "epoch": 0.29, + "grad_norm": 0.875, + "learning_rate": 0.00014616410940627086, + "loss": 0.7933, + "step": 2191 + }, + { + "epoch": 0.29, + "grad_norm": 0.8203125, + "learning_rate": 0.00014623082054703138, + "loss": 0.5005, + "step": 2192 + }, + { + "epoch": 0.29, + "grad_norm": 0.796875, + "learning_rate": 0.00014629753168779187, + "loss": 0.5635, + "step": 2193 + }, + { + "epoch": 0.29, + "grad_norm": 0.9140625, + "learning_rate": 0.00014636424282855236, + "loss": 0.4679, + "step": 2194 + }, + { + "epoch": 0.29, + "grad_norm": 0.6953125, + "learning_rate": 0.00014643095396931288, + "loss": 0.881, + "step": 2195 + }, + { + "epoch": 0.29, + "grad_norm": 0.86328125, + "learning_rate": 0.0001464976651100734, + "loss": 0.4339, + "step": 2196 + }, + { + "epoch": 0.29, + "grad_norm": 0.80859375, + "learning_rate": 0.0001465643762508339, + "loss": 0.6438, + "step": 2197 + }, + { + "epoch": 0.29, + "grad_norm": 0.7265625, + "learning_rate": 0.0001466310873915944, + "loss": 0.5374, + "step": 2198 + }, + { + "epoch": 0.29, + "grad_norm": 0.703125, + "learning_rate": 0.0001466977985323549, + "loss": 0.5729, + "step": 2199 + }, + { + "epoch": 0.29, + "grad_norm": 0.71875, + "learning_rate": 0.00014676450967311542, + "loss": 0.8325, + "step": 2200 + }, + { + "epoch": 0.29, + "grad_norm": 0.9609375, + "learning_rate": 0.00014683122081387594, + "loss": 0.6766, + "step": 2201 + }, + { + "epoch": 0.29, + "grad_norm": 0.859375, + "learning_rate": 0.00014689793195463643, + "loss": 0.7715, + "step": 2202 + }, + { + "epoch": 0.29, + "grad_norm": 0.8046875, + "learning_rate": 0.00014696464309539695, + "loss": 0.7035, + "step": 2203 + }, + { + "epoch": 0.29, + "grad_norm": 0.84765625, + "learning_rate": 0.00014703135423615744, + "loss": 0.8596, + "step": 2204 + }, + { + "epoch": 0.29, + "grad_norm": 0.5703125, + "learning_rate": 0.00014709806537691796, + "loss": 0.4082, + "step": 2205 + }, + { + "epoch": 0.29, + "grad_norm": 0.8125, + "learning_rate": 0.00014716477651767845, + "loss": 0.3724, + "step": 2206 + }, + { + "epoch": 0.29, + "grad_norm": 0.88671875, + "learning_rate": 0.00014723148765843897, + "loss": 0.785, + "step": 2207 + }, + { + "epoch": 0.29, + "grad_norm": 1.109375, + "learning_rate": 0.00014729819879919949, + "loss": 0.717, + "step": 2208 + }, + { + "epoch": 0.29, + "grad_norm": 0.953125, + "learning_rate": 0.00014736490993995998, + "loss": 0.6597, + "step": 2209 + }, + { + "epoch": 0.29, + "grad_norm": 0.77734375, + "learning_rate": 0.0001474316210807205, + "loss": 0.6987, + "step": 2210 + }, + { + "epoch": 0.3, + "grad_norm": 0.72265625, + "learning_rate": 0.000147498332221481, + "loss": 1.0191, + "step": 2211 + }, + { + "epoch": 0.3, + "grad_norm": 0.8359375, + "learning_rate": 0.0001475650433622415, + "loss": 0.8675, + "step": 2212 + }, + { + "epoch": 0.3, + "grad_norm": 0.90234375, + "learning_rate": 0.00014763175450300202, + "loss": 0.8075, + "step": 2213 + }, + { + "epoch": 0.3, + "grad_norm": 0.78125, + "learning_rate": 0.00014769846564376252, + "loss": 0.8177, + "step": 2214 + }, + { + "epoch": 0.3, + "grad_norm": 0.81640625, + "learning_rate": 0.000147765176784523, + "loss": 0.7239, + "step": 2215 + }, + { + "epoch": 0.3, + "grad_norm": 0.76953125, + "learning_rate": 0.00014783188792528353, + "loss": 0.9476, + "step": 2216 + }, + { + "epoch": 0.3, + "grad_norm": 0.84375, + "learning_rate": 0.00014789859906604404, + "loss": 0.5872, + "step": 2217 + }, + { + "epoch": 0.3, + "grad_norm": 0.6796875, + "learning_rate": 0.00014796531020680456, + "loss": 0.8052, + "step": 2218 + }, + { + "epoch": 0.3, + "grad_norm": 0.73828125, + "learning_rate": 0.00014803202134756505, + "loss": 0.7716, + "step": 2219 + }, + { + "epoch": 0.3, + "grad_norm": 0.80859375, + "learning_rate": 0.00014809873248832555, + "loss": 0.5893, + "step": 2220 + }, + { + "epoch": 0.3, + "grad_norm": 0.87109375, + "learning_rate": 0.00014816544362908606, + "loss": 0.73, + "step": 2221 + }, + { + "epoch": 0.3, + "grad_norm": 0.66015625, + "learning_rate": 0.00014823215476984658, + "loss": 0.7168, + "step": 2222 + }, + { + "epoch": 0.3, + "grad_norm": 0.6328125, + "learning_rate": 0.00014829886591060707, + "loss": 0.7903, + "step": 2223 + }, + { + "epoch": 0.3, + "grad_norm": 0.62890625, + "learning_rate": 0.00014836557705136757, + "loss": 0.6441, + "step": 2224 + }, + { + "epoch": 0.3, + "grad_norm": 0.78125, + "learning_rate": 0.00014843228819212808, + "loss": 0.817, + "step": 2225 + }, + { + "epoch": 0.3, + "grad_norm": 0.74609375, + "learning_rate": 0.0001484989993328886, + "loss": 0.6954, + "step": 2226 + }, + { + "epoch": 0.3, + "grad_norm": 0.734375, + "learning_rate": 0.00014856571047364912, + "loss": 0.737, + "step": 2227 + }, + { + "epoch": 0.3, + "grad_norm": 0.828125, + "learning_rate": 0.0001486324216144096, + "loss": 0.7433, + "step": 2228 + }, + { + "epoch": 0.3, + "grad_norm": 0.921875, + "learning_rate": 0.0001486991327551701, + "loss": 0.427, + "step": 2229 + }, + { + "epoch": 0.3, + "grad_norm": 0.70703125, + "learning_rate": 0.00014876584389593062, + "loss": 0.729, + "step": 2230 + }, + { + "epoch": 0.3, + "grad_norm": 1.0390625, + "learning_rate": 0.00014883255503669114, + "loss": 0.8939, + "step": 2231 + }, + { + "epoch": 0.3, + "grad_norm": 0.94140625, + "learning_rate": 0.00014889926617745163, + "loss": 0.9977, + "step": 2232 + }, + { + "epoch": 0.3, + "grad_norm": 0.90625, + "learning_rate": 0.00014896597731821215, + "loss": 0.3327, + "step": 2233 + }, + { + "epoch": 0.3, + "grad_norm": 0.77734375, + "learning_rate": 0.00014903268845897264, + "loss": 0.7167, + "step": 2234 + }, + { + "epoch": 0.3, + "grad_norm": 0.921875, + "learning_rate": 0.00014909939959973316, + "loss": 0.6558, + "step": 2235 + }, + { + "epoch": 0.3, + "grad_norm": 0.78515625, + "learning_rate": 0.00014916611074049368, + "loss": 0.6878, + "step": 2236 + }, + { + "epoch": 0.3, + "grad_norm": 0.8359375, + "learning_rate": 0.00014923282188125417, + "loss": 0.5651, + "step": 2237 + }, + { + "epoch": 0.3, + "grad_norm": 0.6953125, + "learning_rate": 0.0001492995330220147, + "loss": 0.8321, + "step": 2238 + }, + { + "epoch": 0.3, + "grad_norm": 0.87890625, + "learning_rate": 0.00014936624416277518, + "loss": 0.6241, + "step": 2239 + }, + { + "epoch": 0.3, + "grad_norm": 0.8828125, + "learning_rate": 0.0001494329553035357, + "loss": 0.7196, + "step": 2240 + }, + { + "epoch": 0.3, + "grad_norm": 1.375, + "learning_rate": 0.0001494996664442962, + "loss": 0.6765, + "step": 2241 + }, + { + "epoch": 0.3, + "grad_norm": 0.7890625, + "learning_rate": 0.0001495663775850567, + "loss": 0.5649, + "step": 2242 + }, + { + "epoch": 0.3, + "grad_norm": 0.80078125, + "learning_rate": 0.00014963308872581723, + "loss": 0.7594, + "step": 2243 + }, + { + "epoch": 0.3, + "grad_norm": 0.6875, + "learning_rate": 0.00014969979986657772, + "loss": 0.8085, + "step": 2244 + }, + { + "epoch": 0.3, + "grad_norm": 1.2109375, + "learning_rate": 0.00014976651100733824, + "loss": 0.6824, + "step": 2245 + }, + { + "epoch": 0.3, + "grad_norm": 0.87890625, + "learning_rate": 0.00014983322214809873, + "loss": 0.7928, + "step": 2246 + }, + { + "epoch": 0.3, + "grad_norm": 0.88671875, + "learning_rate": 0.00014989993328885925, + "loss": 0.6881, + "step": 2247 + }, + { + "epoch": 0.3, + "grad_norm": 0.7265625, + "learning_rate": 0.00014996664442961977, + "loss": 0.7719, + "step": 2248 + }, + { + "epoch": 0.3, + "grad_norm": 0.85546875, + "learning_rate": 0.00015003335557038026, + "loss": 0.5877, + "step": 2249 + }, + { + "epoch": 0.3, + "grad_norm": 0.84375, + "learning_rate": 0.00015010006671114075, + "loss": 0.7613, + "step": 2250 + }, + { + "epoch": 0.3, + "grad_norm": 0.8828125, + "learning_rate": 0.00015016677785190127, + "loss": 0.8742, + "step": 2251 + }, + { + "epoch": 0.3, + "grad_norm": 0.7578125, + "learning_rate": 0.0001502334889926618, + "loss": 0.8587, + "step": 2252 + }, + { + "epoch": 0.3, + "grad_norm": 0.93359375, + "learning_rate": 0.0001503002001334223, + "loss": 0.4926, + "step": 2253 + }, + { + "epoch": 0.3, + "grad_norm": 0.9375, + "learning_rate": 0.0001503669112741828, + "loss": 0.8135, + "step": 2254 + }, + { + "epoch": 0.3, + "grad_norm": 1.1875, + "learning_rate": 0.0001504336224149433, + "loss": 0.7977, + "step": 2255 + }, + { + "epoch": 0.3, + "grad_norm": 0.66796875, + "learning_rate": 0.0001505003335557038, + "loss": 0.6775, + "step": 2256 + }, + { + "epoch": 0.3, + "grad_norm": 0.796875, + "learning_rate": 0.00015056704469646433, + "loss": 0.7743, + "step": 2257 + }, + { + "epoch": 0.3, + "grad_norm": 0.69140625, + "learning_rate": 0.00015063375583722485, + "loss": 0.6905, + "step": 2258 + }, + { + "epoch": 0.3, + "grad_norm": 1.3046875, + "learning_rate": 0.0001507004669779853, + "loss": 0.5471, + "step": 2259 + }, + { + "epoch": 0.3, + "grad_norm": 0.97265625, + "learning_rate": 0.00015076717811874583, + "loss": 0.6646, + "step": 2260 + }, + { + "epoch": 0.3, + "grad_norm": 0.89453125, + "learning_rate": 0.00015083388925950635, + "loss": 1.0641, + "step": 2261 + }, + { + "epoch": 0.3, + "grad_norm": 0.62109375, + "learning_rate": 0.00015090060040026687, + "loss": 0.775, + "step": 2262 + }, + { + "epoch": 0.3, + "grad_norm": 1.0234375, + "learning_rate": 0.00015096731154102736, + "loss": 0.7944, + "step": 2263 + }, + { + "epoch": 0.3, + "grad_norm": 0.671875, + "learning_rate": 0.00015103402268178785, + "loss": 0.661, + "step": 2264 + }, + { + "epoch": 0.3, + "grad_norm": 0.79296875, + "learning_rate": 0.00015110073382254837, + "loss": 0.5543, + "step": 2265 + }, + { + "epoch": 0.3, + "grad_norm": 0.93359375, + "learning_rate": 0.00015116744496330889, + "loss": 0.6341, + "step": 2266 + }, + { + "epoch": 0.3, + "grad_norm": 1.171875, + "learning_rate": 0.0001512341561040694, + "loss": 0.7739, + "step": 2267 + }, + { + "epoch": 0.3, + "grad_norm": 1.046875, + "learning_rate": 0.0001513008672448299, + "loss": 1.0993, + "step": 2268 + }, + { + "epoch": 0.3, + "grad_norm": 1.21875, + "learning_rate": 0.0001513675783855904, + "loss": 0.7639, + "step": 2269 + }, + { + "epoch": 0.3, + "grad_norm": 0.91015625, + "learning_rate": 0.0001514342895263509, + "loss": 0.7762, + "step": 2270 + }, + { + "epoch": 0.3, + "grad_norm": 0.6953125, + "learning_rate": 0.00015150100066711142, + "loss": 0.6494, + "step": 2271 + }, + { + "epoch": 0.3, + "grad_norm": 0.70703125, + "learning_rate": 0.00015156771180787192, + "loss": 0.6228, + "step": 2272 + }, + { + "epoch": 0.3, + "grad_norm": 0.859375, + "learning_rate": 0.00015163442294863243, + "loss": 0.9472, + "step": 2273 + }, + { + "epoch": 0.3, + "grad_norm": 0.69921875, + "learning_rate": 0.00015170113408939293, + "loss": 0.9184, + "step": 2274 + }, + { + "epoch": 0.3, + "grad_norm": 0.68359375, + "learning_rate": 0.00015176784523015344, + "loss": 0.6114, + "step": 2275 + }, + { + "epoch": 0.3, + "grad_norm": 1.015625, + "learning_rate": 0.00015183455637091394, + "loss": 0.7067, + "step": 2276 + }, + { + "epoch": 0.3, + "grad_norm": 0.703125, + "learning_rate": 0.00015190126751167445, + "loss": 0.8885, + "step": 2277 + }, + { + "epoch": 0.3, + "grad_norm": 0.59765625, + "learning_rate": 0.00015196797865243497, + "loss": 0.783, + "step": 2278 + }, + { + "epoch": 0.3, + "grad_norm": 0.7109375, + "learning_rate": 0.00015203468979319547, + "loss": 0.7703, + "step": 2279 + }, + { + "epoch": 0.3, + "grad_norm": 0.79296875, + "learning_rate": 0.00015210140093395598, + "loss": 0.7469, + "step": 2280 + }, + { + "epoch": 0.3, + "grad_norm": 0.69921875, + "learning_rate": 0.00015216811207471648, + "loss": 0.6324, + "step": 2281 + }, + { + "epoch": 0.3, + "grad_norm": 0.703125, + "learning_rate": 0.000152234823215477, + "loss": 0.7949, + "step": 2282 + }, + { + "epoch": 0.3, + "grad_norm": 0.75390625, + "learning_rate": 0.0001523015343562375, + "loss": 0.7566, + "step": 2283 + }, + { + "epoch": 0.3, + "grad_norm": 0.7265625, + "learning_rate": 0.000152368245496998, + "loss": 0.845, + "step": 2284 + }, + { + "epoch": 0.3, + "grad_norm": 0.7734375, + "learning_rate": 0.0001524349566377585, + "loss": 0.5637, + "step": 2285 + }, + { + "epoch": 0.31, + "grad_norm": 0.859375, + "learning_rate": 0.00015250166777851901, + "loss": 0.7675, + "step": 2286 + }, + { + "epoch": 0.31, + "grad_norm": 2.34375, + "learning_rate": 0.00015256837891927953, + "loss": 0.5058, + "step": 2287 + }, + { + "epoch": 0.31, + "grad_norm": 0.74609375, + "learning_rate": 0.00015263509006004005, + "loss": 0.8213, + "step": 2288 + }, + { + "epoch": 0.31, + "grad_norm": 0.84375, + "learning_rate": 0.00015270180120080054, + "loss": 0.6883, + "step": 2289 + }, + { + "epoch": 0.31, + "grad_norm": 0.91796875, + "learning_rate": 0.00015276851234156103, + "loss": 0.7043, + "step": 2290 + }, + { + "epoch": 0.31, + "grad_norm": 0.9609375, + "learning_rate": 0.00015283522348232155, + "loss": 0.8365, + "step": 2291 + }, + { + "epoch": 0.31, + "grad_norm": 0.8984375, + "learning_rate": 0.00015290193462308207, + "loss": 0.7138, + "step": 2292 + }, + { + "epoch": 0.31, + "grad_norm": 0.84375, + "learning_rate": 0.0001529686457638426, + "loss": 0.7261, + "step": 2293 + }, + { + "epoch": 0.31, + "grad_norm": 0.7734375, + "learning_rate": 0.00015303535690460305, + "loss": 0.5221, + "step": 2294 + }, + { + "epoch": 0.31, + "grad_norm": 0.83984375, + "learning_rate": 0.00015310206804536357, + "loss": 0.7704, + "step": 2295 + }, + { + "epoch": 0.31, + "grad_norm": 0.87890625, + "learning_rate": 0.0001531687791861241, + "loss": 0.7338, + "step": 2296 + }, + { + "epoch": 0.31, + "grad_norm": 0.76953125, + "learning_rate": 0.0001532354903268846, + "loss": 0.7453, + "step": 2297 + }, + { + "epoch": 0.31, + "grad_norm": 0.8828125, + "learning_rate": 0.0001533022014676451, + "loss": 0.7212, + "step": 2298 + }, + { + "epoch": 0.31, + "grad_norm": 1.0078125, + "learning_rate": 0.0001533689126084056, + "loss": 0.7434, + "step": 2299 + }, + { + "epoch": 0.31, + "grad_norm": 0.6015625, + "learning_rate": 0.0001534356237491661, + "loss": 0.9524, + "step": 2300 + }, + { + "epoch": 0.31, + "grad_norm": 0.62890625, + "learning_rate": 0.00015350233488992663, + "loss": 0.4125, + "step": 2301 + }, + { + "epoch": 0.31, + "grad_norm": 0.73828125, + "learning_rate": 0.00015356904603068715, + "loss": 0.7716, + "step": 2302 + }, + { + "epoch": 0.31, + "grad_norm": 0.90234375, + "learning_rate": 0.00015363575717144764, + "loss": 0.523, + "step": 2303 + }, + { + "epoch": 0.31, + "grad_norm": 0.984375, + "learning_rate": 0.00015370246831220813, + "loss": 0.8115, + "step": 2304 + }, + { + "epoch": 0.31, + "grad_norm": 0.7890625, + "learning_rate": 0.00015376917945296865, + "loss": 0.8543, + "step": 2305 + }, + { + "epoch": 0.31, + "grad_norm": 0.8671875, + "learning_rate": 0.00015383589059372917, + "loss": 0.7374, + "step": 2306 + }, + { + "epoch": 0.31, + "grad_norm": 0.86328125, + "learning_rate": 0.00015390260173448966, + "loss": 0.7873, + "step": 2307 + }, + { + "epoch": 0.31, + "grad_norm": 0.984375, + "learning_rate": 0.00015396931287525018, + "loss": 0.8782, + "step": 2308 + }, + { + "epoch": 0.31, + "grad_norm": 1.1953125, + "learning_rate": 0.00015403602401601067, + "loss": 0.6765, + "step": 2309 + }, + { + "epoch": 0.31, + "grad_norm": 0.8359375, + "learning_rate": 0.0001541027351567712, + "loss": 0.5797, + "step": 2310 + }, + { + "epoch": 0.31, + "grad_norm": 0.625, + "learning_rate": 0.0001541694462975317, + "loss": 0.5246, + "step": 2311 + }, + { + "epoch": 0.31, + "grad_norm": 0.75, + "learning_rate": 0.0001542361574382922, + "loss": 0.6896, + "step": 2312 + }, + { + "epoch": 0.31, + "grad_norm": 1.09375, + "learning_rate": 0.00015430286857905272, + "loss": 0.9396, + "step": 2313 + }, + { + "epoch": 0.31, + "grad_norm": 0.87109375, + "learning_rate": 0.0001543695797198132, + "loss": 0.8704, + "step": 2314 + }, + { + "epoch": 0.31, + "grad_norm": 0.86328125, + "learning_rate": 0.00015443629086057373, + "loss": 0.9552, + "step": 2315 + }, + { + "epoch": 0.31, + "grad_norm": 1.265625, + "learning_rate": 0.00015450300200133422, + "loss": 0.5964, + "step": 2316 + }, + { + "epoch": 0.31, + "grad_norm": 0.828125, + "learning_rate": 0.00015456971314209474, + "loss": 0.9657, + "step": 2317 + }, + { + "epoch": 0.31, + "grad_norm": 0.64453125, + "learning_rate": 0.00015463642428285526, + "loss": 0.7087, + "step": 2318 + }, + { + "epoch": 0.31, + "grad_norm": 0.86328125, + "learning_rate": 0.00015470313542361575, + "loss": 0.5721, + "step": 2319 + }, + { + "epoch": 0.31, + "grad_norm": 0.82421875, + "learning_rate": 0.00015476984656437627, + "loss": 1.0084, + "step": 2320 + }, + { + "epoch": 0.31, + "grad_norm": 0.83203125, + "learning_rate": 0.00015483655770513676, + "loss": 0.3715, + "step": 2321 + }, + { + "epoch": 0.31, + "grad_norm": 0.61328125, + "learning_rate": 0.00015490326884589728, + "loss": 0.5688, + "step": 2322 + }, + { + "epoch": 0.31, + "grad_norm": 0.68359375, + "learning_rate": 0.0001549699799866578, + "loss": 0.5509, + "step": 2323 + }, + { + "epoch": 0.31, + "grad_norm": 0.98828125, + "learning_rate": 0.0001550366911274183, + "loss": 0.8978, + "step": 2324 + }, + { + "epoch": 0.31, + "grad_norm": 0.703125, + "learning_rate": 0.00015510340226817878, + "loss": 0.367, + "step": 2325 + }, + { + "epoch": 0.31, + "grad_norm": 0.78515625, + "learning_rate": 0.0001551701134089393, + "loss": 0.459, + "step": 2326 + }, + { + "epoch": 0.31, + "grad_norm": 0.6796875, + "learning_rate": 0.00015523682454969982, + "loss": 0.5364, + "step": 2327 + }, + { + "epoch": 0.31, + "grad_norm": 0.79296875, + "learning_rate": 0.00015530353569046033, + "loss": 0.7607, + "step": 2328 + }, + { + "epoch": 0.31, + "grad_norm": 0.734375, + "learning_rate": 0.0001553702468312208, + "loss": 0.849, + "step": 2329 + }, + { + "epoch": 0.31, + "grad_norm": 0.921875, + "learning_rate": 0.00015543695797198132, + "loss": 0.6766, + "step": 2330 + }, + { + "epoch": 0.31, + "grad_norm": 0.8125, + "learning_rate": 0.00015550366911274184, + "loss": 0.5679, + "step": 2331 + }, + { + "epoch": 0.31, + "grad_norm": 1.0546875, + "learning_rate": 0.00015557038025350235, + "loss": 0.6118, + "step": 2332 + }, + { + "epoch": 0.31, + "grad_norm": 0.796875, + "learning_rate": 0.00015563709139426285, + "loss": 0.8682, + "step": 2333 + }, + { + "epoch": 0.31, + "grad_norm": 0.81640625, + "learning_rate": 0.00015570380253502334, + "loss": 0.8483, + "step": 2334 + }, + { + "epoch": 0.31, + "grad_norm": 0.6171875, + "learning_rate": 0.00015577051367578386, + "loss": 0.5412, + "step": 2335 + }, + { + "epoch": 0.31, + "grad_norm": 0.90625, + "learning_rate": 0.00015583722481654437, + "loss": 0.8558, + "step": 2336 + }, + { + "epoch": 0.31, + "grad_norm": 0.69921875, + "learning_rate": 0.0001559039359573049, + "loss": 0.822, + "step": 2337 + }, + { + "epoch": 0.31, + "grad_norm": 0.83203125, + "learning_rate": 0.00015597064709806538, + "loss": 0.4984, + "step": 2338 + }, + { + "epoch": 0.31, + "grad_norm": 0.671875, + "learning_rate": 0.00015603735823882588, + "loss": 0.7399, + "step": 2339 + }, + { + "epoch": 0.31, + "grad_norm": 0.73046875, + "learning_rate": 0.0001561040693795864, + "loss": 0.7033, + "step": 2340 + }, + { + "epoch": 0.31, + "grad_norm": 0.796875, + "learning_rate": 0.0001561707805203469, + "loss": 1.043, + "step": 2341 + }, + { + "epoch": 0.31, + "grad_norm": 0.875, + "learning_rate": 0.0001562374916611074, + "loss": 0.4132, + "step": 2342 + }, + { + "epoch": 0.31, + "grad_norm": 0.7109375, + "learning_rate": 0.00015630420280186792, + "loss": 0.6967, + "step": 2343 + }, + { + "epoch": 0.31, + "grad_norm": 0.6328125, + "learning_rate": 0.00015637091394262841, + "loss": 0.7761, + "step": 2344 + }, + { + "epoch": 0.31, + "grad_norm": 0.66015625, + "learning_rate": 0.00015643762508338893, + "loss": 0.745, + "step": 2345 + }, + { + "epoch": 0.31, + "grad_norm": 0.921875, + "learning_rate": 0.00015650433622414945, + "loss": 0.7481, + "step": 2346 + }, + { + "epoch": 0.31, + "grad_norm": 0.77734375, + "learning_rate": 0.00015657104736490994, + "loss": 0.502, + "step": 2347 + }, + { + "epoch": 0.31, + "grad_norm": 0.83203125, + "learning_rate": 0.00015663775850567046, + "loss": 0.7147, + "step": 2348 + }, + { + "epoch": 0.31, + "grad_norm": 0.80859375, + "learning_rate": 0.00015670446964643095, + "loss": 0.5008, + "step": 2349 + }, + { + "epoch": 0.31, + "grad_norm": 0.91015625, + "learning_rate": 0.00015677118078719147, + "loss": 0.7785, + "step": 2350 + }, + { + "epoch": 0.31, + "grad_norm": 0.828125, + "learning_rate": 0.00015683789192795196, + "loss": 0.5042, + "step": 2351 + }, + { + "epoch": 0.31, + "grad_norm": 0.765625, + "learning_rate": 0.00015690460306871248, + "loss": 0.7206, + "step": 2352 + }, + { + "epoch": 0.31, + "grad_norm": 0.83203125, + "learning_rate": 0.000156971314209473, + "loss": 0.6728, + "step": 2353 + }, + { + "epoch": 0.31, + "grad_norm": 0.890625, + "learning_rate": 0.0001570380253502335, + "loss": 0.6488, + "step": 2354 + }, + { + "epoch": 0.31, + "grad_norm": 0.734375, + "learning_rate": 0.000157104736490994, + "loss": 0.9362, + "step": 2355 + }, + { + "epoch": 0.31, + "grad_norm": 0.97265625, + "learning_rate": 0.0001571714476317545, + "loss": 0.323, + "step": 2356 + }, + { + "epoch": 0.31, + "grad_norm": 0.890625, + "learning_rate": 0.00015723815877251502, + "loss": 0.8163, + "step": 2357 + }, + { + "epoch": 0.31, + "grad_norm": 1.3828125, + "learning_rate": 0.00015730486991327554, + "loss": 0.7512, + "step": 2358 + }, + { + "epoch": 0.31, + "grad_norm": 0.7109375, + "learning_rate": 0.00015737158105403603, + "loss": 1.0313, + "step": 2359 + }, + { + "epoch": 0.31, + "grad_norm": 0.73828125, + "learning_rate": 0.00015743829219479652, + "loss": 0.6106, + "step": 2360 + }, + { + "epoch": 0.32, + "grad_norm": 0.8203125, + "learning_rate": 0.00015750500333555704, + "loss": 1.0137, + "step": 2361 + }, + { + "epoch": 0.32, + "grad_norm": 0.77734375, + "learning_rate": 0.00015757171447631756, + "loss": 0.6847, + "step": 2362 + }, + { + "epoch": 0.32, + "grad_norm": 0.76953125, + "learning_rate": 0.00015763842561707808, + "loss": 0.58, + "step": 2363 + }, + { + "epoch": 0.32, + "grad_norm": 0.8671875, + "learning_rate": 0.00015770513675783857, + "loss": 0.6528, + "step": 2364 + }, + { + "epoch": 0.32, + "grad_norm": 0.75390625, + "learning_rate": 0.00015777184789859906, + "loss": 0.4852, + "step": 2365 + }, + { + "epoch": 0.32, + "grad_norm": 0.7734375, + "learning_rate": 0.00015783855903935958, + "loss": 0.7483, + "step": 2366 + }, + { + "epoch": 0.32, + "grad_norm": 0.7421875, + "learning_rate": 0.0001579052701801201, + "loss": 0.702, + "step": 2367 + }, + { + "epoch": 0.32, + "grad_norm": 0.92578125, + "learning_rate": 0.0001579719813208806, + "loss": 0.8332, + "step": 2368 + }, + { + "epoch": 0.32, + "grad_norm": 0.8203125, + "learning_rate": 0.00015803869246164108, + "loss": 0.6559, + "step": 2369 + }, + { + "epoch": 0.32, + "grad_norm": 1.171875, + "learning_rate": 0.0001581054036024016, + "loss": 0.8303, + "step": 2370 + }, + { + "epoch": 0.32, + "grad_norm": 0.8125, + "learning_rate": 0.00015817211474316212, + "loss": 0.6764, + "step": 2371 + }, + { + "epoch": 0.32, + "grad_norm": 0.73046875, + "learning_rate": 0.00015823882588392264, + "loss": 0.9514, + "step": 2372 + }, + { + "epoch": 0.32, + "grad_norm": 0.7734375, + "learning_rate": 0.00015830553702468313, + "loss": 0.5707, + "step": 2373 + }, + { + "epoch": 0.32, + "grad_norm": 0.75390625, + "learning_rate": 0.00015837224816544362, + "loss": 0.6426, + "step": 2374 + }, + { + "epoch": 0.32, + "grad_norm": 0.9140625, + "learning_rate": 0.00015843895930620414, + "loss": 0.597, + "step": 2375 + }, + { + "epoch": 0.32, + "grad_norm": 0.8046875, + "learning_rate": 0.00015850567044696466, + "loss": 0.8049, + "step": 2376 + }, + { + "epoch": 0.32, + "grad_norm": 0.859375, + "learning_rate": 0.00015857238158772515, + "loss": 0.5722, + "step": 2377 + }, + { + "epoch": 0.32, + "grad_norm": 0.64453125, + "learning_rate": 0.00015863909272848567, + "loss": 0.562, + "step": 2378 + }, + { + "epoch": 0.32, + "grad_norm": 0.70703125, + "learning_rate": 0.00015870580386924616, + "loss": 1.0597, + "step": 2379 + }, + { + "epoch": 0.32, + "grad_norm": 0.77734375, + "learning_rate": 0.00015877251501000668, + "loss": 0.6327, + "step": 2380 + }, + { + "epoch": 0.32, + "grad_norm": 0.6953125, + "learning_rate": 0.0001588392261507672, + "loss": 0.64, + "step": 2381 + }, + { + "epoch": 0.32, + "grad_norm": 0.69140625, + "learning_rate": 0.0001589059372915277, + "loss": 0.861, + "step": 2382 + }, + { + "epoch": 0.32, + "grad_norm": 0.73046875, + "learning_rate": 0.0001589726484322882, + "loss": 0.8431, + "step": 2383 + }, + { + "epoch": 0.32, + "grad_norm": 0.79296875, + "learning_rate": 0.0001590393595730487, + "loss": 0.8588, + "step": 2384 + }, + { + "epoch": 0.32, + "grad_norm": 0.79296875, + "learning_rate": 0.00015910607071380922, + "loss": 0.6068, + "step": 2385 + }, + { + "epoch": 0.32, + "grad_norm": 0.7578125, + "learning_rate": 0.0001591727818545697, + "loss": 0.5201, + "step": 2386 + }, + { + "epoch": 0.32, + "grad_norm": 0.78125, + "learning_rate": 0.00015923949299533023, + "loss": 0.5028, + "step": 2387 + }, + { + "epoch": 0.32, + "grad_norm": 0.765625, + "learning_rate": 0.00015930620413609074, + "loss": 0.6823, + "step": 2388 + }, + { + "epoch": 0.32, + "grad_norm": 0.85546875, + "learning_rate": 0.00015937291527685124, + "loss": 0.9158, + "step": 2389 + }, + { + "epoch": 0.32, + "grad_norm": 0.81640625, + "learning_rate": 0.00015943962641761175, + "loss": 0.5784, + "step": 2390 + }, + { + "epoch": 0.32, + "grad_norm": 0.69140625, + "learning_rate": 0.00015950633755837225, + "loss": 0.9645, + "step": 2391 + }, + { + "epoch": 0.32, + "grad_norm": 0.7109375, + "learning_rate": 0.00015957304869913276, + "loss": 0.448, + "step": 2392 + }, + { + "epoch": 0.32, + "grad_norm": 0.6953125, + "learning_rate": 0.00015963975983989328, + "loss": 0.6786, + "step": 2393 + }, + { + "epoch": 0.32, + "grad_norm": 0.76953125, + "learning_rate": 0.00015970647098065377, + "loss": 0.6132, + "step": 2394 + }, + { + "epoch": 0.32, + "grad_norm": 0.68359375, + "learning_rate": 0.00015977318212141427, + "loss": 0.8859, + "step": 2395 + }, + { + "epoch": 0.32, + "grad_norm": 1.15625, + "learning_rate": 0.00015983989326217478, + "loss": 1.0064, + "step": 2396 + }, + { + "epoch": 0.32, + "grad_norm": 0.6953125, + "learning_rate": 0.0001599066044029353, + "loss": 0.7556, + "step": 2397 + }, + { + "epoch": 0.32, + "grad_norm": 0.91015625, + "learning_rate": 0.00015997331554369582, + "loss": 0.7273, + "step": 2398 + }, + { + "epoch": 0.32, + "grad_norm": 0.79296875, + "learning_rate": 0.0001600400266844563, + "loss": 0.5447, + "step": 2399 + }, + { + "epoch": 0.32, + "grad_norm": 0.7890625, + "learning_rate": 0.0001601067378252168, + "loss": 0.464, + "step": 2400 + }, + { + "epoch": 0.32, + "grad_norm": 0.93359375, + "learning_rate": 0.00016017344896597732, + "loss": 0.5306, + "step": 2401 + }, + { + "epoch": 0.32, + "grad_norm": 0.92578125, + "learning_rate": 0.00016024016010673784, + "loss": 0.9569, + "step": 2402 + }, + { + "epoch": 0.32, + "grad_norm": 0.82421875, + "learning_rate": 0.00016030687124749833, + "loss": 0.4763, + "step": 2403 + }, + { + "epoch": 0.32, + "grad_norm": 0.83984375, + "learning_rate": 0.00016037358238825883, + "loss": 0.9939, + "step": 2404 + }, + { + "epoch": 0.32, + "grad_norm": 0.94140625, + "learning_rate": 0.00016044029352901934, + "loss": 0.6299, + "step": 2405 + }, + { + "epoch": 0.32, + "grad_norm": 0.71484375, + "learning_rate": 0.00016050700466977986, + "loss": 0.4818, + "step": 2406 + }, + { + "epoch": 0.32, + "grad_norm": 0.890625, + "learning_rate": 0.00016057371581054038, + "loss": 0.392, + "step": 2407 + }, + { + "epoch": 0.32, + "grad_norm": 0.80859375, + "learning_rate": 0.00016064042695130087, + "loss": 0.5951, + "step": 2408 + }, + { + "epoch": 0.32, + "grad_norm": 0.93359375, + "learning_rate": 0.00016070713809206136, + "loss": 0.6464, + "step": 2409 + }, + { + "epoch": 0.32, + "grad_norm": 0.69921875, + "learning_rate": 0.00016077384923282188, + "loss": 0.7318, + "step": 2410 + }, + { + "epoch": 0.32, + "grad_norm": 0.7265625, + "learning_rate": 0.0001608405603735824, + "loss": 0.6229, + "step": 2411 + }, + { + "epoch": 0.32, + "grad_norm": 0.80078125, + "learning_rate": 0.0001609072715143429, + "loss": 0.5509, + "step": 2412 + }, + { + "epoch": 0.32, + "grad_norm": 0.62890625, + "learning_rate": 0.0001609739826551034, + "loss": 0.585, + "step": 2413 + }, + { + "epoch": 0.32, + "grad_norm": 0.84375, + "learning_rate": 0.0001610406937958639, + "loss": 0.8104, + "step": 2414 + }, + { + "epoch": 0.32, + "grad_norm": 0.75390625, + "learning_rate": 0.00016110740493662442, + "loss": 0.7991, + "step": 2415 + }, + { + "epoch": 0.32, + "grad_norm": 0.70703125, + "learning_rate": 0.00016117411607738494, + "loss": 0.644, + "step": 2416 + }, + { + "epoch": 0.32, + "grad_norm": 1.0546875, + "learning_rate": 0.00016124082721814543, + "loss": 0.5238, + "step": 2417 + }, + { + "epoch": 0.32, + "grad_norm": 0.81640625, + "learning_rate": 0.00016130753835890595, + "loss": 0.7727, + "step": 2418 + }, + { + "epoch": 0.32, + "grad_norm": 1.0078125, + "learning_rate": 0.00016137424949966644, + "loss": 0.9998, + "step": 2419 + }, + { + "epoch": 0.32, + "grad_norm": 0.74609375, + "learning_rate": 0.00016144096064042696, + "loss": 0.4693, + "step": 2420 + }, + { + "epoch": 0.32, + "grad_norm": 1.0859375, + "learning_rate": 0.00016150767178118745, + "loss": 0.6541, + "step": 2421 + }, + { + "epoch": 0.32, + "grad_norm": 1.0234375, + "learning_rate": 0.00016157438292194797, + "loss": 0.7804, + "step": 2422 + }, + { + "epoch": 0.32, + "grad_norm": 0.9140625, + "learning_rate": 0.0001616410940627085, + "loss": 0.542, + "step": 2423 + }, + { + "epoch": 0.32, + "grad_norm": 0.73046875, + "learning_rate": 0.00016170780520346898, + "loss": 0.5042, + "step": 2424 + }, + { + "epoch": 0.32, + "grad_norm": 0.9375, + "learning_rate": 0.0001617745163442295, + "loss": 0.4411, + "step": 2425 + }, + { + "epoch": 0.32, + "grad_norm": 1.0859375, + "learning_rate": 0.00016184122748499, + "loss": 0.6559, + "step": 2426 + }, + { + "epoch": 0.32, + "grad_norm": 0.7109375, + "learning_rate": 0.0001619079386257505, + "loss": 1.0686, + "step": 2427 + }, + { + "epoch": 0.32, + "grad_norm": 0.8359375, + "learning_rate": 0.00016197464976651103, + "loss": 0.6222, + "step": 2428 + }, + { + "epoch": 0.32, + "grad_norm": 0.6953125, + "learning_rate": 0.00016204136090727152, + "loss": 0.7494, + "step": 2429 + }, + { + "epoch": 0.32, + "grad_norm": 0.65625, + "learning_rate": 0.000162108072048032, + "loss": 0.4358, + "step": 2430 + }, + { + "epoch": 0.32, + "grad_norm": 0.859375, + "learning_rate": 0.00016217478318879253, + "loss": 0.5264, + "step": 2431 + }, + { + "epoch": 0.32, + "grad_norm": 0.9921875, + "learning_rate": 0.00016224149432955305, + "loss": 0.5202, + "step": 2432 + }, + { + "epoch": 0.32, + "grad_norm": 0.625, + "learning_rate": 0.00016230820547031357, + "loss": 0.7147, + "step": 2433 + }, + { + "epoch": 0.32, + "grad_norm": 1.0, + "learning_rate": 0.00016237491661107406, + "loss": 0.6857, + "step": 2434 + }, + { + "epoch": 0.32, + "grad_norm": 0.6875, + "learning_rate": 0.00016244162775183455, + "loss": 0.4965, + "step": 2435 + }, + { + "epoch": 0.33, + "grad_norm": 0.984375, + "learning_rate": 0.00016250833889259507, + "loss": 0.9044, + "step": 2436 + }, + { + "epoch": 0.33, + "grad_norm": 1.2734375, + "learning_rate": 0.00016257505003335559, + "loss": 0.5653, + "step": 2437 + }, + { + "epoch": 0.33, + "grad_norm": 0.8203125, + "learning_rate": 0.0001626417611741161, + "loss": 0.722, + "step": 2438 + }, + { + "epoch": 0.33, + "grad_norm": 1.140625, + "learning_rate": 0.0001627084723148766, + "loss": 0.7058, + "step": 2439 + }, + { + "epoch": 0.33, + "grad_norm": 0.78125, + "learning_rate": 0.0001627751834556371, + "loss": 0.8188, + "step": 2440 + }, + { + "epoch": 0.33, + "grad_norm": 0.796875, + "learning_rate": 0.0001628418945963976, + "loss": 0.5158, + "step": 2441 + }, + { + "epoch": 0.33, + "grad_norm": 0.78515625, + "learning_rate": 0.00016290860573715812, + "loss": 0.5719, + "step": 2442 + }, + { + "epoch": 0.33, + "grad_norm": 0.59765625, + "learning_rate": 0.00016297531687791862, + "loss": 0.8351, + "step": 2443 + }, + { + "epoch": 0.33, + "grad_norm": 0.81640625, + "learning_rate": 0.00016304202801867913, + "loss": 0.6343, + "step": 2444 + }, + { + "epoch": 0.33, + "grad_norm": 0.71484375, + "learning_rate": 0.00016310873915943963, + "loss": 0.6329, + "step": 2445 + }, + { + "epoch": 0.33, + "grad_norm": 0.6796875, + "learning_rate": 0.00016317545030020015, + "loss": 0.4969, + "step": 2446 + }, + { + "epoch": 0.33, + "grad_norm": 0.73046875, + "learning_rate": 0.00016324216144096066, + "loss": 0.8072, + "step": 2447 + }, + { + "epoch": 0.33, + "grad_norm": 1.0859375, + "learning_rate": 0.00016330887258172116, + "loss": 0.7726, + "step": 2448 + }, + { + "epoch": 0.33, + "grad_norm": 0.70703125, + "learning_rate": 0.00016337558372248167, + "loss": 0.8245, + "step": 2449 + }, + { + "epoch": 0.33, + "grad_norm": 0.76171875, + "learning_rate": 0.00016344229486324217, + "loss": 0.6816, + "step": 2450 + }, + { + "epoch": 0.33, + "grad_norm": 0.71875, + "learning_rate": 0.00016350900600400268, + "loss": 0.6773, + "step": 2451 + }, + { + "epoch": 0.33, + "grad_norm": 0.66015625, + "learning_rate": 0.00016357571714476318, + "loss": 0.5291, + "step": 2452 + }, + { + "epoch": 0.33, + "grad_norm": 0.76953125, + "learning_rate": 0.0001636424282855237, + "loss": 0.7353, + "step": 2453 + }, + { + "epoch": 0.33, + "grad_norm": 0.6015625, + "learning_rate": 0.0001637091394262842, + "loss": 0.4725, + "step": 2454 + }, + { + "epoch": 0.33, + "grad_norm": 0.953125, + "learning_rate": 0.0001637758505670447, + "loss": 0.7382, + "step": 2455 + }, + { + "epoch": 0.33, + "grad_norm": 0.91015625, + "learning_rate": 0.0001638425617078052, + "loss": 0.9264, + "step": 2456 + }, + { + "epoch": 0.33, + "grad_norm": 0.765625, + "learning_rate": 0.00016390927284856571, + "loss": 0.6305, + "step": 2457 + }, + { + "epoch": 0.33, + "grad_norm": 1.09375, + "learning_rate": 0.00016397598398932623, + "loss": 0.9758, + "step": 2458 + }, + { + "epoch": 0.33, + "grad_norm": 0.76953125, + "learning_rate": 0.00016404269513008675, + "loss": 0.8077, + "step": 2459 + }, + { + "epoch": 0.33, + "grad_norm": 1.09375, + "learning_rate": 0.00016410940627084724, + "loss": 0.3845, + "step": 2460 + }, + { + "epoch": 0.33, + "grad_norm": 0.75390625, + "learning_rate": 0.00016417611741160773, + "loss": 0.5262, + "step": 2461 + }, + { + "epoch": 0.33, + "grad_norm": 0.7578125, + "learning_rate": 0.00016424282855236825, + "loss": 0.6964, + "step": 2462 + }, + { + "epoch": 0.33, + "grad_norm": 0.6953125, + "learning_rate": 0.00016430953969312877, + "loss": 0.483, + "step": 2463 + }, + { + "epoch": 0.33, + "grad_norm": 0.66015625, + "learning_rate": 0.0001643762508338893, + "loss": 0.7175, + "step": 2464 + }, + { + "epoch": 0.33, + "grad_norm": 0.83203125, + "learning_rate": 0.00016444296197464975, + "loss": 0.7197, + "step": 2465 + }, + { + "epoch": 0.33, + "grad_norm": 0.6796875, + "learning_rate": 0.00016450967311541027, + "loss": 0.5703, + "step": 2466 + }, + { + "epoch": 0.33, + "grad_norm": 0.9140625, + "learning_rate": 0.0001645763842561708, + "loss": 0.7666, + "step": 2467 + }, + { + "epoch": 0.33, + "grad_norm": 0.84375, + "learning_rate": 0.0001646430953969313, + "loss": 0.5458, + "step": 2468 + }, + { + "epoch": 0.33, + "grad_norm": 0.7578125, + "learning_rate": 0.0001647098065376918, + "loss": 0.5998, + "step": 2469 + }, + { + "epoch": 0.33, + "grad_norm": 0.76171875, + "learning_rate": 0.0001647765176784523, + "loss": 0.7397, + "step": 2470 + }, + { + "epoch": 0.33, + "grad_norm": 0.984375, + "learning_rate": 0.0001648432288192128, + "loss": 0.5599, + "step": 2471 + }, + { + "epoch": 0.33, + "grad_norm": 0.8359375, + "learning_rate": 0.00016490993995997333, + "loss": 0.8587, + "step": 2472 + }, + { + "epoch": 0.33, + "grad_norm": 0.8359375, + "learning_rate": 0.00016497665110073385, + "loss": 0.6752, + "step": 2473 + }, + { + "epoch": 0.33, + "grad_norm": 0.625, + "learning_rate": 0.00016504336224149434, + "loss": 0.4706, + "step": 2474 + }, + { + "epoch": 0.33, + "grad_norm": 0.9140625, + "learning_rate": 0.00016511007338225483, + "loss": 0.697, + "step": 2475 + }, + { + "epoch": 0.33, + "grad_norm": 0.94921875, + "learning_rate": 0.00016517678452301535, + "loss": 1.2714, + "step": 2476 + }, + { + "epoch": 0.33, + "grad_norm": 0.65625, + "learning_rate": 0.00016524349566377587, + "loss": 0.6237, + "step": 2477 + }, + { + "epoch": 0.33, + "grad_norm": 0.75, + "learning_rate": 0.00016531020680453636, + "loss": 0.7714, + "step": 2478 + }, + { + "epoch": 0.33, + "grad_norm": 0.69140625, + "learning_rate": 0.00016537691794529688, + "loss": 0.2753, + "step": 2479 + }, + { + "epoch": 0.33, + "grad_norm": 0.984375, + "learning_rate": 0.00016544362908605737, + "loss": 1.0847, + "step": 2480 + }, + { + "epoch": 0.33, + "grad_norm": 0.98046875, + "learning_rate": 0.0001655103402268179, + "loss": 0.9483, + "step": 2481 + }, + { + "epoch": 0.33, + "grad_norm": 0.84375, + "learning_rate": 0.0001655770513675784, + "loss": 0.8332, + "step": 2482 + }, + { + "epoch": 0.33, + "grad_norm": 0.828125, + "learning_rate": 0.0001656437625083389, + "loss": 0.6272, + "step": 2483 + }, + { + "epoch": 0.33, + "grad_norm": 0.7890625, + "learning_rate": 0.00016571047364909942, + "loss": 0.8456, + "step": 2484 + }, + { + "epoch": 0.33, + "grad_norm": 0.7265625, + "learning_rate": 0.0001657771847898599, + "loss": 0.7966, + "step": 2485 + }, + { + "epoch": 0.33, + "grad_norm": 0.73046875, + "learning_rate": 0.00016584389593062043, + "loss": 0.5827, + "step": 2486 + }, + { + "epoch": 0.33, + "grad_norm": 0.84765625, + "learning_rate": 0.00016591060707138092, + "loss": 0.553, + "step": 2487 + }, + { + "epoch": 0.33, + "grad_norm": 0.7421875, + "learning_rate": 0.00016597731821214144, + "loss": 0.4235, + "step": 2488 + }, + { + "epoch": 0.33, + "grad_norm": 0.74609375, + "learning_rate": 0.00016604402935290196, + "loss": 0.4657, + "step": 2489 + }, + { + "epoch": 0.33, + "grad_norm": 0.7734375, + "learning_rate": 0.00016611074049366245, + "loss": 0.9241, + "step": 2490 + }, + { + "epoch": 0.33, + "grad_norm": 0.6875, + "learning_rate": 0.00016617745163442297, + "loss": 0.6808, + "step": 2491 + }, + { + "epoch": 0.33, + "grad_norm": 0.81640625, + "learning_rate": 0.00016624416277518346, + "loss": 0.5755, + "step": 2492 + }, + { + "epoch": 0.33, + "grad_norm": 0.7578125, + "learning_rate": 0.00016631087391594398, + "loss": 0.5782, + "step": 2493 + }, + { + "epoch": 0.33, + "grad_norm": 0.69140625, + "learning_rate": 0.0001663775850567045, + "loss": 0.6362, + "step": 2494 + }, + { + "epoch": 0.33, + "grad_norm": 0.76953125, + "learning_rate": 0.000166444296197465, + "loss": 0.8338, + "step": 2495 + }, + { + "epoch": 0.33, + "grad_norm": 0.7265625, + "learning_rate": 0.00016651100733822548, + "loss": 0.6575, + "step": 2496 + }, + { + "epoch": 0.33, + "grad_norm": 0.73828125, + "learning_rate": 0.000166577718478986, + "loss": 0.6098, + "step": 2497 + }, + { + "epoch": 0.33, + "grad_norm": 1.1171875, + "learning_rate": 0.00016664442961974652, + "loss": 0.4769, + "step": 2498 + }, + { + "epoch": 0.33, + "grad_norm": 0.71484375, + "learning_rate": 0.00016671114076050703, + "loss": 0.5371, + "step": 2499 + }, + { + "epoch": 0.33, + "grad_norm": 0.703125, + "learning_rate": 0.00016677785190126753, + "loss": 0.7066, + "step": 2500 + }, + { + "epoch": 0.33, + "grad_norm": 0.69921875, + "learning_rate": 0.00016684456304202802, + "loss": 0.5039, + "step": 2501 + }, + { + "epoch": 0.33, + "grad_norm": 0.625, + "learning_rate": 0.00016691127418278854, + "loss": 0.5554, + "step": 2502 + }, + { + "epoch": 0.33, + "grad_norm": 0.921875, + "learning_rate": 0.00016697798532354905, + "loss": 0.5039, + "step": 2503 + }, + { + "epoch": 0.33, + "grad_norm": 0.80859375, + "learning_rate": 0.00016704469646430955, + "loss": 0.7965, + "step": 2504 + }, + { + "epoch": 0.33, + "grad_norm": 0.75390625, + "learning_rate": 0.00016711140760507004, + "loss": 0.4405, + "step": 2505 + }, + { + "epoch": 0.33, + "grad_norm": 1.234375, + "learning_rate": 0.00016717811874583056, + "loss": 0.8497, + "step": 2506 + }, + { + "epoch": 0.33, + "grad_norm": 0.8359375, + "learning_rate": 0.00016724482988659107, + "loss": 0.6544, + "step": 2507 + }, + { + "epoch": 0.33, + "grad_norm": 1.015625, + "learning_rate": 0.0001673115410273516, + "loss": 0.7058, + "step": 2508 + }, + { + "epoch": 0.33, + "grad_norm": 0.7421875, + "learning_rate": 0.00016737825216811208, + "loss": 0.6908, + "step": 2509 + }, + { + "epoch": 0.33, + "grad_norm": 0.68359375, + "learning_rate": 0.00016744496330887258, + "loss": 0.7376, + "step": 2510 + }, + { + "epoch": 0.34, + "grad_norm": 0.8203125, + "learning_rate": 0.0001675116744496331, + "loss": 0.8594, + "step": 2511 + }, + { + "epoch": 0.34, + "grad_norm": 0.8125, + "learning_rate": 0.0001675783855903936, + "loss": 0.578, + "step": 2512 + }, + { + "epoch": 0.34, + "grad_norm": 0.671875, + "learning_rate": 0.0001676450967311541, + "loss": 0.6519, + "step": 2513 + }, + { + "epoch": 0.34, + "grad_norm": 0.734375, + "learning_rate": 0.00016771180787191462, + "loss": 0.6964, + "step": 2514 + }, + { + "epoch": 0.34, + "grad_norm": 0.609375, + "learning_rate": 0.00016777851901267511, + "loss": 0.784, + "step": 2515 + }, + { + "epoch": 0.34, + "grad_norm": 0.69921875, + "learning_rate": 0.00016784523015343563, + "loss": 0.6947, + "step": 2516 + }, + { + "epoch": 0.34, + "grad_norm": 0.74609375, + "learning_rate": 0.00016791194129419615, + "loss": 0.595, + "step": 2517 + }, + { + "epoch": 0.34, + "grad_norm": 0.671875, + "learning_rate": 0.00016797865243495664, + "loss": 0.487, + "step": 2518 + }, + { + "epoch": 0.34, + "grad_norm": 0.92578125, + "learning_rate": 0.00016804536357571716, + "loss": 0.9404, + "step": 2519 + }, + { + "epoch": 0.34, + "grad_norm": 0.82421875, + "learning_rate": 0.00016811207471647765, + "loss": 0.537, + "step": 2520 + }, + { + "epoch": 0.34, + "grad_norm": 1.109375, + "learning_rate": 0.00016817878585723817, + "loss": 0.5988, + "step": 2521 + }, + { + "epoch": 0.34, + "grad_norm": 0.84765625, + "learning_rate": 0.00016824549699799866, + "loss": 0.776, + "step": 2522 + }, + { + "epoch": 0.34, + "grad_norm": 0.75, + "learning_rate": 0.00016831220813875918, + "loss": 0.5426, + "step": 2523 + }, + { + "epoch": 0.34, + "grad_norm": 0.90625, + "learning_rate": 0.0001683789192795197, + "loss": 0.6261, + "step": 2524 + }, + { + "epoch": 0.34, + "grad_norm": 0.8046875, + "learning_rate": 0.0001684456304202802, + "loss": 0.5654, + "step": 2525 + }, + { + "epoch": 0.34, + "grad_norm": 0.6953125, + "learning_rate": 0.0001685123415610407, + "loss": 0.6898, + "step": 2526 + }, + { + "epoch": 0.34, + "grad_norm": 0.71875, + "learning_rate": 0.0001685790527018012, + "loss": 0.7394, + "step": 2527 + }, + { + "epoch": 0.34, + "grad_norm": 1.046875, + "learning_rate": 0.00016864576384256172, + "loss": 0.4192, + "step": 2528 + }, + { + "epoch": 0.34, + "grad_norm": 0.83984375, + "learning_rate": 0.00016871247498332224, + "loss": 0.6592, + "step": 2529 + }, + { + "epoch": 0.34, + "grad_norm": 0.90234375, + "learning_rate": 0.00016877918612408273, + "loss": 0.5448, + "step": 2530 + }, + { + "epoch": 0.34, + "grad_norm": 0.79296875, + "learning_rate": 0.00016884589726484322, + "loss": 0.8035, + "step": 2531 + }, + { + "epoch": 0.34, + "grad_norm": 0.9453125, + "learning_rate": 0.00016891260840560374, + "loss": 0.4974, + "step": 2532 + }, + { + "epoch": 0.34, + "grad_norm": 0.72265625, + "learning_rate": 0.00016897931954636426, + "loss": 1.0049, + "step": 2533 + }, + { + "epoch": 0.34, + "grad_norm": 0.91015625, + "learning_rate": 0.00016904603068712478, + "loss": 0.6845, + "step": 2534 + }, + { + "epoch": 0.34, + "grad_norm": 0.66015625, + "learning_rate": 0.00016911274182788527, + "loss": 0.5826, + "step": 2535 + }, + { + "epoch": 0.34, + "grad_norm": 0.9453125, + "learning_rate": 0.00016917945296864576, + "loss": 0.9801, + "step": 2536 + }, + { + "epoch": 0.34, + "grad_norm": 0.7578125, + "learning_rate": 0.00016924616410940628, + "loss": 0.7405, + "step": 2537 + }, + { + "epoch": 0.34, + "grad_norm": 0.70703125, + "learning_rate": 0.0001693128752501668, + "loss": 1.0402, + "step": 2538 + }, + { + "epoch": 0.34, + "grad_norm": 0.94921875, + "learning_rate": 0.0001693795863909273, + "loss": 0.583, + "step": 2539 + }, + { + "epoch": 0.34, + "grad_norm": 0.59375, + "learning_rate": 0.00016944629753168778, + "loss": 0.4913, + "step": 2540 + }, + { + "epoch": 0.34, + "grad_norm": 0.7578125, + "learning_rate": 0.0001695130086724483, + "loss": 0.6637, + "step": 2541 + }, + { + "epoch": 0.34, + "grad_norm": 0.69140625, + "learning_rate": 0.00016957971981320882, + "loss": 0.8699, + "step": 2542 + }, + { + "epoch": 0.34, + "grad_norm": 0.67578125, + "learning_rate": 0.00016964643095396934, + "loss": 0.4549, + "step": 2543 + }, + { + "epoch": 0.34, + "grad_norm": 0.84765625, + "learning_rate": 0.00016971314209472983, + "loss": 1.022, + "step": 2544 + }, + { + "epoch": 0.34, + "grad_norm": 0.8359375, + "learning_rate": 0.00016977985323549032, + "loss": 0.6925, + "step": 2545 + }, + { + "epoch": 0.34, + "grad_norm": 0.72265625, + "learning_rate": 0.00016984656437625084, + "loss": 0.5141, + "step": 2546 + }, + { + "epoch": 0.34, + "grad_norm": 0.83984375, + "learning_rate": 0.00016991327551701136, + "loss": 0.6728, + "step": 2547 + }, + { + "epoch": 0.34, + "grad_norm": 0.84765625, + "learning_rate": 0.00016997998665777185, + "loss": 0.9531, + "step": 2548 + }, + { + "epoch": 0.34, + "grad_norm": 0.66015625, + "learning_rate": 0.00017004669779853237, + "loss": 0.56, + "step": 2549 + }, + { + "epoch": 0.34, + "grad_norm": 0.796875, + "learning_rate": 0.00017011340893929286, + "loss": 0.9517, + "step": 2550 + }, + { + "epoch": 0.34, + "grad_norm": 0.85546875, + "learning_rate": 0.00017018012008005338, + "loss": 0.6365, + "step": 2551 + }, + { + "epoch": 0.34, + "grad_norm": 0.71484375, + "learning_rate": 0.0001702468312208139, + "loss": 0.5921, + "step": 2552 + }, + { + "epoch": 0.34, + "grad_norm": 0.9375, + "learning_rate": 0.0001703135423615744, + "loss": 0.5782, + "step": 2553 + }, + { + "epoch": 0.34, + "grad_norm": 0.92578125, + "learning_rate": 0.0001703802535023349, + "loss": 0.674, + "step": 2554 + }, + { + "epoch": 0.34, + "grad_norm": 0.96875, + "learning_rate": 0.0001704469646430954, + "loss": 0.8908, + "step": 2555 + }, + { + "epoch": 0.34, + "grad_norm": 0.640625, + "learning_rate": 0.00017051367578385592, + "loss": 0.7333, + "step": 2556 + }, + { + "epoch": 0.34, + "grad_norm": 0.66015625, + "learning_rate": 0.0001705803869246164, + "loss": 0.8139, + "step": 2557 + }, + { + "epoch": 0.34, + "grad_norm": 0.921875, + "learning_rate": 0.00017064709806537693, + "loss": 0.8364, + "step": 2558 + }, + { + "epoch": 0.34, + "grad_norm": 0.94921875, + "learning_rate": 0.00017071380920613744, + "loss": 0.7188, + "step": 2559 + }, + { + "epoch": 0.34, + "grad_norm": 0.67578125, + "learning_rate": 0.00017078052034689794, + "loss": 0.6576, + "step": 2560 + }, + { + "epoch": 0.34, + "grad_norm": 0.734375, + "learning_rate": 0.00017084723148765845, + "loss": 0.77, + "step": 2561 + }, + { + "epoch": 0.34, + "grad_norm": 0.859375, + "learning_rate": 0.00017091394262841895, + "loss": 0.4996, + "step": 2562 + }, + { + "epoch": 0.34, + "grad_norm": 0.78515625, + "learning_rate": 0.00017098065376917946, + "loss": 0.9895, + "step": 2563 + }, + { + "epoch": 0.34, + "grad_norm": 0.7890625, + "learning_rate": 0.00017104736490993998, + "loss": 0.4965, + "step": 2564 + }, + { + "epoch": 0.34, + "grad_norm": 0.8203125, + "learning_rate": 0.00017111407605070047, + "loss": 0.3961, + "step": 2565 + }, + { + "epoch": 0.34, + "grad_norm": 0.98828125, + "learning_rate": 0.00017118078719146097, + "loss": 0.7651, + "step": 2566 + }, + { + "epoch": 0.34, + "grad_norm": 0.7109375, + "learning_rate": 0.00017124749833222148, + "loss": 0.43, + "step": 2567 + }, + { + "epoch": 0.34, + "grad_norm": 0.734375, + "learning_rate": 0.000171314209472982, + "loss": 0.6612, + "step": 2568 + }, + { + "epoch": 0.34, + "grad_norm": 0.6875, + "learning_rate": 0.00017138092061374252, + "loss": 0.5199, + "step": 2569 + }, + { + "epoch": 0.34, + "grad_norm": 0.89453125, + "learning_rate": 0.000171447631754503, + "loss": 0.7798, + "step": 2570 + }, + { + "epoch": 0.34, + "grad_norm": 0.86328125, + "learning_rate": 0.0001715143428952635, + "loss": 0.8255, + "step": 2571 + }, + { + "epoch": 0.34, + "grad_norm": 0.81640625, + "learning_rate": 0.00017158105403602402, + "loss": 0.7122, + "step": 2572 + }, + { + "epoch": 0.34, + "grad_norm": 0.7890625, + "learning_rate": 0.00017164776517678454, + "loss": 0.5966, + "step": 2573 + }, + { + "epoch": 0.34, + "grad_norm": 0.7109375, + "learning_rate": 0.00017171447631754506, + "loss": 0.6281, + "step": 2574 + }, + { + "epoch": 0.34, + "grad_norm": 0.86328125, + "learning_rate": 0.00017178118745830553, + "loss": 0.6801, + "step": 2575 + }, + { + "epoch": 0.34, + "grad_norm": 0.80859375, + "learning_rate": 0.00017184789859906604, + "loss": 0.6041, + "step": 2576 + }, + { + "epoch": 0.34, + "grad_norm": 0.76171875, + "learning_rate": 0.00017191460973982656, + "loss": 0.7593, + "step": 2577 + }, + { + "epoch": 0.34, + "grad_norm": 0.81640625, + "learning_rate": 0.00017198132088058708, + "loss": 0.7715, + "step": 2578 + }, + { + "epoch": 0.34, + "grad_norm": 0.859375, + "learning_rate": 0.00017204803202134757, + "loss": 0.6786, + "step": 2579 + }, + { + "epoch": 0.34, + "grad_norm": 0.59765625, + "learning_rate": 0.00017211474316210806, + "loss": 0.9628, + "step": 2580 + }, + { + "epoch": 0.34, + "grad_norm": 0.8046875, + "learning_rate": 0.00017218145430286858, + "loss": 0.5487, + "step": 2581 + }, + { + "epoch": 0.34, + "grad_norm": 0.67578125, + "learning_rate": 0.0001722481654436291, + "loss": 0.766, + "step": 2582 + }, + { + "epoch": 0.34, + "grad_norm": 0.75390625, + "learning_rate": 0.00017231487658438962, + "loss": 0.7232, + "step": 2583 + }, + { + "epoch": 0.34, + "grad_norm": 0.64453125, + "learning_rate": 0.0001723815877251501, + "loss": 0.4935, + "step": 2584 + }, + { + "epoch": 0.34, + "grad_norm": 0.6953125, + "learning_rate": 0.0001724482988659106, + "loss": 0.7436, + "step": 2585 + }, + { + "epoch": 0.35, + "grad_norm": 0.796875, + "learning_rate": 0.00017251501000667112, + "loss": 0.6319, + "step": 2586 + }, + { + "epoch": 0.35, + "grad_norm": 0.70703125, + "learning_rate": 0.00017258172114743164, + "loss": 0.5248, + "step": 2587 + }, + { + "epoch": 0.35, + "grad_norm": 0.734375, + "learning_rate": 0.00017264843228819213, + "loss": 0.5355, + "step": 2588 + }, + { + "epoch": 0.35, + "grad_norm": 0.72265625, + "learning_rate": 0.00017271514342895265, + "loss": 0.9274, + "step": 2589 + }, + { + "epoch": 0.35, + "grad_norm": 0.79296875, + "learning_rate": 0.00017278185456971314, + "loss": 0.4977, + "step": 2590 + }, + { + "epoch": 0.35, + "grad_norm": 0.65234375, + "learning_rate": 0.00017284856571047366, + "loss": 0.6947, + "step": 2591 + }, + { + "epoch": 0.35, + "grad_norm": 0.68359375, + "learning_rate": 0.00017291527685123415, + "loss": 0.413, + "step": 2592 + }, + { + "epoch": 0.35, + "grad_norm": 0.8203125, + "learning_rate": 0.00017298198799199467, + "loss": 1.1319, + "step": 2593 + }, + { + "epoch": 0.35, + "grad_norm": 0.74609375, + "learning_rate": 0.0001730486991327552, + "loss": 0.6037, + "step": 2594 + }, + { + "epoch": 0.35, + "grad_norm": 0.7265625, + "learning_rate": 0.00017311541027351568, + "loss": 0.6674, + "step": 2595 + }, + { + "epoch": 0.35, + "grad_norm": 0.77734375, + "learning_rate": 0.0001731821214142762, + "loss": 0.6832, + "step": 2596 + }, + { + "epoch": 0.35, + "grad_norm": 0.77734375, + "learning_rate": 0.0001732488325550367, + "loss": 0.7076, + "step": 2597 + }, + { + "epoch": 0.35, + "grad_norm": 0.953125, + "learning_rate": 0.0001733155436957972, + "loss": 0.9041, + "step": 2598 + }, + { + "epoch": 0.35, + "grad_norm": 0.67578125, + "learning_rate": 0.00017338225483655773, + "loss": 0.8151, + "step": 2599 + }, + { + "epoch": 0.35, + "grad_norm": 0.9140625, + "learning_rate": 0.00017344896597731822, + "loss": 0.8459, + "step": 2600 + }, + { + "epoch": 0.35, + "grad_norm": 0.671875, + "learning_rate": 0.0001735156771180787, + "loss": 0.6789, + "step": 2601 + }, + { + "epoch": 0.35, + "grad_norm": 0.87109375, + "learning_rate": 0.00017358238825883923, + "loss": 0.5535, + "step": 2602 + }, + { + "epoch": 0.35, + "grad_norm": 0.58203125, + "learning_rate": 0.00017364909939959975, + "loss": 0.4458, + "step": 2603 + }, + { + "epoch": 0.35, + "grad_norm": 0.859375, + "learning_rate": 0.00017371581054036027, + "loss": 0.6854, + "step": 2604 + }, + { + "epoch": 0.35, + "grad_norm": 1.0234375, + "learning_rate": 0.00017378252168112076, + "loss": 0.7178, + "step": 2605 + }, + { + "epoch": 0.35, + "grad_norm": 0.78125, + "learning_rate": 0.00017384923282188125, + "loss": 0.8083, + "step": 2606 + }, + { + "epoch": 0.35, + "grad_norm": 0.6953125, + "learning_rate": 0.00017391594396264177, + "loss": 0.9354, + "step": 2607 + }, + { + "epoch": 0.35, + "grad_norm": 0.67578125, + "learning_rate": 0.00017398265510340229, + "loss": 0.6883, + "step": 2608 + }, + { + "epoch": 0.35, + "grad_norm": 0.73046875, + "learning_rate": 0.0001740493662441628, + "loss": 0.5291, + "step": 2609 + }, + { + "epoch": 0.35, + "grad_norm": 0.6953125, + "learning_rate": 0.00017411607738492327, + "loss": 0.5831, + "step": 2610 + }, + { + "epoch": 0.35, + "grad_norm": 0.69140625, + "learning_rate": 0.0001741827885256838, + "loss": 0.6945, + "step": 2611 + }, + { + "epoch": 0.35, + "grad_norm": 0.80859375, + "learning_rate": 0.0001742494996664443, + "loss": 0.834, + "step": 2612 + }, + { + "epoch": 0.35, + "grad_norm": 0.625, + "learning_rate": 0.00017431621080720482, + "loss": 0.3997, + "step": 2613 + }, + { + "epoch": 0.35, + "grad_norm": 1.0234375, + "learning_rate": 0.00017438292194796532, + "loss": 0.5094, + "step": 2614 + }, + { + "epoch": 0.35, + "grad_norm": 0.69921875, + "learning_rate": 0.0001744496330887258, + "loss": 0.5175, + "step": 2615 + }, + { + "epoch": 0.35, + "grad_norm": 0.8828125, + "learning_rate": 0.00017451634422948633, + "loss": 0.6155, + "step": 2616 + }, + { + "epoch": 0.35, + "grad_norm": 0.77734375, + "learning_rate": 0.00017458305537024685, + "loss": 0.6422, + "step": 2617 + }, + { + "epoch": 0.35, + "grad_norm": 0.828125, + "learning_rate": 0.00017464976651100736, + "loss": 0.4766, + "step": 2618 + }, + { + "epoch": 0.35, + "grad_norm": 0.97265625, + "learning_rate": 0.00017471647765176786, + "loss": 0.8225, + "step": 2619 + }, + { + "epoch": 0.35, + "grad_norm": 0.90234375, + "learning_rate": 0.00017478318879252835, + "loss": 0.6346, + "step": 2620 + }, + { + "epoch": 0.35, + "grad_norm": 0.8671875, + "learning_rate": 0.00017484989993328887, + "loss": 0.7845, + "step": 2621 + }, + { + "epoch": 0.35, + "grad_norm": 0.8515625, + "learning_rate": 0.00017491661107404938, + "loss": 0.7424, + "step": 2622 + }, + { + "epoch": 0.35, + "grad_norm": 0.75, + "learning_rate": 0.00017498332221480988, + "loss": 0.7328, + "step": 2623 + }, + { + "epoch": 0.35, + "grad_norm": 0.63671875, + "learning_rate": 0.0001750500333555704, + "loss": 0.6134, + "step": 2624 + }, + { + "epoch": 0.35, + "grad_norm": 0.70703125, + "learning_rate": 0.00017511674449633089, + "loss": 0.6401, + "step": 2625 + }, + { + "epoch": 0.35, + "grad_norm": 1.0703125, + "learning_rate": 0.0001751834556370914, + "loss": 0.8285, + "step": 2626 + }, + { + "epoch": 0.35, + "grad_norm": 0.80859375, + "learning_rate": 0.00017525016677785192, + "loss": 0.6025, + "step": 2627 + }, + { + "epoch": 0.35, + "grad_norm": 0.6953125, + "learning_rate": 0.00017531687791861241, + "loss": 0.5751, + "step": 2628 + }, + { + "epoch": 0.35, + "grad_norm": 0.66015625, + "learning_rate": 0.00017538358905937293, + "loss": 0.418, + "step": 2629 + }, + { + "epoch": 0.35, + "grad_norm": 0.61328125, + "learning_rate": 0.00017545030020013342, + "loss": 0.6591, + "step": 2630 + }, + { + "epoch": 0.35, + "grad_norm": 0.71484375, + "learning_rate": 0.00017551701134089394, + "loss": 0.6861, + "step": 2631 + }, + { + "epoch": 0.35, + "grad_norm": 0.57421875, + "learning_rate": 0.00017558372248165443, + "loss": 0.5436, + "step": 2632 + }, + { + "epoch": 0.35, + "grad_norm": 0.859375, + "learning_rate": 0.00017565043362241495, + "loss": 0.8992, + "step": 2633 + }, + { + "epoch": 0.35, + "grad_norm": 0.97265625, + "learning_rate": 0.00017571714476317547, + "loss": 0.6205, + "step": 2634 + }, + { + "epoch": 0.35, + "grad_norm": 0.7265625, + "learning_rate": 0.00017578385590393596, + "loss": 0.9339, + "step": 2635 + }, + { + "epoch": 0.35, + "grad_norm": 0.75, + "learning_rate": 0.00017585056704469648, + "loss": 0.2208, + "step": 2636 + }, + { + "epoch": 0.35, + "grad_norm": 0.90234375, + "learning_rate": 0.00017591727818545697, + "loss": 0.8593, + "step": 2637 + }, + { + "epoch": 0.35, + "grad_norm": 0.83203125, + "learning_rate": 0.0001759839893262175, + "loss": 1.0275, + "step": 2638 + }, + { + "epoch": 0.35, + "grad_norm": 0.85546875, + "learning_rate": 0.000176050700466978, + "loss": 0.9764, + "step": 2639 + }, + { + "epoch": 0.35, + "grad_norm": 1.15625, + "learning_rate": 0.0001761174116077385, + "loss": 0.7717, + "step": 2640 + }, + { + "epoch": 0.35, + "grad_norm": 0.85546875, + "learning_rate": 0.000176184122748499, + "loss": 0.8856, + "step": 2641 + }, + { + "epoch": 0.35, + "grad_norm": 0.796875, + "learning_rate": 0.0001762508338892595, + "loss": 0.5747, + "step": 2642 + }, + { + "epoch": 0.35, + "grad_norm": 0.703125, + "learning_rate": 0.00017631754503002003, + "loss": 0.8207, + "step": 2643 + }, + { + "epoch": 0.35, + "grad_norm": 0.7109375, + "learning_rate": 0.00017638425617078055, + "loss": 0.9048, + "step": 2644 + }, + { + "epoch": 0.35, + "grad_norm": 0.71875, + "learning_rate": 0.000176450967311541, + "loss": 0.6731, + "step": 2645 + }, + { + "epoch": 0.35, + "grad_norm": 1.09375, + "learning_rate": 0.00017651767845230153, + "loss": 0.7249, + "step": 2646 + }, + { + "epoch": 0.35, + "grad_norm": 0.7578125, + "learning_rate": 0.00017658438959306205, + "loss": 0.8945, + "step": 2647 + }, + { + "epoch": 0.35, + "grad_norm": 0.8984375, + "learning_rate": 0.00017665110073382257, + "loss": 0.7371, + "step": 2648 + }, + { + "epoch": 0.35, + "grad_norm": 0.875, + "learning_rate": 0.00017671781187458306, + "loss": 0.4241, + "step": 2649 + }, + { + "epoch": 0.35, + "grad_norm": 0.90234375, + "learning_rate": 0.00017678452301534355, + "loss": 0.7129, + "step": 2650 + }, + { + "epoch": 0.35, + "grad_norm": 0.95703125, + "learning_rate": 0.00017685123415610407, + "loss": 0.7439, + "step": 2651 + }, + { + "epoch": 0.35, + "grad_norm": 0.9921875, + "learning_rate": 0.0001769179452968646, + "loss": 0.7077, + "step": 2652 + }, + { + "epoch": 0.35, + "grad_norm": 0.9453125, + "learning_rate": 0.0001769846564376251, + "loss": 0.7173, + "step": 2653 + }, + { + "epoch": 0.35, + "grad_norm": 0.9609375, + "learning_rate": 0.0001770513675783856, + "loss": 0.6467, + "step": 2654 + }, + { + "epoch": 0.35, + "grad_norm": 0.8515625, + "learning_rate": 0.0001771180787191461, + "loss": 0.6983, + "step": 2655 + }, + { + "epoch": 0.35, + "grad_norm": 0.84375, + "learning_rate": 0.0001771847898599066, + "loss": 0.5061, + "step": 2656 + }, + { + "epoch": 0.35, + "grad_norm": 1.1328125, + "learning_rate": 0.00017725150100066713, + "loss": 0.9346, + "step": 2657 + }, + { + "epoch": 0.35, + "grad_norm": 0.71875, + "learning_rate": 0.00017731821214142762, + "loss": 0.7308, + "step": 2658 + }, + { + "epoch": 0.35, + "grad_norm": 0.5390625, + "learning_rate": 0.00017738492328218814, + "loss": 0.5528, + "step": 2659 + }, + { + "epoch": 0.35, + "grad_norm": 0.72265625, + "learning_rate": 0.00017745163442294863, + "loss": 0.7528, + "step": 2660 + }, + { + "epoch": 0.36, + "grad_norm": 0.734375, + "learning_rate": 0.00017751834556370915, + "loss": 0.8441, + "step": 2661 + }, + { + "epoch": 0.36, + "grad_norm": 0.73828125, + "learning_rate": 0.00017758505670446967, + "loss": 0.5773, + "step": 2662 + }, + { + "epoch": 0.36, + "grad_norm": 0.65625, + "learning_rate": 0.00017765176784523016, + "loss": 0.7939, + "step": 2663 + }, + { + "epoch": 0.36, + "grad_norm": 1.078125, + "learning_rate": 0.00017771847898599068, + "loss": 0.7252, + "step": 2664 + }, + { + "epoch": 0.36, + "grad_norm": 0.765625, + "learning_rate": 0.00017778519012675117, + "loss": 0.7336, + "step": 2665 + }, + { + "epoch": 0.36, + "grad_norm": 0.91796875, + "learning_rate": 0.0001778519012675117, + "loss": 0.5237, + "step": 2666 + }, + { + "epoch": 0.36, + "grad_norm": 1.015625, + "learning_rate": 0.00017791861240827218, + "loss": 0.7179, + "step": 2667 + }, + { + "epoch": 0.36, + "grad_norm": 0.76171875, + "learning_rate": 0.0001779853235490327, + "loss": 0.9224, + "step": 2668 + }, + { + "epoch": 0.36, + "grad_norm": 0.8359375, + "learning_rate": 0.00017805203468979322, + "loss": 0.6957, + "step": 2669 + }, + { + "epoch": 0.36, + "grad_norm": 0.796875, + "learning_rate": 0.0001781187458305537, + "loss": 0.4661, + "step": 2670 + }, + { + "epoch": 0.36, + "grad_norm": 0.79296875, + "learning_rate": 0.00017818545697131423, + "loss": 0.7408, + "step": 2671 + }, + { + "epoch": 0.36, + "grad_norm": 0.8828125, + "learning_rate": 0.00017825216811207472, + "loss": 0.7323, + "step": 2672 + }, + { + "epoch": 0.36, + "grad_norm": 0.7265625, + "learning_rate": 0.00017831887925283524, + "loss": 0.7134, + "step": 2673 + }, + { + "epoch": 0.36, + "grad_norm": 0.73046875, + "learning_rate": 0.00017838559039359575, + "loss": 0.6128, + "step": 2674 + }, + { + "epoch": 0.36, + "grad_norm": 0.765625, + "learning_rate": 0.00017845230153435625, + "loss": 0.6945, + "step": 2675 + }, + { + "epoch": 0.36, + "grad_norm": 0.61328125, + "learning_rate": 0.00017851901267511674, + "loss": 0.3045, + "step": 2676 + }, + { + "epoch": 0.36, + "grad_norm": 0.86328125, + "learning_rate": 0.00017858572381587726, + "loss": 0.6782, + "step": 2677 + }, + { + "epoch": 0.36, + "grad_norm": 0.78515625, + "learning_rate": 0.00017865243495663777, + "loss": 0.7471, + "step": 2678 + }, + { + "epoch": 0.36, + "grad_norm": 0.73046875, + "learning_rate": 0.0001787191460973983, + "loss": 0.4739, + "step": 2679 + }, + { + "epoch": 0.36, + "grad_norm": 0.7890625, + "learning_rate": 0.00017878585723815878, + "loss": 0.66, + "step": 2680 + }, + { + "epoch": 0.36, + "grad_norm": 0.8984375, + "learning_rate": 0.00017885256837891928, + "loss": 0.411, + "step": 2681 + }, + { + "epoch": 0.36, + "grad_norm": 1.5, + "learning_rate": 0.0001789192795196798, + "loss": 0.872, + "step": 2682 + }, + { + "epoch": 0.36, + "grad_norm": 0.96875, + "learning_rate": 0.0001789859906604403, + "loss": 0.7629, + "step": 2683 + }, + { + "epoch": 0.36, + "grad_norm": 0.93359375, + "learning_rate": 0.0001790527018012008, + "loss": 0.8081, + "step": 2684 + }, + { + "epoch": 0.36, + "grad_norm": 0.921875, + "learning_rate": 0.0001791194129419613, + "loss": 0.596, + "step": 2685 + }, + { + "epoch": 0.36, + "grad_norm": 0.875, + "learning_rate": 0.00017918612408272181, + "loss": 0.693, + "step": 2686 + }, + { + "epoch": 0.36, + "grad_norm": 0.72265625, + "learning_rate": 0.00017925283522348233, + "loss": 0.5356, + "step": 2687 + }, + { + "epoch": 0.36, + "grad_norm": 0.78125, + "learning_rate": 0.00017931954636424285, + "loss": 0.6772, + "step": 2688 + }, + { + "epoch": 0.36, + "grad_norm": 1.203125, + "learning_rate": 0.00017938625750500334, + "loss": 0.7695, + "step": 2689 + }, + { + "epoch": 0.36, + "grad_norm": 0.8515625, + "learning_rate": 0.00017945296864576383, + "loss": 0.5728, + "step": 2690 + }, + { + "epoch": 0.36, + "grad_norm": 0.78125, + "learning_rate": 0.00017951967978652435, + "loss": 0.7853, + "step": 2691 + }, + { + "epoch": 0.36, + "grad_norm": 0.87109375, + "learning_rate": 0.00017958639092728487, + "loss": 0.4353, + "step": 2692 + }, + { + "epoch": 0.36, + "grad_norm": 0.640625, + "learning_rate": 0.00017965310206804536, + "loss": 0.8105, + "step": 2693 + }, + { + "epoch": 0.36, + "grad_norm": 0.73046875, + "learning_rate": 0.00017971981320880588, + "loss": 0.7805, + "step": 2694 + }, + { + "epoch": 0.36, + "grad_norm": 0.73828125, + "learning_rate": 0.00017978652434956637, + "loss": 0.92, + "step": 2695 + }, + { + "epoch": 0.36, + "grad_norm": 0.69140625, + "learning_rate": 0.0001798532354903269, + "loss": 1.0521, + "step": 2696 + }, + { + "epoch": 0.36, + "grad_norm": 0.66796875, + "learning_rate": 0.0001799199466310874, + "loss": 0.5348, + "step": 2697 + }, + { + "epoch": 0.36, + "grad_norm": 0.66796875, + "learning_rate": 0.0001799866577718479, + "loss": 0.6498, + "step": 2698 + }, + { + "epoch": 0.36, + "grad_norm": 0.76953125, + "learning_rate": 0.00018005336891260842, + "loss": 0.707, + "step": 2699 + }, + { + "epoch": 0.36, + "grad_norm": 0.6015625, + "learning_rate": 0.0001801200800533689, + "loss": 0.5676, + "step": 2700 + }, + { + "epoch": 0.36, + "grad_norm": 0.72265625, + "learning_rate": 0.00018018679119412943, + "loss": 0.6111, + "step": 2701 + }, + { + "epoch": 0.36, + "grad_norm": 0.64453125, + "learning_rate": 0.00018025350233488992, + "loss": 0.7225, + "step": 2702 + }, + { + "epoch": 0.36, + "grad_norm": 0.80859375, + "learning_rate": 0.00018032021347565044, + "loss": 0.5858, + "step": 2703 + }, + { + "epoch": 0.36, + "grad_norm": 0.73046875, + "learning_rate": 0.00018038692461641096, + "loss": 0.5847, + "step": 2704 + }, + { + "epoch": 0.36, + "grad_norm": 0.77734375, + "learning_rate": 0.00018045363575717145, + "loss": 0.7464, + "step": 2705 + }, + { + "epoch": 0.36, + "grad_norm": 0.76953125, + "learning_rate": 0.00018052034689793197, + "loss": 0.761, + "step": 2706 + }, + { + "epoch": 0.36, + "grad_norm": 0.84375, + "learning_rate": 0.00018058705803869246, + "loss": 0.6088, + "step": 2707 + }, + { + "epoch": 0.36, + "grad_norm": 0.66015625, + "learning_rate": 0.00018065376917945298, + "loss": 0.6747, + "step": 2708 + }, + { + "epoch": 0.36, + "grad_norm": 0.890625, + "learning_rate": 0.0001807204803202135, + "loss": 0.7046, + "step": 2709 + }, + { + "epoch": 0.36, + "grad_norm": 0.90625, + "learning_rate": 0.000180787191460974, + "loss": 0.5551, + "step": 2710 + }, + { + "epoch": 0.36, + "grad_norm": 0.94140625, + "learning_rate": 0.00018085390260173448, + "loss": 0.766, + "step": 2711 + }, + { + "epoch": 0.36, + "grad_norm": 0.83984375, + "learning_rate": 0.000180920613742495, + "loss": 0.7895, + "step": 2712 + }, + { + "epoch": 0.36, + "grad_norm": 0.484375, + "learning_rate": 0.00018098732488325552, + "loss": 0.4179, + "step": 2713 + }, + { + "epoch": 0.36, + "grad_norm": 0.9765625, + "learning_rate": 0.00018105403602401604, + "loss": 0.6679, + "step": 2714 + }, + { + "epoch": 0.36, + "grad_norm": 0.8203125, + "learning_rate": 0.00018112074716477653, + "loss": 0.8019, + "step": 2715 + }, + { + "epoch": 0.36, + "grad_norm": 0.7109375, + "learning_rate": 0.00018118745830553702, + "loss": 0.3524, + "step": 2716 + }, + { + "epoch": 0.36, + "grad_norm": 0.8828125, + "learning_rate": 0.00018125416944629754, + "loss": 0.4788, + "step": 2717 + }, + { + "epoch": 0.36, + "grad_norm": 0.86328125, + "learning_rate": 0.00018132088058705806, + "loss": 0.6528, + "step": 2718 + }, + { + "epoch": 0.36, + "grad_norm": 0.9375, + "learning_rate": 0.00018138759172781855, + "loss": 0.952, + "step": 2719 + }, + { + "epoch": 0.36, + "grad_norm": 0.69921875, + "learning_rate": 0.00018145430286857904, + "loss": 0.4545, + "step": 2720 + }, + { + "epoch": 0.36, + "grad_norm": 0.48828125, + "learning_rate": 0.00018152101400933956, + "loss": 0.5278, + "step": 2721 + }, + { + "epoch": 0.36, + "grad_norm": 0.8125, + "learning_rate": 0.00018158772515010008, + "loss": 1.0056, + "step": 2722 + }, + { + "epoch": 0.36, + "grad_norm": 0.578125, + "learning_rate": 0.0001816544362908606, + "loss": 0.4371, + "step": 2723 + }, + { + "epoch": 0.36, + "grad_norm": 0.68359375, + "learning_rate": 0.0001817211474316211, + "loss": 0.6944, + "step": 2724 + }, + { + "epoch": 0.36, + "grad_norm": 0.63671875, + "learning_rate": 0.00018178785857238158, + "loss": 0.691, + "step": 2725 + }, + { + "epoch": 0.36, + "grad_norm": 0.8203125, + "learning_rate": 0.0001818545697131421, + "loss": 0.759, + "step": 2726 + }, + { + "epoch": 0.36, + "grad_norm": 0.67578125, + "learning_rate": 0.00018192128085390262, + "loss": 0.5792, + "step": 2727 + }, + { + "epoch": 0.36, + "grad_norm": 0.58203125, + "learning_rate": 0.0001819879919946631, + "loss": 0.587, + "step": 2728 + }, + { + "epoch": 0.36, + "grad_norm": 0.6328125, + "learning_rate": 0.00018205470313542363, + "loss": 0.5941, + "step": 2729 + }, + { + "epoch": 0.36, + "grad_norm": 0.68359375, + "learning_rate": 0.00018212141427618412, + "loss": 1.0216, + "step": 2730 + }, + { + "epoch": 0.36, + "grad_norm": 0.84765625, + "learning_rate": 0.00018218812541694464, + "loss": 0.8442, + "step": 2731 + }, + { + "epoch": 0.36, + "grad_norm": 0.82421875, + "learning_rate": 0.00018225483655770515, + "loss": 0.7038, + "step": 2732 + }, + { + "epoch": 0.36, + "grad_norm": 0.75390625, + "learning_rate": 0.00018232154769846565, + "loss": 0.6304, + "step": 2733 + }, + { + "epoch": 0.36, + "grad_norm": 0.890625, + "learning_rate": 0.00018238825883922616, + "loss": 0.8261, + "step": 2734 + }, + { + "epoch": 0.36, + "grad_norm": 0.875, + "learning_rate": 0.00018245496997998666, + "loss": 0.6921, + "step": 2735 + }, + { + "epoch": 0.37, + "grad_norm": 0.71484375, + "learning_rate": 0.00018252168112074717, + "loss": 0.6556, + "step": 2736 + }, + { + "epoch": 0.37, + "grad_norm": 1.0703125, + "learning_rate": 0.00018258839226150767, + "loss": 0.7471, + "step": 2737 + }, + { + "epoch": 0.37, + "grad_norm": 0.84375, + "learning_rate": 0.00018265510340226818, + "loss": 0.5999, + "step": 2738 + }, + { + "epoch": 0.37, + "grad_norm": 0.84765625, + "learning_rate": 0.0001827218145430287, + "loss": 0.7345, + "step": 2739 + }, + { + "epoch": 0.37, + "grad_norm": 0.6640625, + "learning_rate": 0.0001827885256837892, + "loss": 0.7819, + "step": 2740 + }, + { + "epoch": 0.37, + "grad_norm": 0.7734375, + "learning_rate": 0.00018285523682454971, + "loss": 0.6316, + "step": 2741 + }, + { + "epoch": 0.37, + "grad_norm": 0.703125, + "learning_rate": 0.0001829219479653102, + "loss": 0.5148, + "step": 2742 + }, + { + "epoch": 0.37, + "grad_norm": 0.66015625, + "learning_rate": 0.00018298865910607072, + "loss": 0.6607, + "step": 2743 + }, + { + "epoch": 0.37, + "grad_norm": 1.0234375, + "learning_rate": 0.00018305537024683124, + "loss": 0.5334, + "step": 2744 + }, + { + "epoch": 0.37, + "grad_norm": 0.66796875, + "learning_rate": 0.00018312208138759173, + "loss": 0.422, + "step": 2745 + }, + { + "epoch": 0.37, + "grad_norm": 0.69921875, + "learning_rate": 0.00018318879252835223, + "loss": 0.741, + "step": 2746 + }, + { + "epoch": 0.37, + "grad_norm": 0.796875, + "learning_rate": 0.00018325550366911274, + "loss": 0.6475, + "step": 2747 + }, + { + "epoch": 0.37, + "grad_norm": 0.8515625, + "learning_rate": 0.00018332221480987326, + "loss": 0.7379, + "step": 2748 + }, + { + "epoch": 0.37, + "grad_norm": 0.953125, + "learning_rate": 0.00018338892595063378, + "loss": 0.6023, + "step": 2749 + }, + { + "epoch": 0.37, + "grad_norm": 0.71875, + "learning_rate": 0.00018345563709139427, + "loss": 0.6737, + "step": 2750 + }, + { + "epoch": 0.37, + "grad_norm": 0.78125, + "learning_rate": 0.00018352234823215476, + "loss": 0.574, + "step": 2751 + }, + { + "epoch": 0.37, + "grad_norm": 0.97265625, + "learning_rate": 0.00018358905937291528, + "loss": 0.3661, + "step": 2752 + }, + { + "epoch": 0.37, + "grad_norm": 0.9296875, + "learning_rate": 0.0001836557705136758, + "loss": 0.6847, + "step": 2753 + }, + { + "epoch": 0.37, + "grad_norm": 0.984375, + "learning_rate": 0.00018372248165443632, + "loss": 0.7918, + "step": 2754 + }, + { + "epoch": 0.37, + "grad_norm": 1.0078125, + "learning_rate": 0.00018378919279519678, + "loss": 0.6461, + "step": 2755 + }, + { + "epoch": 0.37, + "grad_norm": 0.62109375, + "learning_rate": 0.0001838559039359573, + "loss": 0.449, + "step": 2756 + }, + { + "epoch": 0.37, + "grad_norm": 0.80078125, + "learning_rate": 0.00018392261507671782, + "loss": 0.5736, + "step": 2757 + }, + { + "epoch": 0.37, + "grad_norm": 0.70703125, + "learning_rate": 0.00018398932621747834, + "loss": 0.8308, + "step": 2758 + }, + { + "epoch": 0.37, + "grad_norm": 0.6015625, + "learning_rate": 0.00018405603735823883, + "loss": 0.5532, + "step": 2759 + }, + { + "epoch": 0.37, + "grad_norm": 0.70703125, + "learning_rate": 0.00018412274849899932, + "loss": 0.6083, + "step": 2760 + }, + { + "epoch": 0.37, + "grad_norm": 0.7578125, + "learning_rate": 0.00018418945963975984, + "loss": 0.6519, + "step": 2761 + }, + { + "epoch": 0.37, + "grad_norm": 0.8203125, + "learning_rate": 0.00018425617078052036, + "loss": 0.7469, + "step": 2762 + }, + { + "epoch": 0.37, + "grad_norm": 0.859375, + "learning_rate": 0.00018432288192128088, + "loss": 0.6941, + "step": 2763 + }, + { + "epoch": 0.37, + "grad_norm": 0.90234375, + "learning_rate": 0.00018438959306204137, + "loss": 0.5654, + "step": 2764 + }, + { + "epoch": 0.37, + "grad_norm": 0.80859375, + "learning_rate": 0.00018445630420280186, + "loss": 0.3057, + "step": 2765 + }, + { + "epoch": 0.37, + "grad_norm": 1.2109375, + "learning_rate": 0.00018452301534356238, + "loss": 0.852, + "step": 2766 + }, + { + "epoch": 0.37, + "grad_norm": 0.94921875, + "learning_rate": 0.0001845897264843229, + "loss": 0.7056, + "step": 2767 + }, + { + "epoch": 0.37, + "grad_norm": 0.78515625, + "learning_rate": 0.0001846564376250834, + "loss": 0.6297, + "step": 2768 + }, + { + "epoch": 0.37, + "grad_norm": 0.86328125, + "learning_rate": 0.0001847231487658439, + "loss": 0.6702, + "step": 2769 + }, + { + "epoch": 0.37, + "grad_norm": 1.0078125, + "learning_rate": 0.0001847898599066044, + "loss": 0.8302, + "step": 2770 + }, + { + "epoch": 0.37, + "grad_norm": 0.98046875, + "learning_rate": 0.00018485657104736492, + "loss": 0.6255, + "step": 2771 + }, + { + "epoch": 0.37, + "grad_norm": 0.7578125, + "learning_rate": 0.00018492328218812544, + "loss": 0.6051, + "step": 2772 + }, + { + "epoch": 0.37, + "grad_norm": 0.69140625, + "learning_rate": 0.00018498999332888593, + "loss": 0.6058, + "step": 2773 + }, + { + "epoch": 0.37, + "grad_norm": 0.921875, + "learning_rate": 0.00018505670446964645, + "loss": 1.0262, + "step": 2774 + }, + { + "epoch": 0.37, + "grad_norm": 0.69921875, + "learning_rate": 0.00018512341561040694, + "loss": 0.6439, + "step": 2775 + }, + { + "epoch": 0.37, + "grad_norm": 0.7265625, + "learning_rate": 0.00018519012675116746, + "loss": 0.73, + "step": 2776 + }, + { + "epoch": 0.37, + "grad_norm": 0.5859375, + "learning_rate": 0.00018525683789192795, + "loss": 0.4953, + "step": 2777 + }, + { + "epoch": 0.37, + "grad_norm": 0.74609375, + "learning_rate": 0.00018532354903268847, + "loss": 0.5411, + "step": 2778 + }, + { + "epoch": 0.37, + "grad_norm": 0.92578125, + "learning_rate": 0.00018539026017344899, + "loss": 0.6975, + "step": 2779 + }, + { + "epoch": 0.37, + "grad_norm": 1.3046875, + "learning_rate": 0.00018545697131420948, + "loss": 0.6259, + "step": 2780 + }, + { + "epoch": 0.37, + "grad_norm": 0.7578125, + "learning_rate": 0.00018552368245496997, + "loss": 0.5674, + "step": 2781 + }, + { + "epoch": 0.37, + "grad_norm": 0.6484375, + "learning_rate": 0.0001855903935957305, + "loss": 0.7595, + "step": 2782 + }, + { + "epoch": 0.37, + "grad_norm": 1.03125, + "learning_rate": 0.000185657104736491, + "loss": 1.0221, + "step": 2783 + }, + { + "epoch": 0.37, + "grad_norm": 0.7109375, + "learning_rate": 0.00018572381587725153, + "loss": 0.8682, + "step": 2784 + }, + { + "epoch": 0.37, + "grad_norm": 0.95703125, + "learning_rate": 0.00018579052701801202, + "loss": 0.614, + "step": 2785 + }, + { + "epoch": 0.37, + "grad_norm": 1.0234375, + "learning_rate": 0.0001858572381587725, + "loss": 0.7796, + "step": 2786 + }, + { + "epoch": 0.37, + "grad_norm": 0.86328125, + "learning_rate": 0.00018592394929953303, + "loss": 1.0967, + "step": 2787 + }, + { + "epoch": 0.37, + "grad_norm": 0.75, + "learning_rate": 0.00018599066044029355, + "loss": 0.6746, + "step": 2788 + }, + { + "epoch": 0.37, + "grad_norm": 0.60546875, + "learning_rate": 0.00018605737158105406, + "loss": 0.5548, + "step": 2789 + }, + { + "epoch": 0.37, + "grad_norm": 0.70703125, + "learning_rate": 0.00018612408272181453, + "loss": 0.4825, + "step": 2790 + }, + { + "epoch": 0.37, + "grad_norm": 1.0234375, + "learning_rate": 0.00018619079386257505, + "loss": 0.5828, + "step": 2791 + }, + { + "epoch": 0.37, + "grad_norm": 1.0859375, + "learning_rate": 0.00018625750500333557, + "loss": 0.7844, + "step": 2792 + }, + { + "epoch": 0.37, + "grad_norm": 0.7734375, + "learning_rate": 0.00018632421614409608, + "loss": 0.76, + "step": 2793 + }, + { + "epoch": 0.37, + "grad_norm": 0.6875, + "learning_rate": 0.00018639092728485658, + "loss": 0.7947, + "step": 2794 + }, + { + "epoch": 0.37, + "grad_norm": 0.78125, + "learning_rate": 0.00018645763842561707, + "loss": 0.7789, + "step": 2795 + }, + { + "epoch": 0.37, + "grad_norm": 0.9140625, + "learning_rate": 0.00018652434956637759, + "loss": 0.7076, + "step": 2796 + }, + { + "epoch": 0.37, + "grad_norm": 0.80078125, + "learning_rate": 0.0001865910607071381, + "loss": 0.6675, + "step": 2797 + }, + { + "epoch": 0.37, + "grad_norm": 0.72265625, + "learning_rate": 0.00018665777184789862, + "loss": 0.5858, + "step": 2798 + }, + { + "epoch": 0.37, + "grad_norm": 1.140625, + "learning_rate": 0.00018672448298865911, + "loss": 0.804, + "step": 2799 + }, + { + "epoch": 0.37, + "grad_norm": 0.94921875, + "learning_rate": 0.0001867911941294196, + "loss": 0.3067, + "step": 2800 + }, + { + "epoch": 0.37, + "grad_norm": 0.8125, + "learning_rate": 0.00018685790527018012, + "loss": 0.6097, + "step": 2801 + }, + { + "epoch": 0.37, + "grad_norm": 1.015625, + "learning_rate": 0.00018692461641094064, + "loss": 0.6774, + "step": 2802 + }, + { + "epoch": 0.37, + "grad_norm": 0.72265625, + "learning_rate": 0.00018699132755170113, + "loss": 0.5877, + "step": 2803 + }, + { + "epoch": 0.37, + "grad_norm": 0.94140625, + "learning_rate": 0.00018705803869246165, + "loss": 0.5621, + "step": 2804 + }, + { + "epoch": 0.37, + "grad_norm": 0.671875, + "learning_rate": 0.00018712474983322214, + "loss": 0.5363, + "step": 2805 + }, + { + "epoch": 0.37, + "grad_norm": 0.86328125, + "learning_rate": 0.00018719146097398266, + "loss": 0.7807, + "step": 2806 + }, + { + "epoch": 0.37, + "grad_norm": 0.74609375, + "learning_rate": 0.00018725817211474318, + "loss": 0.8144, + "step": 2807 + }, + { + "epoch": 0.37, + "grad_norm": 0.76171875, + "learning_rate": 0.00018732488325550367, + "loss": 0.5663, + "step": 2808 + }, + { + "epoch": 0.37, + "grad_norm": 0.69140625, + "learning_rate": 0.0001873915943962642, + "loss": 0.5385, + "step": 2809 + }, + { + "epoch": 0.37, + "grad_norm": 0.72265625, + "learning_rate": 0.00018745830553702468, + "loss": 0.6357, + "step": 2810 + }, + { + "epoch": 0.38, + "grad_norm": 0.625, + "learning_rate": 0.0001875250166777852, + "loss": 0.7798, + "step": 2811 + }, + { + "epoch": 0.38, + "grad_norm": 0.875, + "learning_rate": 0.0001875917278185457, + "loss": 0.6178, + "step": 2812 + }, + { + "epoch": 0.38, + "grad_norm": 0.5859375, + "learning_rate": 0.0001876584389593062, + "loss": 0.4787, + "step": 2813 + }, + { + "epoch": 0.38, + "grad_norm": 0.8203125, + "learning_rate": 0.00018772515010006673, + "loss": 0.4245, + "step": 2814 + }, + { + "epoch": 0.38, + "grad_norm": 1.0078125, + "learning_rate": 0.00018779186124082725, + "loss": 0.8548, + "step": 2815 + }, + { + "epoch": 0.38, + "grad_norm": 0.7109375, + "learning_rate": 0.00018785857238158774, + "loss": 0.8733, + "step": 2816 + }, + { + "epoch": 0.38, + "grad_norm": 0.91015625, + "learning_rate": 0.00018792528352234823, + "loss": 0.4582, + "step": 2817 + }, + { + "epoch": 0.38, + "grad_norm": 0.6953125, + "learning_rate": 0.00018799199466310875, + "loss": 0.7254, + "step": 2818 + }, + { + "epoch": 0.38, + "grad_norm": 0.91015625, + "learning_rate": 0.00018805870580386927, + "loss": 0.5325, + "step": 2819 + }, + { + "epoch": 0.38, + "grad_norm": 0.6796875, + "learning_rate": 0.00018812541694462976, + "loss": 0.7715, + "step": 2820 + }, + { + "epoch": 0.38, + "grad_norm": 1.109375, + "learning_rate": 0.00018819212808539025, + "loss": 0.6701, + "step": 2821 + }, + { + "epoch": 0.38, + "grad_norm": 0.81640625, + "learning_rate": 0.00018825883922615077, + "loss": 0.8043, + "step": 2822 + }, + { + "epoch": 0.38, + "grad_norm": 0.68359375, + "learning_rate": 0.0001883255503669113, + "loss": 0.3362, + "step": 2823 + }, + { + "epoch": 0.38, + "grad_norm": 0.65234375, + "learning_rate": 0.0001883922615076718, + "loss": 0.5089, + "step": 2824 + }, + { + "epoch": 0.38, + "grad_norm": 1.125, + "learning_rate": 0.0001884589726484323, + "loss": 0.6148, + "step": 2825 + }, + { + "epoch": 0.38, + "grad_norm": 0.734375, + "learning_rate": 0.0001885256837891928, + "loss": 0.7807, + "step": 2826 + }, + { + "epoch": 0.38, + "grad_norm": 0.70703125, + "learning_rate": 0.0001885923949299533, + "loss": 0.9471, + "step": 2827 + }, + { + "epoch": 0.38, + "grad_norm": 0.82421875, + "learning_rate": 0.00018865910607071383, + "loss": 0.549, + "step": 2828 + }, + { + "epoch": 0.38, + "grad_norm": 0.78515625, + "learning_rate": 0.00018872581721147432, + "loss": 0.5952, + "step": 2829 + }, + { + "epoch": 0.38, + "grad_norm": 0.62890625, + "learning_rate": 0.00018879252835223484, + "loss": 0.7226, + "step": 2830 + }, + { + "epoch": 0.38, + "grad_norm": 0.53515625, + "learning_rate": 0.00018885923949299533, + "loss": 0.5521, + "step": 2831 + }, + { + "epoch": 0.38, + "grad_norm": 0.765625, + "learning_rate": 0.00018892595063375585, + "loss": 0.556, + "step": 2832 + }, + { + "epoch": 0.38, + "grad_norm": 0.8046875, + "learning_rate": 0.00018899266177451637, + "loss": 0.7214, + "step": 2833 + }, + { + "epoch": 0.38, + "grad_norm": 1.0546875, + "learning_rate": 0.00018905937291527686, + "loss": 0.7989, + "step": 2834 + }, + { + "epoch": 0.38, + "grad_norm": 1.0546875, + "learning_rate": 0.00018912608405603738, + "loss": 0.6058, + "step": 2835 + }, + { + "epoch": 0.38, + "grad_norm": 0.66796875, + "learning_rate": 0.00018919279519679787, + "loss": 0.5993, + "step": 2836 + }, + { + "epoch": 0.38, + "grad_norm": 0.640625, + "learning_rate": 0.0001892595063375584, + "loss": 0.6336, + "step": 2837 + }, + { + "epoch": 0.38, + "grad_norm": 0.87890625, + "learning_rate": 0.00018932621747831888, + "loss": 0.5306, + "step": 2838 + }, + { + "epoch": 0.38, + "grad_norm": 0.8515625, + "learning_rate": 0.0001893929286190794, + "loss": 0.7093, + "step": 2839 + }, + { + "epoch": 0.38, + "grad_norm": 0.69140625, + "learning_rate": 0.00018945963975983992, + "loss": 0.6587, + "step": 2840 + }, + { + "epoch": 0.38, + "grad_norm": 0.69140625, + "learning_rate": 0.0001895263509006004, + "loss": 0.5183, + "step": 2841 + }, + { + "epoch": 0.38, + "grad_norm": 0.78515625, + "learning_rate": 0.00018959306204136093, + "loss": 0.7727, + "step": 2842 + }, + { + "epoch": 0.38, + "grad_norm": 0.765625, + "learning_rate": 0.00018965977318212142, + "loss": 0.5901, + "step": 2843 + }, + { + "epoch": 0.38, + "grad_norm": 0.7578125, + "learning_rate": 0.00018972648432288194, + "loss": 0.6796, + "step": 2844 + }, + { + "epoch": 0.38, + "grad_norm": 0.6796875, + "learning_rate": 0.00018979319546364245, + "loss": 0.5791, + "step": 2845 + }, + { + "epoch": 0.38, + "grad_norm": 0.71484375, + "learning_rate": 0.00018985990660440295, + "loss": 1.0217, + "step": 2846 + }, + { + "epoch": 0.38, + "grad_norm": 0.671875, + "learning_rate": 0.00018992661774516344, + "loss": 0.7456, + "step": 2847 + }, + { + "epoch": 0.38, + "grad_norm": 0.8515625, + "learning_rate": 0.00018999332888592396, + "loss": 0.6697, + "step": 2848 + }, + { + "epoch": 0.38, + "grad_norm": 0.77734375, + "learning_rate": 0.00019006004002668447, + "loss": 0.7453, + "step": 2849 + }, + { + "epoch": 0.38, + "grad_norm": 0.796875, + "learning_rate": 0.000190126751167445, + "loss": 0.3947, + "step": 2850 + }, + { + "epoch": 0.38, + "grad_norm": 0.79296875, + "learning_rate": 0.00019019346230820548, + "loss": 0.8329, + "step": 2851 + }, + { + "epoch": 0.38, + "grad_norm": 0.75390625, + "learning_rate": 0.00019026017344896598, + "loss": 0.6942, + "step": 2852 + }, + { + "epoch": 0.38, + "grad_norm": 0.79296875, + "learning_rate": 0.0001903268845897265, + "loss": 0.6227, + "step": 2853 + }, + { + "epoch": 0.38, + "grad_norm": 0.66796875, + "learning_rate": 0.000190393595730487, + "loss": 0.8487, + "step": 2854 + }, + { + "epoch": 0.38, + "grad_norm": 0.8671875, + "learning_rate": 0.0001904603068712475, + "loss": 0.5701, + "step": 2855 + }, + { + "epoch": 0.38, + "grad_norm": 1.0625, + "learning_rate": 0.000190527018012008, + "loss": 0.9391, + "step": 2856 + }, + { + "epoch": 0.38, + "grad_norm": 0.67578125, + "learning_rate": 0.00019059372915276851, + "loss": 0.504, + "step": 2857 + }, + { + "epoch": 0.38, + "grad_norm": 0.7109375, + "learning_rate": 0.00019066044029352903, + "loss": 0.621, + "step": 2858 + }, + { + "epoch": 0.38, + "grad_norm": 0.73046875, + "learning_rate": 0.00019072715143428955, + "loss": 0.7559, + "step": 2859 + }, + { + "epoch": 0.38, + "grad_norm": 0.70703125, + "learning_rate": 0.00019079386257505004, + "loss": 0.4692, + "step": 2860 + }, + { + "epoch": 0.38, + "grad_norm": 0.72265625, + "learning_rate": 0.00019086057371581053, + "loss": 0.797, + "step": 2861 + }, + { + "epoch": 0.38, + "grad_norm": 0.8359375, + "learning_rate": 0.00019092728485657105, + "loss": 0.6453, + "step": 2862 + }, + { + "epoch": 0.38, + "grad_norm": 0.8203125, + "learning_rate": 0.00019099399599733157, + "loss": 0.4311, + "step": 2863 + }, + { + "epoch": 0.38, + "grad_norm": 0.71484375, + "learning_rate": 0.00019106070713809206, + "loss": 0.5453, + "step": 2864 + }, + { + "epoch": 0.38, + "grad_norm": 0.9609375, + "learning_rate": 0.00019112741827885258, + "loss": 0.6254, + "step": 2865 + }, + { + "epoch": 0.38, + "grad_norm": 0.6328125, + "learning_rate": 0.00019119412941961307, + "loss": 0.56, + "step": 2866 + }, + { + "epoch": 0.38, + "grad_norm": 0.875, + "learning_rate": 0.0001912608405603736, + "loss": 0.6227, + "step": 2867 + }, + { + "epoch": 0.38, + "grad_norm": 0.66015625, + "learning_rate": 0.0001913275517011341, + "loss": 0.5792, + "step": 2868 + }, + { + "epoch": 0.38, + "grad_norm": 0.70703125, + "learning_rate": 0.0001913942628418946, + "loss": 0.5379, + "step": 2869 + }, + { + "epoch": 0.38, + "grad_norm": 0.875, + "learning_rate": 0.00019146097398265512, + "loss": 1.0046, + "step": 2870 + }, + { + "epoch": 0.38, + "grad_norm": 0.8828125, + "learning_rate": 0.0001915276851234156, + "loss": 0.712, + "step": 2871 + }, + { + "epoch": 0.38, + "grad_norm": 0.94921875, + "learning_rate": 0.00019159439626417613, + "loss": 0.5693, + "step": 2872 + }, + { + "epoch": 0.38, + "grad_norm": 0.87109375, + "learning_rate": 0.00019166110740493662, + "loss": 0.7735, + "step": 2873 + }, + { + "epoch": 0.38, + "grad_norm": 0.7578125, + "learning_rate": 0.00019172781854569714, + "loss": 0.7945, + "step": 2874 + }, + { + "epoch": 0.38, + "grad_norm": 0.703125, + "learning_rate": 0.00019179452968645766, + "loss": 0.4262, + "step": 2875 + }, + { + "epoch": 0.38, + "grad_norm": 0.734375, + "learning_rate": 0.00019186124082721815, + "loss": 0.7489, + "step": 2876 + }, + { + "epoch": 0.38, + "grad_norm": 0.76953125, + "learning_rate": 0.00019192795196797867, + "loss": 1.024, + "step": 2877 + }, + { + "epoch": 0.38, + "grad_norm": 1.28125, + "learning_rate": 0.00019199466310873916, + "loss": 0.6243, + "step": 2878 + }, + { + "epoch": 0.38, + "grad_norm": 0.55078125, + "learning_rate": 0.00019206137424949968, + "loss": 0.6535, + "step": 2879 + }, + { + "epoch": 0.38, + "grad_norm": 0.6953125, + "learning_rate": 0.0001921280853902602, + "loss": 0.6964, + "step": 2880 + }, + { + "epoch": 0.38, + "grad_norm": 0.73828125, + "learning_rate": 0.0001921947965310207, + "loss": 0.4109, + "step": 2881 + }, + { + "epoch": 0.38, + "grad_norm": 0.7265625, + "learning_rate": 0.00019226150767178118, + "loss": 0.5804, + "step": 2882 + }, + { + "epoch": 0.38, + "grad_norm": 0.7421875, + "learning_rate": 0.0001923282188125417, + "loss": 0.8662, + "step": 2883 + }, + { + "epoch": 0.38, + "grad_norm": 0.8125, + "learning_rate": 0.00019239492995330222, + "loss": 0.7128, + "step": 2884 + }, + { + "epoch": 0.38, + "grad_norm": 0.7109375, + "learning_rate": 0.00019246164109406274, + "loss": 0.4966, + "step": 2885 + }, + { + "epoch": 0.39, + "grad_norm": 0.6796875, + "learning_rate": 0.00019252835223482323, + "loss": 0.6643, + "step": 2886 + }, + { + "epoch": 0.39, + "grad_norm": 1.0625, + "learning_rate": 0.00019259506337558372, + "loss": 0.3604, + "step": 2887 + }, + { + "epoch": 0.39, + "grad_norm": 0.7578125, + "learning_rate": 0.00019266177451634424, + "loss": 0.8143, + "step": 2888 + }, + { + "epoch": 0.39, + "grad_norm": 0.75, + "learning_rate": 0.00019272848565710476, + "loss": 0.4039, + "step": 2889 + }, + { + "epoch": 0.39, + "grad_norm": 1.1328125, + "learning_rate": 0.00019279519679786528, + "loss": 0.53, + "step": 2890 + }, + { + "epoch": 0.39, + "grad_norm": 0.625, + "learning_rate": 0.00019286190793862574, + "loss": 0.7027, + "step": 2891 + }, + { + "epoch": 0.39, + "grad_norm": 0.88671875, + "learning_rate": 0.00019292861907938626, + "loss": 0.6236, + "step": 2892 + }, + { + "epoch": 0.39, + "grad_norm": 0.6640625, + "learning_rate": 0.00019299533022014678, + "loss": 0.6098, + "step": 2893 + }, + { + "epoch": 0.39, + "grad_norm": 0.76953125, + "learning_rate": 0.0001930620413609073, + "loss": 0.8134, + "step": 2894 + }, + { + "epoch": 0.39, + "grad_norm": 0.72265625, + "learning_rate": 0.0001931287525016678, + "loss": 0.6992, + "step": 2895 + }, + { + "epoch": 0.39, + "grad_norm": 1.0703125, + "learning_rate": 0.00019319546364242828, + "loss": 0.6003, + "step": 2896 + }, + { + "epoch": 0.39, + "grad_norm": 0.875, + "learning_rate": 0.0001932621747831888, + "loss": 0.4469, + "step": 2897 + }, + { + "epoch": 0.39, + "grad_norm": 1.1875, + "learning_rate": 0.00019332888592394932, + "loss": 0.4703, + "step": 2898 + }, + { + "epoch": 0.39, + "grad_norm": 0.84375, + "learning_rate": 0.00019339559706470983, + "loss": 0.7778, + "step": 2899 + }, + { + "epoch": 0.39, + "grad_norm": 0.75, + "learning_rate": 0.00019346230820547033, + "loss": 0.5451, + "step": 2900 + }, + { + "epoch": 0.39, + "grad_norm": 0.8203125, + "learning_rate": 0.00019352901934623082, + "loss": 0.9207, + "step": 2901 + }, + { + "epoch": 0.39, + "grad_norm": 0.96875, + "learning_rate": 0.00019359573048699134, + "loss": 0.5601, + "step": 2902 + }, + { + "epoch": 0.39, + "grad_norm": 0.83984375, + "learning_rate": 0.00019366244162775185, + "loss": 0.568, + "step": 2903 + }, + { + "epoch": 0.39, + "grad_norm": 0.73828125, + "learning_rate": 0.00019372915276851235, + "loss": 0.8947, + "step": 2904 + }, + { + "epoch": 0.39, + "grad_norm": 0.78125, + "learning_rate": 0.00019379586390927286, + "loss": 0.8625, + "step": 2905 + }, + { + "epoch": 0.39, + "grad_norm": 0.8046875, + "learning_rate": 0.00019386257505003336, + "loss": 0.7353, + "step": 2906 + }, + { + "epoch": 0.39, + "grad_norm": 0.921875, + "learning_rate": 0.00019392928619079388, + "loss": 1.12, + "step": 2907 + }, + { + "epoch": 0.39, + "grad_norm": 0.76171875, + "learning_rate": 0.00019399599733155437, + "loss": 0.5658, + "step": 2908 + }, + { + "epoch": 0.39, + "grad_norm": 0.76953125, + "learning_rate": 0.00019406270847231489, + "loss": 0.65, + "step": 2909 + }, + { + "epoch": 0.39, + "grad_norm": 0.84765625, + "learning_rate": 0.0001941294196130754, + "loss": 0.8121, + "step": 2910 + }, + { + "epoch": 0.39, + "grad_norm": 0.828125, + "learning_rate": 0.0001941961307538359, + "loss": 0.7569, + "step": 2911 + }, + { + "epoch": 0.39, + "grad_norm": 0.7578125, + "learning_rate": 0.00019426284189459641, + "loss": 0.5676, + "step": 2912 + }, + { + "epoch": 0.39, + "grad_norm": 0.76171875, + "learning_rate": 0.0001943295530353569, + "loss": 0.7445, + "step": 2913 + }, + { + "epoch": 0.39, + "grad_norm": 0.65625, + "learning_rate": 0.00019439626417611742, + "loss": 0.4822, + "step": 2914 + }, + { + "epoch": 0.39, + "grad_norm": 1.0234375, + "learning_rate": 0.00019446297531687794, + "loss": 0.5966, + "step": 2915 + }, + { + "epoch": 0.39, + "grad_norm": 0.78515625, + "learning_rate": 0.00019452968645763843, + "loss": 0.7277, + "step": 2916 + }, + { + "epoch": 0.39, + "grad_norm": 0.88671875, + "learning_rate": 0.00019459639759839893, + "loss": 0.5132, + "step": 2917 + }, + { + "epoch": 0.39, + "grad_norm": 0.625, + "learning_rate": 0.00019466310873915944, + "loss": 0.7299, + "step": 2918 + }, + { + "epoch": 0.39, + "grad_norm": 0.71875, + "learning_rate": 0.00019472981987991996, + "loss": 0.4818, + "step": 2919 + }, + { + "epoch": 0.39, + "grad_norm": 0.75, + "learning_rate": 0.00019479653102068048, + "loss": 1.0165, + "step": 2920 + }, + { + "epoch": 0.39, + "grad_norm": 0.85546875, + "learning_rate": 0.00019486324216144097, + "loss": 0.8224, + "step": 2921 + }, + { + "epoch": 0.39, + "grad_norm": 0.78515625, + "learning_rate": 0.00019492995330220146, + "loss": 0.5484, + "step": 2922 + }, + { + "epoch": 0.39, + "grad_norm": 0.76171875, + "learning_rate": 0.00019499666444296198, + "loss": 0.6664, + "step": 2923 + }, + { + "epoch": 0.39, + "grad_norm": 0.7265625, + "learning_rate": 0.0001950633755837225, + "loss": 0.3367, + "step": 2924 + }, + { + "epoch": 0.39, + "grad_norm": 0.66015625, + "learning_rate": 0.00019513008672448302, + "loss": 0.7486, + "step": 2925 + }, + { + "epoch": 0.39, + "grad_norm": 0.62890625, + "learning_rate": 0.00019519679786524348, + "loss": 0.3998, + "step": 2926 + }, + { + "epoch": 0.39, + "grad_norm": 0.77734375, + "learning_rate": 0.000195263509006004, + "loss": 0.5131, + "step": 2927 + }, + { + "epoch": 0.39, + "grad_norm": 0.91015625, + "learning_rate": 0.00019533022014676452, + "loss": 0.654, + "step": 2928 + }, + { + "epoch": 0.39, + "grad_norm": 0.765625, + "learning_rate": 0.00019539693128752504, + "loss": 0.6486, + "step": 2929 + }, + { + "epoch": 0.39, + "grad_norm": 0.90234375, + "learning_rate": 0.00019546364242828553, + "loss": 0.3393, + "step": 2930 + }, + { + "epoch": 0.39, + "grad_norm": 0.76953125, + "learning_rate": 0.00019553035356904602, + "loss": 0.5325, + "step": 2931 + }, + { + "epoch": 0.39, + "grad_norm": 1.046875, + "learning_rate": 0.00019559706470980654, + "loss": 0.6566, + "step": 2932 + }, + { + "epoch": 0.39, + "grad_norm": 0.69140625, + "learning_rate": 0.00019566377585056706, + "loss": 0.6798, + "step": 2933 + }, + { + "epoch": 0.39, + "grad_norm": 0.75, + "learning_rate": 0.00019573048699132758, + "loss": 0.7998, + "step": 2934 + }, + { + "epoch": 0.39, + "grad_norm": 1.0703125, + "learning_rate": 0.00019579719813208807, + "loss": 0.6292, + "step": 2935 + }, + { + "epoch": 0.39, + "grad_norm": 0.7578125, + "learning_rate": 0.00019586390927284856, + "loss": 0.4877, + "step": 2936 + }, + { + "epoch": 0.39, + "grad_norm": 0.6328125, + "learning_rate": 0.00019593062041360908, + "loss": 0.8382, + "step": 2937 + }, + { + "epoch": 0.39, + "grad_norm": 0.6875, + "learning_rate": 0.0001959973315543696, + "loss": 0.7128, + "step": 2938 + }, + { + "epoch": 0.39, + "grad_norm": 1.1875, + "learning_rate": 0.0001960640426951301, + "loss": 0.8859, + "step": 2939 + }, + { + "epoch": 0.39, + "grad_norm": 0.88671875, + "learning_rate": 0.0001961307538358906, + "loss": 0.8838, + "step": 2940 + }, + { + "epoch": 0.39, + "grad_norm": 0.8359375, + "learning_rate": 0.0001961974649766511, + "loss": 0.6007, + "step": 2941 + }, + { + "epoch": 0.39, + "grad_norm": 0.75, + "learning_rate": 0.00019626417611741162, + "loss": 0.3866, + "step": 2942 + }, + { + "epoch": 0.39, + "grad_norm": 0.64453125, + "learning_rate": 0.00019633088725817214, + "loss": 0.5515, + "step": 2943 + }, + { + "epoch": 0.39, + "grad_norm": 0.94921875, + "learning_rate": 0.00019639759839893263, + "loss": 0.3709, + "step": 2944 + }, + { + "epoch": 0.39, + "grad_norm": 0.66015625, + "learning_rate": 0.00019646430953969315, + "loss": 0.7089, + "step": 2945 + }, + { + "epoch": 0.39, + "grad_norm": 0.79296875, + "learning_rate": 0.00019653102068045364, + "loss": 0.9386, + "step": 2946 + }, + { + "epoch": 0.39, + "grad_norm": 0.6796875, + "learning_rate": 0.00019659773182121416, + "loss": 0.6231, + "step": 2947 + }, + { + "epoch": 0.39, + "grad_norm": 0.6953125, + "learning_rate": 0.00019666444296197465, + "loss": 0.5873, + "step": 2948 + }, + { + "epoch": 0.39, + "grad_norm": 0.73046875, + "learning_rate": 0.00019673115410273517, + "loss": 0.5476, + "step": 2949 + }, + { + "epoch": 0.39, + "grad_norm": 0.7578125, + "learning_rate": 0.0001967978652434957, + "loss": 0.5621, + "step": 2950 + }, + { + "epoch": 0.39, + "grad_norm": 0.91015625, + "learning_rate": 0.00019686457638425618, + "loss": 0.4419, + "step": 2951 + }, + { + "epoch": 0.39, + "grad_norm": 0.703125, + "learning_rate": 0.0001969312875250167, + "loss": 0.6874, + "step": 2952 + }, + { + "epoch": 0.39, + "grad_norm": 0.58984375, + "learning_rate": 0.0001969979986657772, + "loss": 0.7484, + "step": 2953 + }, + { + "epoch": 0.39, + "grad_norm": 0.671875, + "learning_rate": 0.0001970647098065377, + "loss": 0.4594, + "step": 2954 + }, + { + "epoch": 0.39, + "grad_norm": 0.7421875, + "learning_rate": 0.00019713142094729823, + "loss": 0.6617, + "step": 2955 + }, + { + "epoch": 0.39, + "grad_norm": 1.015625, + "learning_rate": 0.00019719813208805872, + "loss": 0.6434, + "step": 2956 + }, + { + "epoch": 0.39, + "grad_norm": 0.69921875, + "learning_rate": 0.0001972648432288192, + "loss": 0.7237, + "step": 2957 + }, + { + "epoch": 0.39, + "grad_norm": 0.65625, + "learning_rate": 0.00019733155436957973, + "loss": 0.5681, + "step": 2958 + }, + { + "epoch": 0.39, + "grad_norm": 0.6484375, + "learning_rate": 0.00019739826551034025, + "loss": 0.6632, + "step": 2959 + }, + { + "epoch": 0.39, + "grad_norm": 0.71875, + "learning_rate": 0.00019746497665110076, + "loss": 0.4593, + "step": 2960 + }, + { + "epoch": 0.4, + "grad_norm": 0.9609375, + "learning_rate": 0.00019753168779186123, + "loss": 0.803, + "step": 2961 + }, + { + "epoch": 0.4, + "grad_norm": 0.84765625, + "learning_rate": 0.00019759839893262175, + "loss": 0.5207, + "step": 2962 + }, + { + "epoch": 0.4, + "grad_norm": 0.85546875, + "learning_rate": 0.00019766511007338227, + "loss": 0.7807, + "step": 2963 + }, + { + "epoch": 0.4, + "grad_norm": 0.58984375, + "learning_rate": 0.00019773182121414278, + "loss": 0.5898, + "step": 2964 + }, + { + "epoch": 0.4, + "grad_norm": 0.7421875, + "learning_rate": 0.00019779853235490328, + "loss": 0.4302, + "step": 2965 + }, + { + "epoch": 0.4, + "grad_norm": 0.73828125, + "learning_rate": 0.00019786524349566377, + "loss": 0.6158, + "step": 2966 + }, + { + "epoch": 0.4, + "grad_norm": 0.58984375, + "learning_rate": 0.00019793195463642429, + "loss": 0.5256, + "step": 2967 + }, + { + "epoch": 0.4, + "grad_norm": 0.953125, + "learning_rate": 0.0001979986657771848, + "loss": 0.6052, + "step": 2968 + }, + { + "epoch": 0.4, + "grad_norm": 0.59765625, + "learning_rate": 0.00019806537691794532, + "loss": 0.3369, + "step": 2969 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.00019813208805870581, + "loss": 0.6449, + "step": 2970 + }, + { + "epoch": 0.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0001981987991994663, + "loss": 0.7852, + "step": 2971 + }, + { + "epoch": 0.4, + "grad_norm": 0.64453125, + "learning_rate": 0.00019826551034022682, + "loss": 0.4903, + "step": 2972 + }, + { + "epoch": 0.4, + "grad_norm": 0.9609375, + "learning_rate": 0.00019833222148098734, + "loss": 0.4865, + "step": 2973 + }, + { + "epoch": 0.4, + "grad_norm": 0.9921875, + "learning_rate": 0.00019839893262174783, + "loss": 0.6225, + "step": 2974 + }, + { + "epoch": 0.4, + "grad_norm": 0.6953125, + "learning_rate": 0.00019846564376250835, + "loss": 0.7609, + "step": 2975 + }, + { + "epoch": 0.4, + "grad_norm": 1.109375, + "learning_rate": 0.00019853235490326884, + "loss": 0.6892, + "step": 2976 + }, + { + "epoch": 0.4, + "grad_norm": 0.6171875, + "learning_rate": 0.00019859906604402936, + "loss": 0.7035, + "step": 2977 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.00019866577718478988, + "loss": 0.6019, + "step": 2978 + }, + { + "epoch": 0.4, + "grad_norm": 0.890625, + "learning_rate": 0.00019873248832555037, + "loss": 0.5363, + "step": 2979 + }, + { + "epoch": 0.4, + "grad_norm": 0.77734375, + "learning_rate": 0.0001987991994663109, + "loss": 0.4455, + "step": 2980 + }, + { + "epoch": 0.4, + "grad_norm": 0.6328125, + "learning_rate": 0.00019886591060707138, + "loss": 0.5304, + "step": 2981 + }, + { + "epoch": 0.4, + "grad_norm": 0.7734375, + "learning_rate": 0.0001989326217478319, + "loss": 0.7251, + "step": 2982 + }, + { + "epoch": 0.4, + "grad_norm": 0.84375, + "learning_rate": 0.0001989993328885924, + "loss": 0.4465, + "step": 2983 + }, + { + "epoch": 0.4, + "grad_norm": 0.91796875, + "learning_rate": 0.0001990660440293529, + "loss": 0.6893, + "step": 2984 + }, + { + "epoch": 0.4, + "grad_norm": 0.80078125, + "learning_rate": 0.00019913275517011343, + "loss": 0.6254, + "step": 2985 + }, + { + "epoch": 0.4, + "grad_norm": 0.80078125, + "learning_rate": 0.00019919946631087392, + "loss": 0.7717, + "step": 2986 + }, + { + "epoch": 0.4, + "grad_norm": 0.57421875, + "learning_rate": 0.00019926617745163444, + "loss": 0.5352, + "step": 2987 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.00019933288859239493, + "loss": 0.884, + "step": 2988 + }, + { + "epoch": 0.4, + "grad_norm": 0.7421875, + "learning_rate": 0.00019939959973315545, + "loss": 0.8399, + "step": 2989 + }, + { + "epoch": 0.4, + "grad_norm": 0.8984375, + "learning_rate": 0.00019946631087391597, + "loss": 0.7985, + "step": 2990 + }, + { + "epoch": 0.4, + "grad_norm": 0.71875, + "learning_rate": 0.00019953302201467646, + "loss": 0.6324, + "step": 2991 + }, + { + "epoch": 0.4, + "grad_norm": 0.875, + "learning_rate": 0.00019959973315543695, + "loss": 0.7357, + "step": 2992 + }, + { + "epoch": 0.4, + "grad_norm": 0.75390625, + "learning_rate": 0.00019966644429619747, + "loss": 0.4998, + "step": 2993 + }, + { + "epoch": 0.4, + "grad_norm": 0.7421875, + "learning_rate": 0.000199733155436958, + "loss": 0.5014, + "step": 2994 + }, + { + "epoch": 0.4, + "grad_norm": 0.68359375, + "learning_rate": 0.0001997998665777185, + "loss": 0.4477, + "step": 2995 + }, + { + "epoch": 0.4, + "grad_norm": 0.7578125, + "learning_rate": 0.000199866577718479, + "loss": 0.4979, + "step": 2996 + }, + { + "epoch": 0.4, + "grad_norm": 0.875, + "learning_rate": 0.0001999332888592395, + "loss": 0.5937, + "step": 2997 + }, + { + "epoch": 0.4, + "grad_norm": 0.765625, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 2998 + }, + { + "epoch": 0.4, + "grad_norm": 0.66796875, + "learning_rate": 0.00019999999932196794, + "loss": 0.6856, + "step": 2999 + }, + { + "epoch": 0.4, + "grad_norm": 0.578125, + "learning_rate": 0.00019999999728787166, + "loss": 0.6348, + "step": 3000 + }, + { + "epoch": 0.4, + "grad_norm": 0.8046875, + "learning_rate": 0.00019999999389771125, + "loss": 0.7693, + "step": 3001 + }, + { + "epoch": 0.4, + "grad_norm": 0.8203125, + "learning_rate": 0.00019999998915148676, + "loss": 0.4154, + "step": 3002 + }, + { + "epoch": 0.4, + "grad_norm": 0.71875, + "learning_rate": 0.00019999998304919822, + "loss": 0.5905, + "step": 3003 + }, + { + "epoch": 0.4, + "grad_norm": 0.70703125, + "learning_rate": 0.00019999997559084576, + "loss": 0.5659, + "step": 3004 + }, + { + "epoch": 0.4, + "grad_norm": 1.359375, + "learning_rate": 0.0001999999667764294, + "loss": 0.7743, + "step": 3005 + }, + { + "epoch": 0.4, + "grad_norm": 0.95703125, + "learning_rate": 0.00019999995660594934, + "loss": 0.6252, + "step": 3006 + }, + { + "epoch": 0.4, + "grad_norm": 0.8359375, + "learning_rate": 0.0001999999450794057, + "loss": 0.5476, + "step": 3007 + }, + { + "epoch": 0.4, + "grad_norm": 0.81640625, + "learning_rate": 0.0001999999321967986, + "loss": 0.699, + "step": 3008 + }, + { + "epoch": 0.4, + "grad_norm": 0.6875, + "learning_rate": 0.00019999991795812826, + "loss": 0.8846, + "step": 3009 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.00019999990236339486, + "loss": 0.6281, + "step": 3010 + }, + { + "epoch": 0.4, + "grad_norm": 0.671875, + "learning_rate": 0.00019999988541259857, + "loss": 0.3689, + "step": 3011 + }, + { + "epoch": 0.4, + "grad_norm": 0.8046875, + "learning_rate": 0.0001999998671057397, + "loss": 0.5219, + "step": 3012 + }, + { + "epoch": 0.4, + "grad_norm": 0.68359375, + "learning_rate": 0.00019999984744281841, + "loss": 0.4011, + "step": 3013 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.00019999982642383507, + "loss": 0.7982, + "step": 3014 + }, + { + "epoch": 0.4, + "grad_norm": 0.8125, + "learning_rate": 0.00019999980404878984, + "loss": 0.9999, + "step": 3015 + }, + { + "epoch": 0.4, + "grad_norm": 0.87109375, + "learning_rate": 0.0001999997803176831, + "loss": 0.6285, + "step": 3016 + }, + { + "epoch": 0.4, + "grad_norm": 0.76953125, + "learning_rate": 0.00019999975523051516, + "loss": 0.6288, + "step": 3017 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.0001999997287872864, + "loss": 0.5805, + "step": 3018 + }, + { + "epoch": 0.4, + "grad_norm": 0.76171875, + "learning_rate": 0.0001999997009879971, + "loss": 0.56, + "step": 3019 + }, + { + "epoch": 0.4, + "grad_norm": 0.8515625, + "learning_rate": 0.00019999967183264767, + "loss": 0.4725, + "step": 3020 + }, + { + "epoch": 0.4, + "grad_norm": 0.7109375, + "learning_rate": 0.0001999996413212385, + "loss": 0.4931, + "step": 3021 + }, + { + "epoch": 0.4, + "grad_norm": 0.703125, + "learning_rate": 0.00019999960945377008, + "loss": 0.7408, + "step": 3022 + }, + { + "epoch": 0.4, + "grad_norm": 0.66796875, + "learning_rate": 0.00019999957623024271, + "loss": 0.6139, + "step": 3023 + }, + { + "epoch": 0.4, + "grad_norm": 0.71875, + "learning_rate": 0.00019999954165065696, + "loss": 0.624, + "step": 3024 + }, + { + "epoch": 0.4, + "grad_norm": 0.72265625, + "learning_rate": 0.0001999995057150132, + "loss": 0.7025, + "step": 3025 + }, + { + "epoch": 0.4, + "grad_norm": 0.81640625, + "learning_rate": 0.00019999946842331196, + "loss": 0.7859, + "step": 3026 + }, + { + "epoch": 0.4, + "grad_norm": 0.76171875, + "learning_rate": 0.00019999942977555376, + "loss": 0.3436, + "step": 3027 + }, + { + "epoch": 0.4, + "grad_norm": 0.6796875, + "learning_rate": 0.00019999938977173916, + "loss": 0.3242, + "step": 3028 + }, + { + "epoch": 0.4, + "grad_norm": 0.95703125, + "learning_rate": 0.0001999993484118686, + "loss": 0.7098, + "step": 3029 + }, + { + "epoch": 0.4, + "grad_norm": 0.65625, + "learning_rate": 0.00019999930569594273, + "loss": 0.7843, + "step": 3030 + }, + { + "epoch": 0.4, + "grad_norm": 0.8671875, + "learning_rate": 0.00019999926162396204, + "loss": 0.5935, + "step": 3031 + }, + { + "epoch": 0.4, + "grad_norm": 0.6171875, + "learning_rate": 0.00019999921619592726, + "loss": 0.4866, + "step": 3032 + }, + { + "epoch": 0.4, + "grad_norm": 0.94140625, + "learning_rate": 0.00019999916941183888, + "loss": 0.5361, + "step": 3033 + }, + { + "epoch": 0.4, + "grad_norm": 1.1796875, + "learning_rate": 0.0001999991212716976, + "loss": 0.9045, + "step": 3034 + }, + { + "epoch": 0.4, + "grad_norm": 0.875, + "learning_rate": 0.00019999907177550405, + "loss": 0.9395, + "step": 3035 + }, + { + "epoch": 0.41, + "grad_norm": 0.70703125, + "learning_rate": 0.00019999902092325891, + "loss": 0.3287, + "step": 3036 + }, + { + "epoch": 0.41, + "grad_norm": 0.8515625, + "learning_rate": 0.00019999896871496285, + "loss": 0.7849, + "step": 3037 + }, + { + "epoch": 0.41, + "grad_norm": 0.9765625, + "learning_rate": 0.00019999891515061662, + "loss": 0.5861, + "step": 3038 + }, + { + "epoch": 0.41, + "grad_norm": 1.0390625, + "learning_rate": 0.0001999988602302209, + "loss": 0.8457, + "step": 3039 + }, + { + "epoch": 0.41, + "grad_norm": 0.5234375, + "learning_rate": 0.0001999988039537765, + "loss": 0.3985, + "step": 3040 + }, + { + "epoch": 0.41, + "grad_norm": 0.61328125, + "learning_rate": 0.0001999987463212841, + "loss": 0.5209, + "step": 3041 + }, + { + "epoch": 0.41, + "grad_norm": 0.84765625, + "learning_rate": 0.00019999868733274454, + "loss": 0.7852, + "step": 3042 + }, + { + "epoch": 0.41, + "grad_norm": 0.765625, + "learning_rate": 0.00019999862698815858, + "loss": 0.554, + "step": 3043 + }, + { + "epoch": 0.41, + "grad_norm": 0.76953125, + "learning_rate": 0.0001999985652875271, + "loss": 0.6402, + "step": 3044 + }, + { + "epoch": 0.41, + "grad_norm": 0.7578125, + "learning_rate": 0.00019999850223085085, + "loss": 0.4162, + "step": 3045 + }, + { + "epoch": 0.41, + "grad_norm": 0.609375, + "learning_rate": 0.00019999843781813077, + "loss": 0.425, + "step": 3046 + }, + { + "epoch": 0.41, + "grad_norm": 0.65234375, + "learning_rate": 0.00019999837204936766, + "loss": 0.5266, + "step": 3047 + }, + { + "epoch": 0.41, + "grad_norm": 0.61328125, + "learning_rate": 0.00019999830492456247, + "loss": 0.5932, + "step": 3048 + }, + { + "epoch": 0.41, + "grad_norm": 0.7890625, + "learning_rate": 0.00019999823644371607, + "loss": 0.7364, + "step": 3049 + }, + { + "epoch": 0.41, + "grad_norm": 0.69921875, + "learning_rate": 0.00019999816660682938, + "loss": 0.6791, + "step": 3050 + }, + { + "epoch": 0.41, + "grad_norm": 0.765625, + "learning_rate": 0.00019999809541390345, + "loss": 0.7809, + "step": 3051 + }, + { + "epoch": 0.41, + "grad_norm": 0.82421875, + "learning_rate": 0.0001999980228649391, + "loss": 0.485, + "step": 3052 + }, + { + "epoch": 0.41, + "grad_norm": 1.0234375, + "learning_rate": 0.00019999794895993742, + "loss": 0.6419, + "step": 3053 + }, + { + "epoch": 0.41, + "grad_norm": 0.72265625, + "learning_rate": 0.00019999787369889935, + "loss": 1.0179, + "step": 3054 + }, + { + "epoch": 0.41, + "grad_norm": 0.93359375, + "learning_rate": 0.00019999779708182596, + "loss": 0.5838, + "step": 3055 + }, + { + "epoch": 0.41, + "grad_norm": 0.7578125, + "learning_rate": 0.00019999771910871823, + "loss": 0.6007, + "step": 3056 + }, + { + "epoch": 0.41, + "grad_norm": 0.85546875, + "learning_rate": 0.00019999763977957726, + "loss": 0.6091, + "step": 3057 + }, + { + "epoch": 0.41, + "grad_norm": 0.89453125, + "learning_rate": 0.00019999755909440414, + "loss": 0.7955, + "step": 3058 + }, + { + "epoch": 0.41, + "grad_norm": 0.7421875, + "learning_rate": 0.0001999974770531999, + "loss": 0.4815, + "step": 3059 + }, + { + "epoch": 0.41, + "grad_norm": 0.875, + "learning_rate": 0.00019999739365596572, + "loss": 0.681, + "step": 3060 + }, + { + "epoch": 0.41, + "grad_norm": 0.78515625, + "learning_rate": 0.00019999730890270272, + "loss": 0.5589, + "step": 3061 + }, + { + "epoch": 0.41, + "grad_norm": 1.1171875, + "learning_rate": 0.00019999722279341202, + "loss": 0.8862, + "step": 3062 + }, + { + "epoch": 0.41, + "grad_norm": 0.625, + "learning_rate": 0.00019999713532809478, + "loss": 0.6754, + "step": 3063 + }, + { + "epoch": 0.41, + "grad_norm": 0.88671875, + "learning_rate": 0.00019999704650675221, + "loss": 0.5664, + "step": 3064 + }, + { + "epoch": 0.41, + "grad_norm": 0.7109375, + "learning_rate": 0.00019999695632938553, + "loss": 0.792, + "step": 3065 + }, + { + "epoch": 0.41, + "grad_norm": 0.69921875, + "learning_rate": 0.00019999686479599596, + "loss": 0.6611, + "step": 3066 + }, + { + "epoch": 0.41, + "grad_norm": 0.84765625, + "learning_rate": 0.0001999967719065847, + "loss": 0.6895, + "step": 3067 + }, + { + "epoch": 0.41, + "grad_norm": 0.66796875, + "learning_rate": 0.00019999667766115305, + "loss": 0.6883, + "step": 3068 + }, + { + "epoch": 0.41, + "grad_norm": 0.84375, + "learning_rate": 0.00019999658205970225, + "loss": 0.7416, + "step": 3069 + }, + { + "epoch": 0.41, + "grad_norm": 0.75390625, + "learning_rate": 0.00019999648510223367, + "loss": 0.6195, + "step": 3070 + }, + { + "epoch": 0.41, + "grad_norm": 0.9140625, + "learning_rate": 0.0001999963867887485, + "loss": 0.5022, + "step": 3071 + }, + { + "epoch": 0.41, + "grad_norm": 0.71484375, + "learning_rate": 0.0001999962871192482, + "loss": 0.7824, + "step": 3072 + }, + { + "epoch": 0.41, + "grad_norm": 0.6328125, + "learning_rate": 0.00019999618609373404, + "loss": 0.5525, + "step": 3073 + }, + { + "epoch": 0.41, + "grad_norm": 0.6484375, + "learning_rate": 0.00019999608371220747, + "loss": 0.7534, + "step": 3074 + }, + { + "epoch": 0.41, + "grad_norm": 0.78515625, + "learning_rate": 0.00019999597997466976, + "loss": 0.7529, + "step": 3075 + }, + { + "epoch": 0.41, + "grad_norm": 0.70703125, + "learning_rate": 0.00019999587488112242, + "loss": 0.398, + "step": 3076 + }, + { + "epoch": 0.41, + "grad_norm": 0.875, + "learning_rate": 0.00019999576843156684, + "loss": 0.7634, + "step": 3077 + }, + { + "epoch": 0.41, + "grad_norm": 0.87109375, + "learning_rate": 0.00019999566062600446, + "loss": 0.8086, + "step": 3078 + }, + { + "epoch": 0.41, + "grad_norm": 0.94140625, + "learning_rate": 0.00019999555146443676, + "loss": 0.7582, + "step": 3079 + }, + { + "epoch": 0.41, + "grad_norm": 0.74609375, + "learning_rate": 0.0001999954409468652, + "loss": 0.5781, + "step": 3080 + }, + { + "epoch": 0.41, + "grad_norm": 0.76171875, + "learning_rate": 0.00019999532907329125, + "loss": 0.8262, + "step": 3081 + }, + { + "epoch": 0.41, + "grad_norm": 1.1171875, + "learning_rate": 0.0001999952158437165, + "loss": 0.77, + "step": 3082 + }, + { + "epoch": 0.41, + "grad_norm": 1.25, + "learning_rate": 0.00019999510125814243, + "loss": 0.8898, + "step": 3083 + }, + { + "epoch": 0.41, + "grad_norm": 0.69140625, + "learning_rate": 0.00019999498531657063, + "loss": 0.7402, + "step": 3084 + }, + { + "epoch": 0.41, + "grad_norm": 0.84375, + "learning_rate": 0.00019999486801900262, + "loss": 0.6544, + "step": 3085 + }, + { + "epoch": 0.41, + "grad_norm": 0.99609375, + "learning_rate": 0.00019999474936544006, + "loss": 0.3996, + "step": 3086 + }, + { + "epoch": 0.41, + "grad_norm": 0.77734375, + "learning_rate": 0.0001999946293558845, + "loss": 0.3566, + "step": 3087 + }, + { + "epoch": 0.41, + "grad_norm": 0.6953125, + "learning_rate": 0.0001999945079903376, + "loss": 0.7222, + "step": 3088 + }, + { + "epoch": 0.41, + "grad_norm": 0.79296875, + "learning_rate": 0.00019999438526880096, + "loss": 0.5852, + "step": 3089 + }, + { + "epoch": 0.41, + "grad_norm": 0.83203125, + "learning_rate": 0.0001999942611912763, + "loss": 0.5726, + "step": 3090 + }, + { + "epoch": 0.41, + "grad_norm": 0.65234375, + "learning_rate": 0.00019999413575776532, + "loss": 0.7358, + "step": 3091 + }, + { + "epoch": 0.41, + "grad_norm": 0.890625, + "learning_rate": 0.00019999400896826965, + "loss": 0.6169, + "step": 3092 + }, + { + "epoch": 0.41, + "grad_norm": 0.703125, + "learning_rate": 0.00019999388082279101, + "loss": 0.3532, + "step": 3093 + }, + { + "epoch": 0.41, + "grad_norm": 0.78515625, + "learning_rate": 0.00019999375132133125, + "loss": 0.8087, + "step": 3094 + }, + { + "epoch": 0.41, + "grad_norm": 0.6171875, + "learning_rate": 0.000199993620463892, + "loss": 0.5504, + "step": 3095 + }, + { + "epoch": 0.41, + "grad_norm": 0.94921875, + "learning_rate": 0.00019999348825047508, + "loss": 0.8367, + "step": 3096 + }, + { + "epoch": 0.41, + "grad_norm": 0.796875, + "learning_rate": 0.00019999335468108228, + "loss": 0.8789, + "step": 3097 + }, + { + "epoch": 0.41, + "grad_norm": 0.78515625, + "learning_rate": 0.00019999321975571542, + "loss": 0.828, + "step": 3098 + }, + { + "epoch": 0.41, + "grad_norm": 0.8046875, + "learning_rate": 0.00019999308347437632, + "loss": 0.6557, + "step": 3099 + }, + { + "epoch": 0.41, + "grad_norm": 0.71484375, + "learning_rate": 0.00019999294583706684, + "loss": 0.738, + "step": 3100 + }, + { + "epoch": 0.41, + "grad_norm": 0.7265625, + "learning_rate": 0.00019999280684378885, + "loss": 0.7619, + "step": 3101 + }, + { + "epoch": 0.41, + "grad_norm": 0.6953125, + "learning_rate": 0.00019999266649454423, + "loss": 0.7029, + "step": 3102 + }, + { + "epoch": 0.41, + "grad_norm": 0.7265625, + "learning_rate": 0.00019999252478933484, + "loss": 0.7007, + "step": 3103 + }, + { + "epoch": 0.41, + "grad_norm": 0.6796875, + "learning_rate": 0.0001999923817281627, + "loss": 0.6889, + "step": 3104 + }, + { + "epoch": 0.41, + "grad_norm": 0.54296875, + "learning_rate": 0.00019999223731102962, + "loss": 0.4186, + "step": 3105 + }, + { + "epoch": 0.41, + "grad_norm": 0.81640625, + "learning_rate": 0.00019999209153793768, + "loss": 0.707, + "step": 3106 + }, + { + "epoch": 0.41, + "grad_norm": 0.87890625, + "learning_rate": 0.00019999194440888877, + "loss": 0.4668, + "step": 3107 + }, + { + "epoch": 0.41, + "grad_norm": 0.796875, + "learning_rate": 0.00019999179592388494, + "loss": 0.3047, + "step": 3108 + }, + { + "epoch": 0.41, + "grad_norm": 0.6796875, + "learning_rate": 0.00019999164608292818, + "loss": 0.539, + "step": 3109 + }, + { + "epoch": 0.41, + "grad_norm": 0.75, + "learning_rate": 0.0001999914948860205, + "loss": 0.6219, + "step": 3110 + }, + { + "epoch": 0.42, + "grad_norm": 0.7578125, + "learning_rate": 0.000199991342333164, + "loss": 0.6805, + "step": 3111 + }, + { + "epoch": 0.42, + "grad_norm": 0.7421875, + "learning_rate": 0.0001999911884243607, + "loss": 0.5132, + "step": 3112 + }, + { + "epoch": 0.42, + "grad_norm": 0.9765625, + "learning_rate": 0.00019999103315961273, + "loss": 0.6369, + "step": 3113 + }, + { + "epoch": 0.42, + "grad_norm": 0.92578125, + "learning_rate": 0.0001999908765389222, + "loss": 0.4229, + "step": 3114 + }, + { + "epoch": 0.42, + "grad_norm": 0.89453125, + "learning_rate": 0.00019999071856229117, + "loss": 0.8796, + "step": 3115 + }, + { + "epoch": 0.42, + "grad_norm": 0.67578125, + "learning_rate": 0.00019999055922972184, + "loss": 0.8507, + "step": 3116 + }, + { + "epoch": 0.42, + "grad_norm": 0.65234375, + "learning_rate": 0.00019999039854121637, + "loss": 0.282, + "step": 3117 + }, + { + "epoch": 0.42, + "grad_norm": 0.734375, + "learning_rate": 0.0001999902364967769, + "loss": 0.5267, + "step": 3118 + }, + { + "epoch": 0.42, + "grad_norm": 0.671875, + "learning_rate": 0.00019999007309640565, + "loss": 0.4922, + "step": 3119 + }, + { + "epoch": 0.42, + "grad_norm": 0.66796875, + "learning_rate": 0.00019998990834010485, + "loss": 0.3201, + "step": 3120 + }, + { + "epoch": 0.42, + "grad_norm": 0.8984375, + "learning_rate": 0.0001999897422278767, + "loss": 0.5619, + "step": 3121 + }, + { + "epoch": 0.42, + "grad_norm": 0.6171875, + "learning_rate": 0.00019998957475972348, + "loss": 0.6683, + "step": 3122 + }, + { + "epoch": 0.42, + "grad_norm": 0.75390625, + "learning_rate": 0.00019998940593564745, + "loss": 0.6284, + "step": 3123 + }, + { + "epoch": 0.42, + "grad_norm": 0.78125, + "learning_rate": 0.00019998923575565092, + "loss": 0.5853, + "step": 3124 + }, + { + "epoch": 0.42, + "grad_norm": 0.8515625, + "learning_rate": 0.0001999890642197362, + "loss": 0.4179, + "step": 3125 + }, + { + "epoch": 0.42, + "grad_norm": 0.95703125, + "learning_rate": 0.00019998889132790554, + "loss": 0.4338, + "step": 3126 + }, + { + "epoch": 0.42, + "grad_norm": 0.640625, + "learning_rate": 0.00019998871708016135, + "loss": 0.5355, + "step": 3127 + }, + { + "epoch": 0.42, + "grad_norm": 0.62109375, + "learning_rate": 0.000199988541476506, + "loss": 0.7126, + "step": 3128 + }, + { + "epoch": 0.42, + "grad_norm": 0.984375, + "learning_rate": 0.00019998836451694187, + "loss": 0.6494, + "step": 3129 + }, + { + "epoch": 0.42, + "grad_norm": 0.65625, + "learning_rate": 0.00019998818620147133, + "loss": 1.0058, + "step": 3130 + }, + { + "epoch": 0.42, + "grad_norm": 0.8046875, + "learning_rate": 0.0001999880065300968, + "loss": 0.4165, + "step": 3131 + }, + { + "epoch": 0.42, + "grad_norm": 0.91796875, + "learning_rate": 0.00019998782550282074, + "loss": 0.5674, + "step": 3132 + }, + { + "epoch": 0.42, + "grad_norm": 0.68359375, + "learning_rate": 0.0001999876431196456, + "loss": 0.7163, + "step": 3133 + }, + { + "epoch": 0.42, + "grad_norm": 0.76171875, + "learning_rate": 0.00019998745938057386, + "loss": 0.8662, + "step": 3134 + }, + { + "epoch": 0.42, + "grad_norm": 0.578125, + "learning_rate": 0.000199987274285608, + "loss": 0.7216, + "step": 3135 + }, + { + "epoch": 0.42, + "grad_norm": 0.70703125, + "learning_rate": 0.0001999870878347505, + "loss": 0.8248, + "step": 3136 + }, + { + "epoch": 0.42, + "grad_norm": 0.5625, + "learning_rate": 0.00019998690002800395, + "loss": 0.3099, + "step": 3137 + }, + { + "epoch": 0.42, + "grad_norm": 0.71484375, + "learning_rate": 0.00019998671086537083, + "loss": 0.5298, + "step": 3138 + }, + { + "epoch": 0.42, + "grad_norm": 0.482421875, + "learning_rate": 0.00019998652034685377, + "loss": 0.3217, + "step": 3139 + }, + { + "epoch": 0.42, + "grad_norm": 0.703125, + "learning_rate": 0.0001999863284724553, + "loss": 0.664, + "step": 3140 + }, + { + "epoch": 0.42, + "grad_norm": 0.65625, + "learning_rate": 0.00019998613524217806, + "loss": 0.8119, + "step": 3141 + }, + { + "epoch": 0.42, + "grad_norm": 0.5546875, + "learning_rate": 0.00019998594065602466, + "loss": 0.5194, + "step": 3142 + }, + { + "epoch": 0.42, + "grad_norm": 0.78515625, + "learning_rate": 0.0001999857447139977, + "loss": 0.6737, + "step": 3143 + }, + { + "epoch": 0.42, + "grad_norm": 0.9609375, + "learning_rate": 0.0001999855474160999, + "loss": 0.884, + "step": 3144 + }, + { + "epoch": 0.42, + "grad_norm": 0.8046875, + "learning_rate": 0.0001999853487623339, + "loss": 0.8238, + "step": 3145 + }, + { + "epoch": 0.42, + "grad_norm": 0.8203125, + "learning_rate": 0.00019998514875270242, + "loss": 0.8609, + "step": 3146 + }, + { + "epoch": 0.42, + "grad_norm": 0.72265625, + "learning_rate": 0.00019998494738720815, + "loss": 0.401, + "step": 3147 + }, + { + "epoch": 0.42, + "grad_norm": 0.7890625, + "learning_rate": 0.0001999847446658538, + "loss": 0.8385, + "step": 3148 + }, + { + "epoch": 0.42, + "grad_norm": 0.7109375, + "learning_rate": 0.00019998454058864216, + "loss": 0.5832, + "step": 3149 + }, + { + "epoch": 0.42, + "grad_norm": 0.8515625, + "learning_rate": 0.00019998433515557596, + "loss": 0.6138, + "step": 3150 + }, + { + "epoch": 0.42, + "grad_norm": 0.80078125, + "learning_rate": 0.00019998412836665802, + "loss": 0.4542, + "step": 3151 + }, + { + "epoch": 0.42, + "grad_norm": 0.921875, + "learning_rate": 0.00019998392022189113, + "loss": 0.5439, + "step": 3152 + }, + { + "epoch": 0.42, + "grad_norm": 0.65625, + "learning_rate": 0.0001999837107212781, + "loss": 0.5863, + "step": 3153 + }, + { + "epoch": 0.42, + "grad_norm": 0.65625, + "learning_rate": 0.00019998349986482178, + "loss": 0.3966, + "step": 3154 + }, + { + "epoch": 0.42, + "grad_norm": 1.203125, + "learning_rate": 0.00019998328765252508, + "loss": 0.4806, + "step": 3155 + }, + { + "epoch": 0.42, + "grad_norm": 0.70703125, + "learning_rate": 0.0001999830740843908, + "loss": 0.4091, + "step": 3156 + }, + { + "epoch": 0.42, + "grad_norm": 0.82421875, + "learning_rate": 0.00019998285916042184, + "loss": 0.6065, + "step": 3157 + }, + { + "epoch": 0.42, + "grad_norm": 1.0546875, + "learning_rate": 0.00019998264288062116, + "loss": 0.5618, + "step": 3158 + }, + { + "epoch": 0.42, + "grad_norm": 0.87890625, + "learning_rate": 0.0001999824252449917, + "loss": 0.8585, + "step": 3159 + }, + { + "epoch": 0.42, + "grad_norm": 0.78515625, + "learning_rate": 0.00019998220625353636, + "loss": 0.6554, + "step": 3160 + }, + { + "epoch": 0.42, + "grad_norm": 0.8046875, + "learning_rate": 0.00019998198590625814, + "loss": 0.7467, + "step": 3161 + }, + { + "epoch": 0.42, + "grad_norm": 0.734375, + "learning_rate": 0.00019998176420316002, + "loss": 0.8667, + "step": 3162 + }, + { + "epoch": 0.42, + "grad_norm": 0.94921875, + "learning_rate": 0.00019998154114424504, + "loss": 0.5716, + "step": 3163 + }, + { + "epoch": 0.42, + "grad_norm": 0.70703125, + "learning_rate": 0.00019998131672951613, + "loss": 0.4827, + "step": 3164 + }, + { + "epoch": 0.42, + "grad_norm": 0.5390625, + "learning_rate": 0.00019998109095897647, + "loss": 0.5719, + "step": 3165 + }, + { + "epoch": 0.42, + "grad_norm": 0.8125, + "learning_rate": 0.00019998086383262898, + "loss": 0.4852, + "step": 3166 + }, + { + "epoch": 0.42, + "grad_norm": 0.58203125, + "learning_rate": 0.00019998063535047687, + "loss": 0.853, + "step": 3167 + }, + { + "epoch": 0.42, + "grad_norm": 0.87109375, + "learning_rate": 0.00019998040551252313, + "loss": 0.6401, + "step": 3168 + }, + { + "epoch": 0.42, + "grad_norm": 0.5078125, + "learning_rate": 0.00019998017431877095, + "loss": 0.6202, + "step": 3169 + }, + { + "epoch": 0.42, + "grad_norm": 0.55078125, + "learning_rate": 0.00019997994176922343, + "loss": 0.579, + "step": 3170 + }, + { + "epoch": 0.42, + "grad_norm": 1.125, + "learning_rate": 0.00019997970786388372, + "loss": 0.8755, + "step": 3171 + }, + { + "epoch": 0.42, + "grad_norm": 0.82421875, + "learning_rate": 0.00019997947260275506, + "loss": 0.5105, + "step": 3172 + }, + { + "epoch": 0.42, + "grad_norm": 0.57421875, + "learning_rate": 0.00019997923598584052, + "loss": 0.4403, + "step": 3173 + }, + { + "epoch": 0.42, + "grad_norm": 0.60546875, + "learning_rate": 0.00019997899801314342, + "loss": 0.4969, + "step": 3174 + }, + { + "epoch": 0.42, + "grad_norm": 0.734375, + "learning_rate": 0.00019997875868466692, + "loss": 0.592, + "step": 3175 + }, + { + "epoch": 0.42, + "grad_norm": 0.71875, + "learning_rate": 0.0001999785180004143, + "loss": 0.5137, + "step": 3176 + }, + { + "epoch": 0.42, + "grad_norm": 0.53125, + "learning_rate": 0.00019997827596038878, + "loss": 0.544, + "step": 3177 + }, + { + "epoch": 0.42, + "grad_norm": 0.875, + "learning_rate": 0.0001999780325645937, + "loss": 0.5363, + "step": 3178 + }, + { + "epoch": 0.42, + "grad_norm": 0.64453125, + "learning_rate": 0.0001999777878130323, + "loss": 0.4468, + "step": 3179 + }, + { + "epoch": 0.42, + "grad_norm": 0.8671875, + "learning_rate": 0.00019997754170570796, + "loss": 0.6411, + "step": 3180 + }, + { + "epoch": 0.42, + "grad_norm": 0.97265625, + "learning_rate": 0.000199977294242624, + "loss": 0.4555, + "step": 3181 + }, + { + "epoch": 0.42, + "grad_norm": 0.76953125, + "learning_rate": 0.00019997704542378374, + "loss": 0.5388, + "step": 3182 + }, + { + "epoch": 0.42, + "grad_norm": 1.125, + "learning_rate": 0.0001999767952491906, + "loss": 0.6484, + "step": 3183 + }, + { + "epoch": 0.42, + "grad_norm": 0.65625, + "learning_rate": 0.00019997654371884792, + "loss": 0.4604, + "step": 3184 + }, + { + "epoch": 0.43, + "grad_norm": 0.9765625, + "learning_rate": 0.0001999762908327592, + "loss": 0.6496, + "step": 3185 + }, + { + "epoch": 0.43, + "grad_norm": 1.0546875, + "learning_rate": 0.00019997603659092773, + "loss": 0.366, + "step": 3186 + }, + { + "epoch": 0.43, + "grad_norm": 0.7890625, + "learning_rate": 0.0001999757809933571, + "loss": 0.6997, + "step": 3187 + }, + { + "epoch": 0.43, + "grad_norm": 0.55078125, + "learning_rate": 0.00019997552404005072, + "loss": 0.6977, + "step": 3188 + }, + { + "epoch": 0.43, + "grad_norm": 0.80078125, + "learning_rate": 0.00019997526573101203, + "loss": 0.7, + "step": 3189 + }, + { + "epoch": 0.43, + "grad_norm": 0.5859375, + "learning_rate": 0.0001999750060662446, + "loss": 0.4999, + "step": 3190 + }, + { + "epoch": 0.43, + "grad_norm": 0.6875, + "learning_rate": 0.0001999747450457519, + "loss": 0.5064, + "step": 3191 + }, + { + "epoch": 0.43, + "grad_norm": 0.96875, + "learning_rate": 0.00019997448266953754, + "loss": 0.7495, + "step": 3192 + }, + { + "epoch": 0.43, + "grad_norm": 0.6640625, + "learning_rate": 0.000199974218937605, + "loss": 0.4637, + "step": 3193 + }, + { + "epoch": 0.43, + "grad_norm": 0.71484375, + "learning_rate": 0.00019997395384995789, + "loss": 0.5398, + "step": 3194 + }, + { + "epoch": 0.43, + "grad_norm": 0.82421875, + "learning_rate": 0.0001999736874065998, + "loss": 0.6454, + "step": 3195 + }, + { + "epoch": 0.43, + "grad_norm": 0.61328125, + "learning_rate": 0.00019997341960753434, + "loss": 0.6104, + "step": 3196 + }, + { + "epoch": 0.43, + "grad_norm": 0.65234375, + "learning_rate": 0.0001999731504527652, + "loss": 0.5493, + "step": 3197 + }, + { + "epoch": 0.43, + "grad_norm": 0.78125, + "learning_rate": 0.00019997287994229592, + "loss": 0.584, + "step": 3198 + }, + { + "epoch": 0.43, + "grad_norm": 0.63671875, + "learning_rate": 0.00019997260807613022, + "loss": 0.5002, + "step": 3199 + }, + { + "epoch": 0.43, + "grad_norm": 0.609375, + "learning_rate": 0.00019997233485427184, + "loss": 0.4973, + "step": 3200 + }, + { + "epoch": 0.43, + "grad_norm": 0.64453125, + "learning_rate": 0.0001999720602767244, + "loss": 0.5076, + "step": 3201 + }, + { + "epoch": 0.43, + "grad_norm": 0.7734375, + "learning_rate": 0.00019997178434349167, + "loss": 0.6742, + "step": 3202 + }, + { + "epoch": 0.43, + "grad_norm": 0.6640625, + "learning_rate": 0.0001999715070545774, + "loss": 0.7158, + "step": 3203 + }, + { + "epoch": 0.43, + "grad_norm": 1.1015625, + "learning_rate": 0.00019997122840998533, + "loss": 1.0029, + "step": 3204 + }, + { + "epoch": 0.43, + "grad_norm": 0.8046875, + "learning_rate": 0.0001999709484097192, + "loss": 0.5925, + "step": 3205 + }, + { + "epoch": 0.43, + "grad_norm": 0.8515625, + "learning_rate": 0.0001999706670537829, + "loss": 0.995, + "step": 3206 + }, + { + "epoch": 0.43, + "grad_norm": 0.6953125, + "learning_rate": 0.00019997038434218015, + "loss": 0.4074, + "step": 3207 + }, + { + "epoch": 0.43, + "grad_norm": 0.94140625, + "learning_rate": 0.00019997010027491486, + "loss": 0.6171, + "step": 3208 + }, + { + "epoch": 0.43, + "grad_norm": 0.77734375, + "learning_rate": 0.00019996981485199085, + "loss": 1.0049, + "step": 3209 + }, + { + "epoch": 0.43, + "grad_norm": 0.7265625, + "learning_rate": 0.00019996952807341197, + "loss": 0.7062, + "step": 3210 + }, + { + "epoch": 0.43, + "grad_norm": 0.8515625, + "learning_rate": 0.00019996923993918214, + "loss": 0.4615, + "step": 3211 + }, + { + "epoch": 0.43, + "grad_norm": 0.9921875, + "learning_rate": 0.00019996895044930527, + "loss": 0.5313, + "step": 3212 + }, + { + "epoch": 0.43, + "grad_norm": 1.0234375, + "learning_rate": 0.00019996865960378527, + "loss": 0.6089, + "step": 3213 + }, + { + "epoch": 0.43, + "grad_norm": 0.75, + "learning_rate": 0.00019996836740262606, + "loss": 0.4138, + "step": 3214 + }, + { + "epoch": 0.43, + "grad_norm": 0.578125, + "learning_rate": 0.00019996807384583165, + "loss": 0.4384, + "step": 3215 + }, + { + "epoch": 0.43, + "grad_norm": 0.73828125, + "learning_rate": 0.00019996777893340598, + "loss": 0.7623, + "step": 3216 + }, + { + "epoch": 0.43, + "grad_norm": 0.6796875, + "learning_rate": 0.00019996748266535308, + "loss": 0.691, + "step": 3217 + }, + { + "epoch": 0.43, + "grad_norm": 0.8203125, + "learning_rate": 0.00019996718504167697, + "loss": 0.592, + "step": 3218 + }, + { + "epoch": 0.43, + "grad_norm": 0.76953125, + "learning_rate": 0.00019996688606238166, + "loss": 0.6281, + "step": 3219 + }, + { + "epoch": 0.43, + "grad_norm": 0.859375, + "learning_rate": 0.0001999665857274712, + "loss": 0.9083, + "step": 3220 + }, + { + "epoch": 0.43, + "grad_norm": 0.890625, + "learning_rate": 0.0001999662840369497, + "loss": 0.4192, + "step": 3221 + }, + { + "epoch": 0.43, + "grad_norm": 0.8671875, + "learning_rate": 0.00019996598099082124, + "loss": 0.4626, + "step": 3222 + }, + { + "epoch": 0.43, + "grad_norm": 0.81640625, + "learning_rate": 0.0001999656765890899, + "loss": 0.5464, + "step": 3223 + }, + { + "epoch": 0.43, + "grad_norm": 0.62109375, + "learning_rate": 0.00019996537083175983, + "loss": 0.44, + "step": 3224 + }, + { + "epoch": 0.43, + "grad_norm": 0.67578125, + "learning_rate": 0.0001999650637188352, + "loss": 0.3956, + "step": 3225 + }, + { + "epoch": 0.43, + "grad_norm": 0.734375, + "learning_rate": 0.0001999647552503201, + "loss": 0.5672, + "step": 3226 + }, + { + "epoch": 0.43, + "grad_norm": 0.765625, + "learning_rate": 0.00019996444542621882, + "loss": 0.7037, + "step": 3227 + }, + { + "epoch": 0.43, + "grad_norm": 0.83984375, + "learning_rate": 0.00019996413424653548, + "loss": 0.7058, + "step": 3228 + }, + { + "epoch": 0.43, + "grad_norm": 0.77734375, + "learning_rate": 0.0001999638217112743, + "loss": 0.5237, + "step": 3229 + }, + { + "epoch": 0.43, + "grad_norm": 0.8046875, + "learning_rate": 0.00019996350782043956, + "loss": 0.6393, + "step": 3230 + }, + { + "epoch": 0.43, + "grad_norm": 0.64453125, + "learning_rate": 0.00019996319257403552, + "loss": 0.6314, + "step": 3231 + }, + { + "epoch": 0.43, + "grad_norm": 0.609375, + "learning_rate": 0.00019996287597206638, + "loss": 0.6114, + "step": 3232 + }, + { + "epoch": 0.43, + "grad_norm": 0.703125, + "learning_rate": 0.00019996255801453652, + "loss": 0.6595, + "step": 3233 + }, + { + "epoch": 0.43, + "grad_norm": 0.90234375, + "learning_rate": 0.00019996223870145024, + "loss": 0.6609, + "step": 3234 + }, + { + "epoch": 0.43, + "grad_norm": 0.8359375, + "learning_rate": 0.0001999619180328118, + "loss": 0.7201, + "step": 3235 + }, + { + "epoch": 0.43, + "grad_norm": 0.70703125, + "learning_rate": 0.00019996159600862564, + "loss": 0.5276, + "step": 3236 + }, + { + "epoch": 0.43, + "grad_norm": 0.69140625, + "learning_rate": 0.00019996127262889607, + "loss": 0.3111, + "step": 3237 + }, + { + "epoch": 0.43, + "grad_norm": 0.7578125, + "learning_rate": 0.00019996094789362744, + "loss": 0.7448, + "step": 3238 + }, + { + "epoch": 0.43, + "grad_norm": 0.74609375, + "learning_rate": 0.00019996062180282427, + "loss": 0.6726, + "step": 3239 + }, + { + "epoch": 0.43, + "grad_norm": 0.78515625, + "learning_rate": 0.00019996029435649087, + "loss": 0.9329, + "step": 3240 + }, + { + "epoch": 0.43, + "grad_norm": 0.8203125, + "learning_rate": 0.00019995996555463173, + "loss": 0.7635, + "step": 3241 + }, + { + "epoch": 0.43, + "grad_norm": 0.8125, + "learning_rate": 0.00019995963539725132, + "loss": 0.6785, + "step": 3242 + }, + { + "epoch": 0.43, + "grad_norm": 0.67578125, + "learning_rate": 0.00019995930388435408, + "loss": 0.3861, + "step": 3243 + }, + { + "epoch": 0.43, + "grad_norm": 0.78125, + "learning_rate": 0.00019995897101594454, + "loss": 0.7672, + "step": 3244 + }, + { + "epoch": 0.43, + "grad_norm": 0.75, + "learning_rate": 0.0001999586367920272, + "loss": 0.432, + "step": 3245 + }, + { + "epoch": 0.43, + "grad_norm": 0.68359375, + "learning_rate": 0.0001999583012126066, + "loss": 0.8404, + "step": 3246 + }, + { + "epoch": 0.43, + "grad_norm": 0.77734375, + "learning_rate": 0.00019995796427768728, + "loss": 0.5276, + "step": 3247 + }, + { + "epoch": 0.43, + "grad_norm": 0.9921875, + "learning_rate": 0.0001999576259872738, + "loss": 0.6805, + "step": 3248 + }, + { + "epoch": 0.43, + "grad_norm": 0.54296875, + "learning_rate": 0.00019995728634137076, + "loss": 0.5412, + "step": 3249 + }, + { + "epoch": 0.43, + "grad_norm": 0.640625, + "learning_rate": 0.00019995694533998278, + "loss": 0.6467, + "step": 3250 + }, + { + "epoch": 0.43, + "grad_norm": 1.0, + "learning_rate": 0.00019995660298311444, + "loss": 0.5039, + "step": 3251 + }, + { + "epoch": 0.43, + "grad_norm": 0.65625, + "learning_rate": 0.00019995625927077043, + "loss": 0.5545, + "step": 3252 + }, + { + "epoch": 0.43, + "grad_norm": 0.765625, + "learning_rate": 0.0001999559142029554, + "loss": 0.7834, + "step": 3253 + }, + { + "epoch": 0.43, + "grad_norm": 0.71484375, + "learning_rate": 0.000199955567779674, + "loss": 0.4453, + "step": 3254 + }, + { + "epoch": 0.43, + "grad_norm": 0.6796875, + "learning_rate": 0.000199955220000931, + "loss": 0.6782, + "step": 3255 + }, + { + "epoch": 0.43, + "grad_norm": 1.0546875, + "learning_rate": 0.00019995487086673101, + "loss": 0.7388, + "step": 3256 + }, + { + "epoch": 0.43, + "grad_norm": 0.80859375, + "learning_rate": 0.00019995452037707885, + "loss": 0.6751, + "step": 3257 + }, + { + "epoch": 0.43, + "grad_norm": 0.69140625, + "learning_rate": 0.00019995416853197924, + "loss": 0.961, + "step": 3258 + }, + { + "epoch": 0.43, + "grad_norm": 0.73046875, + "learning_rate": 0.00019995381533143694, + "loss": 0.5656, + "step": 3259 + }, + { + "epoch": 0.44, + "grad_norm": 0.859375, + "learning_rate": 0.00019995346077545677, + "loss": 0.6894, + "step": 3260 + }, + { + "epoch": 0.44, + "grad_norm": 0.51171875, + "learning_rate": 0.0001999531048640435, + "loss": 0.3882, + "step": 3261 + }, + { + "epoch": 0.44, + "grad_norm": 0.59375, + "learning_rate": 0.000199952747597202, + "loss": 0.5978, + "step": 3262 + }, + { + "epoch": 0.44, + "grad_norm": 0.75, + "learning_rate": 0.0001999523889749371, + "loss": 0.612, + "step": 3263 + }, + { + "epoch": 0.44, + "grad_norm": 0.7890625, + "learning_rate": 0.00019995202899725364, + "loss": 0.7716, + "step": 3264 + }, + { + "epoch": 0.44, + "grad_norm": 0.85546875, + "learning_rate": 0.00019995166766415654, + "loss": 0.633, + "step": 3265 + }, + { + "epoch": 0.44, + "grad_norm": 0.86328125, + "learning_rate": 0.00019995130497565063, + "loss": 0.6297, + "step": 3266 + }, + { + "epoch": 0.44, + "grad_norm": 0.890625, + "learning_rate": 0.00019995094093174092, + "loss": 0.6294, + "step": 3267 + }, + { + "epoch": 0.44, + "grad_norm": 0.734375, + "learning_rate": 0.0001999505755324323, + "loss": 0.6386, + "step": 3268 + }, + { + "epoch": 0.44, + "grad_norm": 0.703125, + "learning_rate": 0.00019995020877772973, + "loss": 0.5945, + "step": 3269 + }, + { + "epoch": 0.44, + "grad_norm": 0.80859375, + "learning_rate": 0.00019994984066763814, + "loss": 0.5426, + "step": 3270 + }, + { + "epoch": 0.44, + "grad_norm": 0.81640625, + "learning_rate": 0.0001999494712021626, + "loss": 0.5369, + "step": 3271 + }, + { + "epoch": 0.44, + "grad_norm": 0.90234375, + "learning_rate": 0.00019994910038130808, + "loss": 0.8152, + "step": 3272 + }, + { + "epoch": 0.44, + "grad_norm": 0.9765625, + "learning_rate": 0.0001999487282050796, + "loss": 0.5128, + "step": 3273 + }, + { + "epoch": 0.44, + "grad_norm": 0.72265625, + "learning_rate": 0.00019994835467348224, + "loss": 0.4867, + "step": 3274 + }, + { + "epoch": 0.44, + "grad_norm": 0.99609375, + "learning_rate": 0.00019994797978652103, + "loss": 0.6776, + "step": 3275 + }, + { + "epoch": 0.44, + "grad_norm": 0.9921875, + "learning_rate": 0.00019994760354420107, + "loss": 0.4677, + "step": 3276 + }, + { + "epoch": 0.44, + "grad_norm": 0.91796875, + "learning_rate": 0.00019994722594652745, + "loss": 0.595, + "step": 3277 + }, + { + "epoch": 0.44, + "grad_norm": 1.046875, + "learning_rate": 0.0001999468469935053, + "loss": 0.4981, + "step": 3278 + }, + { + "epoch": 0.44, + "grad_norm": 0.828125, + "learning_rate": 0.00019994646668513978, + "loss": 0.843, + "step": 3279 + }, + { + "epoch": 0.44, + "grad_norm": 0.796875, + "learning_rate": 0.00019994608502143602, + "loss": 0.4627, + "step": 3280 + }, + { + "epoch": 0.44, + "grad_norm": 0.8359375, + "learning_rate": 0.0001999457020023992, + "loss": 1.0297, + "step": 3281 + }, + { + "epoch": 0.44, + "grad_norm": 0.921875, + "learning_rate": 0.0001999453176280345, + "loss": 0.5567, + "step": 3282 + }, + { + "epoch": 0.44, + "grad_norm": 0.73046875, + "learning_rate": 0.00019994493189834717, + "loss": 0.7468, + "step": 3283 + }, + { + "epoch": 0.44, + "grad_norm": 0.8203125, + "learning_rate": 0.00019994454481334238, + "loss": 0.5885, + "step": 3284 + }, + { + "epoch": 0.44, + "grad_norm": 0.447265625, + "learning_rate": 0.00019994415637302547, + "loss": 0.5958, + "step": 3285 + }, + { + "epoch": 0.44, + "grad_norm": 0.88671875, + "learning_rate": 0.00019994376657740163, + "loss": 0.8796, + "step": 3286 + }, + { + "epoch": 0.44, + "grad_norm": 0.65234375, + "learning_rate": 0.00019994337542647618, + "loss": 0.8796, + "step": 3287 + }, + { + "epoch": 0.44, + "grad_norm": 0.85546875, + "learning_rate": 0.00019994298292025443, + "loss": 0.6422, + "step": 3288 + }, + { + "epoch": 0.44, + "grad_norm": 0.6875, + "learning_rate": 0.00019994258905874166, + "loss": 0.6856, + "step": 3289 + }, + { + "epoch": 0.44, + "grad_norm": 0.90625, + "learning_rate": 0.00019994219384194325, + "loss": 0.9541, + "step": 3290 + }, + { + "epoch": 0.44, + "grad_norm": 0.6796875, + "learning_rate": 0.00019994179726986455, + "loss": 0.4691, + "step": 3291 + }, + { + "epoch": 0.44, + "grad_norm": 0.68359375, + "learning_rate": 0.00019994139934251094, + "loss": 0.4644, + "step": 3292 + }, + { + "epoch": 0.44, + "grad_norm": 0.640625, + "learning_rate": 0.0001999410000598878, + "loss": 0.6782, + "step": 3293 + }, + { + "epoch": 0.44, + "grad_norm": 0.64453125, + "learning_rate": 0.00019994059942200057, + "loss": 0.2721, + "step": 3294 + }, + { + "epoch": 0.44, + "grad_norm": 0.8046875, + "learning_rate": 0.00019994019742885466, + "loss": 0.6157, + "step": 3295 + }, + { + "epoch": 0.44, + "grad_norm": 0.66015625, + "learning_rate": 0.00019993979408045552, + "loss": 0.9465, + "step": 3296 + }, + { + "epoch": 0.44, + "grad_norm": 0.953125, + "learning_rate": 0.00019993938937680868, + "loss": 0.739, + "step": 3297 + }, + { + "epoch": 0.44, + "grad_norm": 1.0078125, + "learning_rate": 0.00019993898331791953, + "loss": 0.2996, + "step": 3298 + }, + { + "epoch": 0.44, + "grad_norm": 0.85546875, + "learning_rate": 0.00019993857590379368, + "loss": 0.4829, + "step": 3299 + }, + { + "epoch": 0.44, + "grad_norm": 0.70703125, + "learning_rate": 0.00019993816713443656, + "loss": 0.5485, + "step": 3300 + }, + { + "epoch": 0.44, + "grad_norm": 0.91015625, + "learning_rate": 0.00019993775700985375, + "loss": 0.6806, + "step": 3301 + }, + { + "epoch": 0.44, + "grad_norm": 1.1015625, + "learning_rate": 0.00019993734553005083, + "loss": 0.9011, + "step": 3302 + }, + { + "epoch": 0.44, + "grad_norm": 0.87109375, + "learning_rate": 0.00019993693269503335, + "loss": 0.6036, + "step": 3303 + }, + { + "epoch": 0.44, + "grad_norm": 0.546875, + "learning_rate": 0.00019993651850480694, + "loss": 0.6399, + "step": 3304 + }, + { + "epoch": 0.44, + "grad_norm": 0.61328125, + "learning_rate": 0.0001999361029593772, + "loss": 0.4554, + "step": 3305 + }, + { + "epoch": 0.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00019993568605874976, + "loss": 0.5071, + "step": 3306 + }, + { + "epoch": 0.44, + "grad_norm": 0.7578125, + "learning_rate": 0.00019993526780293027, + "loss": 0.6697, + "step": 3307 + }, + { + "epoch": 0.44, + "grad_norm": 0.6953125, + "learning_rate": 0.00019993484819192444, + "loss": 0.5917, + "step": 3308 + }, + { + "epoch": 0.44, + "grad_norm": 0.76953125, + "learning_rate": 0.0001999344272257379, + "loss": 0.9301, + "step": 3309 + }, + { + "epoch": 0.44, + "grad_norm": 0.76171875, + "learning_rate": 0.00019993400490437638, + "loss": 0.8075, + "step": 3310 + }, + { + "epoch": 0.44, + "grad_norm": 0.828125, + "learning_rate": 0.00019993358122784565, + "loss": 0.7218, + "step": 3311 + }, + { + "epoch": 0.44, + "grad_norm": 1.046875, + "learning_rate": 0.0001999331561961514, + "loss": 1.0142, + "step": 3312 + }, + { + "epoch": 0.44, + "grad_norm": 0.6171875, + "learning_rate": 0.00019993272980929942, + "loss": 0.5039, + "step": 3313 + }, + { + "epoch": 0.44, + "grad_norm": 0.75390625, + "learning_rate": 0.0001999323020672955, + "loss": 0.5297, + "step": 3314 + }, + { + "epoch": 0.44, + "grad_norm": 0.6875, + "learning_rate": 0.00019993187297014542, + "loss": 0.8067, + "step": 3315 + }, + { + "epoch": 0.44, + "grad_norm": 0.58203125, + "learning_rate": 0.000199931442517855, + "loss": 0.5491, + "step": 3316 + }, + { + "epoch": 0.44, + "grad_norm": 0.73046875, + "learning_rate": 0.0001999310107104301, + "loss": 0.5416, + "step": 3317 + }, + { + "epoch": 0.44, + "grad_norm": 0.64453125, + "learning_rate": 0.00019993057754787654, + "loss": 0.4611, + "step": 3318 + }, + { + "epoch": 0.44, + "grad_norm": 0.890625, + "learning_rate": 0.00019993014303020022, + "loss": 0.4, + "step": 3319 + }, + { + "epoch": 0.44, + "grad_norm": 0.70703125, + "learning_rate": 0.00019992970715740702, + "loss": 0.3984, + "step": 3320 + }, + { + "epoch": 0.44, + "grad_norm": 0.83984375, + "learning_rate": 0.00019992926992950288, + "loss": 0.6659, + "step": 3321 + }, + { + "epoch": 0.44, + "grad_norm": 0.625, + "learning_rate": 0.00019992883134649367, + "loss": 0.6152, + "step": 3322 + }, + { + "epoch": 0.44, + "grad_norm": 0.80859375, + "learning_rate": 0.0001999283914083854, + "loss": 0.4619, + "step": 3323 + }, + { + "epoch": 0.44, + "grad_norm": 0.59765625, + "learning_rate": 0.000199927950115184, + "loss": 0.5678, + "step": 3324 + }, + { + "epoch": 0.44, + "grad_norm": 0.640625, + "learning_rate": 0.00019992750746689546, + "loss": 0.57, + "step": 3325 + }, + { + "epoch": 0.44, + "grad_norm": 0.671875, + "learning_rate": 0.00019992706346352577, + "loss": 0.7314, + "step": 3326 + }, + { + "epoch": 0.44, + "grad_norm": 0.765625, + "learning_rate": 0.000199926618105081, + "loss": 0.454, + "step": 3327 + }, + { + "epoch": 0.44, + "grad_norm": 0.87890625, + "learning_rate": 0.00019992617139156715, + "loss": 0.5699, + "step": 3328 + }, + { + "epoch": 0.44, + "grad_norm": 1.0078125, + "learning_rate": 0.00019992572332299026, + "loss": 0.6837, + "step": 3329 + }, + { + "epoch": 0.44, + "grad_norm": 0.87109375, + "learning_rate": 0.00019992527389935648, + "loss": 0.4976, + "step": 3330 + }, + { + "epoch": 0.44, + "grad_norm": 0.64453125, + "learning_rate": 0.0001999248231206718, + "loss": 0.6451, + "step": 3331 + }, + { + "epoch": 0.44, + "grad_norm": 0.76171875, + "learning_rate": 0.0001999243709869424, + "loss": 0.4845, + "step": 3332 + }, + { + "epoch": 0.44, + "grad_norm": 0.66796875, + "learning_rate": 0.0001999239174981744, + "loss": 0.6175, + "step": 3333 + }, + { + "epoch": 0.44, + "grad_norm": 0.82421875, + "learning_rate": 0.00019992346265437395, + "loss": 0.7102, + "step": 3334 + }, + { + "epoch": 0.45, + "grad_norm": 0.9296875, + "learning_rate": 0.00019992300645554724, + "loss": 0.7091, + "step": 3335 + }, + { + "epoch": 0.45, + "grad_norm": 0.7265625, + "learning_rate": 0.0001999225489017004, + "loss": 0.4473, + "step": 3336 + }, + { + "epoch": 0.45, + "grad_norm": 0.85546875, + "learning_rate": 0.0001999220899928397, + "loss": 0.562, + "step": 3337 + }, + { + "epoch": 0.45, + "grad_norm": 0.6875, + "learning_rate": 0.0001999216297289713, + "loss": 0.4499, + "step": 3338 + }, + { + "epoch": 0.45, + "grad_norm": 0.8046875, + "learning_rate": 0.00019992116811010148, + "loss": 0.5417, + "step": 3339 + }, + { + "epoch": 0.45, + "grad_norm": 0.81640625, + "learning_rate": 0.00019992070513623648, + "loss": 0.7217, + "step": 3340 + }, + { + "epoch": 0.45, + "grad_norm": 0.9453125, + "learning_rate": 0.00019992024080738263, + "loss": 0.5971, + "step": 3341 + }, + { + "epoch": 0.45, + "grad_norm": 0.6640625, + "learning_rate": 0.00019991977512354617, + "loss": 0.7592, + "step": 3342 + }, + { + "epoch": 0.45, + "grad_norm": 0.7578125, + "learning_rate": 0.0001999193080847334, + "loss": 0.6875, + "step": 3343 + }, + { + "epoch": 0.45, + "grad_norm": 0.58984375, + "learning_rate": 0.00019991883969095072, + "loss": 0.5859, + "step": 3344 + }, + { + "epoch": 0.45, + "grad_norm": 0.6171875, + "learning_rate": 0.00019991836994220444, + "loss": 0.7055, + "step": 3345 + }, + { + "epoch": 0.45, + "grad_norm": 1.15625, + "learning_rate": 0.00019991789883850092, + "loss": 0.6023, + "step": 3346 + }, + { + "epoch": 0.45, + "grad_norm": 0.609375, + "learning_rate": 0.00019991742637984653, + "loss": 0.538, + "step": 3347 + }, + { + "epoch": 0.45, + "grad_norm": 0.91015625, + "learning_rate": 0.0001999169525662478, + "loss": 0.4496, + "step": 3348 + }, + { + "epoch": 0.45, + "grad_norm": 0.53125, + "learning_rate": 0.000199916477397711, + "loss": 0.4395, + "step": 3349 + }, + { + "epoch": 0.45, + "grad_norm": 0.86328125, + "learning_rate": 0.00019991600087424267, + "loss": 1.0319, + "step": 3350 + }, + { + "epoch": 0.45, + "grad_norm": 0.65625, + "learning_rate": 0.00019991552299584924, + "loss": 0.9151, + "step": 3351 + }, + { + "epoch": 0.45, + "grad_norm": 0.765625, + "learning_rate": 0.00019991504376253719, + "loss": 0.4965, + "step": 3352 + }, + { + "epoch": 0.45, + "grad_norm": 0.97265625, + "learning_rate": 0.000199914563174313, + "loss": 0.6446, + "step": 3353 + }, + { + "epoch": 0.45, + "grad_norm": 0.89453125, + "learning_rate": 0.00019991408123118322, + "loss": 0.3964, + "step": 3354 + }, + { + "epoch": 0.45, + "grad_norm": 0.76953125, + "learning_rate": 0.00019991359793315437, + "loss": 0.5106, + "step": 3355 + }, + { + "epoch": 0.45, + "grad_norm": 0.7109375, + "learning_rate": 0.00019991311328023304, + "loss": 0.6675, + "step": 3356 + }, + { + "epoch": 0.45, + "grad_norm": 0.74609375, + "learning_rate": 0.00019991262727242574, + "loss": 0.4363, + "step": 3357 + }, + { + "epoch": 0.45, + "grad_norm": 0.734375, + "learning_rate": 0.0001999121399097391, + "loss": 0.4675, + "step": 3358 + }, + { + "epoch": 0.45, + "grad_norm": 0.75, + "learning_rate": 0.0001999116511921797, + "loss": 0.6948, + "step": 3359 + }, + { + "epoch": 0.45, + "grad_norm": 0.59375, + "learning_rate": 0.00019991116111975422, + "loss": 0.3957, + "step": 3360 + }, + { + "epoch": 0.45, + "grad_norm": 0.60546875, + "learning_rate": 0.00019991066969246926, + "loss": 0.435, + "step": 3361 + }, + { + "epoch": 0.45, + "grad_norm": 0.73046875, + "learning_rate": 0.0001999101769103315, + "loss": 0.774, + "step": 3362 + }, + { + "epoch": 0.45, + "grad_norm": 0.68359375, + "learning_rate": 0.00019990968277334764, + "loss": 0.5294, + "step": 3363 + }, + { + "epoch": 0.45, + "grad_norm": 0.98828125, + "learning_rate": 0.0001999091872815243, + "loss": 0.6622, + "step": 3364 + }, + { + "epoch": 0.45, + "grad_norm": 0.90234375, + "learning_rate": 0.00019990869043486832, + "loss": 0.2585, + "step": 3365 + }, + { + "epoch": 0.45, + "grad_norm": 0.55859375, + "learning_rate": 0.00019990819223338635, + "loss": 0.5072, + "step": 3366 + }, + { + "epoch": 0.45, + "grad_norm": 0.65625, + "learning_rate": 0.00019990769267708516, + "loss": 0.4058, + "step": 3367 + }, + { + "epoch": 0.45, + "grad_norm": 0.828125, + "learning_rate": 0.00019990719176597155, + "loss": 0.7358, + "step": 3368 + }, + { + "epoch": 0.45, + "grad_norm": 1.015625, + "learning_rate": 0.00019990668950005232, + "loss": 0.3982, + "step": 3369 + }, + { + "epoch": 0.45, + "grad_norm": 0.734375, + "learning_rate": 0.00019990618587933422, + "loss": 0.397, + "step": 3370 + }, + { + "epoch": 0.45, + "grad_norm": 0.765625, + "learning_rate": 0.00019990568090382413, + "loss": 0.7079, + "step": 3371 + }, + { + "epoch": 0.45, + "grad_norm": 0.8046875, + "learning_rate": 0.0001999051745735289, + "loss": 0.5302, + "step": 3372 + }, + { + "epoch": 0.45, + "grad_norm": 0.796875, + "learning_rate": 0.0001999046668884554, + "loss": 0.6843, + "step": 3373 + }, + { + "epoch": 0.45, + "grad_norm": 0.59375, + "learning_rate": 0.00019990415784861047, + "loss": 0.467, + "step": 3374 + }, + { + "epoch": 0.45, + "grad_norm": 0.96875, + "learning_rate": 0.00019990364745400106, + "loss": 0.7431, + "step": 3375 + }, + { + "epoch": 0.45, + "grad_norm": 0.609375, + "learning_rate": 0.00019990313570463405, + "loss": 0.5682, + "step": 3376 + }, + { + "epoch": 0.45, + "grad_norm": 0.875, + "learning_rate": 0.00019990262260051643, + "loss": 0.6618, + "step": 3377 + }, + { + "epoch": 0.45, + "grad_norm": 0.6640625, + "learning_rate": 0.00019990210814165512, + "loss": 0.7946, + "step": 3378 + }, + { + "epoch": 0.45, + "grad_norm": 1.0234375, + "learning_rate": 0.0001999015923280571, + "loss": 0.3955, + "step": 3379 + }, + { + "epoch": 0.45, + "grad_norm": 0.58984375, + "learning_rate": 0.00019990107515972937, + "loss": 0.5581, + "step": 3380 + }, + { + "epoch": 0.45, + "grad_norm": 0.75390625, + "learning_rate": 0.00019990055663667895, + "loss": 0.596, + "step": 3381 + }, + { + "epoch": 0.45, + "grad_norm": 1.125, + "learning_rate": 0.00019990003675891287, + "loss": 0.6308, + "step": 3382 + }, + { + "epoch": 0.45, + "grad_norm": 0.80078125, + "learning_rate": 0.00019989951552643818, + "loss": 0.7266, + "step": 3383 + }, + { + "epoch": 0.45, + "grad_norm": 0.796875, + "learning_rate": 0.00019989899293926193, + "loss": 0.5708, + "step": 3384 + }, + { + "epoch": 0.45, + "grad_norm": 0.609375, + "learning_rate": 0.00019989846899739125, + "loss": 0.6794, + "step": 3385 + }, + { + "epoch": 0.45, + "grad_norm": 0.5546875, + "learning_rate": 0.00019989794370083318, + "loss": 0.5434, + "step": 3386 + }, + { + "epoch": 0.45, + "grad_norm": 0.703125, + "learning_rate": 0.0001998974170495949, + "loss": 0.6323, + "step": 3387 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.0001998968890436835, + "loss": 0.7122, + "step": 3388 + }, + { + "epoch": 0.45, + "grad_norm": 0.6875, + "learning_rate": 0.00019989635968310621, + "loss": 0.6243, + "step": 3389 + }, + { + "epoch": 0.45, + "grad_norm": 0.7109375, + "learning_rate": 0.00019989582896787016, + "loss": 0.3892, + "step": 3390 + }, + { + "epoch": 0.45, + "grad_norm": 0.98046875, + "learning_rate": 0.00019989529689798255, + "loss": 0.6462, + "step": 3391 + }, + { + "epoch": 0.45, + "grad_norm": 0.765625, + "learning_rate": 0.00019989476347345063, + "loss": 0.593, + "step": 3392 + }, + { + "epoch": 0.45, + "grad_norm": 0.765625, + "learning_rate": 0.00019989422869428158, + "loss": 0.5274, + "step": 3393 + }, + { + "epoch": 0.45, + "grad_norm": 0.6953125, + "learning_rate": 0.00019989369256048268, + "loss": 0.5137, + "step": 3394 + }, + { + "epoch": 0.45, + "grad_norm": 0.6953125, + "learning_rate": 0.00019989315507206115, + "loss": 0.4055, + "step": 3395 + }, + { + "epoch": 0.45, + "grad_norm": 0.734375, + "learning_rate": 0.0001998926162290244, + "loss": 0.4888, + "step": 3396 + }, + { + "epoch": 0.45, + "grad_norm": 0.78125, + "learning_rate": 0.00019989207603137964, + "loss": 0.6791, + "step": 3397 + }, + { + "epoch": 0.45, + "grad_norm": 0.69921875, + "learning_rate": 0.0001998915344791342, + "loss": 0.655, + "step": 3398 + }, + { + "epoch": 0.45, + "grad_norm": 1.1640625, + "learning_rate": 0.00019989099157229548, + "loss": 0.5257, + "step": 3399 + }, + { + "epoch": 0.45, + "grad_norm": 0.671875, + "learning_rate": 0.00019989044731087076, + "loss": 0.9447, + "step": 3400 + }, + { + "epoch": 0.45, + "grad_norm": 0.8671875, + "learning_rate": 0.00019988990169486748, + "loss": 0.6097, + "step": 3401 + }, + { + "epoch": 0.45, + "grad_norm": 1.1015625, + "learning_rate": 0.00019988935472429304, + "loss": 0.8358, + "step": 3402 + }, + { + "epoch": 0.45, + "grad_norm": 0.71484375, + "learning_rate": 0.00019988880639915482, + "loss": 0.4755, + "step": 3403 + }, + { + "epoch": 0.45, + "grad_norm": 0.859375, + "learning_rate": 0.0001998882567194603, + "loss": 0.5353, + "step": 3404 + }, + { + "epoch": 0.45, + "grad_norm": 0.57421875, + "learning_rate": 0.0001998877056852169, + "loss": 0.4301, + "step": 3405 + }, + { + "epoch": 0.45, + "grad_norm": 0.890625, + "learning_rate": 0.0001998871532964321, + "loss": 0.4741, + "step": 3406 + }, + { + "epoch": 0.45, + "grad_norm": 0.57421875, + "learning_rate": 0.0001998865995531134, + "loss": 0.5648, + "step": 3407 + }, + { + "epoch": 0.45, + "grad_norm": 0.68359375, + "learning_rate": 0.00019988604445526827, + "loss": 0.7539, + "step": 3408 + }, + { + "epoch": 0.45, + "grad_norm": 0.8046875, + "learning_rate": 0.00019988548800290432, + "loss": 0.6408, + "step": 3409 + }, + { + "epoch": 0.46, + "grad_norm": 0.625, + "learning_rate": 0.00019988493019602906, + "loss": 0.4935, + "step": 3410 + }, + { + "epoch": 0.46, + "grad_norm": 0.62109375, + "learning_rate": 0.00019988437103465, + "loss": 0.6692, + "step": 3411 + }, + { + "epoch": 0.46, + "grad_norm": 0.796875, + "learning_rate": 0.00019988381051877477, + "loss": 0.5081, + "step": 3412 + }, + { + "epoch": 0.46, + "grad_norm": 1.09375, + "learning_rate": 0.00019988324864841095, + "loss": 0.6703, + "step": 3413 + }, + { + "epoch": 0.46, + "grad_norm": 0.5859375, + "learning_rate": 0.0001998826854235662, + "loss": 0.6715, + "step": 3414 + }, + { + "epoch": 0.46, + "grad_norm": 0.83203125, + "learning_rate": 0.00019988212084424812, + "loss": 0.4417, + "step": 3415 + }, + { + "epoch": 0.46, + "grad_norm": 0.7578125, + "learning_rate": 0.00019988155491046437, + "loss": 0.7723, + "step": 3416 + }, + { + "epoch": 0.46, + "grad_norm": 0.94140625, + "learning_rate": 0.00019988098762222265, + "loss": 0.7949, + "step": 3417 + }, + { + "epoch": 0.46, + "grad_norm": 0.94921875, + "learning_rate": 0.0001998804189795306, + "loss": 0.6928, + "step": 3418 + }, + { + "epoch": 0.46, + "grad_norm": 0.82421875, + "learning_rate": 0.000199879848982396, + "loss": 0.83, + "step": 3419 + }, + { + "epoch": 0.46, + "grad_norm": 0.75, + "learning_rate": 0.00019987927763082653, + "loss": 0.4184, + "step": 3420 + }, + { + "epoch": 0.46, + "grad_norm": 0.69140625, + "learning_rate": 0.00019987870492482997, + "loss": 0.4955, + "step": 3421 + }, + { + "epoch": 0.46, + "grad_norm": 0.69140625, + "learning_rate": 0.00019987813086441407, + "loss": 0.5938, + "step": 3422 + }, + { + "epoch": 0.46, + "grad_norm": 0.6484375, + "learning_rate": 0.0001998775554495866, + "loss": 0.4086, + "step": 3423 + }, + { + "epoch": 0.46, + "grad_norm": 0.58203125, + "learning_rate": 0.00019987697868035538, + "loss": 0.564, + "step": 3424 + }, + { + "epoch": 0.46, + "grad_norm": 0.75390625, + "learning_rate": 0.00019987640055672823, + "loss": 0.5914, + "step": 3425 + }, + { + "epoch": 0.46, + "grad_norm": 0.72265625, + "learning_rate": 0.00019987582107871296, + "loss": 0.5251, + "step": 3426 + }, + { + "epoch": 0.46, + "grad_norm": 0.8515625, + "learning_rate": 0.0001998752402463175, + "loss": 0.6182, + "step": 3427 + }, + { + "epoch": 0.46, + "grad_norm": 0.67578125, + "learning_rate": 0.00019987465805954967, + "loss": 0.5267, + "step": 3428 + }, + { + "epoch": 0.46, + "grad_norm": 0.76171875, + "learning_rate": 0.00019987407451841737, + "loss": 0.8564, + "step": 3429 + }, + { + "epoch": 0.46, + "grad_norm": 0.78125, + "learning_rate": 0.00019987348962292853, + "loss": 0.6052, + "step": 3430 + }, + { + "epoch": 0.46, + "grad_norm": 0.79296875, + "learning_rate": 0.00019987290337309107, + "loss": 0.65, + "step": 3431 + }, + { + "epoch": 0.46, + "grad_norm": 0.82421875, + "learning_rate": 0.00019987231576891294, + "loss": 0.6528, + "step": 3432 + }, + { + "epoch": 0.46, + "grad_norm": 0.90234375, + "learning_rate": 0.0001998717268104021, + "loss": 0.5994, + "step": 3433 + }, + { + "epoch": 0.46, + "grad_norm": 0.93359375, + "learning_rate": 0.00019987113649756654, + "loss": 0.8697, + "step": 3434 + }, + { + "epoch": 0.46, + "grad_norm": 0.68359375, + "learning_rate": 0.0001998705448304143, + "loss": 0.419, + "step": 3435 + }, + { + "epoch": 0.46, + "grad_norm": 0.8359375, + "learning_rate": 0.00019986995180895338, + "loss": 0.4279, + "step": 3436 + }, + { + "epoch": 0.46, + "grad_norm": 0.82421875, + "learning_rate": 0.0001998693574331918, + "loss": 0.5177, + "step": 3437 + }, + { + "epoch": 0.46, + "grad_norm": 0.62109375, + "learning_rate": 0.00019986876170313762, + "loss": 0.4838, + "step": 3438 + }, + { + "epoch": 0.46, + "grad_norm": 0.6953125, + "learning_rate": 0.00019986816461879897, + "loss": 0.6826, + "step": 3439 + }, + { + "epoch": 0.46, + "grad_norm": 0.7265625, + "learning_rate": 0.0001998675661801839, + "loss": 0.9351, + "step": 3440 + }, + { + "epoch": 0.46, + "grad_norm": 0.6875, + "learning_rate": 0.00019986696638730055, + "loss": 0.8211, + "step": 3441 + }, + { + "epoch": 0.46, + "grad_norm": 0.7265625, + "learning_rate": 0.000199866365240157, + "loss": 0.5851, + "step": 3442 + }, + { + "epoch": 0.46, + "grad_norm": 0.76953125, + "learning_rate": 0.0001998657627387615, + "loss": 0.568, + "step": 3443 + }, + { + "epoch": 0.46, + "grad_norm": 0.96875, + "learning_rate": 0.00019986515888312212, + "loss": 0.4303, + "step": 3444 + }, + { + "epoch": 0.46, + "grad_norm": 0.8828125, + "learning_rate": 0.0001998645536732471, + "loss": 0.6924, + "step": 3445 + }, + { + "epoch": 0.46, + "grad_norm": 0.69140625, + "learning_rate": 0.00019986394710914466, + "loss": 0.4497, + "step": 3446 + }, + { + "epoch": 0.46, + "grad_norm": 0.84765625, + "learning_rate": 0.000199863339190823, + "loss": 0.831, + "step": 3447 + }, + { + "epoch": 0.46, + "grad_norm": 0.79296875, + "learning_rate": 0.00019986272991829034, + "loss": 0.5158, + "step": 3448 + }, + { + "epoch": 0.46, + "grad_norm": 0.5703125, + "learning_rate": 0.000199862119291555, + "loss": 0.694, + "step": 3449 + }, + { + "epoch": 0.46, + "grad_norm": 0.72265625, + "learning_rate": 0.00019986150731062522, + "loss": 0.3474, + "step": 3450 + }, + { + "epoch": 0.46, + "grad_norm": 0.8515625, + "learning_rate": 0.00019986089397550932, + "loss": 0.7878, + "step": 3451 + }, + { + "epoch": 0.46, + "grad_norm": 0.73046875, + "learning_rate": 0.0001998602792862156, + "loss": 0.5604, + "step": 3452 + }, + { + "epoch": 0.46, + "grad_norm": 0.83203125, + "learning_rate": 0.00019985966324275237, + "loss": 0.618, + "step": 3453 + }, + { + "epoch": 0.46, + "grad_norm": 0.8046875, + "learning_rate": 0.00019985904584512808, + "loss": 0.536, + "step": 3454 + }, + { + "epoch": 0.46, + "grad_norm": 0.625, + "learning_rate": 0.000199858427093351, + "loss": 0.3507, + "step": 3455 + }, + { + "epoch": 0.46, + "grad_norm": 0.63671875, + "learning_rate": 0.00019985780698742958, + "loss": 0.7673, + "step": 3456 + }, + { + "epoch": 0.46, + "grad_norm": 0.72265625, + "learning_rate": 0.00019985718552737217, + "loss": 0.4522, + "step": 3457 + }, + { + "epoch": 0.46, + "grad_norm": 0.78125, + "learning_rate": 0.0001998565627131873, + "loss": 0.301, + "step": 3458 + }, + { + "epoch": 0.46, + "grad_norm": 0.86328125, + "learning_rate": 0.0001998559385448833, + "loss": 0.6461, + "step": 3459 + }, + { + "epoch": 0.46, + "grad_norm": 0.7109375, + "learning_rate": 0.0001998553130224687, + "loss": 0.3109, + "step": 3460 + }, + { + "epoch": 0.46, + "grad_norm": 0.62109375, + "learning_rate": 0.000199854686145952, + "loss": 0.4122, + "step": 3461 + }, + { + "epoch": 0.46, + "grad_norm": 0.7734375, + "learning_rate": 0.00019985405791534162, + "loss": 0.6165, + "step": 3462 + }, + { + "epoch": 0.46, + "grad_norm": 0.8828125, + "learning_rate": 0.00019985342833064615, + "loss": 0.5757, + "step": 3463 + }, + { + "epoch": 0.46, + "grad_norm": 0.90234375, + "learning_rate": 0.0001998527973918741, + "loss": 0.7526, + "step": 3464 + }, + { + "epoch": 0.46, + "grad_norm": 0.76953125, + "learning_rate": 0.00019985216509903406, + "loss": 0.4017, + "step": 3465 + }, + { + "epoch": 0.46, + "grad_norm": 0.734375, + "learning_rate": 0.00019985153145213455, + "loss": 0.4881, + "step": 3466 + }, + { + "epoch": 0.46, + "grad_norm": 0.69140625, + "learning_rate": 0.00019985089645118418, + "loss": 0.6194, + "step": 3467 + }, + { + "epoch": 0.46, + "grad_norm": 0.7890625, + "learning_rate": 0.0001998502600961916, + "loss": 0.6993, + "step": 3468 + }, + { + "epoch": 0.46, + "grad_norm": 0.8828125, + "learning_rate": 0.0001998496223871654, + "loss": 0.6044, + "step": 3469 + }, + { + "epoch": 0.46, + "grad_norm": 0.5234375, + "learning_rate": 0.00019984898332411422, + "loss": 0.4141, + "step": 3470 + }, + { + "epoch": 0.46, + "grad_norm": 0.66796875, + "learning_rate": 0.00019984834290704675, + "loss": 0.4757, + "step": 3471 + }, + { + "epoch": 0.46, + "grad_norm": 0.57421875, + "learning_rate": 0.00019984770113597167, + "loss": 0.4025, + "step": 3472 + }, + { + "epoch": 0.46, + "grad_norm": 0.59765625, + "learning_rate": 0.0001998470580108977, + "loss": 0.4719, + "step": 3473 + }, + { + "epoch": 0.46, + "grad_norm": 0.71484375, + "learning_rate": 0.0001998464135318335, + "loss": 0.5628, + "step": 3474 + }, + { + "epoch": 0.46, + "grad_norm": 0.6953125, + "learning_rate": 0.00019984576769878788, + "loss": 0.5141, + "step": 3475 + }, + { + "epoch": 0.46, + "grad_norm": 0.69921875, + "learning_rate": 0.0001998451205117696, + "loss": 0.4016, + "step": 3476 + }, + { + "epoch": 0.46, + "grad_norm": 1.3046875, + "learning_rate": 0.00019984447197078734, + "loss": 0.6835, + "step": 3477 + }, + { + "epoch": 0.46, + "grad_norm": 0.7421875, + "learning_rate": 0.00019984382207585002, + "loss": 0.5594, + "step": 3478 + }, + { + "epoch": 0.46, + "grad_norm": 0.609375, + "learning_rate": 0.00019984317082696635, + "loss": 0.5344, + "step": 3479 + }, + { + "epoch": 0.46, + "grad_norm": 0.609375, + "learning_rate": 0.00019984251822414523, + "loss": 0.8457, + "step": 3480 + }, + { + "epoch": 0.46, + "grad_norm": 0.6875, + "learning_rate": 0.00019984186426739547, + "loss": 0.4735, + "step": 3481 + }, + { + "epoch": 0.46, + "grad_norm": 0.703125, + "learning_rate": 0.00019984120895672595, + "loss": 0.5663, + "step": 3482 + }, + { + "epoch": 0.46, + "grad_norm": 0.75390625, + "learning_rate": 0.00019984055229214557, + "loss": 0.5655, + "step": 3483 + }, + { + "epoch": 0.46, + "grad_norm": 0.71875, + "learning_rate": 0.00019983989427366323, + "loss": 0.5307, + "step": 3484 + }, + { + "epoch": 0.47, + "grad_norm": 0.69140625, + "learning_rate": 0.00019983923490128784, + "loss": 0.4617, + "step": 3485 + }, + { + "epoch": 0.47, + "grad_norm": 0.671875, + "learning_rate": 0.00019983857417502833, + "loss": 0.4843, + "step": 3486 + }, + { + "epoch": 0.47, + "grad_norm": 0.82421875, + "learning_rate": 0.00019983791209489372, + "loss": 0.5776, + "step": 3487 + }, + { + "epoch": 0.47, + "grad_norm": 0.66015625, + "learning_rate": 0.0001998372486608929, + "loss": 0.474, + "step": 3488 + }, + { + "epoch": 0.47, + "grad_norm": 0.65625, + "learning_rate": 0.00019983658387303494, + "loss": 0.4851, + "step": 3489 + }, + { + "epoch": 0.47, + "grad_norm": 1.1484375, + "learning_rate": 0.00019983591773132882, + "loss": 0.5409, + "step": 3490 + }, + { + "epoch": 0.47, + "grad_norm": 0.78515625, + "learning_rate": 0.0001998352502357836, + "loss": 0.926, + "step": 3491 + }, + { + "epoch": 0.47, + "grad_norm": 0.94921875, + "learning_rate": 0.00019983458138640828, + "loss": 0.4206, + "step": 3492 + }, + { + "epoch": 0.47, + "grad_norm": 0.625, + "learning_rate": 0.00019983391118321198, + "loss": 0.4772, + "step": 3493 + }, + { + "epoch": 0.47, + "grad_norm": 0.77734375, + "learning_rate": 0.00019983323962620376, + "loss": 0.4443, + "step": 3494 + }, + { + "epoch": 0.47, + "grad_norm": 0.90234375, + "learning_rate": 0.00019983256671539276, + "loss": 0.8051, + "step": 3495 + }, + { + "epoch": 0.47, + "grad_norm": 0.95703125, + "learning_rate": 0.00019983189245078808, + "loss": 0.5682, + "step": 3496 + }, + { + "epoch": 0.47, + "grad_norm": 1.0625, + "learning_rate": 0.00019983121683239886, + "loss": 0.523, + "step": 3497 + }, + { + "epoch": 0.47, + "grad_norm": 0.64453125, + "learning_rate": 0.00019983053986023425, + "loss": 0.4037, + "step": 3498 + }, + { + "epoch": 0.47, + "grad_norm": 0.9609375, + "learning_rate": 0.00019982986153430345, + "loss": 0.5241, + "step": 3499 + }, + { + "epoch": 0.47, + "grad_norm": 0.53125, + "learning_rate": 0.00019982918185461568, + "loss": 0.578, + "step": 3500 + }, + { + "epoch": 0.47, + "grad_norm": 0.9609375, + "learning_rate": 0.00019982850082118014, + "loss": 0.4662, + "step": 3501 + }, + { + "epoch": 0.47, + "grad_norm": 0.79296875, + "learning_rate": 0.00019982781843400606, + "loss": 0.6971, + "step": 3502 + }, + { + "epoch": 0.47, + "grad_norm": 0.82421875, + "learning_rate": 0.00019982713469310267, + "loss": 0.3643, + "step": 3503 + }, + { + "epoch": 0.47, + "grad_norm": 0.68359375, + "learning_rate": 0.00019982644959847928, + "loss": 0.605, + "step": 3504 + }, + { + "epoch": 0.47, + "grad_norm": 0.71875, + "learning_rate": 0.00019982576315014515, + "loss": 0.6048, + "step": 3505 + }, + { + "epoch": 0.47, + "grad_norm": 0.66015625, + "learning_rate": 0.0001998250753481096, + "loss": 0.4575, + "step": 3506 + }, + { + "epoch": 0.47, + "grad_norm": 0.86328125, + "learning_rate": 0.000199824386192382, + "loss": 0.6363, + "step": 3507 + }, + { + "epoch": 0.47, + "grad_norm": 0.71875, + "learning_rate": 0.0001998236956829716, + "loss": 0.3359, + "step": 3508 + }, + { + "epoch": 0.47, + "grad_norm": 0.6171875, + "learning_rate": 0.0001998230038198879, + "loss": 0.8286, + "step": 3509 + }, + { + "epoch": 0.47, + "grad_norm": 0.75, + "learning_rate": 0.00019982231060314013, + "loss": 0.5717, + "step": 3510 + }, + { + "epoch": 0.47, + "grad_norm": 0.78125, + "learning_rate": 0.00019982161603273777, + "loss": 0.5929, + "step": 3511 + }, + { + "epoch": 0.47, + "grad_norm": 0.59765625, + "learning_rate": 0.00019982092010869024, + "loss": 0.6877, + "step": 3512 + }, + { + "epoch": 0.47, + "grad_norm": 0.828125, + "learning_rate": 0.00019982022283100698, + "loss": 0.6852, + "step": 3513 + }, + { + "epoch": 0.47, + "grad_norm": 0.859375, + "learning_rate": 0.00019981952419969746, + "loss": 0.7955, + "step": 3514 + }, + { + "epoch": 0.47, + "grad_norm": 0.78125, + "learning_rate": 0.00019981882421477108, + "loss": 0.5653, + "step": 3515 + }, + { + "epoch": 0.47, + "grad_norm": 0.74609375, + "learning_rate": 0.0001998181228762374, + "loss": 0.7926, + "step": 3516 + }, + { + "epoch": 0.47, + "grad_norm": 0.73046875, + "learning_rate": 0.0001998174201841059, + "loss": 0.5071, + "step": 3517 + }, + { + "epoch": 0.47, + "grad_norm": 0.76171875, + "learning_rate": 0.00019981671613838613, + "loss": 0.8704, + "step": 3518 + }, + { + "epoch": 0.47, + "grad_norm": 0.76171875, + "learning_rate": 0.0001998160107390876, + "loss": 0.6969, + "step": 3519 + }, + { + "epoch": 0.47, + "grad_norm": 0.5390625, + "learning_rate": 0.00019981530398621998, + "loss": 0.5964, + "step": 3520 + }, + { + "epoch": 0.47, + "grad_norm": 0.75390625, + "learning_rate": 0.0001998145958797927, + "loss": 0.6789, + "step": 3521 + }, + { + "epoch": 0.47, + "grad_norm": 0.92578125, + "learning_rate": 0.00019981388641981547, + "loss": 0.5973, + "step": 3522 + }, + { + "epoch": 0.47, + "grad_norm": 0.98828125, + "learning_rate": 0.0001998131756062979, + "loss": 0.5764, + "step": 3523 + }, + { + "epoch": 0.47, + "grad_norm": 0.7421875, + "learning_rate": 0.00019981246343924957, + "loss": 0.6807, + "step": 3524 + }, + { + "epoch": 0.47, + "grad_norm": 0.734375, + "learning_rate": 0.0001998117499186802, + "loss": 0.6261, + "step": 3525 + }, + { + "epoch": 0.47, + "grad_norm": 0.703125, + "learning_rate": 0.00019981103504459943, + "loss": 0.6838, + "step": 3526 + }, + { + "epoch": 0.47, + "grad_norm": 0.6171875, + "learning_rate": 0.00019981031881701695, + "loss": 0.7064, + "step": 3527 + }, + { + "epoch": 0.47, + "grad_norm": 0.64453125, + "learning_rate": 0.0001998096012359425, + "loss": 0.6033, + "step": 3528 + }, + { + "epoch": 0.47, + "grad_norm": 0.8125, + "learning_rate": 0.0001998088823013858, + "loss": 0.5906, + "step": 3529 + }, + { + "epoch": 0.47, + "grad_norm": 0.99609375, + "learning_rate": 0.00019980816201335664, + "loss": 0.7863, + "step": 3530 + }, + { + "epoch": 0.47, + "grad_norm": 0.90625, + "learning_rate": 0.00019980744037186469, + "loss": 0.5219, + "step": 3531 + }, + { + "epoch": 0.47, + "grad_norm": 0.80859375, + "learning_rate": 0.00019980671737691985, + "loss": 0.6309, + "step": 3532 + }, + { + "epoch": 0.47, + "grad_norm": 0.75390625, + "learning_rate": 0.00019980599302853183, + "loss": 0.7192, + "step": 3533 + }, + { + "epoch": 0.47, + "grad_norm": 1.1171875, + "learning_rate": 0.00019980526732671047, + "loss": 0.6417, + "step": 3534 + }, + { + "epoch": 0.47, + "grad_norm": 0.921875, + "learning_rate": 0.00019980454027146568, + "loss": 0.8704, + "step": 3535 + }, + { + "epoch": 0.47, + "grad_norm": 0.88671875, + "learning_rate": 0.00019980381186280723, + "loss": 0.6766, + "step": 3536 + }, + { + "epoch": 0.47, + "grad_norm": 0.70703125, + "learning_rate": 0.00019980308210074506, + "loss": 0.7146, + "step": 3537 + }, + { + "epoch": 0.47, + "grad_norm": 0.7734375, + "learning_rate": 0.00019980235098528905, + "loss": 0.7411, + "step": 3538 + }, + { + "epoch": 0.47, + "grad_norm": 0.55078125, + "learning_rate": 0.00019980161851644907, + "loss": 0.5952, + "step": 3539 + }, + { + "epoch": 0.47, + "grad_norm": 0.65234375, + "learning_rate": 0.0001998008846942351, + "loss": 0.5154, + "step": 3540 + }, + { + "epoch": 0.47, + "grad_norm": 0.66015625, + "learning_rate": 0.00019980014951865712, + "loss": 0.6388, + "step": 3541 + }, + { + "epoch": 0.47, + "grad_norm": 0.75390625, + "learning_rate": 0.000199799412989725, + "loss": 0.4978, + "step": 3542 + }, + { + "epoch": 0.47, + "grad_norm": 0.7421875, + "learning_rate": 0.00019979867510744884, + "loss": 0.6926, + "step": 3543 + }, + { + "epoch": 0.47, + "grad_norm": 0.5390625, + "learning_rate": 0.0001997979358718386, + "loss": 0.5778, + "step": 3544 + }, + { + "epoch": 0.47, + "grad_norm": 0.796875, + "learning_rate": 0.00019979719528290425, + "loss": 0.6164, + "step": 3545 + }, + { + "epoch": 0.47, + "grad_norm": 0.890625, + "learning_rate": 0.00019979645334065592, + "loss": 0.5883, + "step": 3546 + }, + { + "epoch": 0.47, + "grad_norm": 0.71875, + "learning_rate": 0.00019979571004510364, + "loss": 0.4517, + "step": 3547 + }, + { + "epoch": 0.47, + "grad_norm": 0.7890625, + "learning_rate": 0.00019979496539625747, + "loss": 0.5005, + "step": 3548 + }, + { + "epoch": 0.47, + "grad_norm": 0.6640625, + "learning_rate": 0.00019979421939412753, + "loss": 0.4378, + "step": 3549 + }, + { + "epoch": 0.47, + "grad_norm": 0.921875, + "learning_rate": 0.00019979347203872392, + "loss": 0.6376, + "step": 3550 + }, + { + "epoch": 0.47, + "grad_norm": 0.90625, + "learning_rate": 0.00019979272333005675, + "loss": 0.789, + "step": 3551 + }, + { + "epoch": 0.47, + "grad_norm": 0.64453125, + "learning_rate": 0.00019979197326813625, + "loss": 0.3297, + "step": 3552 + }, + { + "epoch": 0.47, + "grad_norm": 0.7421875, + "learning_rate": 0.00019979122185297254, + "loss": 0.6727, + "step": 3553 + }, + { + "epoch": 0.47, + "grad_norm": 0.71875, + "learning_rate": 0.00019979046908457583, + "loss": 0.4643, + "step": 3554 + }, + { + "epoch": 0.47, + "grad_norm": 0.67578125, + "learning_rate": 0.00019978971496295626, + "loss": 0.8531, + "step": 3555 + }, + { + "epoch": 0.47, + "grad_norm": 0.9296875, + "learning_rate": 0.00019978895948812416, + "loss": 0.3833, + "step": 3556 + }, + { + "epoch": 0.47, + "grad_norm": 0.5546875, + "learning_rate": 0.0001997882026600897, + "loss": 0.4159, + "step": 3557 + }, + { + "epoch": 0.47, + "grad_norm": 0.8359375, + "learning_rate": 0.00019978744447886318, + "loss": 0.8451, + "step": 3558 + }, + { + "epoch": 0.47, + "grad_norm": 0.88671875, + "learning_rate": 0.00019978668494445486, + "loss": 0.4451, + "step": 3559 + }, + { + "epoch": 0.48, + "grad_norm": 0.9609375, + "learning_rate": 0.00019978592405687506, + "loss": 0.3672, + "step": 3560 + }, + { + "epoch": 0.48, + "grad_norm": 0.6953125, + "learning_rate": 0.0001997851618161341, + "loss": 0.4601, + "step": 3561 + }, + { + "epoch": 0.48, + "grad_norm": 0.859375, + "learning_rate": 0.00019978439822224226, + "loss": 0.5861, + "step": 3562 + }, + { + "epoch": 0.48, + "grad_norm": 0.82421875, + "learning_rate": 0.00019978363327520998, + "loss": 0.5582, + "step": 3563 + }, + { + "epoch": 0.48, + "grad_norm": 0.72265625, + "learning_rate": 0.00019978286697504757, + "loss": 0.469, + "step": 3564 + }, + { + "epoch": 0.48, + "grad_norm": 0.63671875, + "learning_rate": 0.00019978209932176549, + "loss": 0.5029, + "step": 3565 + }, + { + "epoch": 0.48, + "grad_norm": 0.91015625, + "learning_rate": 0.00019978133031537402, + "loss": 0.5797, + "step": 3566 + }, + { + "epoch": 0.48, + "grad_norm": 0.796875, + "learning_rate": 0.00019978055995588373, + "loss": 0.4476, + "step": 3567 + }, + { + "epoch": 0.48, + "grad_norm": 1.09375, + "learning_rate": 0.00019977978824330502, + "loss": 0.4021, + "step": 3568 + }, + { + "epoch": 0.48, + "grad_norm": 0.6171875, + "learning_rate": 0.0001997790151776483, + "loss": 0.5431, + "step": 3569 + }, + { + "epoch": 0.48, + "grad_norm": 0.734375, + "learning_rate": 0.00019977824075892412, + "loss": 0.5168, + "step": 3570 + }, + { + "epoch": 0.48, + "grad_norm": 0.68359375, + "learning_rate": 0.00019977746498714297, + "loss": 0.5553, + "step": 3571 + }, + { + "epoch": 0.48, + "grad_norm": 0.796875, + "learning_rate": 0.00019977668786231534, + "loss": 0.8375, + "step": 3572 + }, + { + "epoch": 0.48, + "grad_norm": 0.96875, + "learning_rate": 0.0001997759093844518, + "loss": 0.5866, + "step": 3573 + }, + { + "epoch": 0.48, + "grad_norm": 0.6484375, + "learning_rate": 0.00019977512955356286, + "loss": 0.724, + "step": 3574 + }, + { + "epoch": 0.48, + "grad_norm": 0.48828125, + "learning_rate": 0.00019977434836965916, + "loss": 0.3947, + "step": 3575 + }, + { + "epoch": 0.48, + "grad_norm": 0.6328125, + "learning_rate": 0.00019977356583275122, + "loss": 1.0645, + "step": 3576 + }, + { + "epoch": 0.48, + "grad_norm": 0.671875, + "learning_rate": 0.00019977278194284974, + "loss": 0.5138, + "step": 3577 + }, + { + "epoch": 0.48, + "grad_norm": 0.80859375, + "learning_rate": 0.0001997719966999653, + "loss": 0.6558, + "step": 3578 + }, + { + "epoch": 0.48, + "grad_norm": 0.640625, + "learning_rate": 0.00019977121010410852, + "loss": 0.6752, + "step": 3579 + }, + { + "epoch": 0.48, + "grad_norm": 0.67578125, + "learning_rate": 0.00019977042215529013, + "loss": 0.6166, + "step": 3580 + }, + { + "epoch": 0.48, + "grad_norm": 0.66796875, + "learning_rate": 0.00019976963285352076, + "loss": 0.3923, + "step": 3581 + }, + { + "epoch": 0.48, + "grad_norm": 0.89453125, + "learning_rate": 0.00019976884219881114, + "loss": 0.479, + "step": 3582 + }, + { + "epoch": 0.48, + "grad_norm": 0.73828125, + "learning_rate": 0.000199768050191172, + "loss": 0.5614, + "step": 3583 + }, + { + "epoch": 0.48, + "grad_norm": 0.65625, + "learning_rate": 0.00019976725683061407, + "loss": 0.2943, + "step": 3584 + }, + { + "epoch": 0.48, + "grad_norm": 0.7421875, + "learning_rate": 0.00019976646211714808, + "loss": 0.5061, + "step": 3585 + }, + { + "epoch": 0.48, + "grad_norm": 0.8984375, + "learning_rate": 0.00019976566605078486, + "loss": 0.7, + "step": 3586 + }, + { + "epoch": 0.48, + "grad_norm": 0.64453125, + "learning_rate": 0.0001997648686315352, + "loss": 0.4442, + "step": 3587 + }, + { + "epoch": 0.48, + "grad_norm": 0.703125, + "learning_rate": 0.00019976406985940985, + "loss": 0.4745, + "step": 3588 + }, + { + "epoch": 0.48, + "grad_norm": 0.76171875, + "learning_rate": 0.0001997632697344197, + "loss": 0.5786, + "step": 3589 + }, + { + "epoch": 0.48, + "grad_norm": 0.73828125, + "learning_rate": 0.00019976246825657558, + "loss": 0.5225, + "step": 3590 + }, + { + "epoch": 0.48, + "grad_norm": 0.734375, + "learning_rate": 0.0001997616654258884, + "loss": 0.6695, + "step": 3591 + }, + { + "epoch": 0.48, + "grad_norm": 0.87890625, + "learning_rate": 0.00019976086124236898, + "loss": 0.595, + "step": 3592 + }, + { + "epoch": 0.48, + "grad_norm": 0.87109375, + "learning_rate": 0.00019976005570602822, + "loss": 0.5261, + "step": 3593 + }, + { + "epoch": 0.48, + "grad_norm": 0.76953125, + "learning_rate": 0.00019975924881687713, + "loss": 0.7109, + "step": 3594 + }, + { + "epoch": 0.48, + "grad_norm": 0.640625, + "learning_rate": 0.00019975844057492655, + "loss": 0.364, + "step": 3595 + }, + { + "epoch": 0.48, + "grad_norm": 0.61328125, + "learning_rate": 0.00019975763098018757, + "loss": 0.6462, + "step": 3596 + }, + { + "epoch": 0.48, + "grad_norm": 0.75, + "learning_rate": 0.00019975682003267102, + "loss": 0.4268, + "step": 3597 + }, + { + "epoch": 0.48, + "grad_norm": 0.609375, + "learning_rate": 0.00019975600773238802, + "loss": 0.4954, + "step": 3598 + }, + { + "epoch": 0.48, + "grad_norm": 0.796875, + "learning_rate": 0.0001997551940793495, + "loss": 0.5553, + "step": 3599 + }, + { + "epoch": 0.48, + "grad_norm": 0.72265625, + "learning_rate": 0.00019975437907356652, + "loss": 0.7805, + "step": 3600 + }, + { + "epoch": 0.48, + "grad_norm": 0.76171875, + "learning_rate": 0.00019975356271505014, + "loss": 0.6064, + "step": 3601 + }, + { + "epoch": 0.48, + "grad_norm": 0.69140625, + "learning_rate": 0.00019975274500381144, + "loss": 0.6433, + "step": 3602 + }, + { + "epoch": 0.48, + "grad_norm": 0.765625, + "learning_rate": 0.0001997519259398615, + "loss": 0.8373, + "step": 3603 + }, + { + "epoch": 0.48, + "grad_norm": 0.83203125, + "learning_rate": 0.00019975110552321144, + "loss": 0.402, + "step": 3604 + }, + { + "epoch": 0.48, + "grad_norm": 0.8203125, + "learning_rate": 0.00019975028375387232, + "loss": 0.4901, + "step": 3605 + }, + { + "epoch": 0.48, + "grad_norm": 0.6953125, + "learning_rate": 0.00019974946063185537, + "loss": 0.5038, + "step": 3606 + }, + { + "epoch": 0.48, + "grad_norm": 0.8671875, + "learning_rate": 0.0001997486361571717, + "loss": 0.512, + "step": 3607 + }, + { + "epoch": 0.48, + "grad_norm": 0.76171875, + "learning_rate": 0.00019974781032983251, + "loss": 0.7776, + "step": 3608 + }, + { + "epoch": 0.48, + "grad_norm": 0.59375, + "learning_rate": 0.000199746983149849, + "loss": 0.5468, + "step": 3609 + }, + { + "epoch": 0.48, + "grad_norm": 1.1953125, + "learning_rate": 0.0001997461546172324, + "loss": 0.7043, + "step": 3610 + }, + { + "epoch": 0.48, + "grad_norm": 0.71484375, + "learning_rate": 0.00019974532473199388, + "loss": 0.4282, + "step": 3611 + }, + { + "epoch": 0.48, + "grad_norm": 1.109375, + "learning_rate": 0.00019974449349414476, + "loss": 0.8354, + "step": 3612 + }, + { + "epoch": 0.48, + "grad_norm": 1.4140625, + "learning_rate": 0.0001997436609036963, + "loss": 0.7982, + "step": 3613 + }, + { + "epoch": 0.48, + "grad_norm": 0.921875, + "learning_rate": 0.00019974282696065977, + "loss": 1.0527, + "step": 3614 + }, + { + "epoch": 0.48, + "grad_norm": 0.57421875, + "learning_rate": 0.00019974199166504652, + "loss": 0.7527, + "step": 3615 + }, + { + "epoch": 0.48, + "grad_norm": 0.98828125, + "learning_rate": 0.00019974115501686783, + "loss": 0.6669, + "step": 3616 + }, + { + "epoch": 0.48, + "grad_norm": 0.78515625, + "learning_rate": 0.00019974031701613506, + "loss": 0.5999, + "step": 3617 + }, + { + "epoch": 0.48, + "grad_norm": 0.7890625, + "learning_rate": 0.00019973947766285956, + "loss": 0.4121, + "step": 3618 + }, + { + "epoch": 0.48, + "grad_norm": 0.87890625, + "learning_rate": 0.00019973863695705278, + "loss": 0.5111, + "step": 3619 + }, + { + "epoch": 0.48, + "grad_norm": 0.640625, + "learning_rate": 0.00019973779489872604, + "loss": 0.5236, + "step": 3620 + }, + { + "epoch": 0.48, + "grad_norm": 0.60546875, + "learning_rate": 0.0001997369514878908, + "loss": 0.5636, + "step": 3621 + }, + { + "epoch": 0.48, + "grad_norm": 0.69140625, + "learning_rate": 0.00019973610672455847, + "loss": 0.3532, + "step": 3622 + }, + { + "epoch": 0.48, + "grad_norm": 0.78125, + "learning_rate": 0.00019973526060874055, + "loss": 0.5083, + "step": 3623 + }, + { + "epoch": 0.48, + "grad_norm": 0.66796875, + "learning_rate": 0.00019973441314044844, + "loss": 0.6392, + "step": 3624 + }, + { + "epoch": 0.48, + "grad_norm": 0.75390625, + "learning_rate": 0.0001997335643196937, + "loss": 0.3223, + "step": 3625 + }, + { + "epoch": 0.48, + "grad_norm": 0.75390625, + "learning_rate": 0.00019973271414648784, + "loss": 0.3218, + "step": 3626 + }, + { + "epoch": 0.48, + "grad_norm": 0.75390625, + "learning_rate": 0.00019973186262084238, + "loss": 0.4533, + "step": 3627 + }, + { + "epoch": 0.48, + "grad_norm": 0.73828125, + "learning_rate": 0.00019973100974276885, + "loss": 0.615, + "step": 3628 + }, + { + "epoch": 0.48, + "grad_norm": 0.83984375, + "learning_rate": 0.00019973015551227877, + "loss": 0.6991, + "step": 3629 + }, + { + "epoch": 0.48, + "grad_norm": 0.7734375, + "learning_rate": 0.00019972929992938382, + "loss": 0.5778, + "step": 3630 + }, + { + "epoch": 0.48, + "grad_norm": 0.8828125, + "learning_rate": 0.00019972844299409554, + "loss": 0.4598, + "step": 3631 + }, + { + "epoch": 0.48, + "grad_norm": 0.63671875, + "learning_rate": 0.00019972758470642558, + "loss": 0.5297, + "step": 3632 + }, + { + "epoch": 0.48, + "grad_norm": 0.8671875, + "learning_rate": 0.00019972672506638554, + "loss": 0.6872, + "step": 3633 + }, + { + "epoch": 0.48, + "grad_norm": 0.60546875, + "learning_rate": 0.00019972586407398716, + "loss": 0.6113, + "step": 3634 + }, + { + "epoch": 0.49, + "grad_norm": 0.66796875, + "learning_rate": 0.000199725001729242, + "loss": 0.6716, + "step": 3635 + }, + { + "epoch": 0.49, + "grad_norm": 0.796875, + "learning_rate": 0.00019972413803216185, + "loss": 0.7114, + "step": 3636 + }, + { + "epoch": 0.49, + "grad_norm": 1.1796875, + "learning_rate": 0.00019972327298275837, + "loss": 0.5563, + "step": 3637 + }, + { + "epoch": 0.49, + "grad_norm": 0.80078125, + "learning_rate": 0.0001997224065810433, + "loss": 0.5392, + "step": 3638 + }, + { + "epoch": 0.49, + "grad_norm": 0.84375, + "learning_rate": 0.00019972153882702839, + "loss": 0.5851, + "step": 3639 + }, + { + "epoch": 0.49, + "grad_norm": 0.71875, + "learning_rate": 0.00019972066972072546, + "loss": 0.5396, + "step": 3640 + }, + { + "epoch": 0.49, + "grad_norm": 0.671875, + "learning_rate": 0.0001997197992621462, + "loss": 0.6511, + "step": 3641 + }, + { + "epoch": 0.49, + "grad_norm": 0.58984375, + "learning_rate": 0.00019971892745130246, + "loss": 0.7394, + "step": 3642 + }, + { + "epoch": 0.49, + "grad_norm": 0.77734375, + "learning_rate": 0.00019971805428820608, + "loss": 0.3552, + "step": 3643 + }, + { + "epoch": 0.49, + "grad_norm": 0.6640625, + "learning_rate": 0.0001997171797728689, + "loss": 0.5183, + "step": 3644 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.00019971630390530276, + "loss": 0.6287, + "step": 3645 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.0001997154266855195, + "loss": 0.414, + "step": 3646 + }, + { + "epoch": 0.49, + "grad_norm": 0.765625, + "learning_rate": 0.0001997145481135311, + "loss": 0.6228, + "step": 3647 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.00019971366818934944, + "loss": 0.6331, + "step": 3648 + }, + { + "epoch": 0.49, + "grad_norm": 0.71875, + "learning_rate": 0.0001997127869129864, + "loss": 0.5436, + "step": 3649 + }, + { + "epoch": 0.49, + "grad_norm": 0.7734375, + "learning_rate": 0.000199711904284454, + "loss": 0.5276, + "step": 3650 + }, + { + "epoch": 0.49, + "grad_norm": 0.76171875, + "learning_rate": 0.00019971102030376422, + "loss": 0.6548, + "step": 3651 + }, + { + "epoch": 0.49, + "grad_norm": 0.578125, + "learning_rate": 0.00019971013497092893, + "loss": 0.6396, + "step": 3652 + }, + { + "epoch": 0.49, + "grad_norm": 0.8984375, + "learning_rate": 0.0001997092482859603, + "loss": 0.3459, + "step": 3653 + }, + { + "epoch": 0.49, + "grad_norm": 0.640625, + "learning_rate": 0.0001997083602488702, + "loss": 0.4409, + "step": 3654 + }, + { + "epoch": 0.49, + "grad_norm": 0.80078125, + "learning_rate": 0.00019970747085967076, + "loss": 0.4601, + "step": 3655 + }, + { + "epoch": 0.49, + "grad_norm": 0.96484375, + "learning_rate": 0.00019970658011837404, + "loss": 0.5862, + "step": 3656 + }, + { + "epoch": 0.49, + "grad_norm": 0.69140625, + "learning_rate": 0.00019970568802499212, + "loss": 0.5156, + "step": 3657 + }, + { + "epoch": 0.49, + "grad_norm": 0.8046875, + "learning_rate": 0.00019970479457953702, + "loss": 0.7493, + "step": 3658 + }, + { + "epoch": 0.49, + "grad_norm": 1.0546875, + "learning_rate": 0.000199703899782021, + "loss": 0.576, + "step": 3659 + }, + { + "epoch": 0.49, + "grad_norm": 0.75, + "learning_rate": 0.00019970300363245604, + "loss": 0.3705, + "step": 3660 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.00019970210613085438, + "loss": 0.6424, + "step": 3661 + }, + { + "epoch": 0.49, + "grad_norm": 0.79296875, + "learning_rate": 0.0001997012072772282, + "loss": 0.4902, + "step": 3662 + }, + { + "epoch": 0.49, + "grad_norm": 0.640625, + "learning_rate": 0.00019970030707158962, + "loss": 0.2703, + "step": 3663 + }, + { + "epoch": 0.49, + "grad_norm": 0.51171875, + "learning_rate": 0.0001996994055139509, + "loss": 0.3983, + "step": 3664 + }, + { + "epoch": 0.49, + "grad_norm": 1.1875, + "learning_rate": 0.0001996985026043243, + "loss": 0.4367, + "step": 3665 + }, + { + "epoch": 0.49, + "grad_norm": 0.69921875, + "learning_rate": 0.00019969759834272195, + "loss": 0.4998, + "step": 3666 + }, + { + "epoch": 0.49, + "grad_norm": 0.8671875, + "learning_rate": 0.0001996966927291562, + "loss": 0.4109, + "step": 3667 + }, + { + "epoch": 0.49, + "grad_norm": 0.61328125, + "learning_rate": 0.00019969578576363933, + "loss": 0.7798, + "step": 3668 + }, + { + "epoch": 0.49, + "grad_norm": 0.75, + "learning_rate": 0.00019969487744618363, + "loss": 0.6387, + "step": 3669 + }, + { + "epoch": 0.49, + "grad_norm": 0.74609375, + "learning_rate": 0.00019969396777680138, + "loss": 0.4136, + "step": 3670 + }, + { + "epoch": 0.49, + "grad_norm": 0.74609375, + "learning_rate": 0.00019969305675550495, + "loss": 0.5126, + "step": 3671 + }, + { + "epoch": 0.49, + "grad_norm": 0.5625, + "learning_rate": 0.00019969214438230671, + "loss": 0.2868, + "step": 3672 + }, + { + "epoch": 0.49, + "grad_norm": 0.6484375, + "learning_rate": 0.000199691230657219, + "loss": 0.6176, + "step": 3673 + }, + { + "epoch": 0.49, + "grad_norm": 0.765625, + "learning_rate": 0.00019969031558025418, + "loss": 0.621, + "step": 3674 + }, + { + "epoch": 0.49, + "grad_norm": 0.78125, + "learning_rate": 0.00019968939915142474, + "loss": 0.6746, + "step": 3675 + }, + { + "epoch": 0.49, + "grad_norm": 1.4609375, + "learning_rate": 0.00019968848137074307, + "loss": 0.6207, + "step": 3676 + }, + { + "epoch": 0.49, + "grad_norm": 0.7578125, + "learning_rate": 0.0001996875622382216, + "loss": 0.7495, + "step": 3677 + }, + { + "epoch": 0.49, + "grad_norm": 0.625, + "learning_rate": 0.0001996866417538728, + "loss": 0.5123, + "step": 3678 + }, + { + "epoch": 0.49, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996857199177092, + "loss": 0.6653, + "step": 3679 + }, + { + "epoch": 0.49, + "grad_norm": 0.76953125, + "learning_rate": 0.00019968479672974324, + "loss": 0.9945, + "step": 3680 + }, + { + "epoch": 0.49, + "grad_norm": 0.71875, + "learning_rate": 0.00019968387218998742, + "loss": 0.4513, + "step": 3681 + }, + { + "epoch": 0.49, + "grad_norm": 0.86328125, + "learning_rate": 0.00019968294629845435, + "loss": 0.8861, + "step": 3682 + }, + { + "epoch": 0.49, + "grad_norm": 0.67578125, + "learning_rate": 0.00019968201905515656, + "loss": 0.5301, + "step": 3683 + }, + { + "epoch": 0.49, + "grad_norm": 0.875, + "learning_rate": 0.0001996810904601066, + "loss": 0.652, + "step": 3684 + }, + { + "epoch": 0.49, + "grad_norm": 0.93359375, + "learning_rate": 0.0001996801605133171, + "loss": 0.4122, + "step": 3685 + }, + { + "epoch": 0.49, + "grad_norm": 0.73828125, + "learning_rate": 0.0001996792292148006, + "loss": 0.6652, + "step": 3686 + }, + { + "epoch": 0.49, + "grad_norm": 0.73828125, + "learning_rate": 0.0001996782965645698, + "loss": 0.5507, + "step": 3687 + }, + { + "epoch": 0.49, + "grad_norm": 1.046875, + "learning_rate": 0.00019967736256263735, + "loss": 0.4365, + "step": 3688 + }, + { + "epoch": 0.49, + "grad_norm": 0.7890625, + "learning_rate": 0.0001996764272090159, + "loss": 0.7749, + "step": 3689 + }, + { + "epoch": 0.49, + "grad_norm": 0.8828125, + "learning_rate": 0.0001996754905037181, + "loss": 0.4401, + "step": 3690 + }, + { + "epoch": 0.49, + "grad_norm": 0.8984375, + "learning_rate": 0.00019967455244675666, + "loss": 0.5998, + "step": 3691 + }, + { + "epoch": 0.49, + "grad_norm": 0.70703125, + "learning_rate": 0.00019967361303814435, + "loss": 0.4664, + "step": 3692 + }, + { + "epoch": 0.49, + "grad_norm": 0.5078125, + "learning_rate": 0.00019967267227789386, + "loss": 0.4011, + "step": 3693 + }, + { + "epoch": 0.49, + "grad_norm": 1.15625, + "learning_rate": 0.00019967173016601796, + "loss": 0.5829, + "step": 3694 + }, + { + "epoch": 0.49, + "grad_norm": 0.69921875, + "learning_rate": 0.00019967078670252945, + "loss": 0.4955, + "step": 3695 + }, + { + "epoch": 0.49, + "grad_norm": 0.75390625, + "learning_rate": 0.00019966984188744108, + "loss": 0.6871, + "step": 3696 + }, + { + "epoch": 0.49, + "grad_norm": 0.76171875, + "learning_rate": 0.0001996688957207657, + "loss": 0.368, + "step": 3697 + }, + { + "epoch": 0.49, + "grad_norm": 0.8984375, + "learning_rate": 0.00019966794820251615, + "loss": 0.6363, + "step": 3698 + }, + { + "epoch": 0.49, + "grad_norm": 0.7734375, + "learning_rate": 0.00019966699933270522, + "loss": 0.6915, + "step": 3699 + }, + { + "epoch": 0.49, + "grad_norm": 0.65625, + "learning_rate": 0.00019966604911134583, + "loss": 0.3898, + "step": 3700 + }, + { + "epoch": 0.49, + "grad_norm": 0.640625, + "learning_rate": 0.00019966509753845083, + "loss": 0.6695, + "step": 3701 + }, + { + "epoch": 0.49, + "grad_norm": 0.5546875, + "learning_rate": 0.00019966414461403318, + "loss": 0.6404, + "step": 3702 + }, + { + "epoch": 0.49, + "grad_norm": 0.9453125, + "learning_rate": 0.00019966319033810575, + "loss": 0.74, + "step": 3703 + }, + { + "epoch": 0.49, + "grad_norm": 0.58984375, + "learning_rate": 0.00019966223471068147, + "loss": 0.5846, + "step": 3704 + }, + { + "epoch": 0.49, + "grad_norm": 0.7265625, + "learning_rate": 0.00019966127773177335, + "loss": 0.3815, + "step": 3705 + }, + { + "epoch": 0.49, + "grad_norm": 0.88671875, + "learning_rate": 0.00019966031940139433, + "loss": 0.7164, + "step": 3706 + }, + { + "epoch": 0.49, + "grad_norm": 1.0, + "learning_rate": 0.00019965935971955743, + "loss": 0.658, + "step": 3707 + }, + { + "epoch": 0.49, + "grad_norm": 0.8203125, + "learning_rate": 0.00019965839868627564, + "loss": 0.5047, + "step": 3708 + }, + { + "epoch": 0.49, + "grad_norm": 0.640625, + "learning_rate": 0.000199657436301562, + "loss": 0.3779, + "step": 3709 + }, + { + "epoch": 0.5, + "grad_norm": 0.7734375, + "learning_rate": 0.00019965647256542956, + "loss": 0.4725, + "step": 3710 + }, + { + "epoch": 0.5, + "grad_norm": 0.98828125, + "learning_rate": 0.00019965550747789141, + "loss": 0.6782, + "step": 3711 + }, + { + "epoch": 0.5, + "grad_norm": 0.65625, + "learning_rate": 0.00019965454103896063, + "loss": 0.3754, + "step": 3712 + }, + { + "epoch": 0.5, + "grad_norm": 0.546875, + "learning_rate": 0.00019965357324865028, + "loss": 0.6085, + "step": 3713 + }, + { + "epoch": 0.5, + "grad_norm": 0.62109375, + "learning_rate": 0.00019965260410697354, + "loss": 0.4433, + "step": 3714 + }, + { + "epoch": 0.5, + "grad_norm": 0.7890625, + "learning_rate": 0.00019965163361394353, + "loss": 0.5942, + "step": 3715 + }, + { + "epoch": 0.5, + "grad_norm": 0.60546875, + "learning_rate": 0.00019965066176957344, + "loss": 0.573, + "step": 3716 + }, + { + "epoch": 0.5, + "grad_norm": 0.546875, + "learning_rate": 0.00019964968857387637, + "loss": 0.4306, + "step": 3717 + }, + { + "epoch": 0.5, + "grad_norm": 0.6015625, + "learning_rate": 0.0001996487140268656, + "loss": 0.8435, + "step": 3718 + }, + { + "epoch": 0.5, + "grad_norm": 0.70703125, + "learning_rate": 0.00019964773812855427, + "loss": 0.9184, + "step": 3719 + }, + { + "epoch": 0.5, + "grad_norm": 0.671875, + "learning_rate": 0.0001996467608789557, + "loss": 0.6495, + "step": 3720 + }, + { + "epoch": 0.5, + "grad_norm": 0.74609375, + "learning_rate": 0.0001996457822780831, + "loss": 0.6183, + "step": 3721 + }, + { + "epoch": 0.5, + "grad_norm": 0.67578125, + "learning_rate": 0.00019964480232594975, + "loss": 0.3665, + "step": 3722 + }, + { + "epoch": 0.5, + "grad_norm": 0.8125, + "learning_rate": 0.0001996438210225689, + "loss": 0.5662, + "step": 3723 + }, + { + "epoch": 0.5, + "grad_norm": 0.8125, + "learning_rate": 0.0001996428383679539, + "loss": 0.6023, + "step": 3724 + }, + { + "epoch": 0.5, + "grad_norm": 0.91796875, + "learning_rate": 0.00019964185436211808, + "loss": 0.4868, + "step": 3725 + }, + { + "epoch": 0.5, + "grad_norm": 0.5625, + "learning_rate": 0.00019964086900507472, + "loss": 0.5406, + "step": 3726 + }, + { + "epoch": 0.5, + "grad_norm": 1.078125, + "learning_rate": 0.00019963988229683725, + "loss": 0.4825, + "step": 3727 + }, + { + "epoch": 0.5, + "grad_norm": 0.7578125, + "learning_rate": 0.00019963889423741903, + "loss": 0.5711, + "step": 3728 + }, + { + "epoch": 0.5, + "grad_norm": 0.6328125, + "learning_rate": 0.00019963790482683345, + "loss": 0.2559, + "step": 3729 + }, + { + "epoch": 0.5, + "grad_norm": 0.76171875, + "learning_rate": 0.00019963691406509396, + "loss": 0.8153, + "step": 3730 + }, + { + "epoch": 0.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00019963592195221394, + "loss": 0.4216, + "step": 3731 + }, + { + "epoch": 0.5, + "grad_norm": 0.77734375, + "learning_rate": 0.00019963492848820686, + "loss": 0.4052, + "step": 3732 + }, + { + "epoch": 0.5, + "grad_norm": 0.6796875, + "learning_rate": 0.00019963393367308622, + "loss": 0.4069, + "step": 3733 + }, + { + "epoch": 0.5, + "grad_norm": 0.46875, + "learning_rate": 0.0001996329375068655, + "loss": 0.2081, + "step": 3734 + }, + { + "epoch": 0.5, + "grad_norm": 0.703125, + "learning_rate": 0.00019963193998955822, + "loss": 0.3455, + "step": 3735 + }, + { + "epoch": 0.5, + "grad_norm": 0.7421875, + "learning_rate": 0.00019963094112117785, + "loss": 0.4595, + "step": 3736 + }, + { + "epoch": 0.5, + "grad_norm": 0.90625, + "learning_rate": 0.000199629940901738, + "loss": 0.4358, + "step": 3737 + }, + { + "epoch": 0.5, + "grad_norm": 1.0703125, + "learning_rate": 0.00019962893933125222, + "loss": 0.5377, + "step": 3738 + }, + { + "epoch": 0.5, + "grad_norm": 0.79296875, + "learning_rate": 0.00019962793640973406, + "loss": 0.4764, + "step": 3739 + }, + { + "epoch": 0.5, + "grad_norm": 0.96875, + "learning_rate": 0.00019962693213719716, + "loss": 0.7317, + "step": 3740 + }, + { + "epoch": 0.5, + "grad_norm": 0.765625, + "learning_rate": 0.00019962592651365512, + "loss": 0.6042, + "step": 3741 + }, + { + "epoch": 0.5, + "grad_norm": 0.88671875, + "learning_rate": 0.00019962491953912158, + "loss": 0.7578, + "step": 3742 + }, + { + "epoch": 0.5, + "grad_norm": 0.71484375, + "learning_rate": 0.00019962391121361016, + "loss": 0.5359, + "step": 3743 + }, + { + "epoch": 0.5, + "grad_norm": 0.6953125, + "learning_rate": 0.0001996229015371346, + "loss": 0.3809, + "step": 3744 + }, + { + "epoch": 0.5, + "grad_norm": 0.64453125, + "learning_rate": 0.00019962189050970854, + "loss": 0.8055, + "step": 3745 + }, + { + "epoch": 0.5, + "grad_norm": 0.7109375, + "learning_rate": 0.0001996208781313457, + "loss": 0.591, + "step": 3746 + }, + { + "epoch": 0.5, + "grad_norm": 0.91015625, + "learning_rate": 0.00019961986440205983, + "loss": 0.6646, + "step": 3747 + }, + { + "epoch": 0.5, + "grad_norm": 0.7890625, + "learning_rate": 0.00019961884932186464, + "loss": 0.6348, + "step": 3748 + }, + { + "epoch": 0.5, + "grad_norm": 0.9296875, + "learning_rate": 0.00019961783289077398, + "loss": 0.4822, + "step": 3749 + }, + { + "epoch": 0.5, + "grad_norm": 0.796875, + "learning_rate": 0.0001996168151088015, + "loss": 0.9698, + "step": 3750 + }, + { + "epoch": 0.5, + "grad_norm": 0.77734375, + "learning_rate": 0.0001996157959759611, + "loss": 0.3897, + "step": 3751 + }, + { + "epoch": 0.5, + "grad_norm": 0.69140625, + "learning_rate": 0.0001996147754922666, + "loss": 0.5377, + "step": 3752 + }, + { + "epoch": 0.5, + "grad_norm": 0.9375, + "learning_rate": 0.00019961375365773178, + "loss": 0.4902, + "step": 3753 + }, + { + "epoch": 0.5, + "grad_norm": 0.7421875, + "learning_rate": 0.00019961273047237055, + "loss": 0.6265, + "step": 3754 + }, + { + "epoch": 0.5, + "grad_norm": 0.703125, + "learning_rate": 0.00019961170593619674, + "loss": 0.5765, + "step": 3755 + }, + { + "epoch": 0.5, + "grad_norm": 0.8046875, + "learning_rate": 0.00019961068004922428, + "loss": 0.9509, + "step": 3756 + }, + { + "epoch": 0.5, + "grad_norm": 0.76171875, + "learning_rate": 0.00019960965281146708, + "loss": 0.6139, + "step": 3757 + }, + { + "epoch": 0.5, + "grad_norm": 0.66796875, + "learning_rate": 0.00019960862422293904, + "loss": 0.6336, + "step": 3758 + }, + { + "epoch": 0.5, + "grad_norm": 0.50390625, + "learning_rate": 0.0001996075942836541, + "loss": 0.655, + "step": 3759 + }, + { + "epoch": 0.5, + "grad_norm": 0.7890625, + "learning_rate": 0.00019960656299362631, + "loss": 0.4554, + "step": 3760 + }, + { + "epoch": 0.5, + "grad_norm": 0.8828125, + "learning_rate": 0.00019960553035286958, + "loss": 0.5872, + "step": 3761 + }, + { + "epoch": 0.5, + "grad_norm": 0.84375, + "learning_rate": 0.00019960449636139793, + "loss": 0.395, + "step": 3762 + }, + { + "epoch": 0.5, + "grad_norm": 0.76171875, + "learning_rate": 0.0001996034610192254, + "loss": 0.7315, + "step": 3763 + }, + { + "epoch": 0.5, + "grad_norm": 0.71875, + "learning_rate": 0.00019960242432636599, + "loss": 0.4611, + "step": 3764 + }, + { + "epoch": 0.5, + "grad_norm": 0.67578125, + "learning_rate": 0.00019960138628283378, + "loss": 0.465, + "step": 3765 + }, + { + "epoch": 0.5, + "grad_norm": 0.76171875, + "learning_rate": 0.00019960034688864284, + "loss": 0.3833, + "step": 3766 + }, + { + "epoch": 0.5, + "grad_norm": 1.0859375, + "learning_rate": 0.0001995993061438073, + "loss": 0.6704, + "step": 3767 + }, + { + "epoch": 0.5, + "grad_norm": 0.67578125, + "learning_rate": 0.00019959826404834124, + "loss": 0.6648, + "step": 3768 + }, + { + "epoch": 0.5, + "grad_norm": 0.9609375, + "learning_rate": 0.0001995972206022588, + "loss": 0.4416, + "step": 3769 + }, + { + "epoch": 0.5, + "grad_norm": 0.6875, + "learning_rate": 0.00019959617580557414, + "loss": 0.5966, + "step": 3770 + }, + { + "epoch": 0.5, + "grad_norm": 0.64453125, + "learning_rate": 0.00019959512965830135, + "loss": 0.6054, + "step": 3771 + }, + { + "epoch": 0.5, + "grad_norm": 0.56640625, + "learning_rate": 0.00019959408216045475, + "loss": 0.4561, + "step": 3772 + }, + { + "epoch": 0.5, + "grad_norm": 0.59765625, + "learning_rate": 0.00019959303331204846, + "loss": 0.5864, + "step": 3773 + }, + { + "epoch": 0.5, + "grad_norm": 0.74609375, + "learning_rate": 0.0001995919831130967, + "loss": 0.7727, + "step": 3774 + }, + { + "epoch": 0.5, + "grad_norm": 0.7421875, + "learning_rate": 0.00019959093156361374, + "loss": 0.5091, + "step": 3775 + }, + { + "epoch": 0.5, + "grad_norm": 0.828125, + "learning_rate": 0.00019958987866361386, + "loss": 0.6623, + "step": 3776 + }, + { + "epoch": 0.5, + "grad_norm": 0.8046875, + "learning_rate": 0.00019958882441311126, + "loss": 0.5497, + "step": 3777 + }, + { + "epoch": 0.5, + "grad_norm": 0.78125, + "learning_rate": 0.00019958776881212034, + "loss": 0.6614, + "step": 3778 + }, + { + "epoch": 0.5, + "grad_norm": 0.9296875, + "learning_rate": 0.00019958671186065533, + "loss": 0.7171, + "step": 3779 + }, + { + "epoch": 0.5, + "grad_norm": 0.671875, + "learning_rate": 0.00019958565355873058, + "loss": 0.5224, + "step": 3780 + }, + { + "epoch": 0.5, + "grad_norm": 0.64453125, + "learning_rate": 0.00019958459390636047, + "loss": 0.3244, + "step": 3781 + }, + { + "epoch": 0.5, + "grad_norm": 0.95703125, + "learning_rate": 0.00019958353290355936, + "loss": 0.4283, + "step": 3782 + }, + { + "epoch": 0.5, + "grad_norm": 1.09375, + "learning_rate": 0.0001995824705503416, + "loss": 0.6633, + "step": 3783 + }, + { + "epoch": 0.5, + "grad_norm": 0.6015625, + "learning_rate": 0.00019958140684672168, + "loss": 0.4038, + "step": 3784 + }, + { + "epoch": 0.51, + "grad_norm": 0.451171875, + "learning_rate": 0.00019958034179271393, + "loss": 0.5561, + "step": 3785 + }, + { + "epoch": 0.51, + "grad_norm": 0.5546875, + "learning_rate": 0.00019957927538833285, + "loss": 0.3749, + "step": 3786 + }, + { + "epoch": 0.51, + "grad_norm": 0.828125, + "learning_rate": 0.0001995782076335929, + "loss": 0.9149, + "step": 3787 + }, + { + "epoch": 0.51, + "grad_norm": 0.66796875, + "learning_rate": 0.00019957713852850853, + "loss": 0.4326, + "step": 3788 + }, + { + "epoch": 0.51, + "grad_norm": 0.5703125, + "learning_rate": 0.00019957606807309423, + "loss": 0.6178, + "step": 3789 + }, + { + "epoch": 0.51, + "grad_norm": 0.703125, + "learning_rate": 0.00019957499626736458, + "loss": 0.5675, + "step": 3790 + }, + { + "epoch": 0.51, + "grad_norm": 0.6953125, + "learning_rate": 0.00019957392311133404, + "loss": 0.3771, + "step": 3791 + }, + { + "epoch": 0.51, + "grad_norm": 0.88671875, + "learning_rate": 0.00019957284860501722, + "loss": 0.4783, + "step": 3792 + }, + { + "epoch": 0.51, + "grad_norm": 0.65234375, + "learning_rate": 0.00019957177274842867, + "loss": 0.5066, + "step": 3793 + }, + { + "epoch": 0.51, + "grad_norm": 0.6953125, + "learning_rate": 0.00019957069554158295, + "loss": 0.744, + "step": 3794 + }, + { + "epoch": 0.51, + "grad_norm": 0.640625, + "learning_rate": 0.00019956961698449472, + "loss": 0.679, + "step": 3795 + }, + { + "epoch": 0.51, + "grad_norm": 0.77734375, + "learning_rate": 0.00019956853707717859, + "loss": 0.5085, + "step": 3796 + }, + { + "epoch": 0.51, + "grad_norm": 0.6640625, + "learning_rate": 0.00019956745581964916, + "loss": 0.4313, + "step": 3797 + }, + { + "epoch": 0.51, + "grad_norm": 0.75390625, + "learning_rate": 0.00019956637321192115, + "loss": 0.5853, + "step": 3798 + }, + { + "epoch": 0.51, + "grad_norm": 0.91796875, + "learning_rate": 0.0001995652892540092, + "loss": 0.4078, + "step": 3799 + }, + { + "epoch": 0.51, + "grad_norm": 0.703125, + "learning_rate": 0.00019956420394592805, + "loss": 0.6371, + "step": 3800 + }, + { + "epoch": 0.51, + "grad_norm": 0.671875, + "learning_rate": 0.0001995631172876924, + "loss": 0.7773, + "step": 3801 + }, + { + "epoch": 0.51, + "grad_norm": 1.0390625, + "learning_rate": 0.00019956202927931695, + "loss": 0.4848, + "step": 3802 + }, + { + "epoch": 0.51, + "grad_norm": 0.7578125, + "learning_rate": 0.0001995609399208165, + "loss": 0.5852, + "step": 3803 + }, + { + "epoch": 0.51, + "grad_norm": 0.734375, + "learning_rate": 0.0001995598492122058, + "loss": 0.4609, + "step": 3804 + }, + { + "epoch": 0.51, + "grad_norm": 0.85546875, + "learning_rate": 0.00019955875715349966, + "loss": 0.6389, + "step": 3805 + }, + { + "epoch": 0.51, + "grad_norm": 0.7578125, + "learning_rate": 0.00019955766374471287, + "loss": 0.6989, + "step": 3806 + }, + { + "epoch": 0.51, + "grad_norm": 1.1015625, + "learning_rate": 0.00019955656898586023, + "loss": 0.7767, + "step": 3807 + }, + { + "epoch": 0.51, + "grad_norm": 0.94140625, + "learning_rate": 0.0001995554728769567, + "loss": 0.7627, + "step": 3808 + }, + { + "epoch": 0.51, + "grad_norm": 0.60546875, + "learning_rate": 0.000199554375418017, + "loss": 0.5395, + "step": 3809 + }, + { + "epoch": 0.51, + "grad_norm": 1.1484375, + "learning_rate": 0.00019955327660905607, + "loss": 0.5229, + "step": 3810 + }, + { + "epoch": 0.51, + "grad_norm": 0.64453125, + "learning_rate": 0.00019955217645008886, + "loss": 0.4843, + "step": 3811 + }, + { + "epoch": 0.51, + "grad_norm": 0.55859375, + "learning_rate": 0.00019955107494113025, + "loss": 0.3393, + "step": 3812 + }, + { + "epoch": 0.51, + "grad_norm": 0.5859375, + "learning_rate": 0.00019954997208219513, + "loss": 0.5079, + "step": 3813 + }, + { + "epoch": 0.51, + "grad_norm": 0.8359375, + "learning_rate": 0.0001995488678732985, + "loss": 0.5538, + "step": 3814 + }, + { + "epoch": 0.51, + "grad_norm": 0.482421875, + "learning_rate": 0.00019954776231445535, + "loss": 0.3604, + "step": 3815 + }, + { + "epoch": 0.51, + "grad_norm": 0.70703125, + "learning_rate": 0.00019954665540568064, + "loss": 0.4999, + "step": 3816 + }, + { + "epoch": 0.51, + "grad_norm": 0.447265625, + "learning_rate": 0.00019954554714698942, + "loss": 0.459, + "step": 3817 + }, + { + "epoch": 0.51, + "grad_norm": 0.88671875, + "learning_rate": 0.00019954443753839667, + "loss": 1.0151, + "step": 3818 + }, + { + "epoch": 0.51, + "grad_norm": 0.640625, + "learning_rate": 0.00019954332657991746, + "loss": 0.461, + "step": 3819 + }, + { + "epoch": 0.51, + "grad_norm": 0.5, + "learning_rate": 0.00019954221427156686, + "loss": 0.3875, + "step": 3820 + }, + { + "epoch": 0.51, + "grad_norm": 0.7109375, + "learning_rate": 0.00019954110061335995, + "loss": 0.48, + "step": 3821 + }, + { + "epoch": 0.51, + "grad_norm": 0.64453125, + "learning_rate": 0.00019953998560531182, + "loss": 0.7071, + "step": 3822 + }, + { + "epoch": 0.51, + "grad_norm": 0.94140625, + "learning_rate": 0.00019953886924743762, + "loss": 0.3665, + "step": 3823 + }, + { + "epoch": 0.51, + "grad_norm": 0.9453125, + "learning_rate": 0.00019953775153975248, + "loss": 0.4893, + "step": 3824 + }, + { + "epoch": 0.51, + "grad_norm": 0.66796875, + "learning_rate": 0.00019953663248227152, + "loss": 0.5492, + "step": 3825 + }, + { + "epoch": 0.51, + "grad_norm": 0.703125, + "learning_rate": 0.00019953551207500992, + "loss": 0.3961, + "step": 3826 + }, + { + "epoch": 0.51, + "grad_norm": 0.9921875, + "learning_rate": 0.00019953439031798292, + "loss": 0.4304, + "step": 3827 + }, + { + "epoch": 0.51, + "grad_norm": 0.79296875, + "learning_rate": 0.00019953326721120572, + "loss": 0.4655, + "step": 3828 + }, + { + "epoch": 0.51, + "grad_norm": 0.7890625, + "learning_rate": 0.00019953214275469355, + "loss": 0.4078, + "step": 3829 + }, + { + "epoch": 0.51, + "grad_norm": 0.8984375, + "learning_rate": 0.00019953101694846162, + "loss": 0.5416, + "step": 3830 + }, + { + "epoch": 0.51, + "grad_norm": 0.9921875, + "learning_rate": 0.00019952988979252522, + "loss": 0.7047, + "step": 3831 + }, + { + "epoch": 0.51, + "grad_norm": 0.60546875, + "learning_rate": 0.00019952876128689963, + "loss": 0.765, + "step": 3832 + }, + { + "epoch": 0.51, + "grad_norm": 0.6875, + "learning_rate": 0.0001995276314316002, + "loss": 0.5563, + "step": 3833 + }, + { + "epoch": 0.51, + "grad_norm": 0.75390625, + "learning_rate": 0.00019952650022664217, + "loss": 0.6608, + "step": 3834 + }, + { + "epoch": 0.51, + "grad_norm": 0.625, + "learning_rate": 0.0001995253676720409, + "loss": 0.5376, + "step": 3835 + }, + { + "epoch": 0.51, + "grad_norm": 0.5859375, + "learning_rate": 0.00019952423376781183, + "loss": 0.489, + "step": 3836 + }, + { + "epoch": 0.51, + "grad_norm": 0.6640625, + "learning_rate": 0.00019952309851397024, + "loss": 0.6138, + "step": 3837 + }, + { + "epoch": 0.51, + "grad_norm": 0.67578125, + "learning_rate": 0.00019952196191053156, + "loss": 0.6145, + "step": 3838 + }, + { + "epoch": 0.51, + "grad_norm": 0.76171875, + "learning_rate": 0.0001995208239575112, + "loss": 0.6262, + "step": 3839 + }, + { + "epoch": 0.51, + "grad_norm": 0.546875, + "learning_rate": 0.00019951968465492463, + "loss": 0.5202, + "step": 3840 + }, + { + "epoch": 0.51, + "grad_norm": 0.625, + "learning_rate": 0.00019951854400278722, + "loss": 0.5769, + "step": 3841 + }, + { + "epoch": 0.51, + "grad_norm": 0.671875, + "learning_rate": 0.00019951740200111452, + "loss": 0.7861, + "step": 3842 + }, + { + "epoch": 0.51, + "grad_norm": 0.68359375, + "learning_rate": 0.00019951625864992196, + "loss": 0.7142, + "step": 3843 + }, + { + "epoch": 0.51, + "grad_norm": 0.64453125, + "learning_rate": 0.00019951511394922507, + "loss": 0.4866, + "step": 3844 + }, + { + "epoch": 0.51, + "grad_norm": 0.83203125, + "learning_rate": 0.00019951396789903939, + "loss": 0.5877, + "step": 3845 + }, + { + "epoch": 0.51, + "grad_norm": 0.51953125, + "learning_rate": 0.00019951282049938038, + "loss": 0.4314, + "step": 3846 + }, + { + "epoch": 0.51, + "grad_norm": 0.69140625, + "learning_rate": 0.00019951167175026369, + "loss": 0.6767, + "step": 3847 + }, + { + "epoch": 0.51, + "grad_norm": 0.7265625, + "learning_rate": 0.00019951052165170484, + "loss": 0.4879, + "step": 3848 + }, + { + "epoch": 0.51, + "grad_norm": 0.7734375, + "learning_rate": 0.0001995093702037195, + "loss": 0.4587, + "step": 3849 + }, + { + "epoch": 0.51, + "grad_norm": 0.671875, + "learning_rate": 0.00019950821740632318, + "loss": 0.3714, + "step": 3850 + }, + { + "epoch": 0.51, + "grad_norm": 0.72265625, + "learning_rate": 0.00019950706325953165, + "loss": 0.4647, + "step": 3851 + }, + { + "epoch": 0.51, + "grad_norm": 0.6171875, + "learning_rate": 0.0001995059077633604, + "loss": 0.5534, + "step": 3852 + }, + { + "epoch": 0.51, + "grad_norm": 0.80859375, + "learning_rate": 0.00019950475091782521, + "loss": 0.8282, + "step": 3853 + }, + { + "epoch": 0.51, + "grad_norm": 0.796875, + "learning_rate": 0.00019950359272294174, + "loss": 0.4562, + "step": 3854 + }, + { + "epoch": 0.51, + "grad_norm": 0.56640625, + "learning_rate": 0.00019950243317872567, + "loss": 0.3447, + "step": 3855 + }, + { + "epoch": 0.51, + "grad_norm": 1.0546875, + "learning_rate": 0.00019950127228519278, + "loss": 0.611, + "step": 3856 + }, + { + "epoch": 0.51, + "grad_norm": 0.57421875, + "learning_rate": 0.00019950011004235874, + "loss": 0.6834, + "step": 3857 + }, + { + "epoch": 0.51, + "grad_norm": 0.54296875, + "learning_rate": 0.00019949894645023935, + "loss": 0.3425, + "step": 3858 + }, + { + "epoch": 0.51, + "grad_norm": 0.8046875, + "learning_rate": 0.00019949778150885042, + "loss": 0.3263, + "step": 3859 + }, + { + "epoch": 0.52, + "grad_norm": 0.8515625, + "learning_rate": 0.0001994966152182077, + "loss": 0.4761, + "step": 3860 + }, + { + "epoch": 0.52, + "grad_norm": 0.515625, + "learning_rate": 0.00019949544757832702, + "loss": 0.5352, + "step": 3861 + }, + { + "epoch": 0.52, + "grad_norm": 0.7265625, + "learning_rate": 0.0001994942785892242, + "loss": 0.6429, + "step": 3862 + }, + { + "epoch": 0.52, + "grad_norm": 0.59765625, + "learning_rate": 0.00019949310825091514, + "loss": 0.564, + "step": 3863 + }, + { + "epoch": 0.52, + "grad_norm": 0.8671875, + "learning_rate": 0.00019949193656341568, + "loss": 0.8604, + "step": 3864 + }, + { + "epoch": 0.52, + "grad_norm": 0.6171875, + "learning_rate": 0.00019949076352674164, + "loss": 0.4776, + "step": 3865 + }, + { + "epoch": 0.52, + "grad_norm": 0.734375, + "learning_rate": 0.00019948958914090909, + "loss": 0.6728, + "step": 3866 + }, + { + "epoch": 0.52, + "grad_norm": 0.63671875, + "learning_rate": 0.00019948841340593378, + "loss": 0.4944, + "step": 3867 + }, + { + "epoch": 0.52, + "grad_norm": 0.6640625, + "learning_rate": 0.00019948723632183177, + "loss": 0.5398, + "step": 3868 + }, + { + "epoch": 0.52, + "grad_norm": 0.55859375, + "learning_rate": 0.000199486057888619, + "loss": 0.4643, + "step": 3869 + }, + { + "epoch": 0.52, + "grad_norm": 0.95703125, + "learning_rate": 0.0001994848781063114, + "loss": 0.4212, + "step": 3870 + }, + { + "epoch": 0.52, + "grad_norm": 0.65625, + "learning_rate": 0.000199483696974925, + "loss": 0.405, + "step": 3871 + }, + { + "epoch": 0.52, + "grad_norm": 0.90625, + "learning_rate": 0.00019948251449447587, + "loss": 0.5521, + "step": 3872 + }, + { + "epoch": 0.52, + "grad_norm": 0.76953125, + "learning_rate": 0.00019948133066497993, + "loss": 0.6339, + "step": 3873 + }, + { + "epoch": 0.52, + "grad_norm": 1.1328125, + "learning_rate": 0.00019948014548645335, + "loss": 0.676, + "step": 3874 + }, + { + "epoch": 0.52, + "grad_norm": 0.7578125, + "learning_rate": 0.00019947895895891212, + "loss": 0.8419, + "step": 3875 + }, + { + "epoch": 0.52, + "grad_norm": 0.466796875, + "learning_rate": 0.0001994777710823724, + "loss": 0.1921, + "step": 3876 + }, + { + "epoch": 0.52, + "grad_norm": 0.84375, + "learning_rate": 0.00019947658185685022, + "loss": 0.6982, + "step": 3877 + }, + { + "epoch": 0.52, + "grad_norm": 0.75, + "learning_rate": 0.00019947539128236177, + "loss": 0.5817, + "step": 3878 + }, + { + "epoch": 0.52, + "grad_norm": 0.93359375, + "learning_rate": 0.00019947419935892316, + "loss": 0.7495, + "step": 3879 + }, + { + "epoch": 0.52, + "grad_norm": 0.71484375, + "learning_rate": 0.00019947300608655057, + "loss": 0.5442, + "step": 3880 + }, + { + "epoch": 0.52, + "grad_norm": 0.765625, + "learning_rate": 0.00019947181146526017, + "loss": 0.6745, + "step": 3881 + }, + { + "epoch": 0.52, + "grad_norm": 0.7890625, + "learning_rate": 0.00019947061549506817, + "loss": 0.6896, + "step": 3882 + }, + { + "epoch": 0.52, + "grad_norm": 0.73828125, + "learning_rate": 0.0001994694181759908, + "loss": 0.6348, + "step": 3883 + }, + { + "epoch": 0.52, + "grad_norm": 0.62109375, + "learning_rate": 0.00019946821950804426, + "loss": 0.6876, + "step": 3884 + }, + { + "epoch": 0.52, + "grad_norm": 0.7890625, + "learning_rate": 0.00019946701949124482, + "loss": 0.649, + "step": 3885 + }, + { + "epoch": 0.52, + "grad_norm": 0.61328125, + "learning_rate": 0.00019946581812560878, + "loss": 0.7311, + "step": 3886 + }, + { + "epoch": 0.52, + "grad_norm": 0.59765625, + "learning_rate": 0.0001994646154111524, + "loss": 0.3633, + "step": 3887 + }, + { + "epoch": 0.52, + "grad_norm": 0.921875, + "learning_rate": 0.00019946341134789198, + "loss": 0.4569, + "step": 3888 + }, + { + "epoch": 0.52, + "grad_norm": 0.60546875, + "learning_rate": 0.0001994622059358439, + "loss": 0.7076, + "step": 3889 + }, + { + "epoch": 0.52, + "grad_norm": 0.65625, + "learning_rate": 0.00019946099917502445, + "loss": 0.5287, + "step": 3890 + }, + { + "epoch": 0.52, + "grad_norm": 0.59375, + "learning_rate": 0.00019945979106545002, + "loss": 0.5326, + "step": 3891 + }, + { + "epoch": 0.52, + "grad_norm": 0.59765625, + "learning_rate": 0.000199458581607137, + "loss": 0.5015, + "step": 3892 + }, + { + "epoch": 0.52, + "grad_norm": 0.7421875, + "learning_rate": 0.00019945737080010178, + "loss": 0.3899, + "step": 3893 + }, + { + "epoch": 0.52, + "grad_norm": 0.890625, + "learning_rate": 0.0001994561586443608, + "loss": 0.6638, + "step": 3894 + }, + { + "epoch": 0.52, + "grad_norm": 0.6484375, + "learning_rate": 0.00019945494513993044, + "loss": 0.4683, + "step": 3895 + }, + { + "epoch": 0.52, + "grad_norm": 0.87890625, + "learning_rate": 0.0001994537302868272, + "loss": 0.496, + "step": 3896 + }, + { + "epoch": 0.52, + "grad_norm": 0.609375, + "learning_rate": 0.00019945251408506757, + "loss": 0.423, + "step": 3897 + }, + { + "epoch": 0.52, + "grad_norm": 0.7734375, + "learning_rate": 0.00019945129653466798, + "loss": 0.6384, + "step": 3898 + }, + { + "epoch": 0.52, + "grad_norm": 1.015625, + "learning_rate": 0.00019945007763564503, + "loss": 0.6197, + "step": 3899 + }, + { + "epoch": 0.52, + "grad_norm": 0.74609375, + "learning_rate": 0.00019944885738801518, + "loss": 0.4685, + "step": 3900 + }, + { + "epoch": 0.52, + "grad_norm": 0.984375, + "learning_rate": 0.000199447635791795, + "loss": 0.3593, + "step": 3901 + }, + { + "epoch": 0.52, + "grad_norm": 0.76171875, + "learning_rate": 0.00019944641284700104, + "loss": 0.6298, + "step": 3902 + }, + { + "epoch": 0.52, + "grad_norm": 0.80078125, + "learning_rate": 0.0001994451885536499, + "loss": 0.6421, + "step": 3903 + }, + { + "epoch": 0.52, + "grad_norm": 0.55078125, + "learning_rate": 0.0001994439629117582, + "loss": 0.2595, + "step": 3904 + }, + { + "epoch": 0.52, + "grad_norm": 0.69140625, + "learning_rate": 0.00019944273592134253, + "loss": 0.4668, + "step": 3905 + }, + { + "epoch": 0.52, + "grad_norm": 0.609375, + "learning_rate": 0.00019944150758241955, + "loss": 0.6748, + "step": 3906 + }, + { + "epoch": 0.52, + "grad_norm": 0.7265625, + "learning_rate": 0.0001994402778950059, + "loss": 0.6403, + "step": 3907 + }, + { + "epoch": 0.52, + "grad_norm": 0.91015625, + "learning_rate": 0.00019943904685911824, + "loss": 0.7566, + "step": 3908 + }, + { + "epoch": 0.52, + "grad_norm": 0.73046875, + "learning_rate": 0.00019943781447477333, + "loss": 0.403, + "step": 3909 + }, + { + "epoch": 0.52, + "grad_norm": 0.921875, + "learning_rate": 0.00019943658074198778, + "loss": 0.5445, + "step": 3910 + }, + { + "epoch": 0.52, + "grad_norm": 0.54296875, + "learning_rate": 0.0001994353456607784, + "loss": 0.5977, + "step": 3911 + }, + { + "epoch": 0.52, + "grad_norm": 0.67578125, + "learning_rate": 0.00019943410923116192, + "loss": 0.8561, + "step": 3912 + }, + { + "epoch": 0.52, + "grad_norm": 0.6875, + "learning_rate": 0.00019943287145315512, + "loss": 0.7165, + "step": 3913 + }, + { + "epoch": 0.52, + "grad_norm": 0.85546875, + "learning_rate": 0.00019943163232677476, + "loss": 0.7305, + "step": 3914 + }, + { + "epoch": 0.52, + "grad_norm": 0.66015625, + "learning_rate": 0.00019943039185203761, + "loss": 0.565, + "step": 3915 + }, + { + "epoch": 0.52, + "grad_norm": 0.7890625, + "learning_rate": 0.0001994291500289606, + "loss": 0.5937, + "step": 3916 + }, + { + "epoch": 0.52, + "grad_norm": 0.95703125, + "learning_rate": 0.00019942790685756044, + "loss": 0.4933, + "step": 3917 + }, + { + "epoch": 0.52, + "grad_norm": 0.6953125, + "learning_rate": 0.0001994266623378541, + "loss": 0.7333, + "step": 3918 + }, + { + "epoch": 0.52, + "grad_norm": 0.65625, + "learning_rate": 0.0001994254164698584, + "loss": 0.723, + "step": 3919 + }, + { + "epoch": 0.52, + "grad_norm": 0.63671875, + "learning_rate": 0.00019942416925359022, + "loss": 0.5277, + "step": 3920 + }, + { + "epoch": 0.52, + "grad_norm": 0.875, + "learning_rate": 0.0001994229206890665, + "loss": 0.6284, + "step": 3921 + }, + { + "epoch": 0.52, + "grad_norm": 0.7578125, + "learning_rate": 0.00019942167077630416, + "loss": 0.4671, + "step": 3922 + }, + { + "epoch": 0.52, + "grad_norm": 0.6796875, + "learning_rate": 0.00019942041951532015, + "loss": 0.7013, + "step": 3923 + }, + { + "epoch": 0.52, + "grad_norm": 0.6640625, + "learning_rate": 0.00019941916690613146, + "loss": 0.2609, + "step": 3924 + }, + { + "epoch": 0.52, + "grad_norm": 0.7421875, + "learning_rate": 0.00019941791294875505, + "loss": 0.7367, + "step": 3925 + }, + { + "epoch": 0.52, + "grad_norm": 1.03125, + "learning_rate": 0.00019941665764320796, + "loss": 0.5648, + "step": 3926 + }, + { + "epoch": 0.52, + "grad_norm": 0.7890625, + "learning_rate": 0.00019941540098950716, + "loss": 0.3993, + "step": 3927 + }, + { + "epoch": 0.52, + "grad_norm": 0.671875, + "learning_rate": 0.00019941414298766974, + "loss": 0.6876, + "step": 3928 + }, + { + "epoch": 0.52, + "grad_norm": 0.83984375, + "learning_rate": 0.00019941288363771272, + "loss": 0.2945, + "step": 3929 + }, + { + "epoch": 0.52, + "grad_norm": 0.65234375, + "learning_rate": 0.0001994116229396532, + "loss": 0.5776, + "step": 3930 + }, + { + "epoch": 0.52, + "grad_norm": 0.640625, + "learning_rate": 0.00019941036089350826, + "loss": 0.6446, + "step": 3931 + }, + { + "epoch": 0.52, + "grad_norm": 1.015625, + "learning_rate": 0.00019940909749929504, + "loss": 0.6814, + "step": 3932 + }, + { + "epoch": 0.52, + "grad_norm": 0.734375, + "learning_rate": 0.00019940783275703065, + "loss": 0.6578, + "step": 3933 + }, + { + "epoch": 0.52, + "grad_norm": 0.68359375, + "learning_rate": 0.00019940656666673226, + "loss": 0.5754, + "step": 3934 + }, + { + "epoch": 0.53, + "grad_norm": 0.6015625, + "learning_rate": 0.00019940529922841702, + "loss": 0.4359, + "step": 3935 + }, + { + "epoch": 0.53, + "grad_norm": 0.8125, + "learning_rate": 0.00019940403044210213, + "loss": 0.4338, + "step": 3936 + }, + { + "epoch": 0.53, + "grad_norm": 0.828125, + "learning_rate": 0.00019940276030780475, + "loss": 0.7082, + "step": 3937 + }, + { + "epoch": 0.53, + "grad_norm": 0.94921875, + "learning_rate": 0.0001994014888255422, + "loss": 0.6034, + "step": 3938 + }, + { + "epoch": 0.53, + "grad_norm": 0.71484375, + "learning_rate": 0.00019940021599533164, + "loss": 0.4998, + "step": 3939 + }, + { + "epoch": 0.53, + "grad_norm": 0.66796875, + "learning_rate": 0.00019939894181719036, + "loss": 0.5554, + "step": 3940 + }, + { + "epoch": 0.53, + "grad_norm": 0.85546875, + "learning_rate": 0.00019939766629113566, + "loss": 0.5038, + "step": 3941 + }, + { + "epoch": 0.53, + "grad_norm": 0.88671875, + "learning_rate": 0.00019939638941718476, + "loss": 0.4222, + "step": 3942 + }, + { + "epoch": 0.53, + "grad_norm": 0.69140625, + "learning_rate": 0.00019939511119535506, + "loss": 0.5227, + "step": 3943 + }, + { + "epoch": 0.53, + "grad_norm": 0.74609375, + "learning_rate": 0.00019939383162566386, + "loss": 0.3516, + "step": 3944 + }, + { + "epoch": 0.53, + "grad_norm": 0.66796875, + "learning_rate": 0.0001993925507081285, + "loss": 0.4742, + "step": 3945 + }, + { + "epoch": 0.53, + "grad_norm": 0.76953125, + "learning_rate": 0.00019939126844276638, + "loss": 0.5362, + "step": 3946 + }, + { + "epoch": 0.53, + "grad_norm": 1.03125, + "learning_rate": 0.00019938998482959487, + "loss": 0.6758, + "step": 3947 + }, + { + "epoch": 0.53, + "grad_norm": 0.5859375, + "learning_rate": 0.00019938869986863136, + "loss": 0.5465, + "step": 3948 + }, + { + "epoch": 0.53, + "grad_norm": 0.81640625, + "learning_rate": 0.0001993874135598933, + "loss": 0.5506, + "step": 3949 + }, + { + "epoch": 0.53, + "grad_norm": 0.67578125, + "learning_rate": 0.00019938612590339813, + "loss": 0.7415, + "step": 3950 + }, + { + "epoch": 0.53, + "grad_norm": 0.5625, + "learning_rate": 0.00019938483689916332, + "loss": 0.3975, + "step": 3951 + }, + { + "epoch": 0.53, + "grad_norm": 0.64453125, + "learning_rate": 0.0001993835465472063, + "loss": 0.6457, + "step": 3952 + }, + { + "epoch": 0.53, + "grad_norm": 0.90234375, + "learning_rate": 0.00019938225484754463, + "loss": 0.7179, + "step": 3953 + }, + { + "epoch": 0.53, + "grad_norm": 0.79296875, + "learning_rate": 0.0001993809618001958, + "loss": 0.5018, + "step": 3954 + }, + { + "epoch": 0.53, + "grad_norm": 0.7109375, + "learning_rate": 0.00019937966740517734, + "loss": 0.4209, + "step": 3955 + }, + { + "epoch": 0.53, + "grad_norm": 0.69921875, + "learning_rate": 0.00019937837166250684, + "loss": 0.6065, + "step": 3956 + }, + { + "epoch": 0.53, + "grad_norm": 0.86328125, + "learning_rate": 0.0001993770745722018, + "loss": 0.6598, + "step": 3957 + }, + { + "epoch": 0.53, + "grad_norm": 0.7734375, + "learning_rate": 0.00019937577613427987, + "loss": 0.4948, + "step": 3958 + }, + { + "epoch": 0.53, + "grad_norm": 0.875, + "learning_rate": 0.0001993744763487586, + "loss": 0.6265, + "step": 3959 + }, + { + "epoch": 0.53, + "grad_norm": 0.6484375, + "learning_rate": 0.0001993731752156557, + "loss": 0.4748, + "step": 3960 + }, + { + "epoch": 0.53, + "grad_norm": 0.73046875, + "learning_rate": 0.00019937187273498874, + "loss": 0.3448, + "step": 3961 + }, + { + "epoch": 0.53, + "grad_norm": 0.7578125, + "learning_rate": 0.0001993705689067754, + "loss": 0.4223, + "step": 3962 + }, + { + "epoch": 0.53, + "grad_norm": 0.71484375, + "learning_rate": 0.0001993692637310334, + "loss": 0.6359, + "step": 3963 + }, + { + "epoch": 0.53, + "grad_norm": 1.0859375, + "learning_rate": 0.0001993679572077804, + "loss": 0.794, + "step": 3964 + }, + { + "epoch": 0.53, + "grad_norm": 0.6953125, + "learning_rate": 0.00019936664933703412, + "loss": 0.6222, + "step": 3965 + }, + { + "epoch": 0.53, + "grad_norm": 0.82421875, + "learning_rate": 0.0001993653401188123, + "loss": 0.5707, + "step": 3966 + }, + { + "epoch": 0.53, + "grad_norm": 0.984375, + "learning_rate": 0.00019936402955313272, + "loss": 0.7363, + "step": 3967 + }, + { + "epoch": 0.53, + "grad_norm": 0.6875, + "learning_rate": 0.00019936271764001308, + "loss": 0.3393, + "step": 3968 + }, + { + "epoch": 0.53, + "grad_norm": 0.3984375, + "learning_rate": 0.00019936140437947124, + "loss": 0.3416, + "step": 3969 + }, + { + "epoch": 0.53, + "grad_norm": 0.609375, + "learning_rate": 0.000199360089771525, + "loss": 0.5765, + "step": 3970 + }, + { + "epoch": 0.53, + "grad_norm": 0.78515625, + "learning_rate": 0.00019935877381619218, + "loss": 0.4142, + "step": 3971 + }, + { + "epoch": 0.53, + "grad_norm": 0.75390625, + "learning_rate": 0.0001993574565134906, + "loss": 0.6277, + "step": 3972 + }, + { + "epoch": 0.53, + "grad_norm": 0.5078125, + "learning_rate": 0.00019935613786343815, + "loss": 0.5531, + "step": 3973 + }, + { + "epoch": 0.53, + "grad_norm": 0.65625, + "learning_rate": 0.0001993548178660527, + "loss": 0.7408, + "step": 3974 + }, + { + "epoch": 0.53, + "grad_norm": 0.61328125, + "learning_rate": 0.00019935349652135215, + "loss": 0.4468, + "step": 3975 + }, + { + "epoch": 0.53, + "grad_norm": 0.67578125, + "learning_rate": 0.00019935217382935445, + "loss": 0.3372, + "step": 3976 + }, + { + "epoch": 0.53, + "grad_norm": 1.390625, + "learning_rate": 0.00019935084979007747, + "loss": 0.7452, + "step": 3977 + }, + { + "epoch": 0.53, + "grad_norm": 0.7890625, + "learning_rate": 0.00019934952440353923, + "loss": 0.6643, + "step": 3978 + }, + { + "epoch": 0.53, + "grad_norm": 0.66015625, + "learning_rate": 0.00019934819766975767, + "loss": 0.6042, + "step": 3979 + }, + { + "epoch": 0.53, + "grad_norm": 0.7421875, + "learning_rate": 0.00019934686958875076, + "loss": 0.8994, + "step": 3980 + }, + { + "epoch": 0.53, + "grad_norm": 0.8046875, + "learning_rate": 0.0001993455401605366, + "loss": 0.5396, + "step": 3981 + }, + { + "epoch": 0.53, + "grad_norm": 0.8046875, + "learning_rate": 0.00019934420938513313, + "loss": 0.5717, + "step": 3982 + }, + { + "epoch": 0.53, + "grad_norm": 0.77734375, + "learning_rate": 0.0001993428772625584, + "loss": 0.5508, + "step": 3983 + }, + { + "epoch": 0.53, + "grad_norm": 0.58984375, + "learning_rate": 0.0001993415437928305, + "loss": 0.4554, + "step": 3984 + }, + { + "epoch": 0.53, + "grad_norm": 0.76953125, + "learning_rate": 0.0001993402089759675, + "loss": 0.8681, + "step": 3985 + }, + { + "epoch": 0.53, + "grad_norm": 0.59375, + "learning_rate": 0.00019933887281198756, + "loss": 0.8938, + "step": 3986 + }, + { + "epoch": 0.53, + "grad_norm": 0.62890625, + "learning_rate": 0.0001993375353009087, + "loss": 0.5877, + "step": 3987 + }, + { + "epoch": 0.53, + "grad_norm": 1.0234375, + "learning_rate": 0.00019933619644274913, + "loss": 0.8273, + "step": 3988 + }, + { + "epoch": 0.53, + "grad_norm": 0.78515625, + "learning_rate": 0.00019933485623752697, + "loss": 0.676, + "step": 3989 + }, + { + "epoch": 0.53, + "grad_norm": 0.8515625, + "learning_rate": 0.00019933351468526042, + "loss": 0.7085, + "step": 3990 + }, + { + "epoch": 0.53, + "grad_norm": 0.87109375, + "learning_rate": 0.00019933217178596765, + "loss": 0.4359, + "step": 3991 + }, + { + "epoch": 0.53, + "grad_norm": 0.53515625, + "learning_rate": 0.0001993308275396669, + "loss": 0.5762, + "step": 3992 + }, + { + "epoch": 0.53, + "grad_norm": 0.6171875, + "learning_rate": 0.00019932948194637635, + "loss": 0.491, + "step": 3993 + }, + { + "epoch": 0.53, + "grad_norm": 0.77734375, + "learning_rate": 0.00019932813500611428, + "loss": 0.4326, + "step": 3994 + }, + { + "epoch": 0.53, + "grad_norm": 0.609375, + "learning_rate": 0.00019932678671889895, + "loss": 0.5442, + "step": 3995 + }, + { + "epoch": 0.53, + "grad_norm": 0.6953125, + "learning_rate": 0.00019932543708474866, + "loss": 0.4928, + "step": 3996 + }, + { + "epoch": 0.53, + "grad_norm": 0.490234375, + "learning_rate": 0.00019932408610368168, + "loss": 0.2712, + "step": 3997 + }, + { + "epoch": 0.53, + "grad_norm": 0.72265625, + "learning_rate": 0.00019932273377571636, + "loss": 0.4967, + "step": 3998 + }, + { + "epoch": 0.53, + "grad_norm": 0.671875, + "learning_rate": 0.00019932138010087105, + "loss": 0.5332, + "step": 3999 + }, + { + "epoch": 0.53, + "grad_norm": 0.59765625, + "learning_rate": 0.00019932002507916404, + "loss": 0.5622, + "step": 4000 + }, + { + "epoch": 0.53, + "grad_norm": 0.8515625, + "learning_rate": 0.00019931866871061374, + "loss": 0.8618, + "step": 4001 + }, + { + "epoch": 0.53, + "grad_norm": 0.71484375, + "learning_rate": 0.0001993173109952386, + "loss": 0.3788, + "step": 4002 + }, + { + "epoch": 0.53, + "grad_norm": 0.65625, + "learning_rate": 0.00019931595193305694, + "loss": 0.4253, + "step": 4003 + }, + { + "epoch": 0.53, + "grad_norm": 0.890625, + "learning_rate": 0.00019931459152408725, + "loss": 0.3018, + "step": 4004 + }, + { + "epoch": 0.53, + "grad_norm": 0.9453125, + "learning_rate": 0.00019931322976834794, + "loss": 0.4756, + "step": 4005 + }, + { + "epoch": 0.53, + "grad_norm": 0.80859375, + "learning_rate": 0.00019931186666585752, + "loss": 0.2881, + "step": 4006 + }, + { + "epoch": 0.53, + "grad_norm": 0.8828125, + "learning_rate": 0.00019931050221663442, + "loss": 0.5964, + "step": 4007 + }, + { + "epoch": 0.53, + "grad_norm": 0.87890625, + "learning_rate": 0.0001993091364206972, + "loss": 0.4731, + "step": 4008 + }, + { + "epoch": 0.53, + "grad_norm": 0.58984375, + "learning_rate": 0.00019930776927806434, + "loss": 0.6961, + "step": 4009 + }, + { + "epoch": 0.54, + "grad_norm": 0.7421875, + "learning_rate": 0.0001993064007887544, + "loss": 0.5133, + "step": 4010 + }, + { + "epoch": 0.54, + "grad_norm": 0.94140625, + "learning_rate": 0.00019930503095278592, + "loss": 0.6623, + "step": 4011 + }, + { + "epoch": 0.54, + "grad_norm": 0.73046875, + "learning_rate": 0.0001993036597701775, + "loss": 0.8065, + "step": 4012 + }, + { + "epoch": 0.54, + "grad_norm": 0.59765625, + "learning_rate": 0.0001993022872409477, + "loss": 0.6171, + "step": 4013 + }, + { + "epoch": 0.54, + "grad_norm": 0.7265625, + "learning_rate": 0.0001993009133651152, + "loss": 0.3603, + "step": 4014 + }, + { + "epoch": 0.54, + "grad_norm": 0.79296875, + "learning_rate": 0.00019929953814269854, + "loss": 0.636, + "step": 4015 + }, + { + "epoch": 0.54, + "grad_norm": 0.4296875, + "learning_rate": 0.00019929816157371644, + "loss": 0.3613, + "step": 4016 + }, + { + "epoch": 0.54, + "grad_norm": 0.58203125, + "learning_rate": 0.0001992967836581875, + "loss": 0.5257, + "step": 4017 + }, + { + "epoch": 0.54, + "grad_norm": 0.59375, + "learning_rate": 0.00019929540439613048, + "loss": 0.5733, + "step": 4018 + }, + { + "epoch": 0.54, + "grad_norm": 0.62109375, + "learning_rate": 0.00019929402378756405, + "loss": 0.5398, + "step": 4019 + }, + { + "epoch": 0.54, + "grad_norm": 0.84765625, + "learning_rate": 0.0001992926418325069, + "loss": 0.5029, + "step": 4020 + }, + { + "epoch": 0.54, + "grad_norm": 0.75, + "learning_rate": 0.00019929125853097786, + "loss": 0.5853, + "step": 4021 + }, + { + "epoch": 0.54, + "grad_norm": 0.6484375, + "learning_rate": 0.00019928987388299557, + "loss": 0.5954, + "step": 4022 + }, + { + "epoch": 0.54, + "grad_norm": 1.03125, + "learning_rate": 0.00019928848788857887, + "loss": 0.5129, + "step": 4023 + }, + { + "epoch": 0.54, + "grad_norm": 0.75390625, + "learning_rate": 0.00019928710054774657, + "loss": 0.6285, + "step": 4024 + }, + { + "epoch": 0.54, + "grad_norm": 0.8515625, + "learning_rate": 0.0001992857118605175, + "loss": 0.5601, + "step": 4025 + }, + { + "epoch": 0.54, + "grad_norm": 0.67578125, + "learning_rate": 0.0001992843218269104, + "loss": 0.5362, + "step": 4026 + }, + { + "epoch": 0.54, + "grad_norm": 0.9140625, + "learning_rate": 0.00019928293044694422, + "loss": 0.3493, + "step": 4027 + }, + { + "epoch": 0.54, + "grad_norm": 0.77734375, + "learning_rate": 0.00019928153772063773, + "loss": 0.7608, + "step": 4028 + }, + { + "epoch": 0.54, + "grad_norm": 0.69921875, + "learning_rate": 0.0001992801436480099, + "loss": 0.4546, + "step": 4029 + }, + { + "epoch": 0.54, + "grad_norm": 0.859375, + "learning_rate": 0.0001992787482290796, + "loss": 0.5588, + "step": 4030 + }, + { + "epoch": 0.54, + "grad_norm": 0.84765625, + "learning_rate": 0.00019927735146386578, + "loss": 0.5934, + "step": 4031 + }, + { + "epoch": 0.54, + "grad_norm": 0.73046875, + "learning_rate": 0.00019927595335238733, + "loss": 0.5859, + "step": 4032 + }, + { + "epoch": 0.54, + "grad_norm": 0.74609375, + "learning_rate": 0.00019927455389466325, + "loss": 0.4588, + "step": 4033 + }, + { + "epoch": 0.54, + "grad_norm": 0.87890625, + "learning_rate": 0.0001992731530907125, + "loss": 0.5933, + "step": 4034 + }, + { + "epoch": 0.54, + "grad_norm": 0.70703125, + "learning_rate": 0.0001992717509405541, + "loss": 0.3262, + "step": 4035 + }, + { + "epoch": 0.54, + "grad_norm": 0.59375, + "learning_rate": 0.00019927034744420704, + "loss": 0.6332, + "step": 4036 + }, + { + "epoch": 0.54, + "grad_norm": 0.74609375, + "learning_rate": 0.0001992689426016903, + "loss": 0.6079, + "step": 4037 + }, + { + "epoch": 0.54, + "grad_norm": 0.87890625, + "learning_rate": 0.00019926753641302306, + "loss": 0.6925, + "step": 4038 + }, + { + "epoch": 0.54, + "grad_norm": 0.84375, + "learning_rate": 0.00019926612887822428, + "loss": 0.3631, + "step": 4039 + }, + { + "epoch": 0.54, + "grad_norm": 0.64453125, + "learning_rate": 0.00019926471999731312, + "loss": 0.5652, + "step": 4040 + }, + { + "epoch": 0.54, + "grad_norm": 0.60546875, + "learning_rate": 0.0001992633097703086, + "loss": 0.581, + "step": 4041 + }, + { + "epoch": 0.54, + "grad_norm": 0.9375, + "learning_rate": 0.00019926189819722995, + "loss": 0.3622, + "step": 4042 + }, + { + "epoch": 0.54, + "grad_norm": 1.375, + "learning_rate": 0.0001992604852780962, + "loss": 0.8481, + "step": 4043 + }, + { + "epoch": 0.54, + "grad_norm": 0.60546875, + "learning_rate": 0.0001992590710129266, + "loss": 0.3269, + "step": 4044 + }, + { + "epoch": 0.54, + "grad_norm": 0.6328125, + "learning_rate": 0.00019925765540174025, + "loss": 0.4292, + "step": 4045 + }, + { + "epoch": 0.54, + "grad_norm": 0.890625, + "learning_rate": 0.00019925623844455642, + "loss": 0.5713, + "step": 4046 + }, + { + "epoch": 0.54, + "grad_norm": 0.9296875, + "learning_rate": 0.0001992548201413943, + "loss": 0.5644, + "step": 4047 + }, + { + "epoch": 0.54, + "grad_norm": 0.71875, + "learning_rate": 0.00019925340049227312, + "loss": 0.4293, + "step": 4048 + }, + { + "epoch": 0.54, + "grad_norm": 1.15625, + "learning_rate": 0.00019925197949721213, + "loss": 0.4641, + "step": 4049 + }, + { + "epoch": 0.54, + "grad_norm": 0.640625, + "learning_rate": 0.00019925055715623057, + "loss": 0.5461, + "step": 4050 + }, + { + "epoch": 0.54, + "grad_norm": 0.5625, + "learning_rate": 0.00019924913346934777, + "loss": 0.4285, + "step": 4051 + }, + { + "epoch": 0.54, + "grad_norm": 0.75, + "learning_rate": 0.00019924770843658305, + "loss": 0.5135, + "step": 4052 + }, + { + "epoch": 0.54, + "grad_norm": 0.9453125, + "learning_rate": 0.00019924628205795567, + "loss": 0.4276, + "step": 4053 + }, + { + "epoch": 0.54, + "grad_norm": 0.76953125, + "learning_rate": 0.000199244854333485, + "loss": 0.594, + "step": 4054 + }, + { + "epoch": 0.54, + "grad_norm": 0.8359375, + "learning_rate": 0.00019924342526319046, + "loss": 0.4925, + "step": 4055 + }, + { + "epoch": 0.54, + "grad_norm": 0.57421875, + "learning_rate": 0.00019924199484709133, + "loss": 0.2311, + "step": 4056 + }, + { + "epoch": 0.54, + "grad_norm": 0.57421875, + "learning_rate": 0.0001992405630852071, + "loss": 0.3263, + "step": 4057 + }, + { + "epoch": 0.54, + "grad_norm": 0.71484375, + "learning_rate": 0.00019923912997755713, + "loss": 0.5643, + "step": 4058 + }, + { + "epoch": 0.54, + "grad_norm": 0.83984375, + "learning_rate": 0.00019923769552416084, + "loss": 0.7912, + "step": 4059 + }, + { + "epoch": 0.54, + "grad_norm": 0.6484375, + "learning_rate": 0.00019923625972503774, + "loss": 0.7226, + "step": 4060 + }, + { + "epoch": 0.54, + "grad_norm": 0.90625, + "learning_rate": 0.00019923482258020725, + "loss": 0.4087, + "step": 4061 + }, + { + "epoch": 0.54, + "grad_norm": 0.828125, + "learning_rate": 0.0001992333840896889, + "loss": 0.5812, + "step": 4062 + }, + { + "epoch": 0.54, + "grad_norm": 0.51953125, + "learning_rate": 0.00019923194425350212, + "loss": 0.4297, + "step": 4063 + }, + { + "epoch": 0.54, + "grad_norm": 0.765625, + "learning_rate": 0.00019923050307166655, + "loss": 0.3703, + "step": 4064 + }, + { + "epoch": 0.54, + "grad_norm": 0.609375, + "learning_rate": 0.00019922906054420166, + "loss": 0.578, + "step": 4065 + }, + { + "epoch": 0.54, + "grad_norm": 0.69921875, + "learning_rate": 0.000199227616671127, + "loss": 0.4397, + "step": 4066 + }, + { + "epoch": 0.54, + "grad_norm": 0.83984375, + "learning_rate": 0.0001992261714524622, + "loss": 0.5807, + "step": 4067 + }, + { + "epoch": 0.54, + "grad_norm": 0.69921875, + "learning_rate": 0.0001992247248882268, + "loss": 0.5919, + "step": 4068 + }, + { + "epoch": 0.54, + "grad_norm": 0.7734375, + "learning_rate": 0.00019922327697844048, + "loss": 0.4406, + "step": 4069 + }, + { + "epoch": 0.54, + "grad_norm": 0.74609375, + "learning_rate": 0.00019922182772312284, + "loss": 0.6035, + "step": 4070 + }, + { + "epoch": 0.54, + "grad_norm": 0.765625, + "learning_rate": 0.00019922037712229352, + "loss": 0.6716, + "step": 4071 + }, + { + "epoch": 0.54, + "grad_norm": 0.6171875, + "learning_rate": 0.00019921892517597224, + "loss": 0.3246, + "step": 4072 + }, + { + "epoch": 0.54, + "grad_norm": 0.5625, + "learning_rate": 0.0001992174718841786, + "loss": 0.504, + "step": 4073 + }, + { + "epoch": 0.54, + "grad_norm": 1.1484375, + "learning_rate": 0.0001992160172469324, + "loss": 0.6962, + "step": 4074 + }, + { + "epoch": 0.54, + "grad_norm": 1.1015625, + "learning_rate": 0.0001992145612642533, + "loss": 0.7606, + "step": 4075 + }, + { + "epoch": 0.54, + "grad_norm": 0.8515625, + "learning_rate": 0.00019921310393616112, + "loss": 0.3793, + "step": 4076 + }, + { + "epoch": 0.54, + "grad_norm": 0.89453125, + "learning_rate": 0.00019921164526267554, + "loss": 0.8207, + "step": 4077 + }, + { + "epoch": 0.54, + "grad_norm": 0.6640625, + "learning_rate": 0.0001992101852438164, + "loss": 0.5084, + "step": 4078 + }, + { + "epoch": 0.54, + "grad_norm": 0.76953125, + "learning_rate": 0.00019920872387960348, + "loss": 0.5667, + "step": 4079 + }, + { + "epoch": 0.54, + "grad_norm": 0.609375, + "learning_rate": 0.00019920726117005657, + "loss": 0.54, + "step": 4080 + }, + { + "epoch": 0.54, + "grad_norm": 0.59375, + "learning_rate": 0.00019920579711519553, + "loss": 0.9884, + "step": 4081 + }, + { + "epoch": 0.54, + "grad_norm": 0.7265625, + "learning_rate": 0.00019920433171504023, + "loss": 0.4837, + "step": 4082 + }, + { + "epoch": 0.54, + "grad_norm": 0.75, + "learning_rate": 0.0001992028649696105, + "loss": 0.3576, + "step": 4083 + }, + { + "epoch": 0.54, + "grad_norm": 0.8515625, + "learning_rate": 0.00019920139687892627, + "loss": 0.6351, + "step": 4084 + }, + { + "epoch": 0.55, + "grad_norm": 0.7109375, + "learning_rate": 0.0001991999274430074, + "loss": 0.4263, + "step": 4085 + }, + { + "epoch": 0.55, + "grad_norm": 0.7578125, + "learning_rate": 0.0001991984566618739, + "loss": 0.7465, + "step": 4086 + }, + { + "epoch": 0.55, + "grad_norm": 0.65234375, + "learning_rate": 0.00019919698453554563, + "loss": 0.6527, + "step": 4087 + }, + { + "epoch": 0.55, + "grad_norm": 0.68359375, + "learning_rate": 0.00019919551106404259, + "loss": 0.4535, + "step": 4088 + }, + { + "epoch": 0.55, + "grad_norm": 0.87890625, + "learning_rate": 0.00019919403624738476, + "loss": 0.4425, + "step": 4089 + }, + { + "epoch": 0.55, + "grad_norm": 0.72265625, + "learning_rate": 0.00019919256008559214, + "loss": 0.7024, + "step": 4090 + }, + { + "epoch": 0.55, + "grad_norm": 0.625, + "learning_rate": 0.00019919108257868473, + "loss": 0.6062, + "step": 4091 + }, + { + "epoch": 0.55, + "grad_norm": 0.8984375, + "learning_rate": 0.00019918960372668259, + "loss": 0.6498, + "step": 4092 + }, + { + "epoch": 0.55, + "grad_norm": 0.8984375, + "learning_rate": 0.00019918812352960575, + "loss": 0.7758, + "step": 4093 + }, + { + "epoch": 0.55, + "grad_norm": 0.703125, + "learning_rate": 0.00019918664198747432, + "loss": 0.8009, + "step": 4094 + }, + { + "epoch": 0.55, + "grad_norm": 0.8125, + "learning_rate": 0.00019918515910030834, + "loss": 0.4682, + "step": 4095 + }, + { + "epoch": 0.55, + "grad_norm": 0.95703125, + "learning_rate": 0.00019918367486812798, + "loss": 0.5404, + "step": 4096 + }, + { + "epoch": 0.55, + "grad_norm": 0.625, + "learning_rate": 0.0001991821892909533, + "loss": 0.5015, + "step": 4097 + }, + { + "epoch": 0.55, + "grad_norm": 0.82421875, + "learning_rate": 0.0001991807023688045, + "loss": 0.4869, + "step": 4098 + }, + { + "epoch": 0.55, + "grad_norm": 0.87890625, + "learning_rate": 0.0001991792141017017, + "loss": 0.3376, + "step": 4099 + }, + { + "epoch": 0.55, + "grad_norm": 0.5234375, + "learning_rate": 0.00019917772448966515, + "loss": 0.4414, + "step": 4100 + }, + { + "epoch": 0.55, + "grad_norm": 1.1640625, + "learning_rate": 0.00019917623353271497, + "loss": 0.6556, + "step": 4101 + }, + { + "epoch": 0.55, + "grad_norm": 0.490234375, + "learning_rate": 0.0001991747412308714, + "loss": 0.2647, + "step": 4102 + }, + { + "epoch": 0.55, + "grad_norm": 0.671875, + "learning_rate": 0.0001991732475841547, + "loss": 0.4131, + "step": 4103 + }, + { + "epoch": 0.55, + "grad_norm": 0.59375, + "learning_rate": 0.00019917175259258514, + "loss": 0.5356, + "step": 4104 + }, + { + "epoch": 0.55, + "grad_norm": 0.70703125, + "learning_rate": 0.00019917025625618292, + "loss": 0.7284, + "step": 4105 + }, + { + "epoch": 0.55, + "grad_norm": 0.77734375, + "learning_rate": 0.0001991687585749684, + "loss": 0.7322, + "step": 4106 + }, + { + "epoch": 0.55, + "grad_norm": 0.7421875, + "learning_rate": 0.0001991672595489619, + "loss": 0.3593, + "step": 4107 + }, + { + "epoch": 0.55, + "grad_norm": 0.89453125, + "learning_rate": 0.00019916575917818368, + "loss": 0.6196, + "step": 4108 + }, + { + "epoch": 0.55, + "grad_norm": 0.578125, + "learning_rate": 0.00019916425746265413, + "loss": 0.8159, + "step": 4109 + }, + { + "epoch": 0.55, + "grad_norm": 0.62890625, + "learning_rate": 0.0001991627544023936, + "loss": 0.4942, + "step": 4110 + }, + { + "epoch": 0.55, + "grad_norm": 0.83203125, + "learning_rate": 0.00019916124999742248, + "loss": 0.4974, + "step": 4111 + }, + { + "epoch": 0.55, + "grad_norm": 0.6484375, + "learning_rate": 0.00019915974424776115, + "loss": 0.3973, + "step": 4112 + }, + { + "epoch": 0.55, + "grad_norm": 0.62890625, + "learning_rate": 0.0001991582371534301, + "loss": 0.4322, + "step": 4113 + }, + { + "epoch": 0.55, + "grad_norm": 0.70703125, + "learning_rate": 0.00019915672871444965, + "loss": 0.5288, + "step": 4114 + }, + { + "epoch": 0.55, + "grad_norm": 0.5390625, + "learning_rate": 0.00019915521893084038, + "loss": 0.5591, + "step": 4115 + }, + { + "epoch": 0.55, + "grad_norm": 0.61328125, + "learning_rate": 0.00019915370780262265, + "loss": 0.4858, + "step": 4116 + }, + { + "epoch": 0.55, + "grad_norm": 0.67578125, + "learning_rate": 0.00019915219532981704, + "loss": 0.6474, + "step": 4117 + }, + { + "epoch": 0.55, + "grad_norm": 0.56640625, + "learning_rate": 0.00019915068151244402, + "loss": 0.6849, + "step": 4118 + }, + { + "epoch": 0.55, + "grad_norm": 0.474609375, + "learning_rate": 0.00019914916635052415, + "loss": 0.3366, + "step": 4119 + }, + { + "epoch": 0.55, + "grad_norm": 0.83984375, + "learning_rate": 0.0001991476498440779, + "loss": 0.9501, + "step": 4120 + }, + { + "epoch": 0.55, + "grad_norm": 0.6328125, + "learning_rate": 0.0001991461319931259, + "loss": 0.6955, + "step": 4121 + }, + { + "epoch": 0.55, + "grad_norm": 1.0078125, + "learning_rate": 0.00019914461279768873, + "loss": 0.6797, + "step": 4122 + }, + { + "epoch": 0.55, + "grad_norm": 0.7265625, + "learning_rate": 0.000199143092257787, + "loss": 0.6422, + "step": 4123 + }, + { + "epoch": 0.55, + "grad_norm": 0.490234375, + "learning_rate": 0.00019914157037344125, + "loss": 0.4856, + "step": 4124 + }, + { + "epoch": 0.55, + "grad_norm": 0.6328125, + "learning_rate": 0.00019914004714467222, + "loss": 0.5578, + "step": 4125 + }, + { + "epoch": 0.55, + "grad_norm": 0.8515625, + "learning_rate": 0.00019913852257150052, + "loss": 0.6422, + "step": 4126 + }, + { + "epoch": 0.55, + "grad_norm": 0.7265625, + "learning_rate": 0.00019913699665394683, + "loss": 0.551, + "step": 4127 + }, + { + "epoch": 0.55, + "grad_norm": 0.72265625, + "learning_rate": 0.00019913546939203186, + "loss": 0.6124, + "step": 4128 + }, + { + "epoch": 0.55, + "grad_norm": 0.7265625, + "learning_rate": 0.00019913394078577626, + "loss": 0.7551, + "step": 4129 + }, + { + "epoch": 0.55, + "grad_norm": 0.6953125, + "learning_rate": 0.0001991324108352008, + "loss": 0.2887, + "step": 4130 + }, + { + "epoch": 0.55, + "grad_norm": 0.78515625, + "learning_rate": 0.00019913087954032626, + "loss": 0.4475, + "step": 4131 + }, + { + "epoch": 0.55, + "grad_norm": 0.6328125, + "learning_rate": 0.00019912934690117333, + "loss": 0.6567, + "step": 4132 + }, + { + "epoch": 0.55, + "grad_norm": 1.265625, + "learning_rate": 0.00019912781291776287, + "loss": 0.6056, + "step": 4133 + }, + { + "epoch": 0.55, + "grad_norm": 0.8671875, + "learning_rate": 0.00019912627759011562, + "loss": 0.3962, + "step": 4134 + }, + { + "epoch": 0.55, + "grad_norm": 0.6171875, + "learning_rate": 0.00019912474091825244, + "loss": 0.5777, + "step": 4135 + }, + { + "epoch": 0.55, + "grad_norm": 0.48046875, + "learning_rate": 0.00019912320290219415, + "loss": 0.8462, + "step": 4136 + }, + { + "epoch": 0.55, + "grad_norm": 1.03125, + "learning_rate": 0.00019912166354196163, + "loss": 0.5791, + "step": 4137 + }, + { + "epoch": 0.55, + "grad_norm": 0.8515625, + "learning_rate": 0.00019912012283757573, + "loss": 0.5475, + "step": 4138 + }, + { + "epoch": 0.55, + "grad_norm": 0.80078125, + "learning_rate": 0.00019911858078905734, + "loss": 0.5752, + "step": 4139 + }, + { + "epoch": 0.55, + "grad_norm": 0.70703125, + "learning_rate": 0.00019911703739642738, + "loss": 0.4118, + "step": 4140 + }, + { + "epoch": 0.55, + "grad_norm": 0.73046875, + "learning_rate": 0.0001991154926597068, + "loss": 0.4131, + "step": 4141 + }, + { + "epoch": 0.55, + "grad_norm": 0.578125, + "learning_rate": 0.00019911394657891652, + "loss": 0.5371, + "step": 4142 + }, + { + "epoch": 0.55, + "grad_norm": 0.80859375, + "learning_rate": 0.00019911239915407748, + "loss": 0.3356, + "step": 4143 + }, + { + "epoch": 0.55, + "grad_norm": 0.9140625, + "learning_rate": 0.00019911085038521074, + "loss": 0.4513, + "step": 4144 + }, + { + "epoch": 0.55, + "grad_norm": 0.62109375, + "learning_rate": 0.00019910930027233726, + "loss": 0.1938, + "step": 4145 + }, + { + "epoch": 0.55, + "grad_norm": 0.85546875, + "learning_rate": 0.000199107748815478, + "loss": 0.8686, + "step": 4146 + }, + { + "epoch": 0.55, + "grad_norm": 0.890625, + "learning_rate": 0.00019910619601465413, + "loss": 1.1142, + "step": 4147 + }, + { + "epoch": 0.55, + "grad_norm": 0.6796875, + "learning_rate": 0.0001991046418698866, + "loss": 0.6948, + "step": 4148 + }, + { + "epoch": 0.55, + "grad_norm": 0.5390625, + "learning_rate": 0.00019910308638119656, + "loss": 0.509, + "step": 4149 + }, + { + "epoch": 0.55, + "grad_norm": 0.85546875, + "learning_rate": 0.000199101529548605, + "loss": 0.5115, + "step": 4150 + }, + { + "epoch": 0.55, + "grad_norm": 0.64453125, + "learning_rate": 0.00019909997137213315, + "loss": 0.7996, + "step": 4151 + }, + { + "epoch": 0.55, + "grad_norm": 0.6953125, + "learning_rate": 0.00019909841185180205, + "loss": 0.4686, + "step": 4152 + }, + { + "epoch": 0.55, + "grad_norm": 0.69140625, + "learning_rate": 0.0001990968509876329, + "loss": 0.4159, + "step": 4153 + }, + { + "epoch": 0.55, + "grad_norm": 0.6875, + "learning_rate": 0.00019909528877964686, + "loss": 0.524, + "step": 4154 + }, + { + "epoch": 0.55, + "grad_norm": 0.75390625, + "learning_rate": 0.00019909372522786508, + "loss": 0.4875, + "step": 4155 + }, + { + "epoch": 0.55, + "grad_norm": 0.87890625, + "learning_rate": 0.00019909216033230881, + "loss": 0.4719, + "step": 4156 + }, + { + "epoch": 0.55, + "grad_norm": 0.5234375, + "learning_rate": 0.00019909059409299926, + "loss": 0.3385, + "step": 4157 + }, + { + "epoch": 0.55, + "grad_norm": 0.78515625, + "learning_rate": 0.00019908902650995764, + "loss": 0.6952, + "step": 4158 + }, + { + "epoch": 0.55, + "grad_norm": 1.0390625, + "learning_rate": 0.0001990874575832052, + "loss": 0.5297, + "step": 4159 + }, + { + "epoch": 0.56, + "grad_norm": 0.92578125, + "learning_rate": 0.0001990858873127633, + "loss": 0.4508, + "step": 4160 + }, + { + "epoch": 0.56, + "grad_norm": 0.64453125, + "learning_rate": 0.00019908431569865313, + "loss": 0.4232, + "step": 4161 + }, + { + "epoch": 0.56, + "grad_norm": 0.81640625, + "learning_rate": 0.00019908274274089608, + "loss": 0.6551, + "step": 4162 + }, + { + "epoch": 0.56, + "grad_norm": 1.09375, + "learning_rate": 0.00019908116843951344, + "loss": 0.5726, + "step": 4163 + }, + { + "epoch": 0.56, + "grad_norm": 0.6953125, + "learning_rate": 0.00019907959279452656, + "loss": 0.5285, + "step": 4164 + }, + { + "epoch": 0.56, + "grad_norm": 0.60546875, + "learning_rate": 0.00019907801580595677, + "loss": 0.4036, + "step": 4165 + }, + { + "epoch": 0.56, + "grad_norm": 0.5390625, + "learning_rate": 0.00019907643747382557, + "loss": 0.4776, + "step": 4166 + }, + { + "epoch": 0.56, + "grad_norm": 0.91015625, + "learning_rate": 0.00019907485779815424, + "loss": 0.5973, + "step": 4167 + }, + { + "epoch": 0.56, + "grad_norm": 0.6484375, + "learning_rate": 0.00019907327677896425, + "loss": 0.5545, + "step": 4168 + }, + { + "epoch": 0.56, + "grad_norm": 0.70703125, + "learning_rate": 0.00019907169441627708, + "loss": 0.3405, + "step": 4169 + }, + { + "epoch": 0.56, + "grad_norm": 0.609375, + "learning_rate": 0.00019907011071011415, + "loss": 0.6151, + "step": 4170 + }, + { + "epoch": 0.56, + "grad_norm": 0.703125, + "learning_rate": 0.0001990685256604969, + "loss": 0.7126, + "step": 4171 + }, + { + "epoch": 0.56, + "grad_norm": 0.74609375, + "learning_rate": 0.00019906693926744688, + "loss": 0.5575, + "step": 4172 + }, + { + "epoch": 0.56, + "grad_norm": 0.609375, + "learning_rate": 0.0001990653515309856, + "loss": 0.7704, + "step": 4173 + }, + { + "epoch": 0.56, + "grad_norm": 0.93359375, + "learning_rate": 0.00019906376245113454, + "loss": 0.4801, + "step": 4174 + }, + { + "epoch": 0.56, + "grad_norm": 1.3984375, + "learning_rate": 0.00019906217202791528, + "loss": 0.615, + "step": 4175 + }, + { + "epoch": 0.56, + "grad_norm": 0.58203125, + "learning_rate": 0.00019906058026134945, + "loss": 0.3255, + "step": 4176 + }, + { + "epoch": 0.56, + "grad_norm": 0.69140625, + "learning_rate": 0.0001990589871514585, + "loss": 0.5359, + "step": 4177 + }, + { + "epoch": 0.56, + "grad_norm": 0.8671875, + "learning_rate": 0.00019905739269826414, + "loss": 0.5364, + "step": 4178 + }, + { + "epoch": 0.56, + "grad_norm": 0.734375, + "learning_rate": 0.00019905579690178798, + "loss": 0.7373, + "step": 4179 + }, + { + "epoch": 0.56, + "grad_norm": 0.5390625, + "learning_rate": 0.0001990541997620516, + "loss": 0.5124, + "step": 4180 + }, + { + "epoch": 0.56, + "grad_norm": 1.03125, + "learning_rate": 0.00019905260127907673, + "loss": 0.7706, + "step": 4181 + }, + { + "epoch": 0.56, + "grad_norm": 0.66796875, + "learning_rate": 0.000199051001452885, + "loss": 0.8027, + "step": 4182 + }, + { + "epoch": 0.56, + "grad_norm": 0.9921875, + "learning_rate": 0.00019904940028349813, + "loss": 0.501, + "step": 4183 + }, + { + "epoch": 0.56, + "grad_norm": 0.66015625, + "learning_rate": 0.00019904779777093782, + "loss": 0.4026, + "step": 4184 + }, + { + "epoch": 0.56, + "grad_norm": 0.70703125, + "learning_rate": 0.00019904619391522581, + "loss": 0.6472, + "step": 4185 + }, + { + "epoch": 0.56, + "grad_norm": 0.625, + "learning_rate": 0.00019904458871638383, + "loss": 0.5767, + "step": 4186 + }, + { + "epoch": 0.56, + "grad_norm": 0.8359375, + "learning_rate": 0.00019904298217443366, + "loss": 0.9662, + "step": 4187 + }, + { + "epoch": 0.56, + "grad_norm": 0.7265625, + "learning_rate": 0.0001990413742893971, + "loss": 0.45, + "step": 4188 + }, + { + "epoch": 0.56, + "grad_norm": 0.6875, + "learning_rate": 0.00019903976506129592, + "loss": 0.4297, + "step": 4189 + }, + { + "epoch": 0.56, + "grad_norm": 0.6484375, + "learning_rate": 0.00019903815449015197, + "loss": 0.5154, + "step": 4190 + }, + { + "epoch": 0.56, + "grad_norm": 0.69921875, + "learning_rate": 0.00019903654257598708, + "loss": 0.5676, + "step": 4191 + }, + { + "epoch": 0.56, + "grad_norm": 0.796875, + "learning_rate": 0.00019903492931882313, + "loss": 0.7024, + "step": 4192 + }, + { + "epoch": 0.56, + "grad_norm": 0.84375, + "learning_rate": 0.00019903331471868196, + "loss": 0.3058, + "step": 4193 + }, + { + "epoch": 0.56, + "grad_norm": 0.7265625, + "learning_rate": 0.0001990316987755855, + "loss": 0.7379, + "step": 4194 + }, + { + "epoch": 0.56, + "grad_norm": 0.8125, + "learning_rate": 0.00019903008148955563, + "loss": 0.4302, + "step": 4195 + }, + { + "epoch": 0.56, + "grad_norm": 1.3125, + "learning_rate": 0.00019902846286061433, + "loss": 0.8105, + "step": 4196 + }, + { + "epoch": 0.56, + "grad_norm": 0.78515625, + "learning_rate": 0.00019902684288878347, + "loss": 0.8218, + "step": 4197 + }, + { + "epoch": 0.56, + "grad_norm": 0.765625, + "learning_rate": 0.0001990252215740851, + "loss": 0.5678, + "step": 4198 + }, + { + "epoch": 0.56, + "grad_norm": 0.76953125, + "learning_rate": 0.00019902359891654114, + "loss": 0.6975, + "step": 4199 + }, + { + "epoch": 0.56, + "grad_norm": 0.69140625, + "learning_rate": 0.00019902197491617366, + "loss": 0.5517, + "step": 4200 + }, + { + "epoch": 0.56, + "grad_norm": 0.92578125, + "learning_rate": 0.0001990203495730046, + "loss": 0.3981, + "step": 4201 + }, + { + "epoch": 0.56, + "grad_norm": 0.72265625, + "learning_rate": 0.0001990187228870561, + "loss": 0.6935, + "step": 4202 + }, + { + "epoch": 0.56, + "grad_norm": 1.046875, + "learning_rate": 0.00019901709485835016, + "loss": 0.5293, + "step": 4203 + }, + { + "epoch": 0.56, + "grad_norm": 0.890625, + "learning_rate": 0.00019901546548690884, + "loss": 0.5682, + "step": 4204 + }, + { + "epoch": 0.56, + "grad_norm": 0.55078125, + "learning_rate": 0.00019901383477275429, + "loss": 0.6514, + "step": 4205 + }, + { + "epoch": 0.56, + "grad_norm": 0.7109375, + "learning_rate": 0.00019901220271590858, + "loss": 0.5653, + "step": 4206 + }, + { + "epoch": 0.56, + "grad_norm": 0.6484375, + "learning_rate": 0.00019901056931639384, + "loss": 0.4483, + "step": 4207 + }, + { + "epoch": 0.56, + "grad_norm": 0.859375, + "learning_rate": 0.00019900893457423227, + "loss": 0.574, + "step": 4208 + }, + { + "epoch": 0.56, + "grad_norm": 0.53515625, + "learning_rate": 0.00019900729848944596, + "loss": 0.4742, + "step": 4209 + }, + { + "epoch": 0.56, + "grad_norm": 0.78125, + "learning_rate": 0.00019900566106205716, + "loss": 0.2914, + "step": 4210 + }, + { + "epoch": 0.56, + "grad_norm": 0.61328125, + "learning_rate": 0.00019900402229208806, + "loss": 0.3543, + "step": 4211 + }, + { + "epoch": 0.56, + "grad_norm": 0.67578125, + "learning_rate": 0.00019900238217956088, + "loss": 0.5384, + "step": 4212 + }, + { + "epoch": 0.56, + "grad_norm": 0.94140625, + "learning_rate": 0.00019900074072449786, + "loss": 0.8124, + "step": 4213 + }, + { + "epoch": 0.56, + "grad_norm": 0.65234375, + "learning_rate": 0.00019899909792692126, + "loss": 0.5654, + "step": 4214 + }, + { + "epoch": 0.56, + "grad_norm": 0.625, + "learning_rate": 0.00019899745378685333, + "loss": 0.3339, + "step": 4215 + }, + { + "epoch": 0.56, + "grad_norm": 0.73046875, + "learning_rate": 0.0001989958083043164, + "loss": 0.6241, + "step": 4216 + }, + { + "epoch": 0.56, + "grad_norm": 0.58984375, + "learning_rate": 0.00019899416147933278, + "loss": 0.5638, + "step": 4217 + }, + { + "epoch": 0.56, + "grad_norm": 0.5625, + "learning_rate": 0.0001989925133119248, + "loss": 0.4173, + "step": 4218 + }, + { + "epoch": 0.56, + "grad_norm": 0.71875, + "learning_rate": 0.00019899086380211482, + "loss": 0.2984, + "step": 4219 + }, + { + "epoch": 0.56, + "grad_norm": 0.859375, + "learning_rate": 0.00019898921294992516, + "loss": 0.5671, + "step": 4220 + }, + { + "epoch": 0.56, + "grad_norm": 0.455078125, + "learning_rate": 0.00019898756075537825, + "loss": 0.4883, + "step": 4221 + }, + { + "epoch": 0.56, + "grad_norm": 0.96484375, + "learning_rate": 0.00019898590721849653, + "loss": 0.4566, + "step": 4222 + }, + { + "epoch": 0.56, + "grad_norm": 0.61328125, + "learning_rate": 0.00019898425233930234, + "loss": 0.3553, + "step": 4223 + }, + { + "epoch": 0.56, + "grad_norm": 0.5625, + "learning_rate": 0.00019898259611781817, + "loss": 0.1995, + "step": 4224 + }, + { + "epoch": 0.56, + "grad_norm": 0.703125, + "learning_rate": 0.00019898093855406647, + "loss": 0.3478, + "step": 4225 + }, + { + "epoch": 0.56, + "grad_norm": 0.953125, + "learning_rate": 0.0001989792796480697, + "loss": 0.4272, + "step": 4226 + }, + { + "epoch": 0.56, + "grad_norm": 0.69921875, + "learning_rate": 0.0001989776193998504, + "loss": 0.5343, + "step": 4227 + }, + { + "epoch": 0.56, + "grad_norm": 0.6875, + "learning_rate": 0.00019897595780943102, + "loss": 0.4413, + "step": 4228 + }, + { + "epoch": 0.56, + "grad_norm": 0.73828125, + "learning_rate": 0.00019897429487683417, + "loss": 0.4223, + "step": 4229 + }, + { + "epoch": 0.56, + "grad_norm": 0.859375, + "learning_rate": 0.00019897263060208236, + "loss": 0.4227, + "step": 4230 + }, + { + "epoch": 0.56, + "grad_norm": 0.91015625, + "learning_rate": 0.0001989709649851981, + "loss": 0.7106, + "step": 4231 + }, + { + "epoch": 0.56, + "grad_norm": 0.87890625, + "learning_rate": 0.00019896929802620412, + "loss": 0.5492, + "step": 4232 + }, + { + "epoch": 0.56, + "grad_norm": 0.86328125, + "learning_rate": 0.00019896762972512288, + "loss": 0.4587, + "step": 4233 + }, + { + "epoch": 0.56, + "grad_norm": 0.7578125, + "learning_rate": 0.0001989659600819771, + "loss": 0.5543, + "step": 4234 + }, + { + "epoch": 0.57, + "grad_norm": 0.8984375, + "learning_rate": 0.00019896428909678938, + "loss": 0.3701, + "step": 4235 + }, + { + "epoch": 0.57, + "grad_norm": 0.55078125, + "learning_rate": 0.00019896261676958236, + "loss": 0.5191, + "step": 4236 + }, + { + "epoch": 0.57, + "grad_norm": 0.81640625, + "learning_rate": 0.00019896094310037875, + "loss": 0.4607, + "step": 4237 + }, + { + "epoch": 0.57, + "grad_norm": 0.77734375, + "learning_rate": 0.00019895926808920127, + "loss": 0.6067, + "step": 4238 + }, + { + "epoch": 0.57, + "grad_norm": 0.77734375, + "learning_rate": 0.00019895759173607256, + "loss": 0.62, + "step": 4239 + }, + { + "epoch": 0.57, + "grad_norm": 0.69921875, + "learning_rate": 0.0001989559140410154, + "loss": 0.3805, + "step": 4240 + }, + { + "epoch": 0.57, + "grad_norm": 0.7421875, + "learning_rate": 0.00019895423500405256, + "loss": 0.749, + "step": 4241 + }, + { + "epoch": 0.57, + "grad_norm": 0.68359375, + "learning_rate": 0.00019895255462520678, + "loss": 0.3448, + "step": 4242 + }, + { + "epoch": 0.57, + "grad_norm": 0.796875, + "learning_rate": 0.00019895087290450086, + "loss": 0.4509, + "step": 4243 + }, + { + "epoch": 0.57, + "grad_norm": 0.578125, + "learning_rate": 0.00019894918984195758, + "loss": 0.3926, + "step": 4244 + }, + { + "epoch": 0.57, + "grad_norm": 0.68359375, + "learning_rate": 0.00019894750543759978, + "loss": 0.4167, + "step": 4245 + }, + { + "epoch": 0.57, + "grad_norm": 0.8046875, + "learning_rate": 0.0001989458196914503, + "loss": 0.6246, + "step": 4246 + }, + { + "epoch": 0.57, + "grad_norm": 0.8359375, + "learning_rate": 0.00019894413260353205, + "loss": 0.6685, + "step": 4247 + }, + { + "epoch": 0.57, + "grad_norm": 0.6875, + "learning_rate": 0.0001989424441738678, + "loss": 0.4567, + "step": 4248 + }, + { + "epoch": 0.57, + "grad_norm": 0.59765625, + "learning_rate": 0.00019894075440248052, + "loss": 0.6702, + "step": 4249 + }, + { + "epoch": 0.57, + "grad_norm": 0.84765625, + "learning_rate": 0.00019893906328939312, + "loss": 0.5831, + "step": 4250 + }, + { + "epoch": 0.57, + "grad_norm": 0.78125, + "learning_rate": 0.00019893737083462854, + "loss": 0.409, + "step": 4251 + }, + { + "epoch": 0.57, + "grad_norm": 0.59765625, + "learning_rate": 0.00019893567703820967, + "loss": 0.3238, + "step": 4252 + }, + { + "epoch": 0.57, + "grad_norm": 0.70703125, + "learning_rate": 0.00019893398190015956, + "loss": 0.9279, + "step": 4253 + }, + { + "epoch": 0.57, + "grad_norm": 0.47265625, + "learning_rate": 0.00019893228542050114, + "loss": 0.5313, + "step": 4254 + }, + { + "epoch": 0.57, + "grad_norm": 0.6796875, + "learning_rate": 0.00019893058759925744, + "loss": 0.5033, + "step": 4255 + }, + { + "epoch": 0.57, + "grad_norm": 0.7265625, + "learning_rate": 0.00019892888843645149, + "loss": 0.5902, + "step": 4256 + }, + { + "epoch": 0.57, + "grad_norm": 0.5859375, + "learning_rate": 0.00019892718793210632, + "loss": 0.3455, + "step": 4257 + }, + { + "epoch": 0.57, + "grad_norm": 0.62890625, + "learning_rate": 0.00019892548608624498, + "loss": 0.575, + "step": 4258 + }, + { + "epoch": 0.57, + "grad_norm": 0.76171875, + "learning_rate": 0.00019892378289889054, + "loss": 0.5983, + "step": 4259 + }, + { + "epoch": 0.57, + "grad_norm": 0.71875, + "learning_rate": 0.00019892207837006614, + "loss": 0.2952, + "step": 4260 + }, + { + "epoch": 0.57, + "grad_norm": 0.64453125, + "learning_rate": 0.00019892037249979487, + "loss": 0.6357, + "step": 4261 + }, + { + "epoch": 0.57, + "grad_norm": 0.89453125, + "learning_rate": 0.00019891866528809986, + "loss": 0.4597, + "step": 4262 + }, + { + "epoch": 0.57, + "grad_norm": 0.80859375, + "learning_rate": 0.00019891695673500425, + "loss": 0.7315, + "step": 4263 + }, + { + "epoch": 0.57, + "grad_norm": 0.859375, + "learning_rate": 0.00019891524684053125, + "loss": 0.547, + "step": 4264 + }, + { + "epoch": 0.57, + "grad_norm": 1.09375, + "learning_rate": 0.000198913535604704, + "loss": 0.6485, + "step": 4265 + }, + { + "epoch": 0.57, + "grad_norm": 0.82421875, + "learning_rate": 0.0001989118230275457, + "loss": 0.3544, + "step": 4266 + }, + { + "epoch": 0.57, + "grad_norm": 0.609375, + "learning_rate": 0.00019891010910907966, + "loss": 0.5755, + "step": 4267 + }, + { + "epoch": 0.57, + "grad_norm": 0.89453125, + "learning_rate": 0.00019890839384932902, + "loss": 0.627, + "step": 4268 + }, + { + "epoch": 0.57, + "grad_norm": 0.7265625, + "learning_rate": 0.00019890667724831707, + "loss": 0.4839, + "step": 4269 + }, + { + "epoch": 0.57, + "grad_norm": 0.59765625, + "learning_rate": 0.00019890495930606714, + "loss": 0.226, + "step": 4270 + }, + { + "epoch": 0.57, + "grad_norm": 0.546875, + "learning_rate": 0.00019890324002260244, + "loss": 0.3848, + "step": 4271 + }, + { + "epoch": 0.57, + "grad_norm": 1.0234375, + "learning_rate": 0.00019890151939794634, + "loss": 0.6484, + "step": 4272 + }, + { + "epoch": 0.57, + "grad_norm": 0.69921875, + "learning_rate": 0.0001988997974321222, + "loss": 0.4623, + "step": 4273 + }, + { + "epoch": 0.57, + "grad_norm": 0.80078125, + "learning_rate": 0.0001988980741251533, + "loss": 0.5072, + "step": 4274 + }, + { + "epoch": 0.57, + "grad_norm": 0.96484375, + "learning_rate": 0.00019889634947706304, + "loss": 0.2761, + "step": 4275 + }, + { + "epoch": 0.57, + "grad_norm": 0.5546875, + "learning_rate": 0.00019889462348787482, + "loss": 0.443, + "step": 4276 + }, + { + "epoch": 0.57, + "grad_norm": 0.67578125, + "learning_rate": 0.00019889289615761203, + "loss": 0.551, + "step": 4277 + }, + { + "epoch": 0.57, + "grad_norm": 0.5859375, + "learning_rate": 0.0001988911674862981, + "loss": 0.5404, + "step": 4278 + }, + { + "epoch": 0.57, + "grad_norm": 0.6875, + "learning_rate": 0.0001988894374739565, + "loss": 0.4459, + "step": 4279 + }, + { + "epoch": 0.57, + "grad_norm": 0.6171875, + "learning_rate": 0.0001988877061206106, + "loss": 0.4794, + "step": 4280 + }, + { + "epoch": 0.57, + "grad_norm": 0.77734375, + "learning_rate": 0.000198885973426284, + "loss": 0.8579, + "step": 4281 + }, + { + "epoch": 0.57, + "grad_norm": 0.78125, + "learning_rate": 0.0001988842393910001, + "loss": 0.4363, + "step": 4282 + }, + { + "epoch": 0.57, + "grad_norm": 0.87109375, + "learning_rate": 0.00019888250401478246, + "loss": 0.5116, + "step": 4283 + }, + { + "epoch": 0.57, + "grad_norm": 0.59375, + "learning_rate": 0.0001988807672976546, + "loss": 0.4039, + "step": 4284 + }, + { + "epoch": 0.57, + "grad_norm": 0.66796875, + "learning_rate": 0.0001988790292396401, + "loss": 0.4663, + "step": 4285 + }, + { + "epoch": 0.57, + "grad_norm": 0.51171875, + "learning_rate": 0.00019887728984076247, + "loss": 0.4289, + "step": 4286 + }, + { + "epoch": 0.57, + "grad_norm": 0.74609375, + "learning_rate": 0.00019887554910104536, + "loss": 0.4378, + "step": 4287 + }, + { + "epoch": 0.57, + "grad_norm": 1.0390625, + "learning_rate": 0.0001988738070205123, + "loss": 0.4608, + "step": 4288 + }, + { + "epoch": 0.57, + "grad_norm": 0.8515625, + "learning_rate": 0.000198872063599187, + "loss": 0.6585, + "step": 4289 + }, + { + "epoch": 0.57, + "grad_norm": 0.64453125, + "learning_rate": 0.00019887031883709306, + "loss": 0.6999, + "step": 4290 + }, + { + "epoch": 0.57, + "grad_norm": 0.80859375, + "learning_rate": 0.00019886857273425411, + "loss": 0.549, + "step": 4291 + }, + { + "epoch": 0.57, + "grad_norm": 0.71484375, + "learning_rate": 0.0001988668252906939, + "loss": 0.2933, + "step": 4292 + }, + { + "epoch": 0.57, + "grad_norm": 0.91796875, + "learning_rate": 0.00019886507650643606, + "loss": 0.4339, + "step": 4293 + }, + { + "epoch": 0.57, + "grad_norm": 0.65234375, + "learning_rate": 0.00019886332638150434, + "loss": 0.5294, + "step": 4294 + }, + { + "epoch": 0.57, + "grad_norm": 0.6640625, + "learning_rate": 0.00019886157491592248, + "loss": 0.4515, + "step": 4295 + }, + { + "epoch": 0.57, + "grad_norm": 0.55859375, + "learning_rate": 0.0001988598221097142, + "loss": 0.3424, + "step": 4296 + }, + { + "epoch": 0.57, + "grad_norm": 0.6015625, + "learning_rate": 0.00019885806796290326, + "loss": 0.6224, + "step": 4297 + }, + { + "epoch": 0.57, + "grad_norm": 0.61328125, + "learning_rate": 0.0001988563124755135, + "loss": 0.535, + "step": 4298 + }, + { + "epoch": 0.57, + "grad_norm": 1.1015625, + "learning_rate": 0.0001988545556475687, + "loss": 0.5356, + "step": 4299 + }, + { + "epoch": 0.57, + "grad_norm": 0.50390625, + "learning_rate": 0.00019885279747909268, + "loss": 0.5182, + "step": 4300 + }, + { + "epoch": 0.57, + "grad_norm": 0.66796875, + "learning_rate": 0.00019885103797010926, + "loss": 0.8733, + "step": 4301 + }, + { + "epoch": 0.57, + "grad_norm": 0.416015625, + "learning_rate": 0.00019884927712064237, + "loss": 0.3948, + "step": 4302 + }, + { + "epoch": 0.57, + "grad_norm": 0.734375, + "learning_rate": 0.0001988475149307158, + "loss": 0.603, + "step": 4303 + }, + { + "epoch": 0.57, + "grad_norm": 0.6171875, + "learning_rate": 0.0001988457514003535, + "loss": 0.7474, + "step": 4304 + }, + { + "epoch": 0.57, + "grad_norm": 0.8671875, + "learning_rate": 0.00019884398652957936, + "loss": 0.3685, + "step": 4305 + }, + { + "epoch": 0.57, + "grad_norm": 0.66796875, + "learning_rate": 0.00019884222031841734, + "loss": 0.3971, + "step": 4306 + }, + { + "epoch": 0.57, + "grad_norm": 0.73828125, + "learning_rate": 0.00019884045276689135, + "loss": 0.6659, + "step": 4307 + }, + { + "epoch": 0.57, + "grad_norm": 0.875, + "learning_rate": 0.00019883868387502544, + "loss": 0.3406, + "step": 4308 + }, + { + "epoch": 0.57, + "grad_norm": 0.63671875, + "learning_rate": 0.0001988369136428435, + "loss": 0.4458, + "step": 4309 + }, + { + "epoch": 0.58, + "grad_norm": 0.9140625, + "learning_rate": 0.00019883514207036956, + "loss": 0.5014, + "step": 4310 + }, + { + "epoch": 0.58, + "grad_norm": 0.62109375, + "learning_rate": 0.0001988333691576277, + "loss": 0.4609, + "step": 4311 + }, + { + "epoch": 0.58, + "grad_norm": 0.74609375, + "learning_rate": 0.00019883159490464194, + "loss": 0.3744, + "step": 4312 + }, + { + "epoch": 0.58, + "grad_norm": 0.5546875, + "learning_rate": 0.00019882981931143627, + "loss": 0.4495, + "step": 4313 + }, + { + "epoch": 0.58, + "grad_norm": 0.78515625, + "learning_rate": 0.00019882804237803488, + "loss": 0.5652, + "step": 4314 + }, + { + "epoch": 0.58, + "grad_norm": 0.55078125, + "learning_rate": 0.00019882626410446178, + "loss": 0.4792, + "step": 4315 + }, + { + "epoch": 0.58, + "grad_norm": 0.8359375, + "learning_rate": 0.00019882448449074107, + "loss": 0.7499, + "step": 4316 + }, + { + "epoch": 0.58, + "grad_norm": 0.6171875, + "learning_rate": 0.00019882270353689698, + "loss": 0.5133, + "step": 4317 + }, + { + "epoch": 0.58, + "grad_norm": 0.73046875, + "learning_rate": 0.00019882092124295358, + "loss": 0.4671, + "step": 4318 + }, + { + "epoch": 0.58, + "grad_norm": 0.69140625, + "learning_rate": 0.00019881913760893508, + "loss": 0.424, + "step": 4319 + }, + { + "epoch": 0.58, + "grad_norm": 0.72265625, + "learning_rate": 0.00019881735263486565, + "loss": 0.6198, + "step": 4320 + }, + { + "epoch": 0.58, + "grad_norm": 0.70703125, + "learning_rate": 0.0001988155663207695, + "loss": 0.554, + "step": 4321 + }, + { + "epoch": 0.58, + "grad_norm": 0.5625, + "learning_rate": 0.00019881377866667082, + "loss": 0.4335, + "step": 4322 + }, + { + "epoch": 0.58, + "grad_norm": 0.57421875, + "learning_rate": 0.00019881198967259392, + "loss": 0.359, + "step": 4323 + }, + { + "epoch": 0.58, + "grad_norm": 0.9609375, + "learning_rate": 0.00019881019933856302, + "loss": 0.4479, + "step": 4324 + }, + { + "epoch": 0.58, + "grad_norm": 0.8515625, + "learning_rate": 0.0001988084076646024, + "loss": 0.5938, + "step": 4325 + }, + { + "epoch": 0.58, + "grad_norm": 0.78515625, + "learning_rate": 0.00019880661465073636, + "loss": 0.631, + "step": 4326 + }, + { + "epoch": 0.58, + "grad_norm": 1.0390625, + "learning_rate": 0.00019880482029698918, + "loss": 0.6324, + "step": 4327 + }, + { + "epoch": 0.58, + "grad_norm": 0.83984375, + "learning_rate": 0.00019880302460338525, + "loss": 0.6433, + "step": 4328 + }, + { + "epoch": 0.58, + "grad_norm": 0.578125, + "learning_rate": 0.0001988012275699489, + "loss": 0.4517, + "step": 4329 + }, + { + "epoch": 0.58, + "grad_norm": 0.625, + "learning_rate": 0.00019879942919670448, + "loss": 0.5957, + "step": 4330 + }, + { + "epoch": 0.58, + "grad_norm": 0.9453125, + "learning_rate": 0.0001987976294836764, + "loss": 0.4536, + "step": 4331 + }, + { + "epoch": 0.58, + "grad_norm": 0.8359375, + "learning_rate": 0.00019879582843088903, + "loss": 0.3799, + "step": 4332 + }, + { + "epoch": 0.58, + "grad_norm": 0.703125, + "learning_rate": 0.00019879402603836685, + "loss": 0.416, + "step": 4333 + }, + { + "epoch": 0.58, + "grad_norm": 1.09375, + "learning_rate": 0.00019879222230613427, + "loss": 0.3188, + "step": 4334 + }, + { + "epoch": 0.58, + "grad_norm": 0.671875, + "learning_rate": 0.00019879041723421575, + "loss": 0.4511, + "step": 4335 + }, + { + "epoch": 0.58, + "grad_norm": 0.94140625, + "learning_rate": 0.00019878861082263575, + "loss": 0.5969, + "step": 4336 + }, + { + "epoch": 0.58, + "grad_norm": 0.57421875, + "learning_rate": 0.0001987868030714188, + "loss": 0.1848, + "step": 4337 + }, + { + "epoch": 0.58, + "grad_norm": 0.5859375, + "learning_rate": 0.0001987849939805894, + "loss": 0.5642, + "step": 4338 + }, + { + "epoch": 0.58, + "grad_norm": 0.76171875, + "learning_rate": 0.0001987831835501721, + "loss": 0.7171, + "step": 4339 + }, + { + "epoch": 0.58, + "grad_norm": 0.7109375, + "learning_rate": 0.0001987813717801914, + "loss": 0.4449, + "step": 4340 + }, + { + "epoch": 0.58, + "grad_norm": 0.890625, + "learning_rate": 0.0001987795586706719, + "loss": 0.3366, + "step": 4341 + }, + { + "epoch": 0.58, + "grad_norm": 0.5, + "learning_rate": 0.00019877774422163823, + "loss": 0.6345, + "step": 4342 + }, + { + "epoch": 0.58, + "grad_norm": 0.7109375, + "learning_rate": 0.0001987759284331149, + "loss": 0.6362, + "step": 4343 + }, + { + "epoch": 0.58, + "grad_norm": 0.65625, + "learning_rate": 0.00019877411130512663, + "loss": 0.8034, + "step": 4344 + }, + { + "epoch": 0.58, + "grad_norm": 0.71875, + "learning_rate": 0.000198772292837698, + "loss": 0.6401, + "step": 4345 + }, + { + "epoch": 0.58, + "grad_norm": 0.9453125, + "learning_rate": 0.0001987704730308537, + "loss": 0.5888, + "step": 4346 + }, + { + "epoch": 0.58, + "grad_norm": 0.75390625, + "learning_rate": 0.00019876865188461835, + "loss": 0.5589, + "step": 4347 + }, + { + "epoch": 0.58, + "grad_norm": 0.68359375, + "learning_rate": 0.0001987668293990167, + "loss": 0.631, + "step": 4348 + }, + { + "epoch": 0.58, + "grad_norm": 0.68359375, + "learning_rate": 0.0001987650055740735, + "loss": 0.5466, + "step": 4349 + }, + { + "epoch": 0.58, + "grad_norm": 0.75, + "learning_rate": 0.00019876318040981344, + "loss": 0.5824, + "step": 4350 + }, + { + "epoch": 0.58, + "grad_norm": 0.71484375, + "learning_rate": 0.00019876135390626122, + "loss": 0.5788, + "step": 4351 + }, + { + "epoch": 0.58, + "grad_norm": 0.83984375, + "learning_rate": 0.0001987595260634417, + "loss": 0.8124, + "step": 4352 + }, + { + "epoch": 0.58, + "grad_norm": 0.71875, + "learning_rate": 0.0001987576968813796, + "loss": 0.6207, + "step": 4353 + }, + { + "epoch": 0.58, + "grad_norm": 0.66796875, + "learning_rate": 0.00019875586636009974, + "loss": 0.433, + "step": 4354 + }, + { + "epoch": 0.58, + "grad_norm": 0.69140625, + "learning_rate": 0.00019875403449962697, + "loss": 0.357, + "step": 4355 + }, + { + "epoch": 0.58, + "grad_norm": 0.83984375, + "learning_rate": 0.0001987522012999861, + "loss": 0.6539, + "step": 4356 + }, + { + "epoch": 0.58, + "grad_norm": 0.54296875, + "learning_rate": 0.00019875036676120204, + "loss": 0.6948, + "step": 4357 + }, + { + "epoch": 0.58, + "grad_norm": 0.765625, + "learning_rate": 0.00019874853088329956, + "loss": 0.2764, + "step": 4358 + }, + { + "epoch": 0.58, + "grad_norm": 0.91796875, + "learning_rate": 0.00019874669366630365, + "loss": 0.5167, + "step": 4359 + }, + { + "epoch": 0.58, + "grad_norm": 0.66796875, + "learning_rate": 0.00019874485511023923, + "loss": 0.5701, + "step": 4360 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 0.0001987430152151312, + "loss": 0.3767, + "step": 4361 + }, + { + "epoch": 0.58, + "grad_norm": 0.734375, + "learning_rate": 0.00019874117398100446, + "loss": 0.4075, + "step": 4362 + }, + { + "epoch": 0.58, + "grad_norm": 0.77734375, + "learning_rate": 0.00019873933140788407, + "loss": 0.3234, + "step": 4363 + }, + { + "epoch": 0.58, + "grad_norm": 0.8828125, + "learning_rate": 0.00019873748749579497, + "loss": 0.8405, + "step": 4364 + }, + { + "epoch": 0.58, + "grad_norm": 0.53515625, + "learning_rate": 0.00019873564224476216, + "loss": 0.4023, + "step": 4365 + }, + { + "epoch": 0.58, + "grad_norm": 0.49609375, + "learning_rate": 0.00019873379565481068, + "loss": 0.5546, + "step": 4366 + }, + { + "epoch": 0.58, + "grad_norm": 0.82421875, + "learning_rate": 0.00019873194772596558, + "loss": 0.7257, + "step": 4367 + }, + { + "epoch": 0.58, + "grad_norm": 0.68359375, + "learning_rate": 0.00019873009845825185, + "loss": 0.6335, + "step": 4368 + }, + { + "epoch": 0.58, + "grad_norm": 0.70703125, + "learning_rate": 0.00019872824785169467, + "loss": 0.539, + "step": 4369 + }, + { + "epoch": 0.58, + "grad_norm": 0.64453125, + "learning_rate": 0.00019872639590631908, + "loss": 0.3491, + "step": 4370 + }, + { + "epoch": 0.58, + "grad_norm": 1.03125, + "learning_rate": 0.00019872454262215018, + "loss": 0.4682, + "step": 4371 + }, + { + "epoch": 0.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00019872268799921315, + "loss": 0.6131, + "step": 4372 + }, + { + "epoch": 0.58, + "grad_norm": 0.9296875, + "learning_rate": 0.00019872083203753308, + "loss": 0.5128, + "step": 4373 + }, + { + "epoch": 0.58, + "grad_norm": 1.125, + "learning_rate": 0.00019871897473713518, + "loss": 0.6883, + "step": 4374 + }, + { + "epoch": 0.58, + "grad_norm": 0.431640625, + "learning_rate": 0.00019871711609804465, + "loss": 0.3681, + "step": 4375 + }, + { + "epoch": 0.58, + "grad_norm": 0.71875, + "learning_rate": 0.0001987152561202866, + "loss": 0.673, + "step": 4376 + }, + { + "epoch": 0.58, + "grad_norm": 0.63671875, + "learning_rate": 0.0001987133948038864, + "loss": 0.3378, + "step": 4377 + }, + { + "epoch": 0.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00019871153214886918, + "loss": 0.6758, + "step": 4378 + }, + { + "epoch": 0.58, + "grad_norm": 0.6640625, + "learning_rate": 0.00019870966815526022, + "loss": 0.3511, + "step": 4379 + }, + { + "epoch": 0.58, + "grad_norm": 0.7109375, + "learning_rate": 0.00019870780282308482, + "loss": 0.3737, + "step": 4380 + }, + { + "epoch": 0.58, + "grad_norm": 0.671875, + "learning_rate": 0.00019870593615236827, + "loss": 0.7215, + "step": 4381 + }, + { + "epoch": 0.58, + "grad_norm": 0.8203125, + "learning_rate": 0.0001987040681431359, + "loss": 0.5248, + "step": 4382 + }, + { + "epoch": 0.58, + "grad_norm": 0.5859375, + "learning_rate": 0.00019870219879541297, + "loss": 0.4818, + "step": 4383 + }, + { + "epoch": 0.59, + "grad_norm": 0.82421875, + "learning_rate": 0.00019870032810922488, + "loss": 0.4739, + "step": 4384 + }, + { + "epoch": 0.59, + "grad_norm": 0.73828125, + "learning_rate": 0.00019869845608459704, + "loss": 0.4284, + "step": 4385 + }, + { + "epoch": 0.59, + "grad_norm": 0.8515625, + "learning_rate": 0.00019869658272155474, + "loss": 0.6248, + "step": 4386 + }, + { + "epoch": 0.59, + "grad_norm": 0.61328125, + "learning_rate": 0.00019869470802012348, + "loss": 0.5994, + "step": 4387 + }, + { + "epoch": 0.59, + "grad_norm": 0.80078125, + "learning_rate": 0.00019869283198032859, + "loss": 0.535, + "step": 4388 + }, + { + "epoch": 0.59, + "grad_norm": 0.86328125, + "learning_rate": 0.00019869095460219561, + "loss": 0.5117, + "step": 4389 + }, + { + "epoch": 0.59, + "grad_norm": 1.0234375, + "learning_rate": 0.0001986890758857499, + "loss": 0.2835, + "step": 4390 + }, + { + "epoch": 0.59, + "grad_norm": 0.8203125, + "learning_rate": 0.00019868719583101703, + "loss": 0.4808, + "step": 4391 + }, + { + "epoch": 0.59, + "grad_norm": 0.8125, + "learning_rate": 0.0001986853144380224, + "loss": 0.3252, + "step": 4392 + }, + { + "epoch": 0.59, + "grad_norm": 0.58203125, + "learning_rate": 0.0001986834317067916, + "loss": 0.272, + "step": 4393 + }, + { + "epoch": 0.59, + "grad_norm": 0.8359375, + "learning_rate": 0.00019868154763735013, + "loss": 0.6768, + "step": 4394 + }, + { + "epoch": 0.59, + "grad_norm": 0.9609375, + "learning_rate": 0.0001986796622297235, + "loss": 0.7779, + "step": 4395 + }, + { + "epoch": 0.59, + "grad_norm": 0.62890625, + "learning_rate": 0.00019867777548393737, + "loss": 0.4744, + "step": 4396 + }, + { + "epoch": 0.59, + "grad_norm": 0.8125, + "learning_rate": 0.00019867588740001726, + "loss": 0.4971, + "step": 4397 + }, + { + "epoch": 0.59, + "grad_norm": 1.1484375, + "learning_rate": 0.00019867399797798878, + "loss": 0.4944, + "step": 4398 + }, + { + "epoch": 0.59, + "grad_norm": 0.81640625, + "learning_rate": 0.00019867210721787756, + "loss": 0.5047, + "step": 4399 + }, + { + "epoch": 0.59, + "grad_norm": 0.70703125, + "learning_rate": 0.00019867021511970926, + "loss": 0.3917, + "step": 4400 + }, + { + "epoch": 0.59, + "grad_norm": 0.71875, + "learning_rate": 0.00019866832168350946, + "loss": 0.5438, + "step": 4401 + }, + { + "epoch": 0.59, + "grad_norm": 0.75, + "learning_rate": 0.00019866642690930395, + "loss": 0.4952, + "step": 4402 + }, + { + "epoch": 0.59, + "grad_norm": 0.9765625, + "learning_rate": 0.00019866453079711834, + "loss": 0.7347, + "step": 4403 + }, + { + "epoch": 0.59, + "grad_norm": 0.73828125, + "learning_rate": 0.00019866263334697835, + "loss": 0.5725, + "step": 4404 + }, + { + "epoch": 0.59, + "grad_norm": 0.69140625, + "learning_rate": 0.0001986607345589098, + "loss": 0.5777, + "step": 4405 + }, + { + "epoch": 0.59, + "grad_norm": 0.66796875, + "learning_rate": 0.00019865883443293832, + "loss": 0.6705, + "step": 4406 + }, + { + "epoch": 0.59, + "grad_norm": 0.7265625, + "learning_rate": 0.00019865693296908972, + "loss": 0.4524, + "step": 4407 + }, + { + "epoch": 0.59, + "grad_norm": 0.58984375, + "learning_rate": 0.00019865503016738983, + "loss": 0.2943, + "step": 4408 + }, + { + "epoch": 0.59, + "grad_norm": 0.5, + "learning_rate": 0.00019865312602786436, + "loss": 0.4632, + "step": 4409 + }, + { + "epoch": 0.59, + "grad_norm": 0.71484375, + "learning_rate": 0.00019865122055053922, + "loss": 0.677, + "step": 4410 + }, + { + "epoch": 0.59, + "grad_norm": 0.68359375, + "learning_rate": 0.0001986493137354402, + "loss": 0.9216, + "step": 4411 + }, + { + "epoch": 0.59, + "grad_norm": 0.74609375, + "learning_rate": 0.00019864740558259318, + "loss": 0.5743, + "step": 4412 + }, + { + "epoch": 0.59, + "grad_norm": 0.6953125, + "learning_rate": 0.00019864549609202405, + "loss": 0.7016, + "step": 4413 + }, + { + "epoch": 0.59, + "grad_norm": 0.84765625, + "learning_rate": 0.00019864358526375866, + "loss": 0.5028, + "step": 4414 + }, + { + "epoch": 0.59, + "grad_norm": 0.91796875, + "learning_rate": 0.00019864167309782295, + "loss": 0.468, + "step": 4415 + }, + { + "epoch": 0.59, + "grad_norm": 0.92578125, + "learning_rate": 0.00019863975959424286, + "loss": 0.3806, + "step": 4416 + }, + { + "epoch": 0.59, + "grad_norm": 0.69921875, + "learning_rate": 0.0001986378447530443, + "loss": 0.5783, + "step": 4417 + }, + { + "epoch": 0.59, + "grad_norm": 0.578125, + "learning_rate": 0.00019863592857425325, + "loss": 0.409, + "step": 4418 + }, + { + "epoch": 0.59, + "grad_norm": 0.8125, + "learning_rate": 0.00019863401105789575, + "loss": 0.4565, + "step": 4419 + }, + { + "epoch": 0.59, + "grad_norm": 0.66015625, + "learning_rate": 0.0001986320922039977, + "loss": 0.5062, + "step": 4420 + }, + { + "epoch": 0.59, + "grad_norm": 0.65234375, + "learning_rate": 0.00019863017201258523, + "loss": 0.4095, + "step": 4421 + }, + { + "epoch": 0.59, + "grad_norm": 0.578125, + "learning_rate": 0.0001986282504836843, + "loss": 0.4803, + "step": 4422 + }, + { + "epoch": 0.59, + "grad_norm": 0.81640625, + "learning_rate": 0.00019862632761732097, + "loss": 0.5085, + "step": 4423 + }, + { + "epoch": 0.59, + "grad_norm": 0.640625, + "learning_rate": 0.0001986244034135214, + "loss": 0.8822, + "step": 4424 + }, + { + "epoch": 0.59, + "grad_norm": 0.70703125, + "learning_rate": 0.0001986224778723116, + "loss": 0.6469, + "step": 4425 + }, + { + "epoch": 0.59, + "grad_norm": 0.875, + "learning_rate": 0.00019862055099371769, + "loss": 0.769, + "step": 4426 + }, + { + "epoch": 0.59, + "grad_norm": 0.55078125, + "learning_rate": 0.00019861862277776581, + "loss": 0.6308, + "step": 4427 + }, + { + "epoch": 0.59, + "grad_norm": 0.7109375, + "learning_rate": 0.00019861669322448214, + "loss": 0.5693, + "step": 4428 + }, + { + "epoch": 0.59, + "grad_norm": 0.62890625, + "learning_rate": 0.0001986147623338928, + "loss": 0.5941, + "step": 4429 + }, + { + "epoch": 0.59, + "grad_norm": 0.5859375, + "learning_rate": 0.000198612830106024, + "loss": 0.232, + "step": 4430 + }, + { + "epoch": 0.59, + "grad_norm": 0.78515625, + "learning_rate": 0.0001986108965409019, + "loss": 0.7801, + "step": 4431 + }, + { + "epoch": 0.59, + "grad_norm": 0.89453125, + "learning_rate": 0.00019860896163855277, + "loss": 0.3958, + "step": 4432 + }, + { + "epoch": 0.59, + "grad_norm": 0.828125, + "learning_rate": 0.00019860702539900287, + "loss": 0.5189, + "step": 4433 + }, + { + "epoch": 0.59, + "grad_norm": 0.68359375, + "learning_rate": 0.0001986050878222784, + "loss": 0.5731, + "step": 4434 + }, + { + "epoch": 0.59, + "grad_norm": 0.8125, + "learning_rate": 0.00019860314890840562, + "loss": 0.4626, + "step": 4435 + }, + { + "epoch": 0.59, + "grad_norm": 0.61328125, + "learning_rate": 0.0001986012086574109, + "loss": 0.5826, + "step": 4436 + }, + { + "epoch": 0.59, + "grad_norm": 0.80859375, + "learning_rate": 0.0001985992670693205, + "loss": 0.533, + "step": 4437 + }, + { + "epoch": 0.59, + "grad_norm": 0.6875, + "learning_rate": 0.00019859732414416077, + "loss": 0.4957, + "step": 4438 + }, + { + "epoch": 0.59, + "grad_norm": 0.60546875, + "learning_rate": 0.000198595379881958, + "loss": 0.5933, + "step": 4439 + }, + { + "epoch": 0.59, + "grad_norm": 0.7109375, + "learning_rate": 0.00019859343428273865, + "loss": 0.4743, + "step": 4440 + }, + { + "epoch": 0.59, + "grad_norm": 0.3984375, + "learning_rate": 0.00019859148734652904, + "loss": 0.1779, + "step": 4441 + }, + { + "epoch": 0.59, + "grad_norm": 1.078125, + "learning_rate": 0.00019858953907335558, + "loss": 0.8402, + "step": 4442 + }, + { + "epoch": 0.59, + "grad_norm": 0.875, + "learning_rate": 0.0001985875894632447, + "loss": 0.4826, + "step": 4443 + }, + { + "epoch": 0.59, + "grad_norm": 0.5703125, + "learning_rate": 0.00019858563851622282, + "loss": 0.4709, + "step": 4444 + }, + { + "epoch": 0.59, + "grad_norm": 0.50390625, + "learning_rate": 0.00019858368623231642, + "loss": 0.6036, + "step": 4445 + }, + { + "epoch": 0.59, + "grad_norm": 0.76953125, + "learning_rate": 0.00019858173261155196, + "loss": 0.5986, + "step": 4446 + }, + { + "epoch": 0.59, + "grad_norm": 0.77734375, + "learning_rate": 0.00019857977765395596, + "loss": 0.5794, + "step": 4447 + }, + { + "epoch": 0.59, + "grad_norm": 0.5859375, + "learning_rate": 0.00019857782135955488, + "loss": 0.2584, + "step": 4448 + }, + { + "epoch": 0.59, + "grad_norm": 0.9921875, + "learning_rate": 0.00019857586372837527, + "loss": 0.5636, + "step": 4449 + }, + { + "epoch": 0.59, + "grad_norm": 0.77734375, + "learning_rate": 0.00019857390476044371, + "loss": 0.7561, + "step": 4450 + }, + { + "epoch": 0.59, + "grad_norm": 0.77734375, + "learning_rate": 0.0001985719444557867, + "loss": 0.6029, + "step": 4451 + }, + { + "epoch": 0.59, + "grad_norm": 0.73046875, + "learning_rate": 0.00019856998281443092, + "loss": 0.5934, + "step": 4452 + }, + { + "epoch": 0.59, + "grad_norm": 0.59375, + "learning_rate": 0.00019856801983640285, + "loss": 0.6444, + "step": 4453 + }, + { + "epoch": 0.59, + "grad_norm": 0.95703125, + "learning_rate": 0.0001985660555217292, + "loss": 0.5923, + "step": 4454 + }, + { + "epoch": 0.59, + "grad_norm": 0.55078125, + "learning_rate": 0.00019856408987043656, + "loss": 0.3376, + "step": 4455 + }, + { + "epoch": 0.59, + "grad_norm": 0.92578125, + "learning_rate": 0.00019856212288255163, + "loss": 0.5593, + "step": 4456 + }, + { + "epoch": 0.59, + "grad_norm": 1.0078125, + "learning_rate": 0.00019856015455810104, + "loss": 0.5927, + "step": 4457 + }, + { + "epoch": 0.59, + "grad_norm": 0.7421875, + "learning_rate": 0.0001985581848971115, + "loss": 0.6113, + "step": 4458 + }, + { + "epoch": 0.6, + "grad_norm": 1.0390625, + "learning_rate": 0.0001985562138996097, + "loss": 0.6074, + "step": 4459 + }, + { + "epoch": 0.6, + "grad_norm": 0.73828125, + "learning_rate": 0.00019855424156562244, + "loss": 0.6797, + "step": 4460 + }, + { + "epoch": 0.6, + "grad_norm": 0.59765625, + "learning_rate": 0.00019855226789517637, + "loss": 0.5009, + "step": 4461 + }, + { + "epoch": 0.6, + "grad_norm": 0.828125, + "learning_rate": 0.0001985502928882983, + "loss": 0.7943, + "step": 4462 + }, + { + "epoch": 0.6, + "grad_norm": 0.671875, + "learning_rate": 0.00019854831654501502, + "loss": 0.6016, + "step": 4463 + }, + { + "epoch": 0.6, + "grad_norm": 0.6328125, + "learning_rate": 0.00019854633886535333, + "loss": 0.3706, + "step": 4464 + }, + { + "epoch": 0.6, + "grad_norm": 0.94140625, + "learning_rate": 0.00019854435984934, + "loss": 0.5907, + "step": 4465 + }, + { + "epoch": 0.6, + "grad_norm": 0.7109375, + "learning_rate": 0.00019854237949700195, + "loss": 0.3877, + "step": 4466 + }, + { + "epoch": 0.6, + "grad_norm": 0.71484375, + "learning_rate": 0.00019854039780836598, + "loss": 0.5034, + "step": 4467 + }, + { + "epoch": 0.6, + "grad_norm": 0.5078125, + "learning_rate": 0.00019853841478345894, + "loss": 0.589, + "step": 4468 + }, + { + "epoch": 0.6, + "grad_norm": 0.494140625, + "learning_rate": 0.0001985364304223078, + "loss": 0.4831, + "step": 4469 + }, + { + "epoch": 0.6, + "grad_norm": 0.53125, + "learning_rate": 0.00019853444472493938, + "loss": 0.3315, + "step": 4470 + }, + { + "epoch": 0.6, + "grad_norm": 0.8984375, + "learning_rate": 0.00019853245769138066, + "loss": 0.4975, + "step": 4471 + }, + { + "epoch": 0.6, + "grad_norm": 0.77734375, + "learning_rate": 0.0001985304693216586, + "loss": 0.4869, + "step": 4472 + }, + { + "epoch": 0.6, + "grad_norm": 0.59765625, + "learning_rate": 0.00019852847961580013, + "loss": 0.5071, + "step": 4473 + }, + { + "epoch": 0.6, + "grad_norm": 0.87109375, + "learning_rate": 0.00019852648857383222, + "loss": 0.6263, + "step": 4474 + }, + { + "epoch": 0.6, + "grad_norm": 0.7890625, + "learning_rate": 0.00019852449619578192, + "loss": 0.4977, + "step": 4475 + }, + { + "epoch": 0.6, + "grad_norm": 0.62890625, + "learning_rate": 0.0001985225024816762, + "loss": 0.3103, + "step": 4476 + }, + { + "epoch": 0.6, + "grad_norm": 0.734375, + "learning_rate": 0.00019852050743154211, + "loss": 0.8756, + "step": 4477 + }, + { + "epoch": 0.6, + "grad_norm": 0.66015625, + "learning_rate": 0.00019851851104540674, + "loss": 0.7091, + "step": 4478 + }, + { + "epoch": 0.6, + "grad_norm": 0.68359375, + "learning_rate": 0.0001985165133232971, + "loss": 0.5593, + "step": 4479 + }, + { + "epoch": 0.6, + "grad_norm": 0.671875, + "learning_rate": 0.00019851451426524031, + "loss": 0.5209, + "step": 4480 + }, + { + "epoch": 0.6, + "grad_norm": 0.87109375, + "learning_rate": 0.00019851251387126352, + "loss": 0.4998, + "step": 4481 + }, + { + "epoch": 0.6, + "grad_norm": 1.03125, + "learning_rate": 0.0001985105121413938, + "loss": 0.3407, + "step": 4482 + }, + { + "epoch": 0.6, + "grad_norm": 0.64453125, + "learning_rate": 0.0001985085090756583, + "loss": 0.5047, + "step": 4483 + }, + { + "epoch": 0.6, + "grad_norm": 0.6015625, + "learning_rate": 0.0001985065046740842, + "loss": 0.3741, + "step": 4484 + }, + { + "epoch": 0.6, + "grad_norm": 0.87109375, + "learning_rate": 0.00019850449893669863, + "loss": 0.4111, + "step": 4485 + }, + { + "epoch": 0.6, + "grad_norm": 0.55078125, + "learning_rate": 0.0001985024918635289, + "loss": 0.6062, + "step": 4486 + }, + { + "epoch": 0.6, + "grad_norm": 0.5546875, + "learning_rate": 0.00019850048345460214, + "loss": 0.7024, + "step": 4487 + }, + { + "epoch": 0.6, + "grad_norm": 0.703125, + "learning_rate": 0.00019849847370994561, + "loss": 0.5385, + "step": 4488 + }, + { + "epoch": 0.6, + "grad_norm": 0.640625, + "learning_rate": 0.00019849646262958657, + "loss": 0.5367, + "step": 4489 + }, + { + "epoch": 0.6, + "grad_norm": 0.53515625, + "learning_rate": 0.00019849445021355227, + "loss": 0.4427, + "step": 4490 + }, + { + "epoch": 0.6, + "grad_norm": 0.69140625, + "learning_rate": 0.00019849243646187002, + "loss": 0.5273, + "step": 4491 + }, + { + "epoch": 0.6, + "grad_norm": 0.431640625, + "learning_rate": 0.00019849042137456708, + "loss": 0.3789, + "step": 4492 + }, + { + "epoch": 0.6, + "grad_norm": 0.7109375, + "learning_rate": 0.00019848840495167086, + "loss": 0.7845, + "step": 4493 + }, + { + "epoch": 0.6, + "grad_norm": 0.87890625, + "learning_rate": 0.00019848638719320864, + "loss": 0.6638, + "step": 4494 + }, + { + "epoch": 0.6, + "grad_norm": 0.484375, + "learning_rate": 0.00019848436809920782, + "loss": 0.5282, + "step": 4495 + }, + { + "epoch": 0.6, + "grad_norm": 0.80859375, + "learning_rate": 0.00019848234766969575, + "loss": 0.4025, + "step": 4496 + }, + { + "epoch": 0.6, + "grad_norm": 0.55859375, + "learning_rate": 0.00019848032590469986, + "loss": 0.367, + "step": 4497 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390625, + "learning_rate": 0.00019847830280424748, + "loss": 0.4201, + "step": 4498 + }, + { + "epoch": 0.6, + "grad_norm": 0.7421875, + "learning_rate": 0.00019847627836836617, + "loss": 0.5373, + "step": 4499 + }, + { + "epoch": 0.6, + "grad_norm": 0.78125, + "learning_rate": 0.00019847425259708333, + "loss": 0.4117, + "step": 4500 + }, + { + "epoch": 0.6, + "grad_norm": 0.6796875, + "learning_rate": 0.0001984722254904264, + "loss": 0.5728, + "step": 4501 + }, + { + "epoch": 0.6, + "grad_norm": 0.54296875, + "learning_rate": 0.0001984701970484229, + "loss": 0.4296, + "step": 4502 + }, + { + "epoch": 0.6, + "grad_norm": 0.470703125, + "learning_rate": 0.0001984681672711003, + "loss": 0.5164, + "step": 4503 + }, + { + "epoch": 0.6, + "grad_norm": 0.6953125, + "learning_rate": 0.0001984661361584862, + "loss": 0.4734, + "step": 4504 + }, + { + "epoch": 0.6, + "grad_norm": 0.7578125, + "learning_rate": 0.00019846410371060808, + "loss": 0.577, + "step": 4505 + }, + { + "epoch": 0.6, + "grad_norm": 0.66796875, + "learning_rate": 0.00019846206992749354, + "loss": 0.5218, + "step": 4506 + }, + { + "epoch": 0.6, + "grad_norm": 0.8046875, + "learning_rate": 0.0001984600348091701, + "loss": 0.5292, + "step": 4507 + }, + { + "epoch": 0.6, + "grad_norm": 1.2421875, + "learning_rate": 0.00019845799835566542, + "loss": 0.6063, + "step": 4508 + }, + { + "epoch": 0.6, + "grad_norm": 0.828125, + "learning_rate": 0.00019845596056700712, + "loss": 0.5252, + "step": 4509 + }, + { + "epoch": 0.6, + "grad_norm": 0.703125, + "learning_rate": 0.00019845392144322274, + "loss": 0.3251, + "step": 4510 + }, + { + "epoch": 0.6, + "grad_norm": 0.75390625, + "learning_rate": 0.00019845188098434005, + "loss": 0.312, + "step": 4511 + }, + { + "epoch": 0.6, + "grad_norm": 0.57421875, + "learning_rate": 0.00019844983919038668, + "loss": 0.5972, + "step": 4512 + }, + { + "epoch": 0.6, + "grad_norm": 0.50390625, + "learning_rate": 0.00019844779606139023, + "loss": 0.3446, + "step": 4513 + }, + { + "epoch": 0.6, + "grad_norm": 0.73046875, + "learning_rate": 0.00019844575159737855, + "loss": 0.6199, + "step": 4514 + }, + { + "epoch": 0.6, + "grad_norm": 0.8828125, + "learning_rate": 0.00019844370579837927, + "loss": 0.6162, + "step": 4515 + }, + { + "epoch": 0.6, + "grad_norm": 0.58984375, + "learning_rate": 0.00019844165866442017, + "loss": 0.5995, + "step": 4516 + }, + { + "epoch": 0.6, + "grad_norm": 0.78125, + "learning_rate": 0.000198439610195529, + "loss": 0.6139, + "step": 4517 + }, + { + "epoch": 0.6, + "grad_norm": 0.734375, + "learning_rate": 0.00019843756039173354, + "loss": 0.5931, + "step": 4518 + }, + { + "epoch": 0.6, + "grad_norm": 0.91796875, + "learning_rate": 0.00019843550925306155, + "loss": 0.5942, + "step": 4519 + }, + { + "epoch": 0.6, + "grad_norm": 0.90625, + "learning_rate": 0.00019843345677954092, + "loss": 0.546, + "step": 4520 + }, + { + "epoch": 0.6, + "grad_norm": 0.5078125, + "learning_rate": 0.00019843140297119943, + "loss": 0.388, + "step": 4521 + }, + { + "epoch": 0.6, + "grad_norm": 0.74609375, + "learning_rate": 0.00019842934782806496, + "loss": 0.3958, + "step": 4522 + }, + { + "epoch": 0.6, + "grad_norm": 0.61328125, + "learning_rate": 0.00019842729135016534, + "loss": 0.6046, + "step": 4523 + }, + { + "epoch": 0.6, + "grad_norm": 0.609375, + "learning_rate": 0.00019842523353752847, + "loss": 0.2977, + "step": 4524 + }, + { + "epoch": 0.6, + "grad_norm": 0.51953125, + "learning_rate": 0.0001984231743901823, + "loss": 0.4074, + "step": 4525 + }, + { + "epoch": 0.6, + "grad_norm": 0.62109375, + "learning_rate": 0.00019842111390815467, + "loss": 0.5027, + "step": 4526 + }, + { + "epoch": 0.6, + "grad_norm": 0.671875, + "learning_rate": 0.00019841905209147358, + "loss": 0.566, + "step": 4527 + }, + { + "epoch": 0.6, + "grad_norm": 0.8125, + "learning_rate": 0.00019841698894016699, + "loss": 0.4276, + "step": 4528 + }, + { + "epoch": 0.6, + "grad_norm": 0.76953125, + "learning_rate": 0.0001984149244542629, + "loss": 0.4538, + "step": 4529 + }, + { + "epoch": 0.6, + "grad_norm": 0.8046875, + "learning_rate": 0.00019841285863378923, + "loss": 0.387, + "step": 4530 + }, + { + "epoch": 0.6, + "grad_norm": 0.76171875, + "learning_rate": 0.000198410791478774, + "loss": 0.3276, + "step": 4531 + }, + { + "epoch": 0.6, + "grad_norm": 0.67578125, + "learning_rate": 0.00019840872298924536, + "loss": 0.5635, + "step": 4532 + }, + { + "epoch": 0.6, + "grad_norm": 0.5625, + "learning_rate": 0.00019840665316523125, + "loss": 0.568, + "step": 4533 + }, + { + "epoch": 0.61, + "grad_norm": 0.578125, + "learning_rate": 0.00019840458200675974, + "loss": 0.3528, + "step": 4534 + }, + { + "epoch": 0.61, + "grad_norm": 1.0546875, + "learning_rate": 0.00019840250951385895, + "loss": 0.4277, + "step": 4535 + }, + { + "epoch": 0.61, + "grad_norm": 0.73828125, + "learning_rate": 0.000198400435686557, + "loss": 0.4597, + "step": 4536 + }, + { + "epoch": 0.61, + "grad_norm": 0.61328125, + "learning_rate": 0.000198398360524882, + "loss": 0.5756, + "step": 4537 + }, + { + "epoch": 0.61, + "grad_norm": 0.9296875, + "learning_rate": 0.00019839628402886202, + "loss": 0.623, + "step": 4538 + }, + { + "epoch": 0.61, + "grad_norm": 0.6875, + "learning_rate": 0.00019839420619852533, + "loss": 0.4404, + "step": 4539 + }, + { + "epoch": 0.61, + "grad_norm": 0.6328125, + "learning_rate": 0.0001983921270339, + "loss": 0.5257, + "step": 4540 + }, + { + "epoch": 0.61, + "grad_norm": 0.66015625, + "learning_rate": 0.00019839004653501437, + "loss": 0.4917, + "step": 4541 + }, + { + "epoch": 0.61, + "grad_norm": 0.66015625, + "learning_rate": 0.0001983879647018965, + "loss": 0.3171, + "step": 4542 + }, + { + "epoch": 0.61, + "grad_norm": 0.6015625, + "learning_rate": 0.0001983858815345747, + "loss": 0.5075, + "step": 4543 + }, + { + "epoch": 0.61, + "grad_norm": 0.703125, + "learning_rate": 0.00019838379703307718, + "loss": 0.5738, + "step": 4544 + }, + { + "epoch": 0.61, + "grad_norm": 1.0, + "learning_rate": 0.00019838171119743224, + "loss": 0.6287, + "step": 4545 + }, + { + "epoch": 0.61, + "grad_norm": 0.703125, + "learning_rate": 0.00019837962402766815, + "loss": 0.6886, + "step": 4546 + }, + { + "epoch": 0.61, + "grad_norm": 1.09375, + "learning_rate": 0.00019837753552381324, + "loss": 0.5461, + "step": 4547 + }, + { + "epoch": 0.61, + "grad_norm": 0.8984375, + "learning_rate": 0.0001983754456858958, + "loss": 0.3996, + "step": 4548 + }, + { + "epoch": 0.61, + "grad_norm": 0.82421875, + "learning_rate": 0.00019837335451394422, + "loss": 0.4477, + "step": 4549 + }, + { + "epoch": 0.61, + "grad_norm": 0.66015625, + "learning_rate": 0.00019837126200798676, + "loss": 0.577, + "step": 4550 + }, + { + "epoch": 0.61, + "grad_norm": 0.8125, + "learning_rate": 0.00019836916816805186, + "loss": 0.6174, + "step": 4551 + }, + { + "epoch": 0.61, + "grad_norm": 0.59765625, + "learning_rate": 0.0001983670729941679, + "loss": 0.6742, + "step": 4552 + }, + { + "epoch": 0.61, + "grad_norm": 0.5703125, + "learning_rate": 0.00019836497648636334, + "loss": 0.3865, + "step": 4553 + }, + { + "epoch": 0.61, + "grad_norm": 0.53125, + "learning_rate": 0.00019836287864466657, + "loss": 0.566, + "step": 4554 + }, + { + "epoch": 0.61, + "grad_norm": 0.61328125, + "learning_rate": 0.00019836077946910598, + "loss": 0.692, + "step": 4555 + }, + { + "epoch": 0.61, + "grad_norm": 0.51953125, + "learning_rate": 0.00019835867895971014, + "loss": 0.4669, + "step": 4556 + }, + { + "epoch": 0.61, + "grad_norm": 0.7109375, + "learning_rate": 0.00019835657711650745, + "loss": 0.7417, + "step": 4557 + }, + { + "epoch": 0.61, + "grad_norm": 0.6640625, + "learning_rate": 0.0001983544739395265, + "loss": 0.5802, + "step": 4558 + }, + { + "epoch": 0.61, + "grad_norm": 0.83203125, + "learning_rate": 0.00019835236942879572, + "loss": 0.6438, + "step": 4559 + }, + { + "epoch": 0.61, + "grad_norm": 1.109375, + "learning_rate": 0.0001983502635843437, + "loss": 0.3893, + "step": 4560 + }, + { + "epoch": 0.61, + "grad_norm": 0.6640625, + "learning_rate": 0.000198348156406199, + "loss": 0.4281, + "step": 4561 + }, + { + "epoch": 0.61, + "grad_norm": 0.875, + "learning_rate": 0.00019834604789439016, + "loss": 0.5553, + "step": 4562 + }, + { + "epoch": 0.61, + "grad_norm": 0.6875, + "learning_rate": 0.0001983439380489458, + "loss": 0.751, + "step": 4563 + }, + { + "epoch": 0.61, + "grad_norm": 0.7578125, + "learning_rate": 0.0001983418268698945, + "loss": 0.5701, + "step": 4564 + }, + { + "epoch": 0.61, + "grad_norm": 0.81640625, + "learning_rate": 0.00019833971435726495, + "loss": 0.5743, + "step": 4565 + }, + { + "epoch": 0.61, + "grad_norm": 0.8125, + "learning_rate": 0.00019833760051108574, + "loss": 0.4652, + "step": 4566 + }, + { + "epoch": 0.61, + "grad_norm": 0.9921875, + "learning_rate": 0.00019833548533138554, + "loss": 0.6352, + "step": 4567 + }, + { + "epoch": 0.61, + "grad_norm": 0.69921875, + "learning_rate": 0.00019833336881819305, + "loss": 0.2995, + "step": 4568 + }, + { + "epoch": 0.61, + "grad_norm": 0.77734375, + "learning_rate": 0.000198331250971537, + "loss": 0.5773, + "step": 4569 + }, + { + "epoch": 0.61, + "grad_norm": 0.8125, + "learning_rate": 0.00019832913179144605, + "loss": 0.4103, + "step": 4570 + }, + { + "epoch": 0.61, + "grad_norm": 0.83203125, + "learning_rate": 0.000198327011277949, + "loss": 0.5141, + "step": 4571 + }, + { + "epoch": 0.61, + "grad_norm": 0.52734375, + "learning_rate": 0.00019832488943107452, + "loss": 0.3726, + "step": 4572 + }, + { + "epoch": 0.61, + "grad_norm": 0.7265625, + "learning_rate": 0.00019832276625085147, + "loss": 0.4446, + "step": 4573 + }, + { + "epoch": 0.61, + "grad_norm": 0.9609375, + "learning_rate": 0.0001983206417373086, + "loss": 0.6485, + "step": 4574 + }, + { + "epoch": 0.61, + "grad_norm": 0.58984375, + "learning_rate": 0.00019831851589047474, + "loss": 0.6689, + "step": 4575 + }, + { + "epoch": 0.61, + "grad_norm": 0.498046875, + "learning_rate": 0.00019831638871037867, + "loss": 0.3478, + "step": 4576 + }, + { + "epoch": 0.61, + "grad_norm": 0.78515625, + "learning_rate": 0.00019831426019704932, + "loss": 0.5326, + "step": 4577 + }, + { + "epoch": 0.61, + "grad_norm": 0.5859375, + "learning_rate": 0.00019831213035051545, + "loss": 0.5825, + "step": 4578 + }, + { + "epoch": 0.61, + "grad_norm": 0.91015625, + "learning_rate": 0.00019830999917080604, + "loss": 0.6364, + "step": 4579 + }, + { + "epoch": 0.61, + "grad_norm": 0.78125, + "learning_rate": 0.0001983078666579499, + "loss": 0.5017, + "step": 4580 + }, + { + "epoch": 0.61, + "grad_norm": 0.68359375, + "learning_rate": 0.00019830573281197605, + "loss": 0.3356, + "step": 4581 + }, + { + "epoch": 0.61, + "grad_norm": 0.765625, + "learning_rate": 0.00019830359763291333, + "loss": 0.5942, + "step": 4582 + }, + { + "epoch": 0.61, + "grad_norm": 0.671875, + "learning_rate": 0.00019830146112079073, + "loss": 0.7331, + "step": 4583 + }, + { + "epoch": 0.61, + "grad_norm": 0.8046875, + "learning_rate": 0.00019829932327563725, + "loss": 1.0551, + "step": 4584 + }, + { + "epoch": 0.61, + "grad_norm": 0.83984375, + "learning_rate": 0.00019829718409748186, + "loss": 0.4588, + "step": 4585 + }, + { + "epoch": 0.61, + "grad_norm": 0.59765625, + "learning_rate": 0.00019829504358635355, + "loss": 0.3976, + "step": 4586 + }, + { + "epoch": 0.61, + "grad_norm": 0.875, + "learning_rate": 0.00019829290174228138, + "loss": 0.6238, + "step": 4587 + }, + { + "epoch": 0.61, + "grad_norm": 0.6484375, + "learning_rate": 0.00019829075856529437, + "loss": 0.4236, + "step": 4588 + }, + { + "epoch": 0.61, + "grad_norm": 0.6015625, + "learning_rate": 0.0001982886140554216, + "loss": 0.344, + "step": 4589 + }, + { + "epoch": 0.61, + "grad_norm": 0.6484375, + "learning_rate": 0.00019828646821269213, + "loss": 0.4503, + "step": 4590 + }, + { + "epoch": 0.61, + "grad_norm": 0.65625, + "learning_rate": 0.00019828432103713507, + "loss": 0.7754, + "step": 4591 + }, + { + "epoch": 0.61, + "grad_norm": 0.640625, + "learning_rate": 0.00019828217252877958, + "loss": 0.4447, + "step": 4592 + }, + { + "epoch": 0.61, + "grad_norm": 1.046875, + "learning_rate": 0.00019828002268765468, + "loss": 0.5074, + "step": 4593 + }, + { + "epoch": 0.61, + "grad_norm": 0.6640625, + "learning_rate": 0.00019827787151378964, + "loss": 0.3568, + "step": 4594 + }, + { + "epoch": 0.61, + "grad_norm": 0.69140625, + "learning_rate": 0.00019827571900721358, + "loss": 0.5262, + "step": 4595 + }, + { + "epoch": 0.61, + "grad_norm": 0.6640625, + "learning_rate": 0.0001982735651679557, + "loss": 0.6168, + "step": 4596 + }, + { + "epoch": 0.61, + "grad_norm": 0.72265625, + "learning_rate": 0.0001982714099960452, + "loss": 0.8044, + "step": 4597 + }, + { + "epoch": 0.61, + "grad_norm": 0.67578125, + "learning_rate": 0.00019826925349151133, + "loss": 0.92, + "step": 4598 + }, + { + "epoch": 0.61, + "grad_norm": 0.8203125, + "learning_rate": 0.0001982670956543833, + "loss": 0.5806, + "step": 4599 + }, + { + "epoch": 0.61, + "grad_norm": 0.70703125, + "learning_rate": 0.00019826493648469038, + "loss": 0.3355, + "step": 4600 + }, + { + "epoch": 0.61, + "grad_norm": 0.84765625, + "learning_rate": 0.00019826277598246187, + "loss": 0.7043, + "step": 4601 + }, + { + "epoch": 0.61, + "grad_norm": 0.60546875, + "learning_rate": 0.00019826061414772705, + "loss": 0.622, + "step": 4602 + }, + { + "epoch": 0.61, + "grad_norm": 0.498046875, + "learning_rate": 0.0001982584509805152, + "loss": 0.3846, + "step": 4603 + }, + { + "epoch": 0.61, + "grad_norm": 0.7421875, + "learning_rate": 0.00019825628648085576, + "loss": 0.4306, + "step": 4604 + }, + { + "epoch": 0.61, + "grad_norm": 0.70703125, + "learning_rate": 0.00019825412064877797, + "loss": 0.7063, + "step": 4605 + }, + { + "epoch": 0.61, + "grad_norm": 0.7890625, + "learning_rate": 0.00019825195348431124, + "loss": 0.6283, + "step": 4606 + }, + { + "epoch": 0.61, + "grad_norm": 0.7734375, + "learning_rate": 0.00019824978498748499, + "loss": 0.4805, + "step": 4607 + }, + { + "epoch": 0.61, + "grad_norm": 0.6328125, + "learning_rate": 0.00019824761515832857, + "loss": 0.5796, + "step": 4608 + }, + { + "epoch": 0.62, + "grad_norm": 0.455078125, + "learning_rate": 0.00019824544399687146, + "loss": 0.4703, + "step": 4609 + }, + { + "epoch": 0.62, + "grad_norm": 0.6328125, + "learning_rate": 0.00019824327150314307, + "loss": 0.5141, + "step": 4610 + }, + { + "epoch": 0.62, + "grad_norm": 0.6953125, + "learning_rate": 0.00019824109767717287, + "loss": 1.041, + "step": 4611 + }, + { + "epoch": 0.62, + "grad_norm": 0.6328125, + "learning_rate": 0.0001982389225189903, + "loss": 0.4773, + "step": 4612 + }, + { + "epoch": 0.62, + "grad_norm": 0.6328125, + "learning_rate": 0.00019823674602862493, + "loss": 0.6789, + "step": 4613 + }, + { + "epoch": 0.62, + "grad_norm": 0.75390625, + "learning_rate": 0.00019823456820610623, + "loss": 0.47, + "step": 4614 + }, + { + "epoch": 0.62, + "grad_norm": 0.68359375, + "learning_rate": 0.00019823238905146374, + "loss": 0.4903, + "step": 4615 + }, + { + "epoch": 0.62, + "grad_norm": 1.0703125, + "learning_rate": 0.00019823020856472702, + "loss": 0.6208, + "step": 4616 + }, + { + "epoch": 0.62, + "grad_norm": 0.59375, + "learning_rate": 0.0001982280267459256, + "loss": 0.5442, + "step": 4617 + }, + { + "epoch": 0.62, + "grad_norm": 0.7109375, + "learning_rate": 0.00019822584359508908, + "loss": 0.6927, + "step": 4618 + }, + { + "epoch": 0.62, + "grad_norm": 0.70703125, + "learning_rate": 0.00019822365911224715, + "loss": 0.37, + "step": 4619 + }, + { + "epoch": 0.62, + "grad_norm": 0.8359375, + "learning_rate": 0.0001982214732974293, + "loss": 0.9078, + "step": 4620 + }, + { + "epoch": 0.62, + "grad_norm": 0.80859375, + "learning_rate": 0.00019821928615066525, + "loss": 0.2509, + "step": 4621 + }, + { + "epoch": 0.62, + "grad_norm": 0.671875, + "learning_rate": 0.00019821709767198464, + "loss": 0.3908, + "step": 4622 + }, + { + "epoch": 0.62, + "grad_norm": 0.67578125, + "learning_rate": 0.00019821490786141714, + "loss": 0.5319, + "step": 4623 + }, + { + "epoch": 0.62, + "grad_norm": 0.74609375, + "learning_rate": 0.0001982127167189925, + "loss": 0.5419, + "step": 4624 + }, + { + "epoch": 0.62, + "grad_norm": 0.625, + "learning_rate": 0.00019821052424474033, + "loss": 0.4684, + "step": 4625 + }, + { + "epoch": 0.62, + "grad_norm": 0.5078125, + "learning_rate": 0.00019820833043869047, + "loss": 0.4128, + "step": 4626 + }, + { + "epoch": 0.62, + "grad_norm": 0.625, + "learning_rate": 0.0001982061353008726, + "loss": 0.4161, + "step": 4627 + }, + { + "epoch": 0.62, + "grad_norm": 0.8515625, + "learning_rate": 0.00019820393883131652, + "loss": 0.3827, + "step": 4628 + }, + { + "epoch": 0.62, + "grad_norm": 0.515625, + "learning_rate": 0.00019820174103005197, + "loss": 0.5512, + "step": 4629 + }, + { + "epoch": 0.62, + "grad_norm": 0.5703125, + "learning_rate": 0.00019819954189710881, + "loss": 0.5894, + "step": 4630 + }, + { + "epoch": 0.62, + "grad_norm": 0.6875, + "learning_rate": 0.00019819734143251684, + "loss": 0.5724, + "step": 4631 + }, + { + "epoch": 0.62, + "grad_norm": 1.15625, + "learning_rate": 0.0001981951396363059, + "loss": 0.4285, + "step": 4632 + }, + { + "epoch": 0.62, + "grad_norm": 0.65625, + "learning_rate": 0.00019819293650850584, + "loss": 0.4647, + "step": 4633 + }, + { + "epoch": 0.62, + "grad_norm": 0.79296875, + "learning_rate": 0.0001981907320491465, + "loss": 0.5508, + "step": 4634 + }, + { + "epoch": 0.62, + "grad_norm": 0.671875, + "learning_rate": 0.00019818852625825789, + "loss": 0.8417, + "step": 4635 + }, + { + "epoch": 0.62, + "grad_norm": 0.53125, + "learning_rate": 0.0001981863191358698, + "loss": 0.4472, + "step": 4636 + }, + { + "epoch": 0.62, + "grad_norm": 0.765625, + "learning_rate": 0.0001981841106820122, + "loss": 0.7419, + "step": 4637 + }, + { + "epoch": 0.62, + "grad_norm": 0.6875, + "learning_rate": 0.00019818190089671508, + "loss": 0.3355, + "step": 4638 + }, + { + "epoch": 0.62, + "grad_norm": 0.609375, + "learning_rate": 0.00019817968978000833, + "loss": 0.3025, + "step": 4639 + }, + { + "epoch": 0.62, + "grad_norm": 0.81640625, + "learning_rate": 0.000198177477331922, + "loss": 0.3688, + "step": 4640 + }, + { + "epoch": 0.62, + "grad_norm": 0.671875, + "learning_rate": 0.00019817526355248607, + "loss": 0.4592, + "step": 4641 + }, + { + "epoch": 0.62, + "grad_norm": 0.59765625, + "learning_rate": 0.00019817304844173058, + "loss": 0.4894, + "step": 4642 + }, + { + "epoch": 0.62, + "grad_norm": 0.796875, + "learning_rate": 0.00019817083199968552, + "loss": 0.3269, + "step": 4643 + }, + { + "epoch": 0.62, + "grad_norm": 0.71484375, + "learning_rate": 0.00019816861422638094, + "loss": 0.4884, + "step": 4644 + }, + { + "epoch": 0.62, + "grad_norm": 0.8203125, + "learning_rate": 0.000198166395121847, + "loss": 0.6392, + "step": 4645 + }, + { + "epoch": 0.62, + "grad_norm": 0.7265625, + "learning_rate": 0.00019816417468611371, + "loss": 0.4206, + "step": 4646 + }, + { + "epoch": 0.62, + "grad_norm": 0.82421875, + "learning_rate": 0.00019816195291921123, + "loss": 0.8309, + "step": 4647 + }, + { + "epoch": 0.62, + "grad_norm": 0.90234375, + "learning_rate": 0.00019815972982116967, + "loss": 0.6765, + "step": 4648 + }, + { + "epoch": 0.62, + "grad_norm": 0.46875, + "learning_rate": 0.00019815750539201914, + "loss": 0.5158, + "step": 4649 + }, + { + "epoch": 0.62, + "grad_norm": 1.2421875, + "learning_rate": 0.00019815527963178985, + "loss": 0.4558, + "step": 4650 + }, + { + "epoch": 0.62, + "grad_norm": 0.55078125, + "learning_rate": 0.000198153052540512, + "loss": 0.5858, + "step": 4651 + }, + { + "epoch": 0.62, + "grad_norm": 0.625, + "learning_rate": 0.00019815082411821572, + "loss": 0.6688, + "step": 4652 + }, + { + "epoch": 0.62, + "grad_norm": 0.69921875, + "learning_rate": 0.0001981485943649313, + "loss": 0.3923, + "step": 4653 + }, + { + "epoch": 0.62, + "grad_norm": 0.65234375, + "learning_rate": 0.000198146363280689, + "loss": 0.4214, + "step": 4654 + }, + { + "epoch": 0.62, + "grad_norm": 0.7421875, + "learning_rate": 0.00019814413086551892, + "loss": 0.4727, + "step": 4655 + }, + { + "epoch": 0.62, + "grad_norm": 0.59765625, + "learning_rate": 0.00019814189711945152, + "loss": 0.5172, + "step": 4656 + }, + { + "epoch": 0.62, + "grad_norm": 0.578125, + "learning_rate": 0.00019813966204251698, + "loss": 0.431, + "step": 4657 + }, + { + "epoch": 0.62, + "grad_norm": 0.53125, + "learning_rate": 0.00019813742563474563, + "loss": 0.4402, + "step": 4658 + }, + { + "epoch": 0.62, + "grad_norm": 0.64453125, + "learning_rate": 0.0001981351878961678, + "loss": 0.5875, + "step": 4659 + }, + { + "epoch": 0.62, + "grad_norm": 0.8203125, + "learning_rate": 0.00019813294882681386, + "loss": 0.4608, + "step": 4660 + }, + { + "epoch": 0.62, + "grad_norm": 0.69921875, + "learning_rate": 0.00019813070842671416, + "loss": 0.5203, + "step": 4661 + }, + { + "epoch": 0.62, + "grad_norm": 0.462890625, + "learning_rate": 0.00019812846669589905, + "loss": 0.3762, + "step": 4662 + }, + { + "epoch": 0.62, + "grad_norm": 0.62890625, + "learning_rate": 0.00019812622363439894, + "loss": 0.6333, + "step": 4663 + }, + { + "epoch": 0.62, + "grad_norm": 0.5234375, + "learning_rate": 0.00019812397924224429, + "loss": 0.5267, + "step": 4664 + }, + { + "epoch": 0.62, + "grad_norm": 0.66015625, + "learning_rate": 0.0001981217335194655, + "loss": 0.6961, + "step": 4665 + }, + { + "epoch": 0.62, + "grad_norm": 0.97265625, + "learning_rate": 0.00019811948646609299, + "loss": 0.5931, + "step": 4666 + }, + { + "epoch": 0.62, + "grad_norm": 0.765625, + "learning_rate": 0.00019811723808215733, + "loss": 0.6818, + "step": 4667 + }, + { + "epoch": 0.62, + "grad_norm": 0.60546875, + "learning_rate": 0.0001981149883676889, + "loss": 0.3764, + "step": 4668 + }, + { + "epoch": 0.62, + "grad_norm": 0.6171875, + "learning_rate": 0.00019811273732271828, + "loss": 0.6692, + "step": 4669 + }, + { + "epoch": 0.62, + "grad_norm": 0.58984375, + "learning_rate": 0.000198110484947276, + "loss": 0.4608, + "step": 4670 + }, + { + "epoch": 0.62, + "grad_norm": 0.88671875, + "learning_rate": 0.00019810823124139255, + "loss": 0.6172, + "step": 4671 + }, + { + "epoch": 0.62, + "grad_norm": 0.65625, + "learning_rate": 0.00019810597620509852, + "loss": 0.5651, + "step": 4672 + }, + { + "epoch": 0.62, + "grad_norm": 0.59375, + "learning_rate": 0.00019810371983842447, + "loss": 0.4215, + "step": 4673 + }, + { + "epoch": 0.62, + "grad_norm": 0.796875, + "learning_rate": 0.00019810146214140105, + "loss": 0.6331, + "step": 4674 + }, + { + "epoch": 0.62, + "grad_norm": 0.52734375, + "learning_rate": 0.00019809920311405886, + "loss": 0.3957, + "step": 4675 + }, + { + "epoch": 0.62, + "grad_norm": 0.76171875, + "learning_rate": 0.00019809694275642846, + "loss": 0.5362, + "step": 4676 + }, + { + "epoch": 0.62, + "grad_norm": 0.5, + "learning_rate": 0.0001980946810685406, + "loss": 0.3363, + "step": 4677 + }, + { + "epoch": 0.62, + "grad_norm": 0.81640625, + "learning_rate": 0.0001980924180504259, + "loss": 0.4109, + "step": 4678 + }, + { + "epoch": 0.62, + "grad_norm": 0.6484375, + "learning_rate": 0.00019809015370211502, + "loss": 0.5277, + "step": 4679 + }, + { + "epoch": 0.62, + "grad_norm": 1.015625, + "learning_rate": 0.00019808788802363876, + "loss": 0.3863, + "step": 4680 + }, + { + "epoch": 0.62, + "grad_norm": 0.5625, + "learning_rate": 0.00019808562101502776, + "loss": 0.4096, + "step": 4681 + }, + { + "epoch": 0.62, + "grad_norm": 0.87890625, + "learning_rate": 0.00019808335267631276, + "loss": 0.8459, + "step": 4682 + }, + { + "epoch": 0.62, + "grad_norm": 0.640625, + "learning_rate": 0.0001980810830075246, + "loss": 0.6116, + "step": 4683 + }, + { + "epoch": 0.63, + "grad_norm": 1.09375, + "learning_rate": 0.00019807881200869395, + "loss": 0.4434, + "step": 4684 + }, + { + "epoch": 0.63, + "grad_norm": 0.578125, + "learning_rate": 0.00019807653967985172, + "loss": 0.7911, + "step": 4685 + }, + { + "epoch": 0.63, + "grad_norm": 0.5078125, + "learning_rate": 0.00019807426602102863, + "loss": 0.6335, + "step": 4686 + }, + { + "epoch": 0.63, + "grad_norm": 0.61328125, + "learning_rate": 0.00019807199103225552, + "loss": 0.5987, + "step": 4687 + }, + { + "epoch": 0.63, + "grad_norm": 0.9921875, + "learning_rate": 0.00019806971471356333, + "loss": 0.4114, + "step": 4688 + }, + { + "epoch": 0.63, + "grad_norm": 0.6328125, + "learning_rate": 0.00019806743706498282, + "loss": 0.386, + "step": 4689 + }, + { + "epoch": 0.63, + "grad_norm": 0.8671875, + "learning_rate": 0.00019806515808654498, + "loss": 0.8518, + "step": 4690 + }, + { + "epoch": 0.63, + "grad_norm": 0.6015625, + "learning_rate": 0.0001980628777782806, + "loss": 0.6839, + "step": 4691 + }, + { + "epoch": 0.63, + "grad_norm": 0.78515625, + "learning_rate": 0.0001980605961402207, + "loss": 0.5705, + "step": 4692 + }, + { + "epoch": 0.63, + "grad_norm": 0.71484375, + "learning_rate": 0.00019805831317239617, + "loss": 0.3087, + "step": 4693 + }, + { + "epoch": 0.63, + "grad_norm": 0.65234375, + "learning_rate": 0.00019805602887483802, + "loss": 0.429, + "step": 4694 + }, + { + "epoch": 0.63, + "grad_norm": 0.5625, + "learning_rate": 0.00019805374324757713, + "loss": 0.5835, + "step": 4695 + }, + { + "epoch": 0.63, + "grad_norm": 0.4140625, + "learning_rate": 0.00019805145629064454, + "loss": 0.3824, + "step": 4696 + }, + { + "epoch": 0.63, + "grad_norm": 0.6015625, + "learning_rate": 0.00019804916800407132, + "loss": 0.4902, + "step": 4697 + }, + { + "epoch": 0.63, + "grad_norm": 0.69140625, + "learning_rate": 0.00019804687838788842, + "loss": 0.4599, + "step": 4698 + }, + { + "epoch": 0.63, + "grad_norm": 0.7421875, + "learning_rate": 0.00019804458744212696, + "loss": 0.9428, + "step": 4699 + }, + { + "epoch": 0.63, + "grad_norm": 0.48828125, + "learning_rate": 0.00019804229516681793, + "loss": 0.4474, + "step": 4700 + }, + { + "epoch": 0.63, + "grad_norm": 0.8359375, + "learning_rate": 0.0001980400015619925, + "loss": 0.3207, + "step": 4701 + }, + { + "epoch": 0.63, + "grad_norm": 0.71484375, + "learning_rate": 0.0001980377066276817, + "loss": 0.6369, + "step": 4702 + }, + { + "epoch": 0.63, + "grad_norm": 0.69921875, + "learning_rate": 0.00019803541036391667, + "loss": 0.3889, + "step": 4703 + }, + { + "epoch": 0.63, + "grad_norm": 0.98046875, + "learning_rate": 0.00019803311277072857, + "loss": 0.907, + "step": 4704 + }, + { + "epoch": 0.63, + "grad_norm": 0.6796875, + "learning_rate": 0.00019803081384814851, + "loss": 0.5815, + "step": 4705 + }, + { + "epoch": 0.63, + "grad_norm": 0.890625, + "learning_rate": 0.00019802851359620777, + "loss": 0.6711, + "step": 4706 + }, + { + "epoch": 0.63, + "grad_norm": 0.69921875, + "learning_rate": 0.0001980262120149374, + "loss": 0.9502, + "step": 4707 + }, + { + "epoch": 0.63, + "grad_norm": 1.0234375, + "learning_rate": 0.0001980239091043687, + "loss": 0.4011, + "step": 4708 + }, + { + "epoch": 0.63, + "grad_norm": 0.67578125, + "learning_rate": 0.0001980216048645329, + "loss": 0.4293, + "step": 4709 + }, + { + "epoch": 0.63, + "grad_norm": 0.81640625, + "learning_rate": 0.0001980192992954612, + "loss": 0.3069, + "step": 4710 + }, + { + "epoch": 0.63, + "grad_norm": 0.609375, + "learning_rate": 0.00019801699239718493, + "loss": 0.4872, + "step": 4711 + }, + { + "epoch": 0.63, + "grad_norm": 0.63671875, + "learning_rate": 0.00019801468416973532, + "loss": 0.7447, + "step": 4712 + }, + { + "epoch": 0.63, + "grad_norm": 0.5859375, + "learning_rate": 0.00019801237461314372, + "loss": 0.5146, + "step": 4713 + }, + { + "epoch": 0.63, + "grad_norm": 0.609375, + "learning_rate": 0.00019801006372744136, + "loss": 0.4231, + "step": 4714 + }, + { + "epoch": 0.63, + "grad_norm": 0.7421875, + "learning_rate": 0.00019800775151265966, + "loss": 0.566, + "step": 4715 + }, + { + "epoch": 0.63, + "grad_norm": 0.703125, + "learning_rate": 0.00019800543796882994, + "loss": 0.6149, + "step": 4716 + }, + { + "epoch": 0.63, + "grad_norm": 0.5625, + "learning_rate": 0.00019800312309598356, + "loss": 0.518, + "step": 4717 + }, + { + "epoch": 0.63, + "grad_norm": 0.50390625, + "learning_rate": 0.000198000806894152, + "loss": 0.4265, + "step": 4718 + }, + { + "epoch": 0.63, + "grad_norm": 0.5546875, + "learning_rate": 0.00019799848936336654, + "loss": 0.395, + "step": 4719 + }, + { + "epoch": 0.63, + "grad_norm": 0.78125, + "learning_rate": 0.0001979961705036587, + "loss": 0.7323, + "step": 4720 + }, + { + "epoch": 0.63, + "grad_norm": 0.6796875, + "learning_rate": 0.0001979938503150599, + "loss": 0.6396, + "step": 4721 + }, + { + "epoch": 0.63, + "grad_norm": 0.84375, + "learning_rate": 0.00019799152879760155, + "loss": 0.929, + "step": 4722 + }, + { + "epoch": 0.63, + "grad_norm": 0.59765625, + "learning_rate": 0.00019798920595131522, + "loss": 0.5855, + "step": 4723 + }, + { + "epoch": 0.63, + "grad_norm": 0.671875, + "learning_rate": 0.00019798688177623234, + "loss": 0.2856, + "step": 4724 + }, + { + "epoch": 0.63, + "grad_norm": 0.78515625, + "learning_rate": 0.0001979845562723845, + "loss": 0.2678, + "step": 4725 + }, + { + "epoch": 0.63, + "grad_norm": 0.9765625, + "learning_rate": 0.00019798222943980315, + "loss": 0.8825, + "step": 4726 + }, + { + "epoch": 0.63, + "grad_norm": 0.734375, + "learning_rate": 0.0001979799012785199, + "loss": 0.4528, + "step": 4727 + }, + { + "epoch": 0.63, + "grad_norm": 0.58984375, + "learning_rate": 0.00019797757178856632, + "loss": 0.4395, + "step": 4728 + }, + { + "epoch": 0.63, + "grad_norm": 0.7734375, + "learning_rate": 0.00019797524096997398, + "loss": 0.3778, + "step": 4729 + }, + { + "epoch": 0.63, + "grad_norm": 0.478515625, + "learning_rate": 0.00019797290882277445, + "loss": 0.6311, + "step": 4730 + }, + { + "epoch": 0.63, + "grad_norm": 1.015625, + "learning_rate": 0.00019797057534699946, + "loss": 0.5341, + "step": 4731 + }, + { + "epoch": 0.63, + "grad_norm": 0.71875, + "learning_rate": 0.00019796824054268055, + "loss": 0.4318, + "step": 4732 + }, + { + "epoch": 0.63, + "grad_norm": 0.76171875, + "learning_rate": 0.00019796590440984945, + "loss": 0.8642, + "step": 4733 + }, + { + "epoch": 0.63, + "grad_norm": 0.73828125, + "learning_rate": 0.0001979635669485378, + "loss": 0.4784, + "step": 4734 + }, + { + "epoch": 0.63, + "grad_norm": 0.53515625, + "learning_rate": 0.00019796122815877732, + "loss": 0.673, + "step": 4735 + }, + { + "epoch": 0.63, + "grad_norm": 0.70703125, + "learning_rate": 0.00019795888804059966, + "loss": 0.4698, + "step": 4736 + }, + { + "epoch": 0.63, + "grad_norm": 0.52734375, + "learning_rate": 0.0001979565465940367, + "loss": 0.5288, + "step": 4737 + }, + { + "epoch": 0.63, + "grad_norm": 0.75390625, + "learning_rate": 0.00019795420381912002, + "loss": 0.463, + "step": 4738 + }, + { + "epoch": 0.63, + "grad_norm": 0.71875, + "learning_rate": 0.0001979518597158815, + "loss": 0.8693, + "step": 4739 + }, + { + "epoch": 0.63, + "grad_norm": 0.5, + "learning_rate": 0.00019794951428435292, + "loss": 0.2727, + "step": 4740 + }, + { + "epoch": 0.63, + "grad_norm": 0.82421875, + "learning_rate": 0.00019794716752456602, + "loss": 0.5036, + "step": 4741 + }, + { + "epoch": 0.63, + "grad_norm": 1.078125, + "learning_rate": 0.00019794481943655268, + "loss": 0.6972, + "step": 4742 + }, + { + "epoch": 0.63, + "grad_norm": 0.6875, + "learning_rate": 0.00019794247002034474, + "loss": 0.465, + "step": 4743 + }, + { + "epoch": 0.63, + "grad_norm": 0.93359375, + "learning_rate": 0.00019794011927597405, + "loss": 0.538, + "step": 4744 + }, + { + "epoch": 0.63, + "grad_norm": 0.44921875, + "learning_rate": 0.0001979377672034725, + "loss": 0.6008, + "step": 4745 + }, + { + "epoch": 0.63, + "grad_norm": 0.5703125, + "learning_rate": 0.0001979354138028719, + "loss": 0.3946, + "step": 4746 + }, + { + "epoch": 0.63, + "grad_norm": 0.52734375, + "learning_rate": 0.00019793305907420429, + "loss": 0.6768, + "step": 4747 + }, + { + "epoch": 0.63, + "grad_norm": 0.64453125, + "learning_rate": 0.00019793070301750155, + "loss": 0.5141, + "step": 4748 + }, + { + "epoch": 0.63, + "grad_norm": 0.85546875, + "learning_rate": 0.0001979283456327956, + "loss": 0.4415, + "step": 4749 + }, + { + "epoch": 0.63, + "grad_norm": 0.80078125, + "learning_rate": 0.00019792598692011846, + "loss": 0.4412, + "step": 4750 + }, + { + "epoch": 0.63, + "grad_norm": 0.4921875, + "learning_rate": 0.00019792362687950208, + "loss": 0.7021, + "step": 4751 + }, + { + "epoch": 0.63, + "grad_norm": 0.625, + "learning_rate": 0.00019792126551097848, + "loss": 0.4961, + "step": 4752 + }, + { + "epoch": 0.63, + "grad_norm": 0.703125, + "learning_rate": 0.00019791890281457963, + "loss": 0.4135, + "step": 4753 + }, + { + "epoch": 0.63, + "grad_norm": 0.83203125, + "learning_rate": 0.00019791653879033767, + "loss": 0.6837, + "step": 4754 + }, + { + "epoch": 0.63, + "grad_norm": 0.8359375, + "learning_rate": 0.00019791417343828456, + "loss": 0.4119, + "step": 4755 + }, + { + "epoch": 0.63, + "grad_norm": 0.8203125, + "learning_rate": 0.00019791180675845244, + "loss": 0.3696, + "step": 4756 + }, + { + "epoch": 0.63, + "grad_norm": 1.0078125, + "learning_rate": 0.00019790943875087338, + "loss": 0.7115, + "step": 4757 + }, + { + "epoch": 0.63, + "grad_norm": 0.80078125, + "learning_rate": 0.00019790706941557947, + "loss": 0.7303, + "step": 4758 + }, + { + "epoch": 0.64, + "grad_norm": 0.83984375, + "learning_rate": 0.0001979046987526029, + "loss": 0.6899, + "step": 4759 + }, + { + "epoch": 0.64, + "grad_norm": 1.1015625, + "learning_rate": 0.00019790232676197576, + "loss": 0.4971, + "step": 4760 + }, + { + "epoch": 0.64, + "grad_norm": 0.71484375, + "learning_rate": 0.00019789995344373024, + "loss": 0.539, + "step": 4761 + }, + { + "epoch": 0.64, + "grad_norm": 0.5859375, + "learning_rate": 0.0001978975787978985, + "loss": 0.271, + "step": 4762 + }, + { + "epoch": 0.64, + "grad_norm": 0.875, + "learning_rate": 0.00019789520282451276, + "loss": 0.51, + "step": 4763 + }, + { + "epoch": 0.64, + "grad_norm": 0.64453125, + "learning_rate": 0.00019789282552360528, + "loss": 0.4725, + "step": 4764 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 0.00019789044689520825, + "loss": 0.5177, + "step": 4765 + }, + { + "epoch": 0.64, + "grad_norm": 1.015625, + "learning_rate": 0.00019788806693935392, + "loss": 0.5724, + "step": 4766 + }, + { + "epoch": 0.64, + "grad_norm": 0.57421875, + "learning_rate": 0.00019788568565607458, + "loss": 0.6617, + "step": 4767 + }, + { + "epoch": 0.64, + "grad_norm": 0.74609375, + "learning_rate": 0.0001978833030454025, + "loss": 0.4872, + "step": 4768 + }, + { + "epoch": 0.64, + "grad_norm": 0.6015625, + "learning_rate": 0.00019788091910737004, + "loss": 0.4588, + "step": 4769 + }, + { + "epoch": 0.64, + "grad_norm": 0.62890625, + "learning_rate": 0.0001978785338420095, + "loss": 0.3847, + "step": 4770 + }, + { + "epoch": 0.64, + "grad_norm": 0.51171875, + "learning_rate": 0.0001978761472493532, + "loss": 0.4264, + "step": 4771 + }, + { + "epoch": 0.64, + "grad_norm": 0.69140625, + "learning_rate": 0.00019787375932943353, + "loss": 0.4705, + "step": 4772 + }, + { + "epoch": 0.64, + "grad_norm": 0.86328125, + "learning_rate": 0.00019787137008228287, + "loss": 0.5048, + "step": 4773 + }, + { + "epoch": 0.64, + "grad_norm": 0.609375, + "learning_rate": 0.00019786897950793364, + "loss": 0.362, + "step": 4774 + }, + { + "epoch": 0.64, + "grad_norm": 0.703125, + "learning_rate": 0.00019786658760641821, + "loss": 0.4319, + "step": 4775 + }, + { + "epoch": 0.64, + "grad_norm": 0.81640625, + "learning_rate": 0.00019786419437776905, + "loss": 0.553, + "step": 4776 + }, + { + "epoch": 0.64, + "grad_norm": 0.625, + "learning_rate": 0.0001978617998220186, + "loss": 0.7569, + "step": 4777 + }, + { + "epoch": 0.64, + "grad_norm": 0.57421875, + "learning_rate": 0.00019785940393919935, + "loss": 0.447, + "step": 4778 + }, + { + "epoch": 0.64, + "grad_norm": 0.59375, + "learning_rate": 0.0001978570067293438, + "loss": 0.8087, + "step": 4779 + }, + { + "epoch": 0.64, + "grad_norm": 0.76171875, + "learning_rate": 0.00019785460819248438, + "loss": 0.6214, + "step": 4780 + }, + { + "epoch": 0.64, + "grad_norm": 0.671875, + "learning_rate": 0.0001978522083286537, + "loss": 0.5368, + "step": 4781 + }, + { + "epoch": 0.64, + "grad_norm": 0.765625, + "learning_rate": 0.00019784980713788425, + "loss": 0.5035, + "step": 4782 + }, + { + "epoch": 0.64, + "grad_norm": 0.9375, + "learning_rate": 0.00019784740462020864, + "loss": 0.7944, + "step": 4783 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 0.00019784500077565944, + "loss": 0.3126, + "step": 4784 + }, + { + "epoch": 0.64, + "grad_norm": 0.79296875, + "learning_rate": 0.0001978425956042692, + "loss": 0.7333, + "step": 4785 + }, + { + "epoch": 0.64, + "grad_norm": 0.640625, + "learning_rate": 0.0001978401891060706, + "loss": 0.6576, + "step": 4786 + }, + { + "epoch": 0.64, + "grad_norm": 0.73046875, + "learning_rate": 0.0001978377812810962, + "loss": 0.3806, + "step": 4787 + }, + { + "epoch": 0.64, + "grad_norm": 0.5859375, + "learning_rate": 0.00019783537212937874, + "loss": 0.3528, + "step": 4788 + }, + { + "epoch": 0.64, + "grad_norm": 0.8203125, + "learning_rate": 0.00019783296165095082, + "loss": 0.4882, + "step": 4789 + }, + { + "epoch": 0.64, + "grad_norm": 0.77734375, + "learning_rate": 0.00019783054984584515, + "loss": 0.482, + "step": 4790 + }, + { + "epoch": 0.64, + "grad_norm": 0.69140625, + "learning_rate": 0.00019782813671409444, + "loss": 0.6258, + "step": 4791 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 0.00019782572225573143, + "loss": 0.7662, + "step": 4792 + }, + { + "epoch": 0.64, + "grad_norm": 0.671875, + "learning_rate": 0.0001978233064707888, + "loss": 0.7966, + "step": 4793 + }, + { + "epoch": 0.64, + "grad_norm": 0.8046875, + "learning_rate": 0.0001978208893592994, + "loss": 0.4127, + "step": 4794 + }, + { + "epoch": 0.64, + "grad_norm": 0.59765625, + "learning_rate": 0.0001978184709212959, + "loss": 0.3691, + "step": 4795 + }, + { + "epoch": 0.64, + "grad_norm": 0.7421875, + "learning_rate": 0.0001978160511568112, + "loss": 0.3439, + "step": 4796 + }, + { + "epoch": 0.64, + "grad_norm": 0.60546875, + "learning_rate": 0.00019781363006587805, + "loss": 0.3738, + "step": 4797 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 0.0001978112076485293, + "loss": 0.4889, + "step": 4798 + }, + { + "epoch": 0.64, + "grad_norm": 0.5703125, + "learning_rate": 0.00019780878390479782, + "loss": 0.5955, + "step": 4799 + }, + { + "epoch": 0.64, + "grad_norm": 0.5546875, + "learning_rate": 0.00019780635883471647, + "loss": 0.4021, + "step": 4800 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 0.00019780393243831808, + "loss": 0.3812, + "step": 4801 + }, + { + "epoch": 0.64, + "grad_norm": 0.7265625, + "learning_rate": 0.00019780150471563558, + "loss": 0.3541, + "step": 4802 + }, + { + "epoch": 0.64, + "grad_norm": 0.58203125, + "learning_rate": 0.00019779907566670196, + "loss": 0.5426, + "step": 4803 + }, + { + "epoch": 0.64, + "grad_norm": 0.4453125, + "learning_rate": 0.0001977966452915501, + "loss": 0.4749, + "step": 4804 + }, + { + "epoch": 0.64, + "grad_norm": 0.55859375, + "learning_rate": 0.00019779421359021292, + "loss": 0.4203, + "step": 4805 + }, + { + "epoch": 0.64, + "grad_norm": 0.76953125, + "learning_rate": 0.00019779178056272348, + "loss": 0.5383, + "step": 4806 + }, + { + "epoch": 0.64, + "grad_norm": 0.7578125, + "learning_rate": 0.00019778934620911473, + "loss": 0.6677, + "step": 4807 + }, + { + "epoch": 0.64, + "grad_norm": 0.5625, + "learning_rate": 0.00019778691052941966, + "loss": 0.2978, + "step": 4808 + }, + { + "epoch": 0.64, + "grad_norm": 0.609375, + "learning_rate": 0.00019778447352367137, + "loss": 0.3348, + "step": 4809 + }, + { + "epoch": 0.64, + "grad_norm": 0.55078125, + "learning_rate": 0.0001977820351919028, + "loss": 0.4974, + "step": 4810 + }, + { + "epoch": 0.64, + "grad_norm": 0.55078125, + "learning_rate": 0.00019777959553414712, + "loss": 0.4321, + "step": 4811 + }, + { + "epoch": 0.64, + "grad_norm": 1.03125, + "learning_rate": 0.00019777715455043736, + "loss": 0.3898, + "step": 4812 + }, + { + "epoch": 0.64, + "grad_norm": 0.59375, + "learning_rate": 0.00019777471224080662, + "loss": 0.4065, + "step": 4813 + }, + { + "epoch": 0.64, + "grad_norm": 0.7265625, + "learning_rate": 0.00019777226860528808, + "loss": 0.5228, + "step": 4814 + }, + { + "epoch": 0.64, + "grad_norm": 0.68359375, + "learning_rate": 0.00019776982364391477, + "loss": 0.6462, + "step": 4815 + }, + { + "epoch": 0.64, + "grad_norm": 0.953125, + "learning_rate": 0.00019776737735671993, + "loss": 0.7762, + "step": 4816 + }, + { + "epoch": 0.64, + "grad_norm": 0.7734375, + "learning_rate": 0.0001977649297437367, + "loss": 0.2658, + "step": 4817 + }, + { + "epoch": 0.64, + "grad_norm": 0.498046875, + "learning_rate": 0.00019776248080499825, + "loss": 0.3917, + "step": 4818 + }, + { + "epoch": 0.64, + "grad_norm": 0.96875, + "learning_rate": 0.00019776003054053788, + "loss": 0.3647, + "step": 4819 + }, + { + "epoch": 0.64, + "grad_norm": 0.451171875, + "learning_rate": 0.0001977575789503887, + "loss": 0.4587, + "step": 4820 + }, + { + "epoch": 0.64, + "grad_norm": 0.984375, + "learning_rate": 0.00019775512603458403, + "loss": 0.3705, + "step": 4821 + }, + { + "epoch": 0.64, + "grad_norm": 0.57421875, + "learning_rate": 0.00019775267179315714, + "loss": 0.5263, + "step": 4822 + }, + { + "epoch": 0.64, + "grad_norm": 0.890625, + "learning_rate": 0.00019775021622614122, + "loss": 0.259, + "step": 4823 + }, + { + "epoch": 0.64, + "grad_norm": 0.7578125, + "learning_rate": 0.00019774775933356967, + "loss": 0.5338, + "step": 4824 + }, + { + "epoch": 0.64, + "grad_norm": 0.5859375, + "learning_rate": 0.00019774530111547577, + "loss": 0.5902, + "step": 4825 + }, + { + "epoch": 0.64, + "grad_norm": 0.73828125, + "learning_rate": 0.00019774284157189284, + "loss": 0.5777, + "step": 4826 + }, + { + "epoch": 0.64, + "grad_norm": 0.73828125, + "learning_rate": 0.00019774038070285427, + "loss": 0.7249, + "step": 4827 + }, + { + "epoch": 0.64, + "grad_norm": 0.64453125, + "learning_rate": 0.0001977379185083934, + "loss": 0.5794, + "step": 4828 + }, + { + "epoch": 0.64, + "grad_norm": 0.45703125, + "learning_rate": 0.0001977354549885436, + "loss": 0.4503, + "step": 4829 + }, + { + "epoch": 0.64, + "grad_norm": 0.71875, + "learning_rate": 0.00019773299014333833, + "loss": 0.5418, + "step": 4830 + }, + { + "epoch": 0.64, + "grad_norm": 0.77734375, + "learning_rate": 0.000197730523972811, + "loss": 0.4356, + "step": 4831 + }, + { + "epoch": 0.64, + "grad_norm": 0.7734375, + "learning_rate": 0.00019772805647699502, + "loss": 0.8227, + "step": 4832 + }, + { + "epoch": 0.64, + "grad_norm": 0.60546875, + "learning_rate": 0.00019772558765592386, + "loss": 0.3087, + "step": 4833 + }, + { + "epoch": 0.65, + "grad_norm": 0.8828125, + "learning_rate": 0.00019772311750963103, + "loss": 0.4937, + "step": 4834 + }, + { + "epoch": 0.65, + "grad_norm": 0.50390625, + "learning_rate": 0.00019772064603815003, + "loss": 0.3999, + "step": 4835 + }, + { + "epoch": 0.65, + "grad_norm": 0.61328125, + "learning_rate": 0.0001977181732415143, + "loss": 0.5493, + "step": 4836 + }, + { + "epoch": 0.65, + "grad_norm": 0.83984375, + "learning_rate": 0.00019771569911975747, + "loss": 0.453, + "step": 4837 + }, + { + "epoch": 0.65, + "grad_norm": 1.0390625, + "learning_rate": 0.00019771322367291306, + "loss": 0.4376, + "step": 4838 + }, + { + "epoch": 0.65, + "grad_norm": 0.78125, + "learning_rate": 0.0001977107469010146, + "loss": 0.7174, + "step": 4839 + }, + { + "epoch": 0.65, + "grad_norm": 0.61328125, + "learning_rate": 0.0001977082688040957, + "loss": 0.4829, + "step": 4840 + }, + { + "epoch": 0.65, + "grad_norm": 0.796875, + "learning_rate": 0.00019770578938218996, + "loss": 0.5437, + "step": 4841 + }, + { + "epoch": 0.65, + "grad_norm": 0.515625, + "learning_rate": 0.000197703308635331, + "loss": 0.5805, + "step": 4842 + }, + { + "epoch": 0.65, + "grad_norm": 0.87109375, + "learning_rate": 0.0001977008265635525, + "loss": 0.509, + "step": 4843 + }, + { + "epoch": 0.65, + "grad_norm": 0.71484375, + "learning_rate": 0.00019769834316688806, + "loss": 0.3862, + "step": 4844 + }, + { + "epoch": 0.65, + "grad_norm": 0.71875, + "learning_rate": 0.0001976958584453714, + "loss": 0.4285, + "step": 4845 + }, + { + "epoch": 0.65, + "grad_norm": 0.78515625, + "learning_rate": 0.0001976933723990362, + "loss": 0.4768, + "step": 4846 + }, + { + "epoch": 0.65, + "grad_norm": 0.6171875, + "learning_rate": 0.00019769088502791617, + "loss": 0.5108, + "step": 4847 + }, + { + "epoch": 0.65, + "grad_norm": 0.640625, + "learning_rate": 0.00019768839633204505, + "loss": 0.3644, + "step": 4848 + }, + { + "epoch": 0.65, + "grad_norm": 0.55078125, + "learning_rate": 0.00019768590631145656, + "loss": 0.5364, + "step": 4849 + }, + { + "epoch": 0.65, + "grad_norm": 0.59375, + "learning_rate": 0.0001976834149661845, + "loss": 0.5753, + "step": 4850 + }, + { + "epoch": 0.65, + "grad_norm": 0.859375, + "learning_rate": 0.00019768092229626263, + "loss": 0.6416, + "step": 4851 + }, + { + "epoch": 0.65, + "grad_norm": 0.73828125, + "learning_rate": 0.00019767842830172475, + "loss": 0.5849, + "step": 4852 + }, + { + "epoch": 0.65, + "grad_norm": 0.72265625, + "learning_rate": 0.00019767593298260474, + "loss": 0.6303, + "step": 4853 + }, + { + "epoch": 0.65, + "grad_norm": 0.6796875, + "learning_rate": 0.00019767343633893635, + "loss": 0.5827, + "step": 4854 + }, + { + "epoch": 0.65, + "grad_norm": 0.65234375, + "learning_rate": 0.00019767093837075347, + "loss": 0.4666, + "step": 4855 + }, + { + "epoch": 0.65, + "grad_norm": 0.67578125, + "learning_rate": 0.00019766843907809, + "loss": 0.5743, + "step": 4856 + }, + { + "epoch": 0.65, + "grad_norm": 0.78515625, + "learning_rate": 0.00019766593846097982, + "loss": 0.4998, + "step": 4857 + }, + { + "epoch": 0.65, + "grad_norm": 0.75, + "learning_rate": 0.00019766343651945683, + "loss": 0.6221, + "step": 4858 + }, + { + "epoch": 0.65, + "grad_norm": 0.96875, + "learning_rate": 0.00019766093325355495, + "loss": 0.5674, + "step": 4859 + }, + { + "epoch": 0.65, + "grad_norm": 0.6484375, + "learning_rate": 0.00019765842866330816, + "loss": 0.5635, + "step": 4860 + }, + { + "epoch": 0.65, + "grad_norm": 0.734375, + "learning_rate": 0.00019765592274875035, + "loss": 0.619, + "step": 4861 + }, + { + "epoch": 0.65, + "grad_norm": 0.69140625, + "learning_rate": 0.0001976534155099156, + "loss": 0.4313, + "step": 4862 + }, + { + "epoch": 0.65, + "grad_norm": 0.72265625, + "learning_rate": 0.00019765090694683784, + "loss": 0.392, + "step": 4863 + }, + { + "epoch": 0.65, + "grad_norm": 0.71875, + "learning_rate": 0.00019764839705955112, + "loss": 0.2854, + "step": 4864 + }, + { + "epoch": 0.65, + "grad_norm": 0.8046875, + "learning_rate": 0.00019764588584808944, + "loss": 0.4624, + "step": 4865 + }, + { + "epoch": 0.65, + "grad_norm": 0.63671875, + "learning_rate": 0.0001976433733124869, + "loss": 0.3597, + "step": 4866 + }, + { + "epoch": 0.65, + "grad_norm": 0.69921875, + "learning_rate": 0.00019764085945277757, + "loss": 0.3101, + "step": 4867 + }, + { + "epoch": 0.65, + "grad_norm": 0.60546875, + "learning_rate": 0.00019763834426899553, + "loss": 0.4024, + "step": 4868 + }, + { + "epoch": 0.65, + "grad_norm": 0.466796875, + "learning_rate": 0.00019763582776117483, + "loss": 0.2403, + "step": 4869 + }, + { + "epoch": 0.65, + "grad_norm": 0.6796875, + "learning_rate": 0.00019763330992934966, + "loss": 0.3482, + "step": 4870 + }, + { + "epoch": 0.65, + "grad_norm": 0.90234375, + "learning_rate": 0.00019763079077355416, + "loss": 0.3382, + "step": 4871 + }, + { + "epoch": 0.65, + "grad_norm": 0.578125, + "learning_rate": 0.00019762827029382247, + "loss": 0.2188, + "step": 4872 + }, + { + "epoch": 0.65, + "grad_norm": 0.54296875, + "learning_rate": 0.00019762574849018877, + "loss": 0.358, + "step": 4873 + }, + { + "epoch": 0.65, + "grad_norm": 0.671875, + "learning_rate": 0.00019762322536268727, + "loss": 0.539, + "step": 4874 + }, + { + "epoch": 0.65, + "grad_norm": 0.6640625, + "learning_rate": 0.00019762070091135217, + "loss": 0.5918, + "step": 4875 + }, + { + "epoch": 0.65, + "grad_norm": 0.5703125, + "learning_rate": 0.00019761817513621775, + "loss": 0.3022, + "step": 4876 + }, + { + "epoch": 0.65, + "grad_norm": 0.7734375, + "learning_rate": 0.00019761564803731817, + "loss": 0.5448, + "step": 4877 + }, + { + "epoch": 0.65, + "grad_norm": 0.84375, + "learning_rate": 0.0001976131196146878, + "loss": 0.3897, + "step": 4878 + }, + { + "epoch": 0.65, + "grad_norm": 1.0, + "learning_rate": 0.00019761058986836086, + "loss": 0.3749, + "step": 4879 + }, + { + "epoch": 0.65, + "grad_norm": 0.6171875, + "learning_rate": 0.00019760805879837169, + "loss": 0.2857, + "step": 4880 + }, + { + "epoch": 0.65, + "grad_norm": 0.62890625, + "learning_rate": 0.0001976055264047546, + "loss": 0.4067, + "step": 4881 + }, + { + "epoch": 0.65, + "grad_norm": 0.625, + "learning_rate": 0.00019760299268754388, + "loss": 0.4378, + "step": 4882 + }, + { + "epoch": 0.65, + "grad_norm": 0.8203125, + "learning_rate": 0.00019760045764677397, + "loss": 0.4249, + "step": 4883 + }, + { + "epoch": 0.65, + "grad_norm": 0.80078125, + "learning_rate": 0.00019759792128247922, + "loss": 0.4353, + "step": 4884 + }, + { + "epoch": 0.65, + "grad_norm": 0.4765625, + "learning_rate": 0.00019759538359469402, + "loss": 0.3409, + "step": 4885 + }, + { + "epoch": 0.65, + "grad_norm": 0.765625, + "learning_rate": 0.00019759284458345278, + "loss": 0.295, + "step": 4886 + }, + { + "epoch": 0.65, + "grad_norm": 0.71484375, + "learning_rate": 0.00019759030424878994, + "loss": 0.5585, + "step": 4887 + }, + { + "epoch": 0.65, + "grad_norm": 0.8125, + "learning_rate": 0.00019758776259073992, + "loss": 0.6856, + "step": 4888 + }, + { + "epoch": 0.65, + "grad_norm": 0.859375, + "learning_rate": 0.0001975852196093372, + "loss": 0.5348, + "step": 4889 + }, + { + "epoch": 0.65, + "grad_norm": 0.80078125, + "learning_rate": 0.0001975826753046163, + "loss": 0.3715, + "step": 4890 + }, + { + "epoch": 0.65, + "grad_norm": 0.74609375, + "learning_rate": 0.0001975801296766117, + "loss": 0.7196, + "step": 4891 + }, + { + "epoch": 0.65, + "grad_norm": 0.609375, + "learning_rate": 0.00019757758272535793, + "loss": 0.3776, + "step": 4892 + }, + { + "epoch": 0.65, + "grad_norm": 0.90234375, + "learning_rate": 0.00019757503445088945, + "loss": 0.329, + "step": 4893 + }, + { + "epoch": 0.65, + "grad_norm": 0.4921875, + "learning_rate": 0.00019757248485324094, + "loss": 0.4113, + "step": 4894 + }, + { + "epoch": 0.65, + "grad_norm": 0.640625, + "learning_rate": 0.00019756993393244687, + "loss": 0.2915, + "step": 4895 + }, + { + "epoch": 0.65, + "grad_norm": 0.73046875, + "learning_rate": 0.00019756738168854188, + "loss": 0.5464, + "step": 4896 + }, + { + "epoch": 0.65, + "grad_norm": 0.484375, + "learning_rate": 0.00019756482812156062, + "loss": 0.324, + "step": 4897 + }, + { + "epoch": 0.65, + "grad_norm": 0.53125, + "learning_rate": 0.00019756227323153763, + "loss": 0.4705, + "step": 4898 + }, + { + "epoch": 0.65, + "grad_norm": 0.87109375, + "learning_rate": 0.00019755971701850765, + "loss": 0.393, + "step": 4899 + }, + { + "epoch": 0.65, + "grad_norm": 0.65625, + "learning_rate": 0.00019755715948250525, + "loss": 0.587, + "step": 4900 + }, + { + "epoch": 0.65, + "grad_norm": 0.828125, + "learning_rate": 0.00019755460062356517, + "loss": 0.3846, + "step": 4901 + }, + { + "epoch": 0.65, + "grad_norm": 0.60546875, + "learning_rate": 0.00019755204044172206, + "loss": 0.7162, + "step": 4902 + }, + { + "epoch": 0.65, + "grad_norm": 0.58984375, + "learning_rate": 0.0001975494789370107, + "loss": 0.6255, + "step": 4903 + }, + { + "epoch": 0.65, + "grad_norm": 0.84765625, + "learning_rate": 0.00019754691610946582, + "loss": 0.4159, + "step": 4904 + }, + { + "epoch": 0.65, + "grad_norm": 0.9296875, + "learning_rate": 0.00019754435195912213, + "loss": 0.5848, + "step": 4905 + }, + { + "epoch": 0.65, + "grad_norm": 0.6484375, + "learning_rate": 0.00019754178648601441, + "loss": 0.41, + "step": 4906 + }, + { + "epoch": 0.65, + "grad_norm": 0.875, + "learning_rate": 0.0001975392196901775, + "loss": 0.6849, + "step": 4907 + }, + { + "epoch": 0.65, + "grad_norm": 0.62109375, + "learning_rate": 0.00019753665157164617, + "loss": 0.4907, + "step": 4908 + }, + { + "epoch": 0.66, + "grad_norm": 0.74609375, + "learning_rate": 0.0001975340821304552, + "loss": 0.4895, + "step": 4909 + }, + { + "epoch": 0.66, + "grad_norm": 0.59375, + "learning_rate": 0.00019753151136663952, + "loss": 0.2698, + "step": 4910 + }, + { + "epoch": 0.66, + "grad_norm": 0.765625, + "learning_rate": 0.00019752893928023392, + "loss": 0.5896, + "step": 4911 + }, + { + "epoch": 0.66, + "grad_norm": 0.8984375, + "learning_rate": 0.00019752636587127333, + "loss": 0.6533, + "step": 4912 + }, + { + "epoch": 0.66, + "grad_norm": 0.53125, + "learning_rate": 0.00019752379113979263, + "loss": 0.5644, + "step": 4913 + }, + { + "epoch": 0.66, + "grad_norm": 0.56640625, + "learning_rate": 0.00019752121508582673, + "loss": 0.5974, + "step": 4914 + }, + { + "epoch": 0.66, + "grad_norm": 0.76171875, + "learning_rate": 0.00019751863770941058, + "loss": 0.4626, + "step": 4915 + }, + { + "epoch": 0.66, + "grad_norm": 0.69140625, + "learning_rate": 0.00019751605901057907, + "loss": 0.5464, + "step": 4916 + }, + { + "epoch": 0.66, + "grad_norm": 0.6953125, + "learning_rate": 0.00019751347898936726, + "loss": 0.6217, + "step": 4917 + }, + { + "epoch": 0.66, + "grad_norm": 0.765625, + "learning_rate": 0.00019751089764581008, + "loss": 0.669, + "step": 4918 + }, + { + "epoch": 0.66, + "grad_norm": 0.796875, + "learning_rate": 0.00019750831497994254, + "loss": 0.6627, + "step": 4919 + }, + { + "epoch": 0.66, + "grad_norm": 0.54296875, + "learning_rate": 0.00019750573099179967, + "loss": 0.5108, + "step": 4920 + }, + { + "epoch": 0.66, + "grad_norm": 0.8671875, + "learning_rate": 0.0001975031456814165, + "loss": 0.4505, + "step": 4921 + }, + { + "epoch": 0.66, + "grad_norm": 0.74609375, + "learning_rate": 0.0001975005590488281, + "loss": 0.4393, + "step": 4922 + }, + { + "epoch": 0.66, + "grad_norm": 0.52734375, + "learning_rate": 0.00019749797109406956, + "loss": 0.3228, + "step": 4923 + }, + { + "epoch": 0.66, + "grad_norm": 0.68359375, + "learning_rate": 0.00019749538181717594, + "loss": 0.5216, + "step": 4924 + }, + { + "epoch": 0.66, + "grad_norm": 0.62109375, + "learning_rate": 0.00019749279121818235, + "loss": 0.7536, + "step": 4925 + }, + { + "epoch": 0.66, + "grad_norm": 0.72265625, + "learning_rate": 0.000197490199297124, + "loss": 0.3647, + "step": 4926 + }, + { + "epoch": 0.66, + "grad_norm": 0.82421875, + "learning_rate": 0.00019748760605403594, + "loss": 0.8114, + "step": 4927 + }, + { + "epoch": 0.66, + "grad_norm": 0.609375, + "learning_rate": 0.00019748501148895338, + "loss": 0.6325, + "step": 4928 + }, + { + "epoch": 0.66, + "grad_norm": 0.796875, + "learning_rate": 0.0001974824156019115, + "loss": 0.4168, + "step": 4929 + }, + { + "epoch": 0.66, + "grad_norm": 0.546875, + "learning_rate": 0.0001974798183929455, + "loss": 0.3546, + "step": 4930 + }, + { + "epoch": 0.66, + "grad_norm": 0.6953125, + "learning_rate": 0.00019747721986209058, + "loss": 0.4992, + "step": 4931 + }, + { + "epoch": 0.66, + "grad_norm": 0.9296875, + "learning_rate": 0.00019747462000938205, + "loss": 0.406, + "step": 4932 + }, + { + "epoch": 0.66, + "grad_norm": 0.6953125, + "learning_rate": 0.00019747201883485506, + "loss": 0.4936, + "step": 4933 + }, + { + "epoch": 0.66, + "grad_norm": 0.443359375, + "learning_rate": 0.00019746941633854497, + "loss": 0.3883, + "step": 4934 + }, + { + "epoch": 0.66, + "grad_norm": 0.50390625, + "learning_rate": 0.00019746681252048702, + "loss": 0.4988, + "step": 4935 + }, + { + "epoch": 0.66, + "grad_norm": 0.4765625, + "learning_rate": 0.00019746420738071655, + "loss": 0.2918, + "step": 4936 + }, + { + "epoch": 0.66, + "grad_norm": 0.91015625, + "learning_rate": 0.00019746160091926887, + "loss": 0.7144, + "step": 4937 + }, + { + "epoch": 0.66, + "grad_norm": 0.65234375, + "learning_rate": 0.00019745899313617935, + "loss": 0.416, + "step": 4938 + }, + { + "epoch": 0.66, + "grad_norm": 0.671875, + "learning_rate": 0.00019745638403148333, + "loss": 0.3415, + "step": 4939 + }, + { + "epoch": 0.66, + "grad_norm": 0.625, + "learning_rate": 0.00019745377360521618, + "loss": 0.6626, + "step": 4940 + }, + { + "epoch": 0.66, + "grad_norm": 0.859375, + "learning_rate": 0.0001974511618574133, + "loss": 0.4628, + "step": 4941 + }, + { + "epoch": 0.66, + "grad_norm": 0.6796875, + "learning_rate": 0.00019744854878811014, + "loss": 0.4254, + "step": 4942 + }, + { + "epoch": 0.66, + "grad_norm": 0.87109375, + "learning_rate": 0.0001974459343973421, + "loss": 0.426, + "step": 4943 + }, + { + "epoch": 0.66, + "grad_norm": 0.76171875, + "learning_rate": 0.00019744331868514468, + "loss": 0.6941, + "step": 4944 + }, + { + "epoch": 0.66, + "grad_norm": 0.71484375, + "learning_rate": 0.0001974407016515533, + "loss": 0.5369, + "step": 4945 + }, + { + "epoch": 0.66, + "grad_norm": 0.80859375, + "learning_rate": 0.0001974380832966035, + "loss": 0.4879, + "step": 4946 + }, + { + "epoch": 0.66, + "grad_norm": 0.74609375, + "learning_rate": 0.0001974354636203307, + "loss": 0.6444, + "step": 4947 + }, + { + "epoch": 0.66, + "grad_norm": 0.85546875, + "learning_rate": 0.0001974328426227705, + "loss": 0.2676, + "step": 4948 + }, + { + "epoch": 0.66, + "grad_norm": 0.57421875, + "learning_rate": 0.00019743022030395846, + "loss": 0.7626, + "step": 4949 + }, + { + "epoch": 0.66, + "grad_norm": 0.68359375, + "learning_rate": 0.00019742759666393006, + "loss": 0.3918, + "step": 4950 + }, + { + "epoch": 0.66, + "grad_norm": 0.6171875, + "learning_rate": 0.00019742497170272094, + "loss": 0.6934, + "step": 4951 + }, + { + "epoch": 0.66, + "grad_norm": 0.72265625, + "learning_rate": 0.00019742234542036665, + "loss": 0.4373, + "step": 4952 + }, + { + "epoch": 0.66, + "grad_norm": 0.51953125, + "learning_rate": 0.00019741971781690287, + "loss": 0.2719, + "step": 4953 + }, + { + "epoch": 0.66, + "grad_norm": 0.74609375, + "learning_rate": 0.00019741708889236517, + "loss": 0.2866, + "step": 4954 + }, + { + "epoch": 0.66, + "grad_norm": 0.65625, + "learning_rate": 0.00019741445864678923, + "loss": 0.3097, + "step": 4955 + }, + { + "epoch": 0.66, + "grad_norm": 0.5625, + "learning_rate": 0.0001974118270802107, + "loss": 0.4347, + "step": 4956 + }, + { + "epoch": 0.66, + "grad_norm": 0.70703125, + "learning_rate": 0.0001974091941926653, + "loss": 0.6631, + "step": 4957 + }, + { + "epoch": 0.66, + "grad_norm": 0.69921875, + "learning_rate": 0.0001974065599841887, + "loss": 0.6871, + "step": 4958 + }, + { + "epoch": 0.66, + "grad_norm": 0.6796875, + "learning_rate": 0.00019740392445481662, + "loss": 0.5043, + "step": 4959 + }, + { + "epoch": 0.66, + "grad_norm": 0.77734375, + "learning_rate": 0.0001974012876045848, + "loss": 0.6037, + "step": 4960 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 0.000197398649433529, + "loss": 0.7534, + "step": 4961 + }, + { + "epoch": 0.66, + "grad_norm": 0.7890625, + "learning_rate": 0.00019739600994168505, + "loss": 0.322, + "step": 4962 + }, + { + "epoch": 0.66, + "grad_norm": 0.6015625, + "learning_rate": 0.0001973933691290887, + "loss": 0.399, + "step": 4963 + }, + { + "epoch": 0.66, + "grad_norm": 0.515625, + "learning_rate": 0.00019739072699577573, + "loss": 0.3913, + "step": 4964 + }, + { + "epoch": 0.66, + "grad_norm": 0.88671875, + "learning_rate": 0.00019738808354178197, + "loss": 0.3093, + "step": 4965 + }, + { + "epoch": 0.66, + "grad_norm": 0.53125, + "learning_rate": 0.00019738543876714334, + "loss": 0.4579, + "step": 4966 + }, + { + "epoch": 0.66, + "grad_norm": 0.55859375, + "learning_rate": 0.00019738279267189564, + "loss": 0.3216, + "step": 4967 + }, + { + "epoch": 0.66, + "grad_norm": 0.5234375, + "learning_rate": 0.00019738014525607478, + "loss": 0.3276, + "step": 4968 + }, + { + "epoch": 0.66, + "grad_norm": 0.8671875, + "learning_rate": 0.00019737749651971664, + "loss": 0.6157, + "step": 4969 + }, + { + "epoch": 0.66, + "grad_norm": 0.9375, + "learning_rate": 0.0001973748464628572, + "loss": 0.6743, + "step": 4970 + }, + { + "epoch": 0.66, + "grad_norm": 0.73828125, + "learning_rate": 0.00019737219508553228, + "loss": 0.576, + "step": 4971 + }, + { + "epoch": 0.66, + "grad_norm": 0.71484375, + "learning_rate": 0.00019736954238777792, + "loss": 0.6234, + "step": 4972 + }, + { + "epoch": 0.66, + "grad_norm": 0.62890625, + "learning_rate": 0.00019736688836963006, + "loss": 0.4107, + "step": 4973 + }, + { + "epoch": 0.66, + "grad_norm": 0.75390625, + "learning_rate": 0.00019736423303112473, + "loss": 0.575, + "step": 4974 + }, + { + "epoch": 0.66, + "grad_norm": 0.8828125, + "learning_rate": 0.00019736157637229794, + "loss": 0.5045, + "step": 4975 + }, + { + "epoch": 0.66, + "grad_norm": 0.57421875, + "learning_rate": 0.00019735891839318563, + "loss": 0.4621, + "step": 4976 + }, + { + "epoch": 0.66, + "grad_norm": 0.671875, + "learning_rate": 0.0001973562590938239, + "loss": 0.5524, + "step": 4977 + }, + { + "epoch": 0.66, + "grad_norm": 1.0625, + "learning_rate": 0.00019735359847424886, + "loss": 0.6282, + "step": 4978 + }, + { + "epoch": 0.66, + "grad_norm": 0.6484375, + "learning_rate": 0.00019735093653449655, + "loss": 0.4321, + "step": 4979 + }, + { + "epoch": 0.66, + "grad_norm": 0.83203125, + "learning_rate": 0.000197348273274603, + "loss": 0.3176, + "step": 4980 + }, + { + "epoch": 0.66, + "grad_norm": 0.6484375, + "learning_rate": 0.00019734560869460443, + "loss": 0.4599, + "step": 4981 + }, + { + "epoch": 0.66, + "grad_norm": 0.58984375, + "learning_rate": 0.00019734294279453692, + "loss": 0.6192, + "step": 4982 + }, + { + "epoch": 0.66, + "grad_norm": 0.71484375, + "learning_rate": 0.00019734027557443665, + "loss": 0.3972, + "step": 4983 + }, + { + "epoch": 0.67, + "grad_norm": 0.93359375, + "learning_rate": 0.00019733760703433974, + "loss": 0.5202, + "step": 4984 + }, + { + "epoch": 0.67, + "grad_norm": 0.6875, + "learning_rate": 0.00019733493717428245, + "loss": 0.6726, + "step": 4985 + }, + { + "epoch": 0.67, + "grad_norm": 1.3203125, + "learning_rate": 0.00019733226599430087, + "loss": 0.8523, + "step": 4986 + }, + { + "epoch": 0.67, + "grad_norm": 0.6953125, + "learning_rate": 0.00019732959349443136, + "loss": 0.4985, + "step": 4987 + }, + { + "epoch": 0.67, + "grad_norm": 0.61328125, + "learning_rate": 0.00019732691967471006, + "loss": 0.3506, + "step": 4988 + }, + { + "epoch": 0.67, + "grad_norm": 0.84765625, + "learning_rate": 0.00019732424453517328, + "loss": 0.6694, + "step": 4989 + }, + { + "epoch": 0.67, + "grad_norm": 0.578125, + "learning_rate": 0.0001973215680758573, + "loss": 0.5415, + "step": 4990 + }, + { + "epoch": 0.67, + "grad_norm": 0.6328125, + "learning_rate": 0.0001973188902967984, + "loss": 0.4063, + "step": 4991 + }, + { + "epoch": 0.67, + "grad_norm": 0.61328125, + "learning_rate": 0.00019731621119803286, + "loss": 0.3392, + "step": 4992 + }, + { + "epoch": 0.67, + "grad_norm": 0.6953125, + "learning_rate": 0.00019731353077959705, + "loss": 0.5562, + "step": 4993 + }, + { + "epoch": 0.67, + "grad_norm": 0.546875, + "learning_rate": 0.0001973108490415273, + "loss": 0.206, + "step": 4994 + }, + { + "epoch": 0.67, + "grad_norm": 0.7421875, + "learning_rate": 0.00019730816598386, + "loss": 0.762, + "step": 4995 + }, + { + "epoch": 0.67, + "grad_norm": 0.70703125, + "learning_rate": 0.0001973054816066315, + "loss": 0.6265, + "step": 4996 + }, + { + "epoch": 0.67, + "grad_norm": 0.7734375, + "learning_rate": 0.0001973027959098782, + "loss": 0.5952, + "step": 4997 + }, + { + "epoch": 0.67, + "grad_norm": 0.7421875, + "learning_rate": 0.0001973001088936366, + "loss": 0.5839, + "step": 4998 + }, + { + "epoch": 0.67, + "grad_norm": 0.9765625, + "learning_rate": 0.00019729742055794302, + "loss": 0.294, + "step": 4999 + }, + { + "epoch": 0.67, + "grad_norm": 0.56640625, + "learning_rate": 0.00019729473090283398, + "loss": 0.3592, + "step": 5000 + }, + { + "epoch": 0.67, + "grad_norm": 0.75390625, + "learning_rate": 0.00019729203992834598, + "loss": 0.5929, + "step": 5001 + }, + { + "epoch": 0.67, + "grad_norm": 0.546875, + "learning_rate": 0.00019728934763451545, + "loss": 0.5368, + "step": 5002 + }, + { + "epoch": 0.67, + "grad_norm": 0.8984375, + "learning_rate": 0.0001972866540213789, + "loss": 0.6459, + "step": 5003 + }, + { + "epoch": 0.67, + "grad_norm": 0.89453125, + "learning_rate": 0.00019728395908897296, + "loss": 0.4078, + "step": 5004 + }, + { + "epoch": 0.67, + "grad_norm": 0.640625, + "learning_rate": 0.00019728126283733403, + "loss": 0.4686, + "step": 5005 + }, + { + "epoch": 0.67, + "grad_norm": 0.72265625, + "learning_rate": 0.00019727856526649876, + "loss": 0.647, + "step": 5006 + }, + { + "epoch": 0.67, + "grad_norm": 0.5078125, + "learning_rate": 0.00019727586637650373, + "loss": 0.6002, + "step": 5007 + }, + { + "epoch": 0.67, + "grad_norm": 0.81640625, + "learning_rate": 0.0001972731661673855, + "loss": 0.3789, + "step": 5008 + }, + { + "epoch": 0.67, + "grad_norm": 0.5703125, + "learning_rate": 0.00019727046463918071, + "loss": 0.6069, + "step": 5009 + }, + { + "epoch": 0.67, + "grad_norm": 0.5625, + "learning_rate": 0.00019726776179192598, + "loss": 0.3866, + "step": 5010 + }, + { + "epoch": 0.67, + "grad_norm": 0.7109375, + "learning_rate": 0.000197265057625658, + "loss": 0.5587, + "step": 5011 + }, + { + "epoch": 0.67, + "grad_norm": 0.7421875, + "learning_rate": 0.00019726235214041342, + "loss": 0.2896, + "step": 5012 + }, + { + "epoch": 0.67, + "grad_norm": 0.6640625, + "learning_rate": 0.0001972596453362289, + "loss": 0.445, + "step": 5013 + }, + { + "epoch": 0.67, + "grad_norm": 0.7265625, + "learning_rate": 0.00019725693721314117, + "loss": 0.4851, + "step": 5014 + }, + { + "epoch": 0.67, + "grad_norm": 0.8125, + "learning_rate": 0.00019725422777118695, + "loss": 0.4042, + "step": 5015 + }, + { + "epoch": 0.67, + "grad_norm": 0.66796875, + "learning_rate": 0.000197251517010403, + "loss": 0.4559, + "step": 5016 + }, + { + "epoch": 0.67, + "grad_norm": 0.8046875, + "learning_rate": 0.00019724880493082606, + "loss": 0.3265, + "step": 5017 + }, + { + "epoch": 0.67, + "grad_norm": 0.5703125, + "learning_rate": 0.00019724609153249292, + "loss": 0.4034, + "step": 5018 + }, + { + "epoch": 0.67, + "grad_norm": 0.73046875, + "learning_rate": 0.00019724337681544034, + "loss": 0.3511, + "step": 5019 + }, + { + "epoch": 0.67, + "grad_norm": 0.6171875, + "learning_rate": 0.00019724066077970518, + "loss": 0.5885, + "step": 5020 + }, + { + "epoch": 0.67, + "grad_norm": 0.60546875, + "learning_rate": 0.00019723794342532425, + "loss": 0.2012, + "step": 5021 + }, + { + "epoch": 0.67, + "grad_norm": 0.625, + "learning_rate": 0.0001972352247523344, + "loss": 0.6054, + "step": 5022 + }, + { + "epoch": 0.67, + "grad_norm": 0.53515625, + "learning_rate": 0.00019723250476077247, + "loss": 0.31, + "step": 5023 + }, + { + "epoch": 0.67, + "grad_norm": 0.67578125, + "learning_rate": 0.0001972297834506754, + "loss": 0.4673, + "step": 5024 + }, + { + "epoch": 0.67, + "grad_norm": 0.70703125, + "learning_rate": 0.00019722706082208005, + "loss": 0.5244, + "step": 5025 + }, + { + "epoch": 0.67, + "grad_norm": 0.82421875, + "learning_rate": 0.00019722433687502337, + "loss": 0.4435, + "step": 5026 + }, + { + "epoch": 0.67, + "grad_norm": 0.6640625, + "learning_rate": 0.00019722161160954226, + "loss": 0.5744, + "step": 5027 + }, + { + "epoch": 0.67, + "grad_norm": 0.73828125, + "learning_rate": 0.00019721888502567374, + "loss": 0.7318, + "step": 5028 + }, + { + "epoch": 0.67, + "grad_norm": 0.80859375, + "learning_rate": 0.0001972161571234547, + "loss": 0.278, + "step": 5029 + }, + { + "epoch": 0.67, + "grad_norm": 0.68359375, + "learning_rate": 0.0001972134279029222, + "loss": 0.6182, + "step": 5030 + }, + { + "epoch": 0.67, + "grad_norm": 0.8671875, + "learning_rate": 0.00019721069736411325, + "loss": 0.6931, + "step": 5031 + }, + { + "epoch": 0.67, + "grad_norm": 0.87109375, + "learning_rate": 0.00019720796550706484, + "loss": 0.3853, + "step": 5032 + }, + { + "epoch": 0.67, + "grad_norm": 0.7265625, + "learning_rate": 0.00019720523233181402, + "loss": 0.4377, + "step": 5033 + }, + { + "epoch": 0.67, + "grad_norm": 0.640625, + "learning_rate": 0.00019720249783839787, + "loss": 0.6053, + "step": 5034 + }, + { + "epoch": 0.67, + "grad_norm": 0.68359375, + "learning_rate": 0.00019719976202685344, + "loss": 0.4901, + "step": 5035 + }, + { + "epoch": 0.67, + "grad_norm": 0.5859375, + "learning_rate": 0.00019719702489721786, + "loss": 0.4718, + "step": 5036 + }, + { + "epoch": 0.67, + "grad_norm": 0.58984375, + "learning_rate": 0.00019719428644952828, + "loss": 0.5147, + "step": 5037 + }, + { + "epoch": 0.67, + "grad_norm": 0.578125, + "learning_rate": 0.00019719154668382176, + "loss": 0.5993, + "step": 5038 + }, + { + "epoch": 0.67, + "grad_norm": 0.75, + "learning_rate": 0.0001971888056001355, + "loss": 0.8455, + "step": 5039 + }, + { + "epoch": 0.67, + "grad_norm": 0.6796875, + "learning_rate": 0.00019718606319850664, + "loss": 0.5951, + "step": 5040 + }, + { + "epoch": 0.67, + "grad_norm": 0.65625, + "learning_rate": 0.00019718331947897244, + "loss": 0.4584, + "step": 5041 + }, + { + "epoch": 0.67, + "grad_norm": 0.703125, + "learning_rate": 0.00019718057444157, + "loss": 0.3443, + "step": 5042 + }, + { + "epoch": 0.67, + "grad_norm": 0.61328125, + "learning_rate": 0.00019717782808633666, + "loss": 0.5712, + "step": 5043 + }, + { + "epoch": 0.67, + "grad_norm": 0.55859375, + "learning_rate": 0.00019717508041330956, + "loss": 0.3279, + "step": 5044 + }, + { + "epoch": 0.67, + "grad_norm": 1.0390625, + "learning_rate": 0.00019717233142252602, + "loss": 0.597, + "step": 5045 + }, + { + "epoch": 0.67, + "grad_norm": 0.69140625, + "learning_rate": 0.00019716958111402327, + "loss": 0.4848, + "step": 5046 + }, + { + "epoch": 0.67, + "grad_norm": 0.80859375, + "learning_rate": 0.0001971668294878387, + "loss": 0.5874, + "step": 5047 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 0.00019716407654400952, + "loss": 0.5075, + "step": 5048 + }, + { + "epoch": 0.67, + "grad_norm": 0.8203125, + "learning_rate": 0.00019716132228257308, + "loss": 0.375, + "step": 5049 + }, + { + "epoch": 0.67, + "grad_norm": 0.52734375, + "learning_rate": 0.0001971585667035668, + "loss": 0.3696, + "step": 5050 + }, + { + "epoch": 0.67, + "grad_norm": 0.71484375, + "learning_rate": 0.00019715580980702796, + "loss": 0.6814, + "step": 5051 + }, + { + "epoch": 0.67, + "grad_norm": 0.51171875, + "learning_rate": 0.00019715305159299402, + "loss": 0.5462, + "step": 5052 + }, + { + "epoch": 0.67, + "grad_norm": 1.1171875, + "learning_rate": 0.00019715029206150234, + "loss": 0.4823, + "step": 5053 + }, + { + "epoch": 0.67, + "grad_norm": 0.93359375, + "learning_rate": 0.00019714753121259033, + "loss": 0.5945, + "step": 5054 + }, + { + "epoch": 0.67, + "grad_norm": 0.61328125, + "learning_rate": 0.00019714476904629543, + "loss": 0.2404, + "step": 5055 + }, + { + "epoch": 0.67, + "grad_norm": 0.76953125, + "learning_rate": 0.00019714200556265516, + "loss": 0.5174, + "step": 5056 + }, + { + "epoch": 0.67, + "grad_norm": 0.69140625, + "learning_rate": 0.00019713924076170694, + "loss": 0.7433, + "step": 5057 + }, + { + "epoch": 0.67, + "grad_norm": 0.640625, + "learning_rate": 0.00019713647464348827, + "loss": 0.3047, + "step": 5058 + }, + { + "epoch": 0.68, + "grad_norm": 0.5234375, + "learning_rate": 0.00019713370720803662, + "loss": 0.3952, + "step": 5059 + }, + { + "epoch": 0.68, + "grad_norm": 0.78125, + "learning_rate": 0.0001971309384553896, + "loss": 0.4846, + "step": 5060 + }, + { + "epoch": 0.68, + "grad_norm": 0.640625, + "learning_rate": 0.00019712816838558475, + "loss": 0.4856, + "step": 5061 + }, + { + "epoch": 0.68, + "grad_norm": 0.7578125, + "learning_rate": 0.00019712539699865954, + "loss": 0.6179, + "step": 5062 + }, + { + "epoch": 0.68, + "grad_norm": 0.59375, + "learning_rate": 0.00019712262429465166, + "loss": 0.4075, + "step": 5063 + }, + { + "epoch": 0.68, + "grad_norm": 0.7578125, + "learning_rate": 0.0001971198502735986, + "loss": 0.6685, + "step": 5064 + }, + { + "epoch": 0.68, + "grad_norm": 0.64453125, + "learning_rate": 0.00019711707493553807, + "loss": 0.1952, + "step": 5065 + }, + { + "epoch": 0.68, + "grad_norm": 0.58984375, + "learning_rate": 0.00019711429828050769, + "loss": 0.6003, + "step": 5066 + }, + { + "epoch": 0.68, + "grad_norm": 0.625, + "learning_rate": 0.00019711152030854509, + "loss": 0.5252, + "step": 5067 + }, + { + "epoch": 0.68, + "grad_norm": 0.92578125, + "learning_rate": 0.00019710874101968795, + "loss": 0.4998, + "step": 5068 + }, + { + "epoch": 0.68, + "grad_norm": 0.69921875, + "learning_rate": 0.00019710596041397395, + "loss": 0.544, + "step": 5069 + }, + { + "epoch": 0.68, + "grad_norm": 0.65625, + "learning_rate": 0.00019710317849144083, + "loss": 0.3717, + "step": 5070 + }, + { + "epoch": 0.68, + "grad_norm": 0.69140625, + "learning_rate": 0.00019710039525212624, + "loss": 0.4507, + "step": 5071 + }, + { + "epoch": 0.68, + "grad_norm": 0.68359375, + "learning_rate": 0.00019709761069606802, + "loss": 0.3536, + "step": 5072 + }, + { + "epoch": 0.68, + "grad_norm": 0.94921875, + "learning_rate": 0.00019709482482330385, + "loss": 0.4265, + "step": 5073 + }, + { + "epoch": 0.68, + "grad_norm": 0.60546875, + "learning_rate": 0.00019709203763387155, + "loss": 0.5774, + "step": 5074 + }, + { + "epoch": 0.68, + "grad_norm": 0.8515625, + "learning_rate": 0.00019708924912780891, + "loss": 0.467, + "step": 5075 + }, + { + "epoch": 0.68, + "grad_norm": 0.65234375, + "learning_rate": 0.00019708645930515374, + "loss": 0.5671, + "step": 5076 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 0.00019708366816594385, + "loss": 0.4491, + "step": 5077 + }, + { + "epoch": 0.68, + "grad_norm": 0.5, + "learning_rate": 0.00019708087571021715, + "loss": 0.4464, + "step": 5078 + }, + { + "epoch": 0.68, + "grad_norm": 0.91015625, + "learning_rate": 0.00019707808193801144, + "loss": 0.495, + "step": 5079 + }, + { + "epoch": 0.68, + "grad_norm": 0.6640625, + "learning_rate": 0.00019707528684936466, + "loss": 0.4905, + "step": 5080 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 0.00019707249044431468, + "loss": 0.6338, + "step": 5081 + }, + { + "epoch": 0.68, + "grad_norm": 0.72265625, + "learning_rate": 0.00019706969272289943, + "loss": 0.4031, + "step": 5082 + }, + { + "epoch": 0.68, + "grad_norm": 0.67578125, + "learning_rate": 0.00019706689368515683, + "loss": 0.5272, + "step": 5083 + }, + { + "epoch": 0.68, + "grad_norm": 0.8203125, + "learning_rate": 0.0001970640933311249, + "loss": 0.5582, + "step": 5084 + }, + { + "epoch": 0.68, + "grad_norm": 0.67578125, + "learning_rate": 0.0001970612916608415, + "loss": 0.3092, + "step": 5085 + }, + { + "epoch": 0.68, + "grad_norm": 0.5546875, + "learning_rate": 0.00019705848867434473, + "loss": 0.6271, + "step": 5086 + }, + { + "epoch": 0.68, + "grad_norm": 0.55859375, + "learning_rate": 0.00019705568437167258, + "loss": 0.4729, + "step": 5087 + }, + { + "epoch": 0.68, + "grad_norm": 1.09375, + "learning_rate": 0.00019705287875286303, + "loss": 0.2898, + "step": 5088 + }, + { + "epoch": 0.68, + "grad_norm": 0.69921875, + "learning_rate": 0.00019705007181795416, + "loss": 0.6542, + "step": 5089 + }, + { + "epoch": 0.68, + "grad_norm": 0.64453125, + "learning_rate": 0.00019704726356698402, + "loss": 0.4251, + "step": 5090 + }, + { + "epoch": 0.68, + "grad_norm": 0.63671875, + "learning_rate": 0.0001970444539999907, + "loss": 0.7071, + "step": 5091 + }, + { + "epoch": 0.68, + "grad_norm": 0.71484375, + "learning_rate": 0.00019704164311701235, + "loss": 0.6279, + "step": 5092 + }, + { + "epoch": 0.68, + "grad_norm": 0.75390625, + "learning_rate": 0.000197038830918087, + "loss": 0.586, + "step": 5093 + }, + { + "epoch": 0.68, + "grad_norm": 0.64453125, + "learning_rate": 0.00019703601740325282, + "loss": 0.4297, + "step": 5094 + }, + { + "epoch": 0.68, + "grad_norm": 0.65625, + "learning_rate": 0.00019703320257254798, + "loss": 0.532, + "step": 5095 + }, + { + "epoch": 0.68, + "grad_norm": 0.56640625, + "learning_rate": 0.0001970303864260106, + "loss": 0.3147, + "step": 5096 + }, + { + "epoch": 0.68, + "grad_norm": 0.6328125, + "learning_rate": 0.00019702756896367894, + "loss": 0.5533, + "step": 5097 + }, + { + "epoch": 0.68, + "grad_norm": 0.75390625, + "learning_rate": 0.0001970247501855912, + "loss": 0.3705, + "step": 5098 + }, + { + "epoch": 0.68, + "grad_norm": 0.439453125, + "learning_rate": 0.00019702193009178554, + "loss": 0.3783, + "step": 5099 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 0.00019701910868230024, + "loss": 0.56, + "step": 5100 + }, + { + "epoch": 0.68, + "grad_norm": 0.60546875, + "learning_rate": 0.00019701628595717357, + "loss": 0.3265, + "step": 5101 + }, + { + "epoch": 0.68, + "grad_norm": 0.80859375, + "learning_rate": 0.00019701346191644382, + "loss": 0.5176, + "step": 5102 + }, + { + "epoch": 0.68, + "grad_norm": 0.6875, + "learning_rate": 0.00019701063656014925, + "loss": 0.8322, + "step": 5103 + }, + { + "epoch": 0.68, + "grad_norm": 0.4453125, + "learning_rate": 0.00019700780988832816, + "loss": 0.4023, + "step": 5104 + }, + { + "epoch": 0.68, + "grad_norm": 0.6484375, + "learning_rate": 0.00019700498190101896, + "loss": 0.4929, + "step": 5105 + }, + { + "epoch": 0.68, + "grad_norm": 0.7421875, + "learning_rate": 0.0001970021525982599, + "loss": 0.6203, + "step": 5106 + }, + { + "epoch": 0.68, + "grad_norm": 0.515625, + "learning_rate": 0.0001969993219800894, + "loss": 0.6383, + "step": 5107 + }, + { + "epoch": 0.68, + "grad_norm": 0.76171875, + "learning_rate": 0.00019699649004654583, + "loss": 0.3716, + "step": 5108 + }, + { + "epoch": 0.68, + "grad_norm": 0.6171875, + "learning_rate": 0.00019699365679766765, + "loss": 0.6568, + "step": 5109 + }, + { + "epoch": 0.68, + "grad_norm": 0.8828125, + "learning_rate": 0.0001969908222334932, + "loss": 0.4478, + "step": 5110 + }, + { + "epoch": 0.68, + "grad_norm": 0.5078125, + "learning_rate": 0.00019698798635406096, + "loss": 0.2714, + "step": 5111 + }, + { + "epoch": 0.68, + "grad_norm": 1.15625, + "learning_rate": 0.00019698514915940938, + "loss": 0.9193, + "step": 5112 + }, + { + "epoch": 0.68, + "grad_norm": 0.484375, + "learning_rate": 0.00019698231064957692, + "loss": 0.4244, + "step": 5113 + }, + { + "epoch": 0.68, + "grad_norm": 0.703125, + "learning_rate": 0.00019697947082460208, + "loss": 0.4901, + "step": 5114 + }, + { + "epoch": 0.68, + "grad_norm": 0.55859375, + "learning_rate": 0.00019697662968452343, + "loss": 0.6804, + "step": 5115 + }, + { + "epoch": 0.68, + "grad_norm": 0.84765625, + "learning_rate": 0.0001969737872293794, + "loss": 0.2735, + "step": 5116 + }, + { + "epoch": 0.68, + "grad_norm": 1.109375, + "learning_rate": 0.00019697094345920855, + "loss": 0.2919, + "step": 5117 + }, + { + "epoch": 0.68, + "grad_norm": 1.0078125, + "learning_rate": 0.00019696809837404952, + "loss": 0.5616, + "step": 5118 + }, + { + "epoch": 0.68, + "grad_norm": 0.51171875, + "learning_rate": 0.0001969652519739408, + "loss": 0.5993, + "step": 5119 + }, + { + "epoch": 0.68, + "grad_norm": 0.82421875, + "learning_rate": 0.00019696240425892109, + "loss": 0.7603, + "step": 5120 + }, + { + "epoch": 0.68, + "grad_norm": 0.68359375, + "learning_rate": 0.0001969595552290289, + "loss": 0.7239, + "step": 5121 + }, + { + "epoch": 0.68, + "grad_norm": 0.71875, + "learning_rate": 0.00019695670488430292, + "loss": 0.4387, + "step": 5122 + }, + { + "epoch": 0.68, + "grad_norm": 0.890625, + "learning_rate": 0.0001969538532247818, + "loss": 0.3841, + "step": 5123 + }, + { + "epoch": 0.68, + "grad_norm": 0.546875, + "learning_rate": 0.0001969510002505042, + "loss": 0.4755, + "step": 5124 + }, + { + "epoch": 0.68, + "grad_norm": 0.6640625, + "learning_rate": 0.00019694814596150884, + "loss": 0.6461, + "step": 5125 + }, + { + "epoch": 0.68, + "grad_norm": 0.65234375, + "learning_rate": 0.00019694529035783437, + "loss": 0.4592, + "step": 5126 + }, + { + "epoch": 0.68, + "grad_norm": 0.65234375, + "learning_rate": 0.00019694243343951957, + "loss": 0.4499, + "step": 5127 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 0.00019693957520660314, + "loss": 0.3177, + "step": 5128 + }, + { + "epoch": 0.68, + "grad_norm": 0.6171875, + "learning_rate": 0.00019693671565912383, + "loss": 0.639, + "step": 5129 + }, + { + "epoch": 0.68, + "grad_norm": 0.86328125, + "learning_rate": 0.00019693385479712048, + "loss": 0.643, + "step": 5130 + }, + { + "epoch": 0.68, + "grad_norm": 0.69140625, + "learning_rate": 0.00019693099262063185, + "loss": 0.2948, + "step": 5131 + }, + { + "epoch": 0.68, + "grad_norm": 0.64453125, + "learning_rate": 0.0001969281291296967, + "loss": 0.4076, + "step": 5132 + }, + { + "epoch": 0.68, + "grad_norm": 0.4921875, + "learning_rate": 0.00019692526432435394, + "loss": 0.6048, + "step": 5133 + }, + { + "epoch": 0.69, + "grad_norm": 0.84765625, + "learning_rate": 0.00019692239820464242, + "loss": 0.6052, + "step": 5134 + }, + { + "epoch": 0.69, + "grad_norm": 0.6171875, + "learning_rate": 0.00019691953077060094, + "loss": 0.5546, + "step": 5135 + }, + { + "epoch": 0.69, + "grad_norm": 0.97265625, + "learning_rate": 0.00019691666202226844, + "loss": 0.7046, + "step": 5136 + }, + { + "epoch": 0.69, + "grad_norm": 0.74609375, + "learning_rate": 0.0001969137919596838, + "loss": 0.4283, + "step": 5137 + }, + { + "epoch": 0.69, + "grad_norm": 0.55859375, + "learning_rate": 0.00019691092058288592, + "loss": 0.5396, + "step": 5138 + }, + { + "epoch": 0.69, + "grad_norm": 0.77734375, + "learning_rate": 0.00019690804789191378, + "loss": 0.5362, + "step": 5139 + }, + { + "epoch": 0.69, + "grad_norm": 0.53515625, + "learning_rate": 0.0001969051738868063, + "loss": 0.6407, + "step": 5140 + }, + { + "epoch": 0.69, + "grad_norm": 0.6640625, + "learning_rate": 0.00019690229856760246, + "loss": 0.7439, + "step": 5141 + }, + { + "epoch": 0.69, + "grad_norm": 0.61328125, + "learning_rate": 0.0001968994219343413, + "loss": 0.5075, + "step": 5142 + }, + { + "epoch": 0.69, + "grad_norm": 0.6328125, + "learning_rate": 0.00019689654398706176, + "loss": 0.9348, + "step": 5143 + }, + { + "epoch": 0.69, + "grad_norm": 0.85546875, + "learning_rate": 0.00019689366472580293, + "loss": 0.7936, + "step": 5144 + }, + { + "epoch": 0.69, + "grad_norm": 0.640625, + "learning_rate": 0.0001968907841506038, + "loss": 0.4304, + "step": 5145 + }, + { + "epoch": 0.69, + "grad_norm": 0.77734375, + "learning_rate": 0.0001968879022615034, + "loss": 0.3897, + "step": 5146 + }, + { + "epoch": 0.69, + "grad_norm": 0.9296875, + "learning_rate": 0.00019688501905854096, + "loss": 0.7676, + "step": 5147 + }, + { + "epoch": 0.69, + "grad_norm": 0.76953125, + "learning_rate": 0.00019688213454175543, + "loss": 0.6948, + "step": 5148 + }, + { + "epoch": 0.69, + "grad_norm": 0.71484375, + "learning_rate": 0.00019687924871118596, + "loss": 0.5105, + "step": 5149 + }, + { + "epoch": 0.69, + "grad_norm": 0.68359375, + "learning_rate": 0.0001968763615668717, + "loss": 0.538, + "step": 5150 + }, + { + "epoch": 0.69, + "grad_norm": 0.703125, + "learning_rate": 0.00019687347310885182, + "loss": 0.5623, + "step": 5151 + }, + { + "epoch": 0.69, + "grad_norm": 0.66015625, + "learning_rate": 0.0001968705833371655, + "loss": 0.4658, + "step": 5152 + }, + { + "epoch": 0.69, + "grad_norm": 0.76953125, + "learning_rate": 0.00019686769225185185, + "loss": 0.5471, + "step": 5153 + }, + { + "epoch": 0.69, + "grad_norm": 0.66796875, + "learning_rate": 0.00019686479985295017, + "loss": 0.3185, + "step": 5154 + }, + { + "epoch": 0.69, + "grad_norm": 0.80859375, + "learning_rate": 0.0001968619061404996, + "loss": 0.5975, + "step": 5155 + }, + { + "epoch": 0.69, + "grad_norm": 0.55078125, + "learning_rate": 0.0001968590111145394, + "loss": 0.5996, + "step": 5156 + }, + { + "epoch": 0.69, + "grad_norm": 0.5703125, + "learning_rate": 0.0001968561147751089, + "loss": 0.3047, + "step": 5157 + }, + { + "epoch": 0.69, + "grad_norm": 0.4296875, + "learning_rate": 0.0001968532171222473, + "loss": 0.2699, + "step": 5158 + }, + { + "epoch": 0.69, + "grad_norm": 0.578125, + "learning_rate": 0.00019685031815599394, + "loss": 0.6146, + "step": 5159 + }, + { + "epoch": 0.69, + "grad_norm": 0.83203125, + "learning_rate": 0.00019684741787638808, + "loss": 0.4851, + "step": 5160 + }, + { + "epoch": 0.69, + "grad_norm": 0.84765625, + "learning_rate": 0.00019684451628346905, + "loss": 0.4066, + "step": 5161 + }, + { + "epoch": 0.69, + "grad_norm": 0.6484375, + "learning_rate": 0.00019684161337727628, + "loss": 0.4667, + "step": 5162 + }, + { + "epoch": 0.69, + "grad_norm": 0.7734375, + "learning_rate": 0.00019683870915784902, + "loss": 0.3541, + "step": 5163 + }, + { + "epoch": 0.69, + "grad_norm": 0.640625, + "learning_rate": 0.00019683580362522674, + "loss": 0.6931, + "step": 5164 + }, + { + "epoch": 0.69, + "grad_norm": 0.8125, + "learning_rate": 0.00019683289677944882, + "loss": 0.5995, + "step": 5165 + }, + { + "epoch": 0.69, + "grad_norm": 0.490234375, + "learning_rate": 0.00019682998862055468, + "loss": 0.3784, + "step": 5166 + }, + { + "epoch": 0.69, + "grad_norm": 0.8515625, + "learning_rate": 0.00019682707914858374, + "loss": 0.5189, + "step": 5167 + }, + { + "epoch": 0.69, + "grad_norm": 0.6328125, + "learning_rate": 0.00019682416836357548, + "loss": 0.4215, + "step": 5168 + }, + { + "epoch": 0.69, + "grad_norm": 0.58984375, + "learning_rate": 0.00019682125626556936, + "loss": 0.2761, + "step": 5169 + }, + { + "epoch": 0.69, + "grad_norm": 0.73828125, + "learning_rate": 0.00019681834285460484, + "loss": 0.2624, + "step": 5170 + }, + { + "epoch": 0.69, + "grad_norm": 0.61328125, + "learning_rate": 0.00019681542813072145, + "loss": 0.4586, + "step": 5171 + }, + { + "epoch": 0.69, + "grad_norm": 0.44921875, + "learning_rate": 0.00019681251209395872, + "loss": 0.5189, + "step": 5172 + }, + { + "epoch": 0.69, + "grad_norm": 0.59375, + "learning_rate": 0.00019680959474435621, + "loss": 0.6826, + "step": 5173 + }, + { + "epoch": 0.69, + "grad_norm": 0.8671875, + "learning_rate": 0.00019680667608195344, + "loss": 0.7222, + "step": 5174 + }, + { + "epoch": 0.69, + "grad_norm": 0.77734375, + "learning_rate": 0.00019680375610679006, + "loss": 0.4585, + "step": 5175 + }, + { + "epoch": 0.69, + "grad_norm": 0.84375, + "learning_rate": 0.00019680083481890556, + "loss": 0.8245, + "step": 5176 + }, + { + "epoch": 0.69, + "grad_norm": 0.55859375, + "learning_rate": 0.00019679791221833965, + "loss": 0.6222, + "step": 5177 + }, + { + "epoch": 0.69, + "grad_norm": 0.67578125, + "learning_rate": 0.0001967949883051319, + "loss": 0.4832, + "step": 5178 + }, + { + "epoch": 0.69, + "grad_norm": 0.7109375, + "learning_rate": 0.000196792063079322, + "loss": 0.4695, + "step": 5179 + }, + { + "epoch": 0.69, + "grad_norm": 0.6015625, + "learning_rate": 0.0001967891365409496, + "loss": 0.3348, + "step": 5180 + }, + { + "epoch": 0.69, + "grad_norm": 0.62109375, + "learning_rate": 0.0001967862086900544, + "loss": 0.3451, + "step": 5181 + }, + { + "epoch": 0.69, + "grad_norm": 0.62890625, + "learning_rate": 0.0001967832795266761, + "loss": 0.6394, + "step": 5182 + }, + { + "epoch": 0.69, + "grad_norm": 0.70703125, + "learning_rate": 0.0001967803490508544, + "loss": 0.8886, + "step": 5183 + }, + { + "epoch": 0.69, + "grad_norm": 0.5390625, + "learning_rate": 0.00019677741726262905, + "loss": 0.392, + "step": 5184 + }, + { + "epoch": 0.69, + "grad_norm": 0.73828125, + "learning_rate": 0.00019677448416203982, + "loss": 0.4988, + "step": 5185 + }, + { + "epoch": 0.69, + "grad_norm": 0.59375, + "learning_rate": 0.00019677154974912647, + "loss": 0.6296, + "step": 5186 + }, + { + "epoch": 0.69, + "grad_norm": 0.5390625, + "learning_rate": 0.00019676861402392879, + "loss": 0.4948, + "step": 5187 + }, + { + "epoch": 0.69, + "grad_norm": 0.56640625, + "learning_rate": 0.0001967656769864866, + "loss": 0.3323, + "step": 5188 + }, + { + "epoch": 0.69, + "grad_norm": 0.59375, + "learning_rate": 0.00019676273863683977, + "loss": 0.5113, + "step": 5189 + }, + { + "epoch": 0.69, + "grad_norm": 0.72265625, + "learning_rate": 0.00019675979897502806, + "loss": 0.3348, + "step": 5190 + }, + { + "epoch": 0.69, + "grad_norm": 0.87890625, + "learning_rate": 0.00019675685800109137, + "loss": 0.5396, + "step": 5191 + }, + { + "epoch": 0.69, + "grad_norm": 0.98828125, + "learning_rate": 0.0001967539157150696, + "loss": 0.5207, + "step": 5192 + }, + { + "epoch": 0.69, + "grad_norm": 0.390625, + "learning_rate": 0.00019675097211700263, + "loss": 0.2113, + "step": 5193 + }, + { + "epoch": 0.69, + "grad_norm": 0.56640625, + "learning_rate": 0.0001967480272069304, + "loss": 0.3363, + "step": 5194 + }, + { + "epoch": 0.69, + "grad_norm": 0.5234375, + "learning_rate": 0.00019674508098489282, + "loss": 0.4893, + "step": 5195 + }, + { + "epoch": 0.69, + "grad_norm": 0.58203125, + "learning_rate": 0.00019674213345092985, + "loss": 0.3595, + "step": 5196 + }, + { + "epoch": 0.69, + "grad_norm": 0.48828125, + "learning_rate": 0.00019673918460508144, + "loss": 0.4564, + "step": 5197 + }, + { + "epoch": 0.69, + "grad_norm": 0.86328125, + "learning_rate": 0.00019673623444738762, + "loss": 0.2788, + "step": 5198 + }, + { + "epoch": 0.69, + "grad_norm": 0.65234375, + "learning_rate": 0.00019673328297788837, + "loss": 0.4192, + "step": 5199 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 0.00019673033019662375, + "loss": 0.3608, + "step": 5200 + }, + { + "epoch": 0.69, + "grad_norm": 0.9375, + "learning_rate": 0.00019672737610363375, + "loss": 0.3753, + "step": 5201 + }, + { + "epoch": 0.69, + "grad_norm": 0.65625, + "learning_rate": 0.00019672442069895845, + "loss": 0.2246, + "step": 5202 + }, + { + "epoch": 0.69, + "grad_norm": 0.80859375, + "learning_rate": 0.00019672146398263794, + "loss": 0.6448, + "step": 5203 + }, + { + "epoch": 0.69, + "grad_norm": 0.625, + "learning_rate": 0.00019671850595471226, + "loss": 0.4963, + "step": 5204 + }, + { + "epoch": 0.69, + "grad_norm": 0.921875, + "learning_rate": 0.0001967155466152216, + "loss": 0.3408, + "step": 5205 + }, + { + "epoch": 0.69, + "grad_norm": 0.6640625, + "learning_rate": 0.00019671258596420604, + "loss": 0.7031, + "step": 5206 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 0.0001967096240017058, + "loss": 0.3832, + "step": 5207 + }, + { + "epoch": 0.69, + "grad_norm": 0.48828125, + "learning_rate": 0.0001967066607277609, + "loss": 0.4805, + "step": 5208 + }, + { + "epoch": 0.7, + "grad_norm": 0.55859375, + "learning_rate": 0.00019670369614241168, + "loss": 0.4189, + "step": 5209 + }, + { + "epoch": 0.7, + "grad_norm": 1.015625, + "learning_rate": 0.00019670073024569829, + "loss": 0.3174, + "step": 5210 + }, + { + "epoch": 0.7, + "grad_norm": 0.69140625, + "learning_rate": 0.00019669776303766088, + "loss": 0.3431, + "step": 5211 + }, + { + "epoch": 0.7, + "grad_norm": 0.51953125, + "learning_rate": 0.00019669479451833976, + "loss": 0.4126, + "step": 5212 + }, + { + "epoch": 0.7, + "grad_norm": 0.77734375, + "learning_rate": 0.0001966918246877752, + "loss": 0.4543, + "step": 5213 + }, + { + "epoch": 0.7, + "grad_norm": 1.0859375, + "learning_rate": 0.0001966888535460074, + "loss": 0.5126, + "step": 5214 + }, + { + "epoch": 0.7, + "grad_norm": 0.6328125, + "learning_rate": 0.00019668588109307672, + "loss": 0.5283, + "step": 5215 + }, + { + "epoch": 0.7, + "grad_norm": 0.6640625, + "learning_rate": 0.0001966829073290234, + "loss": 0.4384, + "step": 5216 + }, + { + "epoch": 0.7, + "grad_norm": 0.66796875, + "learning_rate": 0.00019667993225388786, + "loss": 0.4983, + "step": 5217 + }, + { + "epoch": 0.7, + "grad_norm": 0.7421875, + "learning_rate": 0.00019667695586771035, + "loss": 0.538, + "step": 5218 + }, + { + "epoch": 0.7, + "grad_norm": 0.68359375, + "learning_rate": 0.0001966739781705313, + "loss": 0.3811, + "step": 5219 + }, + { + "epoch": 0.7, + "grad_norm": 0.671875, + "learning_rate": 0.00019667099916239103, + "loss": 0.6029, + "step": 5220 + }, + { + "epoch": 0.7, + "grad_norm": 0.55078125, + "learning_rate": 0.00019666801884333, + "loss": 0.3247, + "step": 5221 + }, + { + "epoch": 0.7, + "grad_norm": 0.859375, + "learning_rate": 0.00019666503721338857, + "loss": 0.5898, + "step": 5222 + }, + { + "epoch": 0.7, + "grad_norm": 0.546875, + "learning_rate": 0.0001966620542726072, + "loss": 0.6816, + "step": 5223 + }, + { + "epoch": 0.7, + "grad_norm": 0.61328125, + "learning_rate": 0.00019665907002102632, + "loss": 0.4142, + "step": 5224 + }, + { + "epoch": 0.7, + "grad_norm": 0.421875, + "learning_rate": 0.00019665608445868643, + "loss": 0.3954, + "step": 5225 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 0.00019665309758562803, + "loss": 0.6494, + "step": 5226 + }, + { + "epoch": 0.7, + "grad_norm": 0.88671875, + "learning_rate": 0.00019665010940189158, + "loss": 0.3786, + "step": 5227 + }, + { + "epoch": 0.7, + "grad_norm": 0.62890625, + "learning_rate": 0.0001966471199075176, + "loss": 0.4343, + "step": 5228 + }, + { + "epoch": 0.7, + "grad_norm": 0.82421875, + "learning_rate": 0.00019664412910254666, + "loss": 0.2956, + "step": 5229 + }, + { + "epoch": 0.7, + "grad_norm": 0.70703125, + "learning_rate": 0.0001966411369870193, + "loss": 0.4672, + "step": 5230 + }, + { + "epoch": 0.7, + "grad_norm": 0.73046875, + "learning_rate": 0.0001966381435609761, + "loss": 0.493, + "step": 5231 + }, + { + "epoch": 0.7, + "grad_norm": 0.4765625, + "learning_rate": 0.00019663514882445763, + "loss": 0.3672, + "step": 5232 + }, + { + "epoch": 0.7, + "grad_norm": 0.53125, + "learning_rate": 0.00019663215277750457, + "loss": 0.751, + "step": 5233 + }, + { + "epoch": 0.7, + "grad_norm": 0.89453125, + "learning_rate": 0.00019662915542015746, + "loss": 0.5395, + "step": 5234 + }, + { + "epoch": 0.7, + "grad_norm": 0.57421875, + "learning_rate": 0.000196626156752457, + "loss": 0.3829, + "step": 5235 + }, + { + "epoch": 0.7, + "grad_norm": 0.5625, + "learning_rate": 0.00019662315677444384, + "loss": 0.6703, + "step": 5236 + }, + { + "epoch": 0.7, + "grad_norm": 0.71875, + "learning_rate": 0.00019662015548615864, + "loss": 0.3831, + "step": 5237 + }, + { + "epoch": 0.7, + "grad_norm": 0.9140625, + "learning_rate": 0.00019661715288764214, + "loss": 0.4223, + "step": 5238 + }, + { + "epoch": 0.7, + "grad_norm": 0.65234375, + "learning_rate": 0.00019661414897893505, + "loss": 0.6062, + "step": 5239 + }, + { + "epoch": 0.7, + "grad_norm": 0.5, + "learning_rate": 0.00019661114376007808, + "loss": 0.5101, + "step": 5240 + }, + { + "epoch": 0.7, + "grad_norm": 0.71875, + "learning_rate": 0.00019660813723111203, + "loss": 0.2141, + "step": 5241 + }, + { + "epoch": 0.7, + "grad_norm": 0.9765625, + "learning_rate": 0.00019660512939207757, + "loss": 0.6181, + "step": 5242 + }, + { + "epoch": 0.7, + "grad_norm": 0.59375, + "learning_rate": 0.0001966021202430156, + "loss": 0.556, + "step": 5243 + }, + { + "epoch": 0.7, + "grad_norm": 1.1953125, + "learning_rate": 0.00019659910978396688, + "loss": 0.4224, + "step": 5244 + }, + { + "epoch": 0.7, + "grad_norm": 0.96875, + "learning_rate": 0.00019659609801497222, + "loss": 0.7802, + "step": 5245 + }, + { + "epoch": 0.7, + "grad_norm": 0.59765625, + "learning_rate": 0.00019659308493607248, + "loss": 0.4342, + "step": 5246 + }, + { + "epoch": 0.7, + "grad_norm": 1.0, + "learning_rate": 0.00019659007054730851, + "loss": 0.2336, + "step": 5247 + }, + { + "epoch": 0.7, + "grad_norm": 0.451171875, + "learning_rate": 0.0001965870548487212, + "loss": 0.4243, + "step": 5248 + }, + { + "epoch": 0.7, + "grad_norm": 0.55859375, + "learning_rate": 0.00019658403784035143, + "loss": 0.4129, + "step": 5249 + }, + { + "epoch": 0.7, + "grad_norm": 0.640625, + "learning_rate": 0.00019658101952224014, + "loss": 0.5853, + "step": 5250 + }, + { + "epoch": 0.7, + "grad_norm": 0.59765625, + "learning_rate": 0.0001965779998944282, + "loss": 0.2454, + "step": 5251 + }, + { + "epoch": 0.7, + "grad_norm": 0.69921875, + "learning_rate": 0.00019657497895695666, + "loss": 0.3182, + "step": 5252 + }, + { + "epoch": 0.7, + "grad_norm": 0.5078125, + "learning_rate": 0.00019657195670986637, + "loss": 0.4616, + "step": 5253 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 0.00019656893315319837, + "loss": 0.4502, + "step": 5254 + }, + { + "epoch": 0.7, + "grad_norm": 0.76953125, + "learning_rate": 0.00019656590828699369, + "loss": 0.3399, + "step": 5255 + }, + { + "epoch": 0.7, + "grad_norm": 0.57421875, + "learning_rate": 0.0001965628821112933, + "loss": 0.5862, + "step": 5256 + }, + { + "epoch": 0.7, + "grad_norm": 0.7265625, + "learning_rate": 0.00019655985462613824, + "loss": 0.4445, + "step": 5257 + }, + { + "epoch": 0.7, + "grad_norm": 0.75390625, + "learning_rate": 0.0001965568258315696, + "loss": 0.3367, + "step": 5258 + }, + { + "epoch": 0.7, + "grad_norm": 0.63671875, + "learning_rate": 0.00019655379572762846, + "loss": 0.3746, + "step": 5259 + }, + { + "epoch": 0.7, + "grad_norm": 1.1328125, + "learning_rate": 0.00019655076431435582, + "loss": 0.5427, + "step": 5260 + }, + { + "epoch": 0.7, + "grad_norm": 1.0546875, + "learning_rate": 0.00019654773159179288, + "loss": 0.3791, + "step": 5261 + }, + { + "epoch": 0.7, + "grad_norm": 0.82421875, + "learning_rate": 0.00019654469755998072, + "loss": 0.3835, + "step": 5262 + }, + { + "epoch": 0.7, + "grad_norm": 0.78515625, + "learning_rate": 0.00019654166221896052, + "loss": 0.7919, + "step": 5263 + }, + { + "epoch": 0.7, + "grad_norm": 0.8125, + "learning_rate": 0.00019653862556877344, + "loss": 0.5961, + "step": 5264 + }, + { + "epoch": 0.7, + "grad_norm": 0.61328125, + "learning_rate": 0.00019653558760946062, + "loss": 0.4953, + "step": 5265 + }, + { + "epoch": 0.7, + "grad_norm": 0.625, + "learning_rate": 0.00019653254834106325, + "loss": 0.4074, + "step": 5266 + }, + { + "epoch": 0.7, + "grad_norm": 0.7890625, + "learning_rate": 0.0001965295077636226, + "loss": 0.6033, + "step": 5267 + }, + { + "epoch": 0.7, + "grad_norm": 0.578125, + "learning_rate": 0.00019652646587717986, + "loss": 0.459, + "step": 5268 + }, + { + "epoch": 0.7, + "grad_norm": 0.5859375, + "learning_rate": 0.00019652342268177631, + "loss": 0.4277, + "step": 5269 + }, + { + "epoch": 0.7, + "grad_norm": 0.62890625, + "learning_rate": 0.00019652037817745316, + "loss": 0.3755, + "step": 5270 + }, + { + "epoch": 0.7, + "grad_norm": 0.6640625, + "learning_rate": 0.0001965173323642518, + "loss": 0.3229, + "step": 5271 + }, + { + "epoch": 0.7, + "grad_norm": 0.78125, + "learning_rate": 0.0001965142852422134, + "loss": 0.4612, + "step": 5272 + }, + { + "epoch": 0.7, + "grad_norm": 0.828125, + "learning_rate": 0.00019651123681137936, + "loss": 0.8815, + "step": 5273 + }, + { + "epoch": 0.7, + "grad_norm": 0.61328125, + "learning_rate": 0.00019650818707179105, + "loss": 0.5943, + "step": 5274 + }, + { + "epoch": 0.7, + "grad_norm": 0.490234375, + "learning_rate": 0.00019650513602348976, + "loss": 0.5042, + "step": 5275 + }, + { + "epoch": 0.7, + "grad_norm": 0.859375, + "learning_rate": 0.0001965020836665169, + "loss": 0.6108, + "step": 5276 + }, + { + "epoch": 0.7, + "grad_norm": 0.84375, + "learning_rate": 0.00019649903000091383, + "loss": 0.4228, + "step": 5277 + }, + { + "epoch": 0.7, + "grad_norm": 0.5390625, + "learning_rate": 0.000196495975026722, + "loss": 0.3796, + "step": 5278 + }, + { + "epoch": 0.7, + "grad_norm": 0.51953125, + "learning_rate": 0.00019649291874398282, + "loss": 0.7985, + "step": 5279 + }, + { + "epoch": 0.7, + "grad_norm": 0.83203125, + "learning_rate": 0.0001964898611527377, + "loss": 0.3708, + "step": 5280 + }, + { + "epoch": 0.7, + "grad_norm": 0.578125, + "learning_rate": 0.00019648680225302817, + "loss": 0.5375, + "step": 5281 + }, + { + "epoch": 0.7, + "grad_norm": 0.83984375, + "learning_rate": 0.00019648374204489567, + "loss": 0.8353, + "step": 5282 + }, + { + "epoch": 0.7, + "grad_norm": 1.0390625, + "learning_rate": 0.0001964806805283817, + "loss": 0.5761, + "step": 5283 + }, + { + "epoch": 0.71, + "grad_norm": 0.49609375, + "learning_rate": 0.00019647761770352778, + "loss": 0.2665, + "step": 5284 + }, + { + "epoch": 0.71, + "grad_norm": 0.63671875, + "learning_rate": 0.00019647455357037544, + "loss": 0.3957, + "step": 5285 + }, + { + "epoch": 0.71, + "grad_norm": 0.6796875, + "learning_rate": 0.00019647148812896624, + "loss": 0.5882, + "step": 5286 + }, + { + "epoch": 0.71, + "grad_norm": 1.0390625, + "learning_rate": 0.00019646842137934178, + "loss": 0.9272, + "step": 5287 + }, + { + "epoch": 0.71, + "grad_norm": 0.7265625, + "learning_rate": 0.00019646535332154357, + "loss": 0.2597, + "step": 5288 + }, + { + "epoch": 0.71, + "grad_norm": 0.75, + "learning_rate": 0.00019646228395561329, + "loss": 0.5549, + "step": 5289 + }, + { + "epoch": 0.71, + "grad_norm": 0.80078125, + "learning_rate": 0.0001964592132815925, + "loss": 0.4334, + "step": 5290 + }, + { + "epoch": 0.71, + "grad_norm": 0.55859375, + "learning_rate": 0.0001964561412995229, + "loss": 0.3398, + "step": 5291 + }, + { + "epoch": 0.71, + "grad_norm": 0.62890625, + "learning_rate": 0.00019645306800944608, + "loss": 0.5258, + "step": 5292 + }, + { + "epoch": 0.71, + "grad_norm": 0.63671875, + "learning_rate": 0.00019644999341140377, + "loss": 0.65, + "step": 5293 + }, + { + "epoch": 0.71, + "grad_norm": 0.52734375, + "learning_rate": 0.00019644691750543767, + "loss": 0.5158, + "step": 5294 + }, + { + "epoch": 0.71, + "grad_norm": 0.9140625, + "learning_rate": 0.00019644384029158943, + "loss": 0.3182, + "step": 5295 + }, + { + "epoch": 0.71, + "grad_norm": 0.63671875, + "learning_rate": 0.00019644076176990086, + "loss": 0.4144, + "step": 5296 + }, + { + "epoch": 0.71, + "grad_norm": 0.5703125, + "learning_rate": 0.00019643768194041366, + "loss": 0.4541, + "step": 5297 + }, + { + "epoch": 0.71, + "grad_norm": 0.64453125, + "learning_rate": 0.00019643460080316958, + "loss": 0.534, + "step": 5298 + }, + { + "epoch": 0.71, + "grad_norm": 0.7109375, + "learning_rate": 0.00019643151835821044, + "loss": 0.4476, + "step": 5299 + }, + { + "epoch": 0.71, + "grad_norm": 0.71875, + "learning_rate": 0.000196428434605578, + "loss": 0.6604, + "step": 5300 + }, + { + "epoch": 0.71, + "grad_norm": 0.625, + "learning_rate": 0.0001964253495453141, + "loss": 0.4289, + "step": 5301 + }, + { + "epoch": 0.71, + "grad_norm": 0.6796875, + "learning_rate": 0.0001964222631774606, + "loss": 0.3167, + "step": 5302 + }, + { + "epoch": 0.71, + "grad_norm": 0.63671875, + "learning_rate": 0.00019641917550205931, + "loss": 0.3842, + "step": 5303 + }, + { + "epoch": 0.71, + "grad_norm": 0.6875, + "learning_rate": 0.00019641608651915212, + "loss": 0.4327, + "step": 5304 + }, + { + "epoch": 0.71, + "grad_norm": 0.7421875, + "learning_rate": 0.00019641299622878092, + "loss": 0.4555, + "step": 5305 + }, + { + "epoch": 0.71, + "grad_norm": 0.9296875, + "learning_rate": 0.00019640990463098761, + "loss": 0.4877, + "step": 5306 + }, + { + "epoch": 0.71, + "grad_norm": 0.78515625, + "learning_rate": 0.00019640681172581413, + "loss": 0.533, + "step": 5307 + }, + { + "epoch": 0.71, + "grad_norm": 0.67578125, + "learning_rate": 0.0001964037175133024, + "loss": 0.6791, + "step": 5308 + }, + { + "epoch": 0.71, + "grad_norm": 0.61328125, + "learning_rate": 0.0001964006219934944, + "loss": 0.3759, + "step": 5309 + }, + { + "epoch": 0.71, + "grad_norm": 0.87890625, + "learning_rate": 0.00019639752516643205, + "loss": 0.3194, + "step": 5310 + }, + { + "epoch": 0.71, + "grad_norm": 0.609375, + "learning_rate": 0.00019639442703215743, + "loss": 0.3827, + "step": 5311 + }, + { + "epoch": 0.71, + "grad_norm": 0.76171875, + "learning_rate": 0.00019639132759071253, + "loss": 0.5956, + "step": 5312 + }, + { + "epoch": 0.71, + "grad_norm": 0.86328125, + "learning_rate": 0.00019638822684213934, + "loss": 0.321, + "step": 5313 + }, + { + "epoch": 0.71, + "grad_norm": 0.671875, + "learning_rate": 0.00019638512478647993, + "loss": 0.3894, + "step": 5314 + }, + { + "epoch": 0.71, + "grad_norm": 0.921875, + "learning_rate": 0.00019638202142377637, + "loss": 0.69, + "step": 5315 + }, + { + "epoch": 0.71, + "grad_norm": 0.61328125, + "learning_rate": 0.00019637891675407075, + "loss": 0.7137, + "step": 5316 + }, + { + "epoch": 0.71, + "grad_norm": 0.7578125, + "learning_rate": 0.00019637581077740516, + "loss": 0.3755, + "step": 5317 + }, + { + "epoch": 0.71, + "grad_norm": 0.828125, + "learning_rate": 0.00019637270349382173, + "loss": 0.5943, + "step": 5318 + }, + { + "epoch": 0.71, + "grad_norm": 0.61328125, + "learning_rate": 0.0001963695949033626, + "loss": 0.3674, + "step": 5319 + }, + { + "epoch": 0.71, + "grad_norm": 0.64453125, + "learning_rate": 0.0001963664850060699, + "loss": 0.8068, + "step": 5320 + }, + { + "epoch": 0.71, + "grad_norm": 0.486328125, + "learning_rate": 0.0001963633738019858, + "loss": 0.4837, + "step": 5321 + }, + { + "epoch": 0.71, + "grad_norm": 0.51953125, + "learning_rate": 0.0001963602612911525, + "loss": 0.3696, + "step": 5322 + }, + { + "epoch": 0.71, + "grad_norm": 0.671875, + "learning_rate": 0.00019635714747361224, + "loss": 0.5537, + "step": 5323 + }, + { + "epoch": 0.71, + "grad_norm": 0.61328125, + "learning_rate": 0.0001963540323494072, + "loss": 0.7427, + "step": 5324 + }, + { + "epoch": 0.71, + "grad_norm": 0.5078125, + "learning_rate": 0.00019635091591857967, + "loss": 0.4567, + "step": 5325 + }, + { + "epoch": 0.71, + "grad_norm": 0.6796875, + "learning_rate": 0.00019634779818117183, + "loss": 0.4188, + "step": 5326 + }, + { + "epoch": 0.71, + "grad_norm": 0.5, + "learning_rate": 0.00019634467913722605, + "loss": 0.2437, + "step": 5327 + }, + { + "epoch": 0.71, + "grad_norm": 0.55078125, + "learning_rate": 0.0001963415587867846, + "loss": 0.4626, + "step": 5328 + }, + { + "epoch": 0.71, + "grad_norm": 0.60546875, + "learning_rate": 0.00019633843712988974, + "loss": 0.6517, + "step": 5329 + }, + { + "epoch": 0.71, + "grad_norm": 0.75, + "learning_rate": 0.00019633531416658387, + "loss": 0.554, + "step": 5330 + }, + { + "epoch": 0.71, + "grad_norm": 0.6953125, + "learning_rate": 0.0001963321898969093, + "loss": 0.5162, + "step": 5331 + }, + { + "epoch": 0.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00019632906432090846, + "loss": 0.4732, + "step": 5332 + }, + { + "epoch": 0.71, + "grad_norm": 0.609375, + "learning_rate": 0.00019632593743862362, + "loss": 0.7344, + "step": 5333 + }, + { + "epoch": 0.71, + "grad_norm": 0.703125, + "learning_rate": 0.00019632280925009727, + "loss": 0.3747, + "step": 5334 + }, + { + "epoch": 0.71, + "grad_norm": 0.7265625, + "learning_rate": 0.0001963196797553718, + "loss": 0.3505, + "step": 5335 + }, + { + "epoch": 0.71, + "grad_norm": 0.70703125, + "learning_rate": 0.00019631654895448968, + "loss": 0.4654, + "step": 5336 + }, + { + "epoch": 0.71, + "grad_norm": 0.58984375, + "learning_rate": 0.00019631341684749328, + "loss": 0.2638, + "step": 5337 + }, + { + "epoch": 0.71, + "grad_norm": 1.0234375, + "learning_rate": 0.0001963102834344252, + "loss": 0.5254, + "step": 5338 + }, + { + "epoch": 0.71, + "grad_norm": 0.7421875, + "learning_rate": 0.00019630714871532784, + "loss": 0.6633, + "step": 5339 + }, + { + "epoch": 0.71, + "grad_norm": 1.109375, + "learning_rate": 0.00019630401269024375, + "loss": 0.633, + "step": 5340 + }, + { + "epoch": 0.71, + "grad_norm": 0.66796875, + "learning_rate": 0.0001963008753592154, + "loss": 0.5208, + "step": 5341 + }, + { + "epoch": 0.71, + "grad_norm": 0.859375, + "learning_rate": 0.0001962977367222854, + "loss": 0.6091, + "step": 5342 + }, + { + "epoch": 0.71, + "grad_norm": 0.50390625, + "learning_rate": 0.00019629459677949628, + "loss": 0.3234, + "step": 5343 + }, + { + "epoch": 0.71, + "grad_norm": 0.65625, + "learning_rate": 0.00019629145553089065, + "loss": 0.5182, + "step": 5344 + }, + { + "epoch": 0.71, + "grad_norm": 0.62109375, + "learning_rate": 0.00019628831297651106, + "loss": 0.391, + "step": 5345 + }, + { + "epoch": 0.71, + "grad_norm": 0.71484375, + "learning_rate": 0.00019628516911640015, + "loss": 0.4294, + "step": 5346 + }, + { + "epoch": 0.71, + "grad_norm": 0.984375, + "learning_rate": 0.00019628202395060055, + "loss": 0.6467, + "step": 5347 + }, + { + "epoch": 0.71, + "grad_norm": 0.88671875, + "learning_rate": 0.00019627887747915494, + "loss": 0.5412, + "step": 5348 + }, + { + "epoch": 0.71, + "grad_norm": 0.828125, + "learning_rate": 0.00019627572970210594, + "loss": 0.4546, + "step": 5349 + }, + { + "epoch": 0.71, + "grad_norm": 0.859375, + "learning_rate": 0.00019627258061949622, + "loss": 0.3988, + "step": 5350 + }, + { + "epoch": 0.71, + "grad_norm": 0.78125, + "learning_rate": 0.00019626943023136857, + "loss": 0.5869, + "step": 5351 + }, + { + "epoch": 0.71, + "grad_norm": 0.67578125, + "learning_rate": 0.00019626627853776564, + "loss": 0.495, + "step": 5352 + }, + { + "epoch": 0.71, + "grad_norm": 0.734375, + "learning_rate": 0.0001962631255387302, + "loss": 0.5519, + "step": 5353 + }, + { + "epoch": 0.71, + "grad_norm": 0.60546875, + "learning_rate": 0.000196259971234305, + "loss": 0.4813, + "step": 5354 + }, + { + "epoch": 0.71, + "grad_norm": 0.59375, + "learning_rate": 0.0001962568156245328, + "loss": 0.5403, + "step": 5355 + }, + { + "epoch": 0.71, + "grad_norm": 0.52734375, + "learning_rate": 0.00019625365870945639, + "loss": 0.6109, + "step": 5356 + }, + { + "epoch": 0.71, + "grad_norm": 0.52734375, + "learning_rate": 0.0001962505004891186, + "loss": 0.171, + "step": 5357 + }, + { + "epoch": 0.71, + "grad_norm": 0.765625, + "learning_rate": 0.00019624734096356224, + "loss": 0.3926, + "step": 5358 + }, + { + "epoch": 0.72, + "grad_norm": 0.56640625, + "learning_rate": 0.00019624418013283022, + "loss": 0.4198, + "step": 5359 + }, + { + "epoch": 0.72, + "grad_norm": 0.59765625, + "learning_rate": 0.0001962410179969653, + "loss": 0.4344, + "step": 5360 + }, + { + "epoch": 0.72, + "grad_norm": 0.7265625, + "learning_rate": 0.0001962378545560104, + "loss": 0.4131, + "step": 5361 + }, + { + "epoch": 0.72, + "grad_norm": 0.8203125, + "learning_rate": 0.00019623468981000848, + "loss": 0.5442, + "step": 5362 + }, + { + "epoch": 0.72, + "grad_norm": 0.83984375, + "learning_rate": 0.00019623152375900237, + "loss": 0.3693, + "step": 5363 + }, + { + "epoch": 0.72, + "grad_norm": 0.84375, + "learning_rate": 0.00019622835640303502, + "loss": 0.3362, + "step": 5364 + }, + { + "epoch": 0.72, + "grad_norm": 0.48828125, + "learning_rate": 0.0001962251877421494, + "loss": 0.3869, + "step": 5365 + }, + { + "epoch": 0.72, + "grad_norm": 0.78125, + "learning_rate": 0.00019622201777638848, + "loss": 0.6521, + "step": 5366 + }, + { + "epoch": 0.72, + "grad_norm": 0.49609375, + "learning_rate": 0.00019621884650579527, + "loss": 0.3299, + "step": 5367 + }, + { + "epoch": 0.72, + "grad_norm": 0.77734375, + "learning_rate": 0.00019621567393041277, + "loss": 0.5989, + "step": 5368 + }, + { + "epoch": 0.72, + "grad_norm": 0.5546875, + "learning_rate": 0.00019621250005028392, + "loss": 0.7462, + "step": 5369 + }, + { + "epoch": 0.72, + "grad_norm": 1.0234375, + "learning_rate": 0.00019620932486545182, + "loss": 0.7576, + "step": 5370 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 0.0001962061483759596, + "loss": 0.6079, + "step": 5371 + }, + { + "epoch": 0.72, + "grad_norm": 0.66015625, + "learning_rate": 0.0001962029705818502, + "loss": 0.637, + "step": 5372 + }, + { + "epoch": 0.72, + "grad_norm": 1.0546875, + "learning_rate": 0.00019619979148316678, + "loss": 0.5784, + "step": 5373 + }, + { + "epoch": 0.72, + "grad_norm": 0.5859375, + "learning_rate": 0.00019619661107995247, + "loss": 0.2805, + "step": 5374 + }, + { + "epoch": 0.72, + "grad_norm": 0.5078125, + "learning_rate": 0.00019619342937225037, + "loss": 0.6309, + "step": 5375 + }, + { + "epoch": 0.72, + "grad_norm": 0.435546875, + "learning_rate": 0.00019619024636010363, + "loss": 0.3183, + "step": 5376 + }, + { + "epoch": 0.72, + "grad_norm": 0.61328125, + "learning_rate": 0.0001961870620435554, + "loss": 0.4171, + "step": 5377 + }, + { + "epoch": 0.72, + "grad_norm": 0.5625, + "learning_rate": 0.0001961838764226489, + "loss": 0.362, + "step": 5378 + }, + { + "epoch": 0.72, + "grad_norm": 0.81640625, + "learning_rate": 0.00019618068949742726, + "loss": 0.6475, + "step": 5379 + }, + { + "epoch": 0.72, + "grad_norm": 0.70703125, + "learning_rate": 0.0001961775012679338, + "loss": 0.6132, + "step": 5380 + }, + { + "epoch": 0.72, + "grad_norm": 0.7421875, + "learning_rate": 0.00019617431173421168, + "loss": 0.4203, + "step": 5381 + }, + { + "epoch": 0.72, + "grad_norm": 0.53515625, + "learning_rate": 0.00019617112089630415, + "loss": 0.3235, + "step": 5382 + }, + { + "epoch": 0.72, + "grad_norm": 0.6328125, + "learning_rate": 0.00019616792875425451, + "loss": 0.4972, + "step": 5383 + }, + { + "epoch": 0.72, + "grad_norm": 0.734375, + "learning_rate": 0.00019616473530810602, + "loss": 0.4194, + "step": 5384 + }, + { + "epoch": 0.72, + "grad_norm": 0.65234375, + "learning_rate": 0.00019616154055790202, + "loss": 0.5535, + "step": 5385 + }, + { + "epoch": 0.72, + "grad_norm": 0.953125, + "learning_rate": 0.00019615834450368582, + "loss": 0.5368, + "step": 5386 + }, + { + "epoch": 0.72, + "grad_norm": 0.435546875, + "learning_rate": 0.00019615514714550075, + "loss": 0.2437, + "step": 5387 + }, + { + "epoch": 0.72, + "grad_norm": 0.91015625, + "learning_rate": 0.00019615194848339015, + "loss": 0.4729, + "step": 5388 + }, + { + "epoch": 0.72, + "grad_norm": 0.484375, + "learning_rate": 0.00019614874851739744, + "loss": 0.4883, + "step": 5389 + }, + { + "epoch": 0.72, + "grad_norm": 0.498046875, + "learning_rate": 0.000196145547247566, + "loss": 0.3981, + "step": 5390 + }, + { + "epoch": 0.72, + "grad_norm": 0.76171875, + "learning_rate": 0.00019614234467393922, + "loss": 0.8018, + "step": 5391 + }, + { + "epoch": 0.72, + "grad_norm": 0.5859375, + "learning_rate": 0.00019613914079656054, + "loss": 0.3148, + "step": 5392 + }, + { + "epoch": 0.72, + "grad_norm": 0.515625, + "learning_rate": 0.00019613593561547342, + "loss": 0.688, + "step": 5393 + }, + { + "epoch": 0.72, + "grad_norm": 0.84765625, + "learning_rate": 0.0001961327291307213, + "loss": 0.4149, + "step": 5394 + }, + { + "epoch": 0.72, + "grad_norm": 0.89453125, + "learning_rate": 0.00019612952134234768, + "loss": 0.534, + "step": 5395 + }, + { + "epoch": 0.72, + "grad_norm": 0.81640625, + "learning_rate": 0.00019612631225039606, + "loss": 0.415, + "step": 5396 + }, + { + "epoch": 0.72, + "grad_norm": 0.58203125, + "learning_rate": 0.00019612310185490993, + "loss": 0.3197, + "step": 5397 + }, + { + "epoch": 0.72, + "grad_norm": 0.56640625, + "learning_rate": 0.00019611989015593288, + "loss": 0.3741, + "step": 5398 + }, + { + "epoch": 0.72, + "grad_norm": 0.796875, + "learning_rate": 0.0001961166771535084, + "loss": 0.4743, + "step": 5399 + }, + { + "epoch": 0.72, + "grad_norm": 0.55078125, + "learning_rate": 0.00019611346284768014, + "loss": 0.4778, + "step": 5400 + }, + { + "epoch": 0.72, + "grad_norm": 0.59765625, + "learning_rate": 0.00019611024723849158, + "loss": 0.3864, + "step": 5401 + }, + { + "epoch": 0.72, + "grad_norm": 0.484375, + "learning_rate": 0.00019610703032598643, + "loss": 0.5944, + "step": 5402 + }, + { + "epoch": 0.72, + "grad_norm": 0.71875, + "learning_rate": 0.00019610381211020825, + "loss": 0.702, + "step": 5403 + }, + { + "epoch": 0.72, + "grad_norm": 0.5, + "learning_rate": 0.0001961005925912007, + "loss": 0.4804, + "step": 5404 + }, + { + "epoch": 0.72, + "grad_norm": 0.640625, + "learning_rate": 0.00019609737176900743, + "loss": 0.2871, + "step": 5405 + }, + { + "epoch": 0.72, + "grad_norm": 0.42578125, + "learning_rate": 0.0001960941496436721, + "loss": 0.3488, + "step": 5406 + }, + { + "epoch": 0.72, + "grad_norm": 0.67578125, + "learning_rate": 0.00019609092621523849, + "loss": 0.3346, + "step": 5407 + }, + { + "epoch": 0.72, + "grad_norm": 0.58984375, + "learning_rate": 0.0001960877014837502, + "loss": 0.2792, + "step": 5408 + }, + { + "epoch": 0.72, + "grad_norm": 0.67578125, + "learning_rate": 0.000196084475449251, + "loss": 0.5441, + "step": 5409 + }, + { + "epoch": 0.72, + "grad_norm": 0.640625, + "learning_rate": 0.0001960812481117847, + "loss": 0.4986, + "step": 5410 + }, + { + "epoch": 0.72, + "grad_norm": 0.5078125, + "learning_rate": 0.00019607801947139496, + "loss": 0.5371, + "step": 5411 + }, + { + "epoch": 0.72, + "grad_norm": 0.5390625, + "learning_rate": 0.00019607478952812565, + "loss": 0.4428, + "step": 5412 + }, + { + "epoch": 0.72, + "grad_norm": 0.50390625, + "learning_rate": 0.00019607155828202052, + "loss": 0.2833, + "step": 5413 + }, + { + "epoch": 0.72, + "grad_norm": 0.578125, + "learning_rate": 0.00019606832573312342, + "loss": 0.5066, + "step": 5414 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 0.00019606509188147813, + "loss": 0.305, + "step": 5415 + }, + { + "epoch": 0.72, + "grad_norm": 0.734375, + "learning_rate": 0.00019606185672712856, + "loss": 0.4178, + "step": 5416 + }, + { + "epoch": 0.72, + "grad_norm": 0.447265625, + "learning_rate": 0.00019605862027011856, + "loss": 0.1863, + "step": 5417 + }, + { + "epoch": 0.72, + "grad_norm": 0.9375, + "learning_rate": 0.000196055382510492, + "loss": 0.5671, + "step": 5418 + }, + { + "epoch": 0.72, + "grad_norm": 0.97265625, + "learning_rate": 0.00019605214344829285, + "loss": 0.5388, + "step": 5419 + }, + { + "epoch": 0.72, + "grad_norm": 0.6640625, + "learning_rate": 0.00019604890308356495, + "loss": 0.6234, + "step": 5420 + }, + { + "epoch": 0.72, + "grad_norm": 0.7109375, + "learning_rate": 0.0001960456614163523, + "loss": 0.3675, + "step": 5421 + }, + { + "epoch": 0.72, + "grad_norm": 0.7734375, + "learning_rate": 0.00019604241844669882, + "loss": 0.7028, + "step": 5422 + }, + { + "epoch": 0.72, + "grad_norm": 0.765625, + "learning_rate": 0.0001960391741746485, + "loss": 0.5508, + "step": 5423 + }, + { + "epoch": 0.72, + "grad_norm": 0.51953125, + "learning_rate": 0.00019603592860024537, + "loss": 0.6821, + "step": 5424 + }, + { + "epoch": 0.72, + "grad_norm": 0.546875, + "learning_rate": 0.00019603268172353343, + "loss": 0.3407, + "step": 5425 + }, + { + "epoch": 0.72, + "grad_norm": 0.74609375, + "learning_rate": 0.00019602943354455666, + "loss": 0.5022, + "step": 5426 + }, + { + "epoch": 0.72, + "grad_norm": 0.58984375, + "learning_rate": 0.00019602618406335914, + "loss": 0.5682, + "step": 5427 + }, + { + "epoch": 0.72, + "grad_norm": 0.59375, + "learning_rate": 0.00019602293327998494, + "loss": 0.5307, + "step": 5428 + }, + { + "epoch": 0.72, + "grad_norm": 0.57421875, + "learning_rate": 0.00019601968119447813, + "loss": 0.5454, + "step": 5429 + }, + { + "epoch": 0.72, + "grad_norm": 0.72265625, + "learning_rate": 0.00019601642780688283, + "loss": 0.3927, + "step": 5430 + }, + { + "epoch": 0.72, + "grad_norm": 1.1875, + "learning_rate": 0.00019601317311724312, + "loss": 0.4175, + "step": 5431 + }, + { + "epoch": 0.72, + "grad_norm": 1.046875, + "learning_rate": 0.0001960099171256032, + "loss": 0.2181, + "step": 5432 + }, + { + "epoch": 0.72, + "grad_norm": 0.7109375, + "learning_rate": 0.00019600665983200715, + "loss": 0.4023, + "step": 5433 + }, + { + "epoch": 0.73, + "grad_norm": 0.625, + "learning_rate": 0.0001960034012364992, + "loss": 0.3641, + "step": 5434 + }, + { + "epoch": 0.73, + "grad_norm": 0.58984375, + "learning_rate": 0.00019600014133912353, + "loss": 0.6136, + "step": 5435 + }, + { + "epoch": 0.73, + "grad_norm": 0.70703125, + "learning_rate": 0.0001959968801399243, + "loss": 0.1981, + "step": 5436 + }, + { + "epoch": 0.73, + "grad_norm": 0.6328125, + "learning_rate": 0.00019599361763894576, + "loss": 0.5007, + "step": 5437 + }, + { + "epoch": 0.73, + "grad_norm": 0.49609375, + "learning_rate": 0.00019599035383623218, + "loss": 0.6624, + "step": 5438 + }, + { + "epoch": 0.73, + "grad_norm": 0.73828125, + "learning_rate": 0.00019598708873182777, + "loss": 0.6023, + "step": 5439 + }, + { + "epoch": 0.73, + "grad_norm": 0.49609375, + "learning_rate": 0.0001959838223257769, + "loss": 0.2008, + "step": 5440 + }, + { + "epoch": 0.73, + "grad_norm": 0.56640625, + "learning_rate": 0.0001959805546181237, + "loss": 0.6665, + "step": 5441 + }, + { + "epoch": 0.73, + "grad_norm": 0.9453125, + "learning_rate": 0.00019597728560891264, + "loss": 0.3475, + "step": 5442 + }, + { + "epoch": 0.73, + "grad_norm": 0.91015625, + "learning_rate": 0.00019597401529818797, + "loss": 0.6526, + "step": 5443 + }, + { + "epoch": 0.73, + "grad_norm": 0.66796875, + "learning_rate": 0.00019597074368599406, + "loss": 0.4207, + "step": 5444 + }, + { + "epoch": 0.73, + "grad_norm": 0.56640625, + "learning_rate": 0.00019596747077237528, + "loss": 0.3782, + "step": 5445 + }, + { + "epoch": 0.73, + "grad_norm": 0.44140625, + "learning_rate": 0.000195964196557376, + "loss": 0.5449, + "step": 5446 + }, + { + "epoch": 0.73, + "grad_norm": 0.55078125, + "learning_rate": 0.0001959609210410406, + "loss": 0.439, + "step": 5447 + }, + { + "epoch": 0.73, + "grad_norm": 0.6796875, + "learning_rate": 0.00019595764422341357, + "loss": 0.2969, + "step": 5448 + }, + { + "epoch": 0.73, + "grad_norm": 0.5546875, + "learning_rate": 0.0001959543661045393, + "loss": 0.3398, + "step": 5449 + }, + { + "epoch": 0.73, + "grad_norm": 0.71484375, + "learning_rate": 0.0001959510866844622, + "loss": 0.6242, + "step": 5450 + }, + { + "epoch": 0.73, + "grad_norm": 0.65625, + "learning_rate": 0.0001959478059632268, + "loss": 0.7048, + "step": 5451 + }, + { + "epoch": 0.73, + "grad_norm": 0.875, + "learning_rate": 0.00019594452394087756, + "loss": 0.4142, + "step": 5452 + }, + { + "epoch": 0.73, + "grad_norm": 0.546875, + "learning_rate": 0.00019594124061745902, + "loss": 0.4691, + "step": 5453 + }, + { + "epoch": 0.73, + "grad_norm": 0.66796875, + "learning_rate": 0.00019593795599301568, + "loss": 0.5015, + "step": 5454 + }, + { + "epoch": 0.73, + "grad_norm": 0.67578125, + "learning_rate": 0.00019593467006759208, + "loss": 0.4658, + "step": 5455 + }, + { + "epoch": 0.73, + "grad_norm": 0.66796875, + "learning_rate": 0.00019593138284123277, + "loss": 0.4767, + "step": 5456 + }, + { + "epoch": 0.73, + "grad_norm": 0.6640625, + "learning_rate": 0.00019592809431398235, + "loss": 0.4316, + "step": 5457 + }, + { + "epoch": 0.73, + "grad_norm": 0.57421875, + "learning_rate": 0.00019592480448588542, + "loss": 0.2177, + "step": 5458 + }, + { + "epoch": 0.73, + "grad_norm": 0.84375, + "learning_rate": 0.00019592151335698656, + "loss": 0.4189, + "step": 5459 + }, + { + "epoch": 0.73, + "grad_norm": 0.71875, + "learning_rate": 0.00019591822092733044, + "loss": 0.4681, + "step": 5460 + }, + { + "epoch": 0.73, + "grad_norm": 0.65234375, + "learning_rate": 0.00019591492719696165, + "loss": 0.4478, + "step": 5461 + }, + { + "epoch": 0.73, + "grad_norm": 0.5, + "learning_rate": 0.0001959116321659249, + "loss": 0.2292, + "step": 5462 + }, + { + "epoch": 0.73, + "grad_norm": 0.77734375, + "learning_rate": 0.00019590833583426487, + "loss": 0.5708, + "step": 5463 + }, + { + "epoch": 0.73, + "grad_norm": 0.66015625, + "learning_rate": 0.00019590503820202625, + "loss": 0.3602, + "step": 5464 + }, + { + "epoch": 0.73, + "grad_norm": 0.58984375, + "learning_rate": 0.00019590173926925376, + "loss": 0.544, + "step": 5465 + }, + { + "epoch": 0.73, + "grad_norm": 0.98046875, + "learning_rate": 0.00019589843903599215, + "loss": 0.3788, + "step": 5466 + }, + { + "epoch": 0.73, + "grad_norm": 0.88671875, + "learning_rate": 0.00019589513750228614, + "loss": 0.5342, + "step": 5467 + }, + { + "epoch": 0.73, + "grad_norm": 0.79296875, + "learning_rate": 0.00019589183466818053, + "loss": 0.6084, + "step": 5468 + }, + { + "epoch": 0.73, + "grad_norm": 0.56640625, + "learning_rate": 0.00019588853053372008, + "loss": 0.3827, + "step": 5469 + }, + { + "epoch": 0.73, + "grad_norm": 0.80078125, + "learning_rate": 0.00019588522509894968, + "loss": 0.4154, + "step": 5470 + }, + { + "epoch": 0.73, + "grad_norm": 0.71875, + "learning_rate": 0.000195881918363914, + "loss": 0.7027, + "step": 5471 + }, + { + "epoch": 0.73, + "grad_norm": 0.54296875, + "learning_rate": 0.00019587861032865803, + "loss": 0.4798, + "step": 5472 + }, + { + "epoch": 0.73, + "grad_norm": 0.703125, + "learning_rate": 0.00019587530099322655, + "loss": 0.5212, + "step": 5473 + }, + { + "epoch": 0.73, + "grad_norm": 0.6171875, + "learning_rate": 0.00019587199035766447, + "loss": 0.371, + "step": 5474 + }, + { + "epoch": 0.73, + "grad_norm": 0.90625, + "learning_rate": 0.00019586867842201665, + "loss": 0.6805, + "step": 5475 + }, + { + "epoch": 0.73, + "grad_norm": 0.78125, + "learning_rate": 0.00019586536518632802, + "loss": 0.2932, + "step": 5476 + }, + { + "epoch": 0.73, + "grad_norm": 0.64453125, + "learning_rate": 0.00019586205065064353, + "loss": 0.1905, + "step": 5477 + }, + { + "epoch": 0.73, + "grad_norm": 0.33984375, + "learning_rate": 0.00019585873481500808, + "loss": 0.291, + "step": 5478 + }, + { + "epoch": 0.73, + "grad_norm": 0.6640625, + "learning_rate": 0.00019585541767946668, + "loss": 0.3724, + "step": 5479 + }, + { + "epoch": 0.73, + "grad_norm": 0.703125, + "learning_rate": 0.00019585209924406432, + "loss": 0.4847, + "step": 5480 + }, + { + "epoch": 0.73, + "grad_norm": 0.6484375, + "learning_rate": 0.00019584877950884594, + "loss": 0.5925, + "step": 5481 + }, + { + "epoch": 0.73, + "grad_norm": 0.55078125, + "learning_rate": 0.00019584545847385664, + "loss": 0.3664, + "step": 5482 + }, + { + "epoch": 0.73, + "grad_norm": 0.69921875, + "learning_rate": 0.00019584213613914138, + "loss": 0.4439, + "step": 5483 + }, + { + "epoch": 0.73, + "grad_norm": 0.71875, + "learning_rate": 0.00019583881250474522, + "loss": 0.283, + "step": 5484 + }, + { + "epoch": 0.73, + "grad_norm": 0.58203125, + "learning_rate": 0.0001958354875707133, + "loss": 0.4173, + "step": 5485 + }, + { + "epoch": 0.73, + "grad_norm": 0.70703125, + "learning_rate": 0.00019583216133709064, + "loss": 0.3448, + "step": 5486 + }, + { + "epoch": 0.73, + "grad_norm": 0.609375, + "learning_rate": 0.00019582883380392236, + "loss": 0.6195, + "step": 5487 + }, + { + "epoch": 0.73, + "grad_norm": 0.71484375, + "learning_rate": 0.00019582550497125362, + "loss": 0.7441, + "step": 5488 + }, + { + "epoch": 0.73, + "grad_norm": 0.59375, + "learning_rate": 0.0001958221748391295, + "loss": 0.3745, + "step": 5489 + }, + { + "epoch": 0.73, + "grad_norm": 0.734375, + "learning_rate": 0.0001958188434075952, + "loss": 0.4468, + "step": 5490 + }, + { + "epoch": 0.73, + "grad_norm": 0.703125, + "learning_rate": 0.0001958155106766959, + "loss": 0.656, + "step": 5491 + }, + { + "epoch": 0.73, + "grad_norm": 0.71875, + "learning_rate": 0.00019581217664647676, + "loss": 0.508, + "step": 5492 + }, + { + "epoch": 0.73, + "grad_norm": 0.8125, + "learning_rate": 0.00019580884131698302, + "loss": 0.4962, + "step": 5493 + }, + { + "epoch": 0.73, + "grad_norm": 0.53125, + "learning_rate": 0.00019580550468825992, + "loss": 0.475, + "step": 5494 + }, + { + "epoch": 0.73, + "grad_norm": 0.8203125, + "learning_rate": 0.00019580216676035267, + "loss": 0.3327, + "step": 5495 + }, + { + "epoch": 0.73, + "grad_norm": 0.5546875, + "learning_rate": 0.00019579882753330657, + "loss": 0.4922, + "step": 5496 + }, + { + "epoch": 0.73, + "grad_norm": 0.578125, + "learning_rate": 0.00019579548700716684, + "loss": 0.6946, + "step": 5497 + }, + { + "epoch": 0.73, + "grad_norm": 0.66796875, + "learning_rate": 0.00019579214518197888, + "loss": 0.2563, + "step": 5498 + }, + { + "epoch": 0.73, + "grad_norm": 0.4765625, + "learning_rate": 0.00019578880205778793, + "loss": 0.345, + "step": 5499 + }, + { + "epoch": 0.73, + "grad_norm": 0.71875, + "learning_rate": 0.00019578545763463935, + "loss": 0.3249, + "step": 5500 + }, + { + "epoch": 0.73, + "grad_norm": 0.71484375, + "learning_rate": 0.0001957821119125785, + "loss": 0.5168, + "step": 5501 + }, + { + "epoch": 0.73, + "grad_norm": 0.7890625, + "learning_rate": 0.00019577876489165072, + "loss": 0.3765, + "step": 5502 + }, + { + "epoch": 0.73, + "grad_norm": 0.70703125, + "learning_rate": 0.0001957754165719014, + "loss": 0.4471, + "step": 5503 + }, + { + "epoch": 0.73, + "grad_norm": 0.94921875, + "learning_rate": 0.000195772066953376, + "loss": 0.659, + "step": 5504 + }, + { + "epoch": 0.73, + "grad_norm": 0.6640625, + "learning_rate": 0.0001957687160361199, + "loss": 0.1906, + "step": 5505 + }, + { + "epoch": 0.73, + "grad_norm": 0.6875, + "learning_rate": 0.00019576536382017853, + "loss": 0.4668, + "step": 5506 + }, + { + "epoch": 0.73, + "grad_norm": 0.69921875, + "learning_rate": 0.0001957620103055974, + "loss": 1.0594, + "step": 5507 + }, + { + "epoch": 0.73, + "grad_norm": 0.6640625, + "learning_rate": 0.00019575865549242193, + "loss": 0.5981, + "step": 5508 + }, + { + "epoch": 0.74, + "grad_norm": 0.84765625, + "learning_rate": 0.0001957552993806976, + "loss": 0.2835, + "step": 5509 + }, + { + "epoch": 0.74, + "grad_norm": 0.5078125, + "learning_rate": 0.00019575194197047, + "loss": 0.4975, + "step": 5510 + }, + { + "epoch": 0.74, + "grad_norm": 0.55859375, + "learning_rate": 0.0001957485832617846, + "loss": 0.5896, + "step": 5511 + }, + { + "epoch": 0.74, + "grad_norm": 0.7890625, + "learning_rate": 0.00019574522325468697, + "loss": 0.6649, + "step": 5512 + }, + { + "epoch": 0.74, + "grad_norm": 0.828125, + "learning_rate": 0.00019574186194922267, + "loss": 0.5108, + "step": 5513 + }, + { + "epoch": 0.74, + "grad_norm": 0.5625, + "learning_rate": 0.00019573849934543725, + "loss": 0.3849, + "step": 5514 + }, + { + "epoch": 0.74, + "grad_norm": 0.6015625, + "learning_rate": 0.00019573513544337634, + "loss": 0.3941, + "step": 5515 + }, + { + "epoch": 0.74, + "grad_norm": 0.7421875, + "learning_rate": 0.00019573177024308555, + "loss": 0.6686, + "step": 5516 + }, + { + "epoch": 0.74, + "grad_norm": 0.80078125, + "learning_rate": 0.00019572840374461052, + "loss": 0.4805, + "step": 5517 + }, + { + "epoch": 0.74, + "grad_norm": 1.140625, + "learning_rate": 0.00019572503594799689, + "loss": 0.9886, + "step": 5518 + }, + { + "epoch": 0.74, + "grad_norm": 0.6875, + "learning_rate": 0.00019572166685329035, + "loss": 0.789, + "step": 5519 + }, + { + "epoch": 0.74, + "grad_norm": 0.734375, + "learning_rate": 0.00019571829646053657, + "loss": 0.2894, + "step": 5520 + }, + { + "epoch": 0.74, + "grad_norm": 0.609375, + "learning_rate": 0.00019571492476978125, + "loss": 0.2463, + "step": 5521 + }, + { + "epoch": 0.74, + "grad_norm": 0.6328125, + "learning_rate": 0.00019571155178107013, + "loss": 0.6101, + "step": 5522 + }, + { + "epoch": 0.74, + "grad_norm": 0.8046875, + "learning_rate": 0.0001957081774944489, + "loss": 0.3438, + "step": 5523 + }, + { + "epoch": 0.74, + "grad_norm": 0.7109375, + "learning_rate": 0.00019570480190996342, + "loss": 0.4452, + "step": 5524 + }, + { + "epoch": 0.74, + "grad_norm": 0.5703125, + "learning_rate": 0.0001957014250276594, + "loss": 0.5503, + "step": 5525 + }, + { + "epoch": 0.74, + "grad_norm": 0.486328125, + "learning_rate": 0.0001956980468475826, + "loss": 0.4404, + "step": 5526 + }, + { + "epoch": 0.74, + "grad_norm": 0.5703125, + "learning_rate": 0.00019569466736977888, + "loss": 0.6247, + "step": 5527 + }, + { + "epoch": 0.74, + "grad_norm": 0.734375, + "learning_rate": 0.00019569128659429407, + "loss": 0.5378, + "step": 5528 + }, + { + "epoch": 0.74, + "grad_norm": 0.78515625, + "learning_rate": 0.00019568790452117397, + "loss": 0.5003, + "step": 5529 + }, + { + "epoch": 0.74, + "grad_norm": 0.8515625, + "learning_rate": 0.0001956845211504645, + "loss": 0.3147, + "step": 5530 + }, + { + "epoch": 0.74, + "grad_norm": 0.58203125, + "learning_rate": 0.0001956811364822115, + "loss": 0.4041, + "step": 5531 + }, + { + "epoch": 0.74, + "grad_norm": 1.0078125, + "learning_rate": 0.00019567775051646086, + "loss": 0.5808, + "step": 5532 + }, + { + "epoch": 0.74, + "grad_norm": 0.6015625, + "learning_rate": 0.00019567436325325854, + "loss": 0.2743, + "step": 5533 + }, + { + "epoch": 0.74, + "grad_norm": 0.6484375, + "learning_rate": 0.00019567097469265046, + "loss": 0.5404, + "step": 5534 + }, + { + "epoch": 0.74, + "grad_norm": 0.60546875, + "learning_rate": 0.00019566758483468254, + "loss": 0.5105, + "step": 5535 + }, + { + "epoch": 0.74, + "grad_norm": 0.71875, + "learning_rate": 0.0001956641936794008, + "loss": 0.6164, + "step": 5536 + }, + { + "epoch": 0.74, + "grad_norm": 0.6875, + "learning_rate": 0.00019566080122685118, + "loss": 0.4402, + "step": 5537 + }, + { + "epoch": 0.74, + "grad_norm": 0.609375, + "learning_rate": 0.00019565740747707968, + "loss": 0.4696, + "step": 5538 + }, + { + "epoch": 0.74, + "grad_norm": 0.9609375, + "learning_rate": 0.00019565401243013236, + "loss": 0.4854, + "step": 5539 + }, + { + "epoch": 0.74, + "grad_norm": 0.5546875, + "learning_rate": 0.00019565061608605526, + "loss": 0.194, + "step": 5540 + }, + { + "epoch": 0.74, + "grad_norm": 0.4921875, + "learning_rate": 0.0001956472184448944, + "loss": 0.2614, + "step": 5541 + }, + { + "epoch": 0.74, + "grad_norm": 0.5546875, + "learning_rate": 0.00019564381950669586, + "loss": 0.5397, + "step": 5542 + }, + { + "epoch": 0.74, + "grad_norm": 0.6015625, + "learning_rate": 0.00019564041927150577, + "loss": 0.4425, + "step": 5543 + }, + { + "epoch": 0.74, + "grad_norm": 0.3515625, + "learning_rate": 0.00019563701773937016, + "loss": 0.1905, + "step": 5544 + }, + { + "epoch": 0.74, + "grad_norm": 0.8671875, + "learning_rate": 0.00019563361491033529, + "loss": 0.4158, + "step": 5545 + }, + { + "epoch": 0.74, + "grad_norm": 0.44140625, + "learning_rate": 0.00019563021078444716, + "loss": 0.3328, + "step": 5546 + }, + { + "epoch": 0.74, + "grad_norm": 0.83203125, + "learning_rate": 0.00019562680536175205, + "loss": 0.4412, + "step": 5547 + }, + { + "epoch": 0.74, + "grad_norm": 0.88671875, + "learning_rate": 0.00019562339864229603, + "loss": 0.4644, + "step": 5548 + }, + { + "epoch": 0.74, + "grad_norm": 0.65234375, + "learning_rate": 0.00019561999062612539, + "loss": 0.6266, + "step": 5549 + }, + { + "epoch": 0.74, + "grad_norm": 0.458984375, + "learning_rate": 0.0001956165813132863, + "loss": 0.4247, + "step": 5550 + }, + { + "epoch": 0.74, + "grad_norm": 0.6640625, + "learning_rate": 0.000195613170703825, + "loss": 0.5083, + "step": 5551 + }, + { + "epoch": 0.74, + "grad_norm": 0.6171875, + "learning_rate": 0.00019560975879778772, + "loss": 0.6389, + "step": 5552 + }, + { + "epoch": 0.74, + "grad_norm": 0.828125, + "learning_rate": 0.0001956063455952208, + "loss": 0.4783, + "step": 5553 + }, + { + "epoch": 0.74, + "grad_norm": 0.953125, + "learning_rate": 0.00019560293109617042, + "loss": 0.5299, + "step": 5554 + }, + { + "epoch": 0.74, + "grad_norm": 0.51953125, + "learning_rate": 0.00019559951530068298, + "loss": 0.3029, + "step": 5555 + }, + { + "epoch": 0.74, + "grad_norm": 0.76953125, + "learning_rate": 0.00019559609820880472, + "loss": 0.5306, + "step": 5556 + }, + { + "epoch": 0.74, + "grad_norm": 0.515625, + "learning_rate": 0.00019559267982058204, + "loss": 0.4053, + "step": 5557 + }, + { + "epoch": 0.74, + "grad_norm": 0.70703125, + "learning_rate": 0.00019558926013606127, + "loss": 0.3475, + "step": 5558 + }, + { + "epoch": 0.74, + "grad_norm": 0.6484375, + "learning_rate": 0.00019558583915528876, + "loss": 0.4425, + "step": 5559 + }, + { + "epoch": 0.74, + "grad_norm": 0.61328125, + "learning_rate": 0.00019558241687831095, + "loss": 0.7048, + "step": 5560 + }, + { + "epoch": 0.74, + "grad_norm": 0.5625, + "learning_rate": 0.00019557899330517421, + "loss": 0.5596, + "step": 5561 + }, + { + "epoch": 0.74, + "grad_norm": 0.7421875, + "learning_rate": 0.00019557556843592498, + "loss": 0.5465, + "step": 5562 + }, + { + "epoch": 0.74, + "grad_norm": 0.77734375, + "learning_rate": 0.0001955721422706097, + "loss": 0.3827, + "step": 5563 + }, + { + "epoch": 0.74, + "grad_norm": 0.875, + "learning_rate": 0.00019556871480927486, + "loss": 0.4044, + "step": 5564 + }, + { + "epoch": 0.74, + "grad_norm": 0.546875, + "learning_rate": 0.0001955652860519669, + "loss": 0.4166, + "step": 5565 + }, + { + "epoch": 0.74, + "grad_norm": 0.6640625, + "learning_rate": 0.0001955618559987323, + "loss": 0.3332, + "step": 5566 + }, + { + "epoch": 0.74, + "grad_norm": 0.51171875, + "learning_rate": 0.00019555842464961763, + "loss": 0.6017, + "step": 5567 + }, + { + "epoch": 0.74, + "grad_norm": 0.52734375, + "learning_rate": 0.00019555499200466937, + "loss": 0.3955, + "step": 5568 + }, + { + "epoch": 0.74, + "grad_norm": 0.74609375, + "learning_rate": 0.0001955515580639341, + "loss": 0.5491, + "step": 5569 + }, + { + "epoch": 0.74, + "grad_norm": 0.5625, + "learning_rate": 0.00019554812282745841, + "loss": 0.4229, + "step": 5570 + }, + { + "epoch": 0.74, + "grad_norm": 0.81640625, + "learning_rate": 0.00019554468629528882, + "loss": 0.5599, + "step": 5571 + }, + { + "epoch": 0.74, + "grad_norm": 0.609375, + "learning_rate": 0.00019554124846747194, + "loss": 0.3081, + "step": 5572 + }, + { + "epoch": 0.74, + "grad_norm": 0.6328125, + "learning_rate": 0.00019553780934405444, + "loss": 0.4057, + "step": 5573 + }, + { + "epoch": 0.74, + "grad_norm": 0.7734375, + "learning_rate": 0.00019553436892508295, + "loss": 0.4148, + "step": 5574 + }, + { + "epoch": 0.74, + "grad_norm": 0.6875, + "learning_rate": 0.00019553092721060405, + "loss": 0.8655, + "step": 5575 + }, + { + "epoch": 0.74, + "grad_norm": 0.5859375, + "learning_rate": 0.0001955274842006645, + "loss": 0.4824, + "step": 5576 + }, + { + "epoch": 0.74, + "grad_norm": 0.890625, + "learning_rate": 0.000195524039895311, + "loss": 0.7734, + "step": 5577 + }, + { + "epoch": 0.74, + "grad_norm": 0.6015625, + "learning_rate": 0.00019552059429459013, + "loss": 0.6296, + "step": 5578 + }, + { + "epoch": 0.74, + "grad_norm": 0.52734375, + "learning_rate": 0.00019551714739854875, + "loss": 0.6073, + "step": 5579 + }, + { + "epoch": 0.74, + "grad_norm": 0.73828125, + "learning_rate": 0.00019551369920723356, + "loss": 0.5127, + "step": 5580 + }, + { + "epoch": 0.74, + "grad_norm": 0.50390625, + "learning_rate": 0.00019551024972069126, + "loss": 0.415, + "step": 5581 + }, + { + "epoch": 0.74, + "grad_norm": 0.6171875, + "learning_rate": 0.0001955067989389687, + "loss": 0.4092, + "step": 5582 + }, + { + "epoch": 0.74, + "grad_norm": 0.484375, + "learning_rate": 0.00019550334686211263, + "loss": 0.35, + "step": 5583 + }, + { + "epoch": 0.75, + "grad_norm": 0.6875, + "learning_rate": 0.0001954998934901699, + "loss": 0.624, + "step": 5584 + }, + { + "epoch": 0.75, + "grad_norm": 0.82421875, + "learning_rate": 0.00019549643882318734, + "loss": 0.7557, + "step": 5585 + }, + { + "epoch": 0.75, + "grad_norm": 0.58984375, + "learning_rate": 0.00019549298286121176, + "loss": 0.6339, + "step": 5586 + }, + { + "epoch": 0.75, + "grad_norm": 0.62890625, + "learning_rate": 0.00019548952560429004, + "loss": 0.6438, + "step": 5587 + }, + { + "epoch": 0.75, + "grad_norm": 0.76171875, + "learning_rate": 0.00019548606705246908, + "loss": 0.3206, + "step": 5588 + }, + { + "epoch": 0.75, + "grad_norm": 0.58984375, + "learning_rate": 0.00019548260720579576, + "loss": 0.7091, + "step": 5589 + }, + { + "epoch": 0.75, + "grad_norm": 0.56640625, + "learning_rate": 0.000195479146064317, + "loss": 0.4613, + "step": 5590 + }, + { + "epoch": 0.75, + "grad_norm": 0.53125, + "learning_rate": 0.00019547568362807979, + "loss": 0.3449, + "step": 5591 + }, + { + "epoch": 0.75, + "grad_norm": 0.5546875, + "learning_rate": 0.00019547221989713097, + "loss": 0.5338, + "step": 5592 + }, + { + "epoch": 0.75, + "grad_norm": 1.1328125, + "learning_rate": 0.00019546875487151758, + "loss": 0.4048, + "step": 5593 + }, + { + "epoch": 0.75, + "grad_norm": 0.99609375, + "learning_rate": 0.00019546528855128665, + "loss": 0.4541, + "step": 5594 + }, + { + "epoch": 0.75, + "grad_norm": 0.76953125, + "learning_rate": 0.0001954618209364851, + "loss": 0.8179, + "step": 5595 + }, + { + "epoch": 0.75, + "grad_norm": 0.53515625, + "learning_rate": 0.00019545835202716, + "loss": 0.3675, + "step": 5596 + }, + { + "epoch": 0.75, + "grad_norm": 0.52734375, + "learning_rate": 0.00019545488182335838, + "loss": 0.3339, + "step": 5597 + }, + { + "epoch": 0.75, + "grad_norm": 0.8828125, + "learning_rate": 0.0001954514103251273, + "loss": 0.4468, + "step": 5598 + }, + { + "epoch": 0.75, + "grad_norm": 0.6796875, + "learning_rate": 0.00019544793753251388, + "loss": 0.4066, + "step": 5599 + }, + { + "epoch": 0.75, + "grad_norm": 0.62890625, + "learning_rate": 0.0001954444634455651, + "loss": 0.5137, + "step": 5600 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 0.00019544098806432815, + "loss": 0.3291, + "step": 5601 + }, + { + "epoch": 0.75, + "grad_norm": 0.45703125, + "learning_rate": 0.00019543751138885018, + "loss": 0.3258, + "step": 5602 + }, + { + "epoch": 0.75, + "grad_norm": 0.86328125, + "learning_rate": 0.00019543403341917829, + "loss": 0.4427, + "step": 5603 + }, + { + "epoch": 0.75, + "grad_norm": 0.5546875, + "learning_rate": 0.00019543055415535967, + "loss": 0.6865, + "step": 5604 + }, + { + "epoch": 0.75, + "grad_norm": 0.51953125, + "learning_rate": 0.00019542707359744145, + "loss": 0.282, + "step": 5605 + }, + { + "epoch": 0.75, + "grad_norm": 0.482421875, + "learning_rate": 0.00019542359174547092, + "loss": 0.3271, + "step": 5606 + }, + { + "epoch": 0.75, + "grad_norm": 0.49609375, + "learning_rate": 0.0001954201085994952, + "loss": 0.3576, + "step": 5607 + }, + { + "epoch": 0.75, + "grad_norm": 0.55078125, + "learning_rate": 0.0001954166241595616, + "loss": 0.3478, + "step": 5608 + }, + { + "epoch": 0.75, + "grad_norm": 0.4140625, + "learning_rate": 0.0001954131384257173, + "loss": 0.4213, + "step": 5609 + }, + { + "epoch": 0.75, + "grad_norm": 0.7265625, + "learning_rate": 0.00019540965139800965, + "loss": 0.6289, + "step": 5610 + }, + { + "epoch": 0.75, + "grad_norm": 0.77734375, + "learning_rate": 0.0001954061630764859, + "loss": 0.4333, + "step": 5611 + }, + { + "epoch": 0.75, + "grad_norm": 0.71875, + "learning_rate": 0.0001954026734611933, + "loss": 0.6392, + "step": 5612 + }, + { + "epoch": 0.75, + "grad_norm": 0.8984375, + "learning_rate": 0.00019539918255217923, + "loss": 0.3845, + "step": 5613 + }, + { + "epoch": 0.75, + "grad_norm": 0.734375, + "learning_rate": 0.00019539569034949104, + "loss": 0.5266, + "step": 5614 + }, + { + "epoch": 0.75, + "grad_norm": 0.51171875, + "learning_rate": 0.00019539219685317606, + "loss": 0.2581, + "step": 5615 + }, + { + "epoch": 0.75, + "grad_norm": 0.54296875, + "learning_rate": 0.00019538870206328165, + "loss": 0.518, + "step": 5616 + }, + { + "epoch": 0.75, + "grad_norm": 0.486328125, + "learning_rate": 0.00019538520597985523, + "loss": 0.273, + "step": 5617 + }, + { + "epoch": 0.75, + "grad_norm": 0.5859375, + "learning_rate": 0.0001953817086029442, + "loss": 0.3336, + "step": 5618 + }, + { + "epoch": 0.75, + "grad_norm": 0.58203125, + "learning_rate": 0.00019537820993259598, + "loss": 0.1854, + "step": 5619 + }, + { + "epoch": 0.75, + "grad_norm": 0.5859375, + "learning_rate": 0.00019537470996885804, + "loss": 0.6281, + "step": 5620 + }, + { + "epoch": 0.75, + "grad_norm": 0.76171875, + "learning_rate": 0.0001953712087117778, + "loss": 0.4063, + "step": 5621 + }, + { + "epoch": 0.75, + "grad_norm": 0.76953125, + "learning_rate": 0.00019536770616140276, + "loss": 0.4999, + "step": 5622 + }, + { + "epoch": 0.75, + "grad_norm": 1.2421875, + "learning_rate": 0.00019536420231778046, + "loss": 0.5068, + "step": 5623 + }, + { + "epoch": 0.75, + "grad_norm": 0.765625, + "learning_rate": 0.00019536069718095834, + "loss": 0.5309, + "step": 5624 + }, + { + "epoch": 0.75, + "grad_norm": 0.609375, + "learning_rate": 0.00019535719075098394, + "loss": 0.1997, + "step": 5625 + }, + { + "epoch": 0.75, + "grad_norm": 0.703125, + "learning_rate": 0.00019535368302790485, + "loss": 0.3705, + "step": 5626 + }, + { + "epoch": 0.75, + "grad_norm": 0.6875, + "learning_rate": 0.00019535017401176865, + "loss": 0.2893, + "step": 5627 + }, + { + "epoch": 0.75, + "grad_norm": 0.5859375, + "learning_rate": 0.00019534666370262283, + "loss": 0.4261, + "step": 5628 + }, + { + "epoch": 0.75, + "grad_norm": 0.6640625, + "learning_rate": 0.00019534315210051513, + "loss": 0.2887, + "step": 5629 + }, + { + "epoch": 0.75, + "grad_norm": 0.80859375, + "learning_rate": 0.00019533963920549306, + "loss": 0.2574, + "step": 5630 + }, + { + "epoch": 0.75, + "grad_norm": 0.486328125, + "learning_rate": 0.0001953361250176043, + "loss": 0.3521, + "step": 5631 + }, + { + "epoch": 0.75, + "grad_norm": 0.5859375, + "learning_rate": 0.00019533260953689648, + "loss": 0.6018, + "step": 5632 + }, + { + "epoch": 0.75, + "grad_norm": 0.8671875, + "learning_rate": 0.00019532909276341734, + "loss": 0.3491, + "step": 5633 + }, + { + "epoch": 0.75, + "grad_norm": 0.53515625, + "learning_rate": 0.00019532557469721448, + "loss": 0.2559, + "step": 5634 + }, + { + "epoch": 0.75, + "grad_norm": 0.63671875, + "learning_rate": 0.00019532205533833566, + "loss": 0.6489, + "step": 5635 + }, + { + "epoch": 0.75, + "grad_norm": 0.8828125, + "learning_rate": 0.0001953185346868286, + "loss": 0.4736, + "step": 5636 + }, + { + "epoch": 0.75, + "grad_norm": 0.484375, + "learning_rate": 0.000195315012742741, + "loss": 0.3413, + "step": 5637 + }, + { + "epoch": 0.75, + "grad_norm": 0.58203125, + "learning_rate": 0.00019531148950612068, + "loss": 0.5068, + "step": 5638 + }, + { + "epoch": 0.75, + "grad_norm": 0.609375, + "learning_rate": 0.0001953079649770154, + "loss": 0.6058, + "step": 5639 + }, + { + "epoch": 0.75, + "grad_norm": 0.5546875, + "learning_rate": 0.00019530443915547293, + "loss": 0.4243, + "step": 5640 + }, + { + "epoch": 0.75, + "grad_norm": 0.51953125, + "learning_rate": 0.00019530091204154112, + "loss": 0.6116, + "step": 5641 + }, + { + "epoch": 0.75, + "grad_norm": 0.66796875, + "learning_rate": 0.00019529738363526774, + "loss": 0.4972, + "step": 5642 + }, + { + "epoch": 0.75, + "grad_norm": 0.7109375, + "learning_rate": 0.00019529385393670073, + "loss": 0.3089, + "step": 5643 + }, + { + "epoch": 0.75, + "grad_norm": 0.546875, + "learning_rate": 0.00019529032294588787, + "loss": 0.5326, + "step": 5644 + }, + { + "epoch": 0.75, + "grad_norm": 0.58984375, + "learning_rate": 0.00019528679066287708, + "loss": 0.5768, + "step": 5645 + }, + { + "epoch": 0.75, + "grad_norm": 0.8359375, + "learning_rate": 0.00019528325708771625, + "loss": 0.5804, + "step": 5646 + }, + { + "epoch": 0.75, + "grad_norm": 0.7109375, + "learning_rate": 0.0001952797222204533, + "loss": 0.3889, + "step": 5647 + }, + { + "epoch": 0.75, + "grad_norm": 0.70703125, + "learning_rate": 0.00019527618606113615, + "loss": 0.2965, + "step": 5648 + }, + { + "epoch": 0.75, + "grad_norm": 0.5625, + "learning_rate": 0.0001952726486098128, + "loss": 0.3481, + "step": 5649 + }, + { + "epoch": 0.75, + "grad_norm": 0.6796875, + "learning_rate": 0.0001952691098665312, + "loss": 0.8792, + "step": 5650 + }, + { + "epoch": 0.75, + "grad_norm": 0.6875, + "learning_rate": 0.00019526556983133928, + "loss": 0.6775, + "step": 5651 + }, + { + "epoch": 0.75, + "grad_norm": 0.95703125, + "learning_rate": 0.00019526202850428512, + "loss": 0.5779, + "step": 5652 + }, + { + "epoch": 0.75, + "grad_norm": 0.90625, + "learning_rate": 0.00019525848588541672, + "loss": 0.6699, + "step": 5653 + }, + { + "epoch": 0.75, + "grad_norm": 0.7109375, + "learning_rate": 0.0001952549419747821, + "loss": 0.6402, + "step": 5654 + }, + { + "epoch": 0.75, + "grad_norm": 0.63671875, + "learning_rate": 0.00019525139677242933, + "loss": 0.5035, + "step": 5655 + }, + { + "epoch": 0.75, + "grad_norm": 0.67578125, + "learning_rate": 0.00019524785027840648, + "loss": 0.2613, + "step": 5656 + }, + { + "epoch": 0.75, + "grad_norm": 0.859375, + "learning_rate": 0.00019524430249276168, + "loss": 0.5917, + "step": 5657 + }, + { + "epoch": 0.76, + "grad_norm": 0.86328125, + "learning_rate": 0.000195240753415543, + "loss": 0.4929, + "step": 5658 + }, + { + "epoch": 0.76, + "grad_norm": 0.69921875, + "learning_rate": 0.00019523720304679857, + "loss": 0.5378, + "step": 5659 + }, + { + "epoch": 0.76, + "grad_norm": 0.6953125, + "learning_rate": 0.00019523365138657655, + "loss": 0.5897, + "step": 5660 + }, + { + "epoch": 0.76, + "grad_norm": 0.7734375, + "learning_rate": 0.0001952300984349251, + "loss": 0.3731, + "step": 5661 + }, + { + "epoch": 0.76, + "grad_norm": 0.6875, + "learning_rate": 0.0001952265441918924, + "loss": 0.3272, + "step": 5662 + }, + { + "epoch": 0.76, + "grad_norm": 0.7890625, + "learning_rate": 0.0001952229886575266, + "loss": 0.3749, + "step": 5663 + }, + { + "epoch": 0.76, + "grad_norm": 0.59375, + "learning_rate": 0.00019521943183187603, + "loss": 0.6086, + "step": 5664 + }, + { + "epoch": 0.76, + "grad_norm": 0.65625, + "learning_rate": 0.0001952158737149888, + "loss": 0.528, + "step": 5665 + }, + { + "epoch": 0.76, + "grad_norm": 0.8515625, + "learning_rate": 0.00019521231430691323, + "loss": 0.3306, + "step": 5666 + }, + { + "epoch": 0.76, + "grad_norm": 0.57421875, + "learning_rate": 0.00019520875360769759, + "loss": 0.6862, + "step": 5667 + }, + { + "epoch": 0.76, + "grad_norm": 0.55078125, + "learning_rate": 0.00019520519161739014, + "loss": 0.6303, + "step": 5668 + }, + { + "epoch": 0.76, + "grad_norm": 0.6484375, + "learning_rate": 0.0001952016283360392, + "loss": 0.5129, + "step": 5669 + }, + { + "epoch": 0.76, + "grad_norm": 0.6015625, + "learning_rate": 0.00019519806376369305, + "loss": 0.3222, + "step": 5670 + }, + { + "epoch": 0.76, + "grad_norm": 0.77734375, + "learning_rate": 0.00019519449790040007, + "loss": 0.3873, + "step": 5671 + }, + { + "epoch": 0.76, + "grad_norm": 0.6171875, + "learning_rate": 0.0001951909307462086, + "loss": 0.8263, + "step": 5672 + }, + { + "epoch": 0.76, + "grad_norm": 0.51953125, + "learning_rate": 0.00019518736230116703, + "loss": 0.3618, + "step": 5673 + }, + { + "epoch": 0.76, + "grad_norm": 0.65625, + "learning_rate": 0.0001951837925653237, + "loss": 0.6203, + "step": 5674 + }, + { + "epoch": 0.76, + "grad_norm": 0.734375, + "learning_rate": 0.0001951802215387271, + "loss": 0.5497, + "step": 5675 + }, + { + "epoch": 0.76, + "grad_norm": 0.5703125, + "learning_rate": 0.0001951766492214256, + "loss": 0.2975, + "step": 5676 + }, + { + "epoch": 0.76, + "grad_norm": 0.56640625, + "learning_rate": 0.00019517307561346762, + "loss": 0.7645, + "step": 5677 + }, + { + "epoch": 0.76, + "grad_norm": 0.6171875, + "learning_rate": 0.00019516950071490167, + "loss": 0.4087, + "step": 5678 + }, + { + "epoch": 0.76, + "grad_norm": 0.6171875, + "learning_rate": 0.0001951659245257762, + "loss": 0.729, + "step": 5679 + }, + { + "epoch": 0.76, + "grad_norm": 0.625, + "learning_rate": 0.00019516234704613976, + "loss": 0.4073, + "step": 5680 + }, + { + "epoch": 0.76, + "grad_norm": 0.53515625, + "learning_rate": 0.00019515876827604079, + "loss": 0.5706, + "step": 5681 + }, + { + "epoch": 0.76, + "grad_norm": 0.68359375, + "learning_rate": 0.00019515518821552785, + "loss": 0.2537, + "step": 5682 + }, + { + "epoch": 0.76, + "grad_norm": 0.62109375, + "learning_rate": 0.00019515160686464948, + "loss": 0.4098, + "step": 5683 + }, + { + "epoch": 0.76, + "grad_norm": 0.94140625, + "learning_rate": 0.00019514802422345427, + "loss": 0.5135, + "step": 5684 + }, + { + "epoch": 0.76, + "grad_norm": 0.5546875, + "learning_rate": 0.00019514444029199076, + "loss": 0.5672, + "step": 5685 + }, + { + "epoch": 0.76, + "grad_norm": 0.73046875, + "learning_rate": 0.0001951408550703076, + "loss": 0.6378, + "step": 5686 + }, + { + "epoch": 0.76, + "grad_norm": 0.84375, + "learning_rate": 0.00019513726855845337, + "loss": 0.5256, + "step": 5687 + }, + { + "epoch": 0.76, + "grad_norm": 0.90625, + "learning_rate": 0.00019513368075647675, + "loss": 0.4333, + "step": 5688 + }, + { + "epoch": 0.76, + "grad_norm": 0.546875, + "learning_rate": 0.00019513009166442633, + "loss": 0.3672, + "step": 5689 + }, + { + "epoch": 0.76, + "grad_norm": 0.5546875, + "learning_rate": 0.0001951265012823508, + "loss": 0.4528, + "step": 5690 + }, + { + "epoch": 0.76, + "grad_norm": 0.66796875, + "learning_rate": 0.0001951229096102989, + "loss": 0.3721, + "step": 5691 + }, + { + "epoch": 0.76, + "grad_norm": 0.7890625, + "learning_rate": 0.00019511931664831927, + "loss": 0.7227, + "step": 5692 + }, + { + "epoch": 0.76, + "grad_norm": 0.76953125, + "learning_rate": 0.00019511572239646068, + "loss": 0.3736, + "step": 5693 + }, + { + "epoch": 0.76, + "grad_norm": 0.421875, + "learning_rate": 0.00019511212685477186, + "loss": 0.4153, + "step": 5694 + }, + { + "epoch": 0.76, + "grad_norm": 0.59765625, + "learning_rate": 0.00019510853002330152, + "loss": 0.4541, + "step": 5695 + }, + { + "epoch": 0.76, + "grad_norm": 0.6796875, + "learning_rate": 0.0001951049319020985, + "loss": 0.5588, + "step": 5696 + }, + { + "epoch": 0.76, + "grad_norm": 0.40234375, + "learning_rate": 0.00019510133249121155, + "loss": 0.2825, + "step": 5697 + }, + { + "epoch": 0.76, + "grad_norm": 0.49609375, + "learning_rate": 0.0001950977317906895, + "loss": 0.4441, + "step": 5698 + }, + { + "epoch": 0.76, + "grad_norm": 0.65234375, + "learning_rate": 0.00019509412980058116, + "loss": 0.4387, + "step": 5699 + }, + { + "epoch": 0.76, + "grad_norm": 0.69921875, + "learning_rate": 0.0001950905265209354, + "loss": 0.3242, + "step": 5700 + }, + { + "epoch": 0.76, + "grad_norm": 0.93359375, + "learning_rate": 0.00019508692195180109, + "loss": 0.4174, + "step": 5701 + }, + { + "epoch": 0.76, + "grad_norm": 0.796875, + "learning_rate": 0.00019508331609322706, + "loss": 0.6201, + "step": 5702 + }, + { + "epoch": 0.76, + "grad_norm": 0.54296875, + "learning_rate": 0.00019507970894526227, + "loss": 0.8189, + "step": 5703 + }, + { + "epoch": 0.76, + "grad_norm": 0.447265625, + "learning_rate": 0.00019507610050795558, + "loss": 0.3485, + "step": 5704 + }, + { + "epoch": 0.76, + "grad_norm": 0.80859375, + "learning_rate": 0.00019507249078135598, + "loss": 0.4272, + "step": 5705 + }, + { + "epoch": 0.76, + "grad_norm": 0.59375, + "learning_rate": 0.00019506887976551232, + "loss": 0.3403, + "step": 5706 + }, + { + "epoch": 0.76, + "grad_norm": 0.52734375, + "learning_rate": 0.00019506526746047368, + "loss": 0.41, + "step": 5707 + }, + { + "epoch": 0.76, + "grad_norm": 0.7578125, + "learning_rate": 0.00019506165386628903, + "loss": 0.512, + "step": 5708 + }, + { + "epoch": 0.76, + "grad_norm": 0.51953125, + "learning_rate": 0.00019505803898300732, + "loss": 0.6097, + "step": 5709 + }, + { + "epoch": 0.76, + "grad_norm": 0.51171875, + "learning_rate": 0.0001950544228106776, + "loss": 0.4537, + "step": 5710 + }, + { + "epoch": 0.76, + "grad_norm": 1.1171875, + "learning_rate": 0.00019505080534934887, + "loss": 0.525, + "step": 5711 + }, + { + "epoch": 0.76, + "grad_norm": 0.56640625, + "learning_rate": 0.00019504718659907024, + "loss": 0.45, + "step": 5712 + }, + { + "epoch": 0.76, + "grad_norm": 0.5703125, + "learning_rate": 0.00019504356655989077, + "loss": 0.2388, + "step": 5713 + }, + { + "epoch": 0.76, + "grad_norm": 0.546875, + "learning_rate": 0.0001950399452318595, + "loss": 0.4104, + "step": 5714 + }, + { + "epoch": 0.76, + "grad_norm": 0.83203125, + "learning_rate": 0.00019503632261502562, + "loss": 0.9538, + "step": 5715 + }, + { + "epoch": 0.76, + "grad_norm": 0.56640625, + "learning_rate": 0.0001950326987094382, + "loss": 0.7577, + "step": 5716 + }, + { + "epoch": 0.76, + "grad_norm": 0.6171875, + "learning_rate": 0.00019502907351514636, + "loss": 0.5263, + "step": 5717 + }, + { + "epoch": 0.76, + "grad_norm": 0.55078125, + "learning_rate": 0.00019502544703219933, + "loss": 0.3273, + "step": 5718 + }, + { + "epoch": 0.76, + "grad_norm": 0.625, + "learning_rate": 0.00019502181926064625, + "loss": 0.5089, + "step": 5719 + }, + { + "epoch": 0.76, + "grad_norm": 0.65234375, + "learning_rate": 0.00019501819020053632, + "loss": 0.6197, + "step": 5720 + }, + { + "epoch": 0.76, + "grad_norm": 0.66796875, + "learning_rate": 0.00019501455985191873, + "loss": 0.3538, + "step": 5721 + }, + { + "epoch": 0.76, + "grad_norm": 0.6796875, + "learning_rate": 0.00019501092821484273, + "loss": 0.5881, + "step": 5722 + }, + { + "epoch": 0.76, + "grad_norm": 0.6953125, + "learning_rate": 0.00019500729528935758, + "loss": 0.357, + "step": 5723 + }, + { + "epoch": 0.76, + "grad_norm": 0.65234375, + "learning_rate": 0.00019500366107551252, + "loss": 0.7873, + "step": 5724 + }, + { + "epoch": 0.76, + "grad_norm": 0.58984375, + "learning_rate": 0.00019500002557335682, + "loss": 0.4363, + "step": 5725 + }, + { + "epoch": 0.76, + "grad_norm": 0.56640625, + "learning_rate": 0.00019499638878293983, + "loss": 0.6991, + "step": 5726 + }, + { + "epoch": 0.76, + "grad_norm": 0.51171875, + "learning_rate": 0.00019499275070431081, + "loss": 0.4831, + "step": 5727 + }, + { + "epoch": 0.76, + "grad_norm": 0.78515625, + "learning_rate": 0.0001949891113375192, + "loss": 0.2997, + "step": 5728 + }, + { + "epoch": 0.76, + "grad_norm": 0.36328125, + "learning_rate": 0.0001949854706826142, + "loss": 0.3431, + "step": 5729 + }, + { + "epoch": 0.76, + "grad_norm": 0.6796875, + "learning_rate": 0.0001949818287396453, + "loss": 0.2987, + "step": 5730 + }, + { + "epoch": 0.76, + "grad_norm": 0.6953125, + "learning_rate": 0.00019497818550866182, + "loss": 0.5495, + "step": 5731 + }, + { + "epoch": 0.76, + "grad_norm": 0.9375, + "learning_rate": 0.0001949745409897132, + "loss": 0.4931, + "step": 5732 + }, + { + "epoch": 0.77, + "grad_norm": 0.9453125, + "learning_rate": 0.00019497089518284884, + "loss": 0.464, + "step": 5733 + }, + { + "epoch": 0.77, + "grad_norm": 0.578125, + "learning_rate": 0.00019496724808811818, + "loss": 0.4914, + "step": 5734 + }, + { + "epoch": 0.77, + "grad_norm": 0.6953125, + "learning_rate": 0.00019496359970557072, + "loss": 0.4247, + "step": 5735 + }, + { + "epoch": 0.77, + "grad_norm": 0.60546875, + "learning_rate": 0.00019495995003525587, + "loss": 0.7654, + "step": 5736 + }, + { + "epoch": 0.77, + "grad_norm": 0.51171875, + "learning_rate": 0.00019495629907722317, + "loss": 0.717, + "step": 5737 + }, + { + "epoch": 0.77, + "grad_norm": 0.55859375, + "learning_rate": 0.0001949526468315221, + "loss": 0.5534, + "step": 5738 + }, + { + "epoch": 0.77, + "grad_norm": 0.6015625, + "learning_rate": 0.00019494899329820223, + "loss": 0.9019, + "step": 5739 + }, + { + "epoch": 0.77, + "grad_norm": 0.85546875, + "learning_rate": 0.00019494533847731304, + "loss": 0.635, + "step": 5740 + }, + { + "epoch": 0.77, + "grad_norm": 0.8671875, + "learning_rate": 0.00019494168236890413, + "loss": 0.5088, + "step": 5741 + }, + { + "epoch": 0.77, + "grad_norm": 0.3984375, + "learning_rate": 0.00019493802497302508, + "loss": 0.498, + "step": 5742 + }, + { + "epoch": 0.77, + "grad_norm": 0.578125, + "learning_rate": 0.00019493436628972548, + "loss": 0.4445, + "step": 5743 + }, + { + "epoch": 0.77, + "grad_norm": 0.6953125, + "learning_rate": 0.00019493070631905493, + "loss": 0.8641, + "step": 5744 + }, + { + "epoch": 0.77, + "grad_norm": 0.55078125, + "learning_rate": 0.0001949270450610631, + "loss": 0.7432, + "step": 5745 + }, + { + "epoch": 0.77, + "grad_norm": 0.8984375, + "learning_rate": 0.00019492338251579958, + "loss": 0.4503, + "step": 5746 + }, + { + "epoch": 0.77, + "grad_norm": 0.64453125, + "learning_rate": 0.00019491971868331408, + "loss": 0.4276, + "step": 5747 + }, + { + "epoch": 0.77, + "grad_norm": 0.64453125, + "learning_rate": 0.00019491605356365633, + "loss": 0.3007, + "step": 5748 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 0.00019491238715687591, + "loss": 0.4828, + "step": 5749 + }, + { + "epoch": 0.77, + "grad_norm": 1.1875, + "learning_rate": 0.00019490871946302264, + "loss": 0.5355, + "step": 5750 + }, + { + "epoch": 0.77, + "grad_norm": 0.78515625, + "learning_rate": 0.00019490505048214623, + "loss": 0.2433, + "step": 5751 + }, + { + "epoch": 0.77, + "grad_norm": 0.93359375, + "learning_rate": 0.0001949013802142964, + "loss": 0.4936, + "step": 5752 + }, + { + "epoch": 0.77, + "grad_norm": 0.65625, + "learning_rate": 0.00019489770865952294, + "loss": 0.3531, + "step": 5753 + }, + { + "epoch": 0.77, + "grad_norm": 0.447265625, + "learning_rate": 0.0001948940358178757, + "loss": 0.3922, + "step": 5754 + }, + { + "epoch": 0.77, + "grad_norm": 0.5859375, + "learning_rate": 0.00019489036168940436, + "loss": 0.4639, + "step": 5755 + }, + { + "epoch": 0.77, + "grad_norm": 0.67578125, + "learning_rate": 0.00019488668627415886, + "loss": 0.4086, + "step": 5756 + }, + { + "epoch": 0.77, + "grad_norm": 0.625, + "learning_rate": 0.000194883009572189, + "loss": 0.6103, + "step": 5757 + }, + { + "epoch": 0.77, + "grad_norm": 0.67578125, + "learning_rate": 0.0001948793315835446, + "loss": 0.3612, + "step": 5758 + }, + { + "epoch": 0.77, + "grad_norm": 0.75, + "learning_rate": 0.00019487565230827563, + "loss": 0.5705, + "step": 5759 + }, + { + "epoch": 0.77, + "grad_norm": 0.59765625, + "learning_rate": 0.00019487197174643188, + "loss": 0.285, + "step": 5760 + }, + { + "epoch": 0.77, + "grad_norm": 0.5859375, + "learning_rate": 0.0001948682898980633, + "loss": 0.511, + "step": 5761 + }, + { + "epoch": 0.77, + "grad_norm": 0.52734375, + "learning_rate": 0.00019486460676321982, + "loss": 0.5052, + "step": 5762 + }, + { + "epoch": 0.77, + "grad_norm": 0.5, + "learning_rate": 0.00019486092234195142, + "loss": 0.3816, + "step": 5763 + }, + { + "epoch": 0.77, + "grad_norm": 0.6640625, + "learning_rate": 0.000194857236634308, + "loss": 0.5484, + "step": 5764 + }, + { + "epoch": 0.77, + "grad_norm": 0.8203125, + "learning_rate": 0.0001948535496403396, + "loss": 0.4929, + "step": 5765 + }, + { + "epoch": 0.77, + "grad_norm": 0.404296875, + "learning_rate": 0.00019484986136009618, + "loss": 0.2803, + "step": 5766 + }, + { + "epoch": 0.77, + "grad_norm": 0.625, + "learning_rate": 0.00019484617179362776, + "loss": 0.2953, + "step": 5767 + }, + { + "epoch": 0.77, + "grad_norm": 0.50390625, + "learning_rate": 0.0001948424809409844, + "loss": 0.5176, + "step": 5768 + }, + { + "epoch": 0.77, + "grad_norm": 0.88671875, + "learning_rate": 0.00019483878880221608, + "loss": 0.3771, + "step": 5769 + }, + { + "epoch": 0.77, + "grad_norm": 0.494140625, + "learning_rate": 0.00019483509537737294, + "loss": 0.3866, + "step": 5770 + }, + { + "epoch": 0.77, + "grad_norm": 0.6015625, + "learning_rate": 0.00019483140066650507, + "loss": 0.401, + "step": 5771 + }, + { + "epoch": 0.77, + "grad_norm": 0.9609375, + "learning_rate": 0.00019482770466966253, + "loss": 0.5727, + "step": 5772 + }, + { + "epoch": 0.77, + "grad_norm": 0.72265625, + "learning_rate": 0.00019482400738689545, + "loss": 0.6485, + "step": 5773 + }, + { + "epoch": 0.77, + "grad_norm": 0.62109375, + "learning_rate": 0.00019482030881825394, + "loss": 0.2482, + "step": 5774 + }, + { + "epoch": 0.77, + "grad_norm": 0.6171875, + "learning_rate": 0.00019481660896378825, + "loss": 0.261, + "step": 5775 + }, + { + "epoch": 0.77, + "grad_norm": 0.5625, + "learning_rate": 0.00019481290782354845, + "loss": 0.7953, + "step": 5776 + }, + { + "epoch": 0.77, + "grad_norm": 0.5546875, + "learning_rate": 0.0001948092053975848, + "loss": 0.3922, + "step": 5777 + }, + { + "epoch": 0.77, + "grad_norm": 0.72265625, + "learning_rate": 0.0001948055016859474, + "loss": 0.4446, + "step": 5778 + }, + { + "epoch": 0.77, + "grad_norm": 1.1484375, + "learning_rate": 0.00019480179668868663, + "loss": 0.5707, + "step": 5779 + }, + { + "epoch": 0.77, + "grad_norm": 0.6328125, + "learning_rate": 0.0001947980904058526, + "loss": 0.6253, + "step": 5780 + }, + { + "epoch": 0.77, + "grad_norm": 0.67578125, + "learning_rate": 0.00019479438283749568, + "loss": 0.455, + "step": 5781 + }, + { + "epoch": 0.77, + "grad_norm": 0.875, + "learning_rate": 0.00019479067398366605, + "loss": 0.4165, + "step": 5782 + }, + { + "epoch": 0.77, + "grad_norm": 0.6796875, + "learning_rate": 0.00019478696384441407, + "loss": 0.4636, + "step": 5783 + }, + { + "epoch": 0.77, + "grad_norm": 0.53125, + "learning_rate": 0.00019478325241979, + "loss": 0.6057, + "step": 5784 + }, + { + "epoch": 0.77, + "grad_norm": 0.4375, + "learning_rate": 0.0001947795397098442, + "loss": 0.4347, + "step": 5785 + }, + { + "epoch": 0.77, + "grad_norm": 0.86328125, + "learning_rate": 0.00019477582571462705, + "loss": 0.6904, + "step": 5786 + }, + { + "epoch": 0.77, + "grad_norm": 0.71484375, + "learning_rate": 0.00019477211043418882, + "loss": 0.4777, + "step": 5787 + }, + { + "epoch": 0.77, + "grad_norm": 0.58984375, + "learning_rate": 0.00019476839386857998, + "loss": 1.242, + "step": 5788 + }, + { + "epoch": 0.77, + "grad_norm": 0.5703125, + "learning_rate": 0.00019476467601785088, + "loss": 0.4284, + "step": 5789 + }, + { + "epoch": 0.77, + "grad_norm": 0.58203125, + "learning_rate": 0.000194760956882052, + "loss": 0.4953, + "step": 5790 + }, + { + "epoch": 0.77, + "grad_norm": 1.3359375, + "learning_rate": 0.00019475723646123367, + "loss": 0.5438, + "step": 5791 + }, + { + "epoch": 0.77, + "grad_norm": 0.390625, + "learning_rate": 0.00019475351475544642, + "loss": 0.3, + "step": 5792 + }, + { + "epoch": 0.77, + "grad_norm": 0.62890625, + "learning_rate": 0.00019474979176474071, + "loss": 0.4574, + "step": 5793 + }, + { + "epoch": 0.77, + "grad_norm": 0.69140625, + "learning_rate": 0.00019474606748916701, + "loss": 0.402, + "step": 5794 + }, + { + "epoch": 0.77, + "grad_norm": 0.63671875, + "learning_rate": 0.0001947423419287758, + "loss": 0.4047, + "step": 5795 + }, + { + "epoch": 0.77, + "grad_norm": 0.98828125, + "learning_rate": 0.00019473861508361766, + "loss": 0.4041, + "step": 5796 + }, + { + "epoch": 0.77, + "grad_norm": 0.65625, + "learning_rate": 0.0001947348869537431, + "loss": 0.6069, + "step": 5797 + }, + { + "epoch": 0.77, + "grad_norm": 0.6796875, + "learning_rate": 0.00019473115753920265, + "loss": 0.4538, + "step": 5798 + }, + { + "epoch": 0.77, + "grad_norm": 0.8046875, + "learning_rate": 0.00019472742684004693, + "loss": 0.4867, + "step": 5799 + }, + { + "epoch": 0.77, + "grad_norm": 0.65625, + "learning_rate": 0.00019472369485632647, + "loss": 0.2549, + "step": 5800 + }, + { + "epoch": 0.77, + "grad_norm": 0.57421875, + "learning_rate": 0.00019471996158809194, + "loss": 0.5217, + "step": 5801 + }, + { + "epoch": 0.77, + "grad_norm": 0.78515625, + "learning_rate": 0.00019471622703539393, + "loss": 0.5445, + "step": 5802 + }, + { + "epoch": 0.77, + "grad_norm": 0.57421875, + "learning_rate": 0.0001947124911982831, + "loss": 0.3176, + "step": 5803 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 0.0001947087540768101, + "loss": 0.3549, + "step": 5804 + }, + { + "epoch": 0.77, + "grad_norm": 0.69921875, + "learning_rate": 0.00019470501567102558, + "loss": 0.3008, + "step": 5805 + }, + { + "epoch": 0.77, + "grad_norm": 0.5859375, + "learning_rate": 0.00019470127598098032, + "loss": 0.6207, + "step": 5806 + }, + { + "epoch": 0.77, + "grad_norm": 0.5703125, + "learning_rate": 0.00019469753500672495, + "loss": 0.3439, + "step": 5807 + }, + { + "epoch": 0.78, + "grad_norm": 0.8515625, + "learning_rate": 0.00019469379274831026, + "loss": 0.8837, + "step": 5808 + }, + { + "epoch": 0.78, + "grad_norm": 0.484375, + "learning_rate": 0.00019469004920578693, + "loss": 0.4908, + "step": 5809 + }, + { + "epoch": 0.78, + "grad_norm": 0.92578125, + "learning_rate": 0.00019468630437920577, + "loss": 0.3956, + "step": 5810 + }, + { + "epoch": 0.78, + "grad_norm": 0.61328125, + "learning_rate": 0.00019468255826861754, + "loss": 0.6922, + "step": 5811 + }, + { + "epoch": 0.78, + "grad_norm": 0.65234375, + "learning_rate": 0.0001946788108740731, + "loss": 0.4475, + "step": 5812 + }, + { + "epoch": 0.78, + "grad_norm": 0.494140625, + "learning_rate": 0.00019467506219562317, + "loss": 0.254, + "step": 5813 + }, + { + "epoch": 0.78, + "grad_norm": 0.46484375, + "learning_rate": 0.00019467131223331867, + "loss": 0.2399, + "step": 5814 + }, + { + "epoch": 0.78, + "grad_norm": 0.53515625, + "learning_rate": 0.0001946675609872104, + "loss": 0.3221, + "step": 5815 + }, + { + "epoch": 0.78, + "grad_norm": 0.65625, + "learning_rate": 0.00019466380845734925, + "loss": 0.4226, + "step": 5816 + }, + { + "epoch": 0.78, + "grad_norm": 0.63671875, + "learning_rate": 0.0001946600546437861, + "loss": 0.5571, + "step": 5817 + }, + { + "epoch": 0.78, + "grad_norm": 0.66015625, + "learning_rate": 0.00019465629954657185, + "loss": 0.6896, + "step": 5818 + }, + { + "epoch": 0.78, + "grad_norm": 0.5234375, + "learning_rate": 0.00019465254316575743, + "loss": 0.3342, + "step": 5819 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 0.00019464878550139379, + "loss": 0.381, + "step": 5820 + }, + { + "epoch": 0.78, + "grad_norm": 0.80078125, + "learning_rate": 0.00019464502655353185, + "loss": 0.4282, + "step": 5821 + }, + { + "epoch": 0.78, + "grad_norm": 0.64453125, + "learning_rate": 0.0001946412663222226, + "loss": 0.6529, + "step": 5822 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 0.00019463750480751708, + "loss": 0.4934, + "step": 5823 + }, + { + "epoch": 0.78, + "grad_norm": 0.5703125, + "learning_rate": 0.00019463374200946626, + "loss": 0.3696, + "step": 5824 + }, + { + "epoch": 0.78, + "grad_norm": 0.60546875, + "learning_rate": 0.00019462997792812111, + "loss": 0.7305, + "step": 5825 + }, + { + "epoch": 0.78, + "grad_norm": 0.421875, + "learning_rate": 0.00019462621256353274, + "loss": 0.3595, + "step": 5826 + }, + { + "epoch": 0.78, + "grad_norm": 0.58203125, + "learning_rate": 0.00019462244591575222, + "loss": 0.3745, + "step": 5827 + }, + { + "epoch": 0.78, + "grad_norm": 0.57421875, + "learning_rate": 0.0001946186779848306, + "loss": 0.3525, + "step": 5828 + }, + { + "epoch": 0.78, + "grad_norm": 0.57421875, + "learning_rate": 0.00019461490877081898, + "loss": 0.3344, + "step": 5829 + }, + { + "epoch": 0.78, + "grad_norm": 0.51953125, + "learning_rate": 0.0001946111382737685, + "loss": 0.4116, + "step": 5830 + }, + { + "epoch": 0.78, + "grad_norm": 0.51953125, + "learning_rate": 0.00019460736649373024, + "loss": 0.5356, + "step": 5831 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 0.00019460359343075533, + "loss": 0.2717, + "step": 5832 + }, + { + "epoch": 0.78, + "grad_norm": 0.5234375, + "learning_rate": 0.00019459981908489504, + "loss": 0.2922, + "step": 5833 + }, + { + "epoch": 0.78, + "grad_norm": 0.6953125, + "learning_rate": 0.00019459604345620045, + "loss": 0.4553, + "step": 5834 + }, + { + "epoch": 0.78, + "grad_norm": 0.484375, + "learning_rate": 0.0001945922665447228, + "loss": 0.5406, + "step": 5835 + }, + { + "epoch": 0.78, + "grad_norm": 0.67578125, + "learning_rate": 0.00019458848835051336, + "loss": 0.4853, + "step": 5836 + }, + { + "epoch": 0.78, + "grad_norm": 0.94921875, + "learning_rate": 0.00019458470887362325, + "loss": 0.5197, + "step": 5837 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 0.00019458092811410382, + "loss": 0.5638, + "step": 5838 + }, + { + "epoch": 0.78, + "grad_norm": 0.78125, + "learning_rate": 0.00019457714607200627, + "loss": 0.3277, + "step": 5839 + }, + { + "epoch": 0.78, + "grad_norm": 0.51171875, + "learning_rate": 0.00019457336274738193, + "loss": 0.3537, + "step": 5840 + }, + { + "epoch": 0.78, + "grad_norm": 0.6171875, + "learning_rate": 0.0001945695781402821, + "loss": 0.4064, + "step": 5841 + }, + { + "epoch": 0.78, + "grad_norm": 0.78515625, + "learning_rate": 0.00019456579225075814, + "loss": 0.4906, + "step": 5842 + }, + { + "epoch": 0.78, + "grad_norm": 0.625, + "learning_rate": 0.00019456200507886128, + "loss": 0.2617, + "step": 5843 + }, + { + "epoch": 0.78, + "grad_norm": 0.63671875, + "learning_rate": 0.00019455821662464295, + "loss": 0.2565, + "step": 5844 + }, + { + "epoch": 0.78, + "grad_norm": 0.71875, + "learning_rate": 0.00019455442688815454, + "loss": 0.5705, + "step": 5845 + }, + { + "epoch": 0.78, + "grad_norm": 0.443359375, + "learning_rate": 0.00019455063586944741, + "loss": 0.6333, + "step": 5846 + }, + { + "epoch": 0.78, + "grad_norm": 0.50390625, + "learning_rate": 0.000194546843568573, + "loss": 0.6629, + "step": 5847 + }, + { + "epoch": 0.78, + "grad_norm": 0.68359375, + "learning_rate": 0.00019454304998558266, + "loss": 0.6827, + "step": 5848 + }, + { + "epoch": 0.78, + "grad_norm": 0.6953125, + "learning_rate": 0.0001945392551205279, + "loss": 0.3059, + "step": 5849 + }, + { + "epoch": 0.78, + "grad_norm": 0.76171875, + "learning_rate": 0.00019453545897346017, + "loss": 0.3621, + "step": 5850 + }, + { + "epoch": 0.78, + "grad_norm": 0.58203125, + "learning_rate": 0.00019453166154443095, + "loss": 0.3054, + "step": 5851 + }, + { + "epoch": 0.78, + "grad_norm": 0.609375, + "learning_rate": 0.00019452786283349175, + "loss": 0.4209, + "step": 5852 + }, + { + "epoch": 0.78, + "grad_norm": 0.60546875, + "learning_rate": 0.00019452406284069406, + "loss": 0.5298, + "step": 5853 + }, + { + "epoch": 0.78, + "grad_norm": 0.6328125, + "learning_rate": 0.00019452026156608935, + "loss": 0.3766, + "step": 5854 + }, + { + "epoch": 0.78, + "grad_norm": 0.609375, + "learning_rate": 0.0001945164590097293, + "loss": 0.2185, + "step": 5855 + }, + { + "epoch": 0.78, + "grad_norm": 0.6015625, + "learning_rate": 0.00019451265517166536, + "loss": 0.4464, + "step": 5856 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 0.0001945088500519492, + "loss": 0.4024, + "step": 5857 + }, + { + "epoch": 0.78, + "grad_norm": 1.1875, + "learning_rate": 0.00019450504365063233, + "loss": 0.335, + "step": 5858 + }, + { + "epoch": 0.78, + "grad_norm": 0.671875, + "learning_rate": 0.00019450123596776646, + "loss": 0.1812, + "step": 5859 + }, + { + "epoch": 0.78, + "grad_norm": 0.640625, + "learning_rate": 0.00019449742700340317, + "loss": 0.3648, + "step": 5860 + }, + { + "epoch": 0.78, + "grad_norm": 0.6640625, + "learning_rate": 0.0001944936167575941, + "loss": 0.6052, + "step": 5861 + }, + { + "epoch": 0.78, + "grad_norm": 0.69921875, + "learning_rate": 0.00019448980523039097, + "loss": 0.5501, + "step": 5862 + }, + { + "epoch": 0.78, + "grad_norm": 0.91796875, + "learning_rate": 0.0001944859924218454, + "loss": 0.4921, + "step": 5863 + }, + { + "epoch": 0.78, + "grad_norm": 0.546875, + "learning_rate": 0.00019448217833200918, + "loss": 0.5122, + "step": 5864 + }, + { + "epoch": 0.78, + "grad_norm": 1.03125, + "learning_rate": 0.00019447836296093392, + "loss": 0.862, + "step": 5865 + }, + { + "epoch": 0.78, + "grad_norm": 0.671875, + "learning_rate": 0.0001944745463086715, + "loss": 0.5683, + "step": 5866 + }, + { + "epoch": 0.78, + "grad_norm": 0.73828125, + "learning_rate": 0.00019447072837527353, + "loss": 0.4446, + "step": 5867 + }, + { + "epoch": 0.78, + "grad_norm": 0.8515625, + "learning_rate": 0.0001944669091607919, + "loss": 0.4351, + "step": 5868 + }, + { + "epoch": 0.78, + "grad_norm": 0.63671875, + "learning_rate": 0.00019446308866527832, + "loss": 0.4731, + "step": 5869 + }, + { + "epoch": 0.78, + "grad_norm": 0.6484375, + "learning_rate": 0.00019445926688878466, + "loss": 0.6826, + "step": 5870 + }, + { + "epoch": 0.78, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944554438313627, + "loss": 0.5792, + "step": 5871 + }, + { + "epoch": 0.78, + "grad_norm": 0.62109375, + "learning_rate": 0.0001944516194930643, + "loss": 0.3906, + "step": 5872 + }, + { + "epoch": 0.78, + "grad_norm": 0.671875, + "learning_rate": 0.00019444779387394132, + "loss": 0.4848, + "step": 5873 + }, + { + "epoch": 0.78, + "grad_norm": 0.5703125, + "learning_rate": 0.00019444396697404566, + "loss": 0.4207, + "step": 5874 + }, + { + "epoch": 0.78, + "grad_norm": 0.625, + "learning_rate": 0.00019444013879342916, + "loss": 0.4682, + "step": 5875 + }, + { + "epoch": 0.78, + "grad_norm": 0.55859375, + "learning_rate": 0.00019443630933214378, + "loss": 0.5466, + "step": 5876 + }, + { + "epoch": 0.78, + "grad_norm": 0.59375, + "learning_rate": 0.00019443247859024146, + "loss": 0.5727, + "step": 5877 + }, + { + "epoch": 0.78, + "grad_norm": 0.69140625, + "learning_rate": 0.00019442864656777408, + "loss": 0.6721, + "step": 5878 + }, + { + "epoch": 0.78, + "grad_norm": 0.61328125, + "learning_rate": 0.0001944248132647937, + "loss": 0.3222, + "step": 5879 + }, + { + "epoch": 0.78, + "grad_norm": 0.62890625, + "learning_rate": 0.00019442097868135224, + "loss": 0.3764, + "step": 5880 + }, + { + "epoch": 0.78, + "grad_norm": 0.6328125, + "learning_rate": 0.00019441714281750167, + "loss": 0.4676, + "step": 5881 + }, + { + "epoch": 0.78, + "grad_norm": 0.69921875, + "learning_rate": 0.00019441330567329405, + "loss": 0.5558, + "step": 5882 + }, + { + "epoch": 0.79, + "grad_norm": 0.671875, + "learning_rate": 0.00019440946724878146, + "loss": 0.3361, + "step": 5883 + }, + { + "epoch": 0.79, + "grad_norm": 0.5625, + "learning_rate": 0.00019440562754401586, + "loss": 0.5578, + "step": 5884 + }, + { + "epoch": 0.79, + "grad_norm": 0.5390625, + "learning_rate": 0.00019440178655904936, + "loss": 0.2432, + "step": 5885 + }, + { + "epoch": 0.79, + "grad_norm": 0.6484375, + "learning_rate": 0.0001943979442939341, + "loss": 0.337, + "step": 5886 + }, + { + "epoch": 0.79, + "grad_norm": 0.58203125, + "learning_rate": 0.00019439410074872208, + "loss": 0.4748, + "step": 5887 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 0.00019439025592346549, + "loss": 0.7166, + "step": 5888 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 0.00019438640981821645, + "loss": 0.4449, + "step": 5889 + }, + { + "epoch": 0.79, + "grad_norm": 0.51171875, + "learning_rate": 0.0001943825624330271, + "loss": 0.834, + "step": 5890 + }, + { + "epoch": 0.79, + "grad_norm": 0.70703125, + "learning_rate": 0.00019437871376794965, + "loss": 0.6826, + "step": 5891 + }, + { + "epoch": 0.79, + "grad_norm": 0.6015625, + "learning_rate": 0.00019437486382303627, + "loss": 0.444, + "step": 5892 + }, + { + "epoch": 0.79, + "grad_norm": 0.51171875, + "learning_rate": 0.00019437101259833916, + "loss": 0.3627, + "step": 5893 + }, + { + "epoch": 0.79, + "grad_norm": 0.6640625, + "learning_rate": 0.00019436716009391054, + "loss": 0.4822, + "step": 5894 + }, + { + "epoch": 0.79, + "grad_norm": 0.94140625, + "learning_rate": 0.00019436330630980269, + "loss": 0.4389, + "step": 5895 + }, + { + "epoch": 0.79, + "grad_norm": 0.71875, + "learning_rate": 0.00019435945124606782, + "loss": 0.341, + "step": 5896 + }, + { + "epoch": 0.79, + "grad_norm": 0.44921875, + "learning_rate": 0.00019435559490275825, + "loss": 0.5626, + "step": 5897 + }, + { + "epoch": 0.79, + "grad_norm": 0.486328125, + "learning_rate": 0.00019435173727992626, + "loss": 0.5648, + "step": 5898 + }, + { + "epoch": 0.79, + "grad_norm": 0.515625, + "learning_rate": 0.00019434787837762415, + "loss": 0.3387, + "step": 5899 + }, + { + "epoch": 0.79, + "grad_norm": 0.58203125, + "learning_rate": 0.00019434401819590425, + "loss": 0.4322, + "step": 5900 + }, + { + "epoch": 0.79, + "grad_norm": 0.41796875, + "learning_rate": 0.0001943401567348189, + "loss": 0.4058, + "step": 5901 + }, + { + "epoch": 0.79, + "grad_norm": 0.56640625, + "learning_rate": 0.0001943362939944205, + "loss": 0.498, + "step": 5902 + }, + { + "epoch": 0.79, + "grad_norm": 0.546875, + "learning_rate": 0.0001943324299747614, + "loss": 0.458, + "step": 5903 + }, + { + "epoch": 0.79, + "grad_norm": 0.5859375, + "learning_rate": 0.00019432856467589403, + "loss": 0.465, + "step": 5904 + }, + { + "epoch": 0.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00019432469809787075, + "loss": 0.4482, + "step": 5905 + }, + { + "epoch": 0.79, + "grad_norm": 0.64453125, + "learning_rate": 0.00019432083024074404, + "loss": 0.2906, + "step": 5906 + }, + { + "epoch": 0.79, + "grad_norm": 0.59765625, + "learning_rate": 0.00019431696110456634, + "loss": 0.3413, + "step": 5907 + }, + { + "epoch": 0.79, + "grad_norm": 0.6796875, + "learning_rate": 0.0001943130906893901, + "loss": 0.4023, + "step": 5908 + }, + { + "epoch": 0.79, + "grad_norm": 0.93359375, + "learning_rate": 0.00019430921899526787, + "loss": 0.3853, + "step": 5909 + }, + { + "epoch": 0.79, + "grad_norm": 0.91796875, + "learning_rate": 0.00019430534602225206, + "loss": 0.3721, + "step": 5910 + }, + { + "epoch": 0.79, + "grad_norm": 0.796875, + "learning_rate": 0.0001943014717703952, + "loss": 0.3972, + "step": 5911 + }, + { + "epoch": 0.79, + "grad_norm": 0.67578125, + "learning_rate": 0.00019429759623974991, + "loss": 0.3843, + "step": 5912 + }, + { + "epoch": 0.79, + "grad_norm": 0.6015625, + "learning_rate": 0.00019429371943036872, + "loss": 0.5554, + "step": 5913 + }, + { + "epoch": 0.79, + "grad_norm": 0.59765625, + "learning_rate": 0.00019428984134230412, + "loss": 0.3337, + "step": 5914 + }, + { + "epoch": 0.79, + "grad_norm": 0.484375, + "learning_rate": 0.00019428596197560878, + "loss": 0.3389, + "step": 5915 + }, + { + "epoch": 0.79, + "grad_norm": 0.458984375, + "learning_rate": 0.0001942820813303353, + "loss": 0.3557, + "step": 5916 + }, + { + "epoch": 0.79, + "grad_norm": 0.80859375, + "learning_rate": 0.00019427819940653627, + "loss": 0.4385, + "step": 5917 + }, + { + "epoch": 0.79, + "grad_norm": 0.7421875, + "learning_rate": 0.00019427431620426435, + "loss": 0.9413, + "step": 5918 + }, + { + "epoch": 0.79, + "grad_norm": 0.734375, + "learning_rate": 0.00019427043172357217, + "loss": 0.6692, + "step": 5919 + }, + { + "epoch": 0.79, + "grad_norm": 0.875, + "learning_rate": 0.0001942665459645125, + "loss": 0.4145, + "step": 5920 + }, + { + "epoch": 0.79, + "grad_norm": 0.58984375, + "learning_rate": 0.0001942626589271379, + "loss": 0.7403, + "step": 5921 + }, + { + "epoch": 0.79, + "grad_norm": 0.388671875, + "learning_rate": 0.0001942587706115012, + "loss": 0.3134, + "step": 5922 + }, + { + "epoch": 0.79, + "grad_norm": 0.7421875, + "learning_rate": 0.00019425488101765503, + "loss": 0.7419, + "step": 5923 + }, + { + "epoch": 0.79, + "grad_norm": 0.75, + "learning_rate": 0.0001942509901456522, + "loss": 0.5071, + "step": 5924 + }, + { + "epoch": 0.79, + "grad_norm": 0.76953125, + "learning_rate": 0.00019424709799554545, + "loss": 0.4259, + "step": 5925 + }, + { + "epoch": 0.79, + "grad_norm": 0.5078125, + "learning_rate": 0.00019424320456738756, + "loss": 0.6238, + "step": 5926 + }, + { + "epoch": 0.79, + "grad_norm": 0.703125, + "learning_rate": 0.00019423930986123137, + "loss": 0.4136, + "step": 5927 + }, + { + "epoch": 0.79, + "grad_norm": 0.61328125, + "learning_rate": 0.00019423541387712963, + "loss": 0.3403, + "step": 5928 + }, + { + "epoch": 0.79, + "grad_norm": 0.5859375, + "learning_rate": 0.00019423151661513517, + "loss": 0.9566, + "step": 5929 + }, + { + "epoch": 0.79, + "grad_norm": 0.69921875, + "learning_rate": 0.0001942276180753009, + "loss": 0.5326, + "step": 5930 + }, + { + "epoch": 0.79, + "grad_norm": 0.51953125, + "learning_rate": 0.00019422371825767964, + "loss": 0.6099, + "step": 5931 + }, + { + "epoch": 0.79, + "grad_norm": 0.58984375, + "learning_rate": 0.0001942198171623243, + "loss": 0.4964, + "step": 5932 + }, + { + "epoch": 0.79, + "grad_norm": 0.50390625, + "learning_rate": 0.00019421591478928775, + "loss": 0.2774, + "step": 5933 + }, + { + "epoch": 0.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00019421201113862296, + "loss": 0.3792, + "step": 5934 + }, + { + "epoch": 0.79, + "grad_norm": 0.67578125, + "learning_rate": 0.0001942081062103828, + "loss": 0.7295, + "step": 5935 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 0.00019420420000462027, + "loss": 0.5261, + "step": 5936 + }, + { + "epoch": 0.79, + "grad_norm": 0.56640625, + "learning_rate": 0.00019420029252138833, + "loss": 0.4261, + "step": 5937 + }, + { + "epoch": 0.79, + "grad_norm": 0.5625, + "learning_rate": 0.00019419638376073998, + "loss": 0.6019, + "step": 5938 + }, + { + "epoch": 0.79, + "grad_norm": 0.50390625, + "learning_rate": 0.00019419247372272819, + "loss": 0.4217, + "step": 5939 + }, + { + "epoch": 0.79, + "grad_norm": 0.6875, + "learning_rate": 0.00019418856240740602, + "loss": 0.4279, + "step": 5940 + }, + { + "epoch": 0.79, + "grad_norm": 0.6640625, + "learning_rate": 0.00019418464981482648, + "loss": 0.2861, + "step": 5941 + }, + { + "epoch": 0.79, + "grad_norm": 0.6171875, + "learning_rate": 0.00019418073594504266, + "loss": 0.8289, + "step": 5942 + }, + { + "epoch": 0.79, + "grad_norm": 0.85546875, + "learning_rate": 0.00019417682079810757, + "loss": 0.4771, + "step": 5943 + }, + { + "epoch": 0.79, + "grad_norm": 0.81640625, + "learning_rate": 0.0001941729043740744, + "loss": 0.3965, + "step": 5944 + }, + { + "epoch": 0.79, + "grad_norm": 0.86328125, + "learning_rate": 0.0001941689866729962, + "loss": 0.428, + "step": 5945 + }, + { + "epoch": 0.79, + "grad_norm": 0.6328125, + "learning_rate": 0.00019416506769492608, + "loss": 0.6581, + "step": 5946 + }, + { + "epoch": 0.79, + "grad_norm": 0.48828125, + "learning_rate": 0.0001941611474399172, + "loss": 0.4456, + "step": 5947 + }, + { + "epoch": 0.79, + "grad_norm": 0.83203125, + "learning_rate": 0.00019415722590802276, + "loss": 0.4581, + "step": 5948 + }, + { + "epoch": 0.79, + "grad_norm": 0.81640625, + "learning_rate": 0.0001941533030992959, + "loss": 0.4627, + "step": 5949 + }, + { + "epoch": 0.79, + "grad_norm": 0.83203125, + "learning_rate": 0.00019414937901378982, + "loss": 0.4336, + "step": 5950 + }, + { + "epoch": 0.79, + "grad_norm": 0.80859375, + "learning_rate": 0.00019414545365155773, + "loss": 0.3537, + "step": 5951 + }, + { + "epoch": 0.79, + "grad_norm": 0.62109375, + "learning_rate": 0.00019414152701265286, + "loss": 0.5726, + "step": 5952 + }, + { + "epoch": 0.79, + "grad_norm": 0.53125, + "learning_rate": 0.00019413759909712848, + "loss": 0.2559, + "step": 5953 + }, + { + "epoch": 0.79, + "grad_norm": 0.62109375, + "learning_rate": 0.00019413366990503783, + "loss": 0.3226, + "step": 5954 + }, + { + "epoch": 0.79, + "grad_norm": 0.80859375, + "learning_rate": 0.0001941297394364342, + "loss": 0.5601, + "step": 5955 + }, + { + "epoch": 0.79, + "grad_norm": 0.4921875, + "learning_rate": 0.0001941258076913709, + "loss": 0.4991, + "step": 5956 + }, + { + "epoch": 0.79, + "grad_norm": 0.56640625, + "learning_rate": 0.0001941218746699012, + "loss": 0.6351, + "step": 5957 + }, + { + "epoch": 0.8, + "grad_norm": 0.65234375, + "learning_rate": 0.00019411794037207854, + "loss": 0.4449, + "step": 5958 + }, + { + "epoch": 0.8, + "grad_norm": 0.69921875, + "learning_rate": 0.00019411400479795617, + "loss": 0.601, + "step": 5959 + }, + { + "epoch": 0.8, + "grad_norm": 0.48828125, + "learning_rate": 0.0001941100679475875, + "loss": 0.239, + "step": 5960 + }, + { + "epoch": 0.8, + "grad_norm": 0.4375, + "learning_rate": 0.0001941061298210259, + "loss": 0.4293, + "step": 5961 + }, + { + "epoch": 0.8, + "grad_norm": 0.60546875, + "learning_rate": 0.00019410219041832475, + "loss": 0.6337, + "step": 5962 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 0.00019409824973953756, + "loss": 0.3371, + "step": 5963 + }, + { + "epoch": 0.8, + "grad_norm": 0.87109375, + "learning_rate": 0.00019409430778471768, + "loss": 0.5482, + "step": 5964 + }, + { + "epoch": 0.8, + "grad_norm": 0.45703125, + "learning_rate": 0.00019409036455391863, + "loss": 0.5744, + "step": 5965 + }, + { + "epoch": 0.8, + "grad_norm": 0.58984375, + "learning_rate": 0.00019408642004719383, + "loss": 0.3601, + "step": 5966 + }, + { + "epoch": 0.8, + "grad_norm": 0.80078125, + "learning_rate": 0.00019408247426459677, + "loss": 0.7575, + "step": 5967 + }, + { + "epoch": 0.8, + "grad_norm": 0.75390625, + "learning_rate": 0.000194078527206181, + "loss": 0.3985, + "step": 5968 + }, + { + "epoch": 0.8, + "grad_norm": 0.73828125, + "learning_rate": 0.00019407457887200003, + "loss": 1.0151, + "step": 5969 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 0.00019407062926210736, + "loss": 0.4532, + "step": 5970 + }, + { + "epoch": 0.8, + "grad_norm": 0.439453125, + "learning_rate": 0.0001940666783765566, + "loss": 0.2379, + "step": 5971 + }, + { + "epoch": 0.8, + "grad_norm": 0.7578125, + "learning_rate": 0.00019406272621540132, + "loss": 0.7467, + "step": 5972 + }, + { + "epoch": 0.8, + "grad_norm": 0.68359375, + "learning_rate": 0.0001940587727786951, + "loss": 0.4783, + "step": 5973 + }, + { + "epoch": 0.8, + "grad_norm": 0.74609375, + "learning_rate": 0.00019405481806649154, + "loss": 0.6249, + "step": 5974 + }, + { + "epoch": 0.8, + "grad_norm": 0.7734375, + "learning_rate": 0.0001940508620788443, + "loss": 0.7004, + "step": 5975 + }, + { + "epoch": 0.8, + "grad_norm": 0.79296875, + "learning_rate": 0.000194046904815807, + "loss": 0.7071, + "step": 5976 + }, + { + "epoch": 0.8, + "grad_norm": 0.66015625, + "learning_rate": 0.00019404294627743332, + "loss": 0.2711, + "step": 5977 + }, + { + "epoch": 0.8, + "grad_norm": 0.6953125, + "learning_rate": 0.00019403898646377693, + "loss": 0.858, + "step": 5978 + }, + { + "epoch": 0.8, + "grad_norm": 0.74609375, + "learning_rate": 0.00019403502537489151, + "loss": 0.4308, + "step": 5979 + }, + { + "epoch": 0.8, + "grad_norm": 0.359375, + "learning_rate": 0.0001940310630108308, + "loss": 0.2722, + "step": 5980 + }, + { + "epoch": 0.8, + "grad_norm": 0.6171875, + "learning_rate": 0.00019402709937164857, + "loss": 0.3115, + "step": 5981 + }, + { + "epoch": 0.8, + "grad_norm": 0.60546875, + "learning_rate": 0.00019402313445739848, + "loss": 0.9378, + "step": 5982 + }, + { + "epoch": 0.8, + "grad_norm": 0.453125, + "learning_rate": 0.00019401916826813434, + "loss": 0.4048, + "step": 5983 + }, + { + "epoch": 0.8, + "grad_norm": 0.875, + "learning_rate": 0.00019401520080390995, + "loss": 0.3472, + "step": 5984 + }, + { + "epoch": 0.8, + "grad_norm": 0.7578125, + "learning_rate": 0.0001940112320647791, + "loss": 0.7063, + "step": 5985 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390625, + "learning_rate": 0.00019400726205079561, + "loss": 0.4004, + "step": 5986 + }, + { + "epoch": 0.8, + "grad_norm": 0.91796875, + "learning_rate": 0.00019400329076201334, + "loss": 0.4985, + "step": 5987 + }, + { + "epoch": 0.8, + "grad_norm": 0.56640625, + "learning_rate": 0.0001939993181984861, + "loss": 0.2537, + "step": 5988 + }, + { + "epoch": 0.8, + "grad_norm": 0.47265625, + "learning_rate": 0.00019399534436026773, + "loss": 0.3091, + "step": 5989 + }, + { + "epoch": 0.8, + "grad_norm": 0.91015625, + "learning_rate": 0.0001939913692474122, + "loss": 0.7597, + "step": 5990 + }, + { + "epoch": 0.8, + "grad_norm": 0.447265625, + "learning_rate": 0.0001939873928599734, + "loss": 0.2299, + "step": 5991 + }, + { + "epoch": 0.8, + "grad_norm": 0.515625, + "learning_rate": 0.0001939834151980052, + "loss": 0.3686, + "step": 5992 + }, + { + "epoch": 0.8, + "grad_norm": 0.59375, + "learning_rate": 0.0001939794362615616, + "loss": 0.6743, + "step": 5993 + }, + { + "epoch": 0.8, + "grad_norm": 0.83203125, + "learning_rate": 0.00019397545605069651, + "loss": 0.4249, + "step": 5994 + }, + { + "epoch": 0.8, + "grad_norm": 0.462890625, + "learning_rate": 0.00019397147456546392, + "loss": 0.3812, + "step": 5995 + }, + { + "epoch": 0.8, + "grad_norm": 0.4453125, + "learning_rate": 0.00019396749180591782, + "loss": 0.3839, + "step": 5996 + }, + { + "epoch": 0.8, + "grad_norm": 0.482421875, + "learning_rate": 0.00019396350777211224, + "loss": 0.3347, + "step": 5997 + }, + { + "epoch": 0.8, + "grad_norm": 0.5703125, + "learning_rate": 0.00019395952246410118, + "loss": 0.4756, + "step": 5998 + }, + { + "epoch": 0.8, + "grad_norm": 0.69140625, + "learning_rate": 0.0001939555358819387, + "loss": 0.5163, + "step": 5999 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390625, + "learning_rate": 0.00019395154802567886, + "loss": 0.3395, + "step": 6000 + }, + { + "epoch": 0.8, + "grad_norm": 0.6328125, + "learning_rate": 0.00019394755889537573, + "loss": 0.497, + "step": 6001 + }, + { + "epoch": 0.8, + "grad_norm": 0.6171875, + "learning_rate": 0.00019394356849108336, + "loss": 0.8044, + "step": 6002 + }, + { + "epoch": 0.8, + "grad_norm": 0.470703125, + "learning_rate": 0.00019393957681285598, + "loss": 0.6723, + "step": 6003 + }, + { + "epoch": 0.8, + "grad_norm": 0.294921875, + "learning_rate": 0.0001939355838607476, + "loss": 0.189, + "step": 6004 + }, + { + "epoch": 0.8, + "grad_norm": 0.58984375, + "learning_rate": 0.00019393158963481244, + "loss": 0.2531, + "step": 6005 + }, + { + "epoch": 0.8, + "grad_norm": 0.75390625, + "learning_rate": 0.0001939275941351046, + "loss": 0.2908, + "step": 6006 + }, + { + "epoch": 0.8, + "grad_norm": 0.62890625, + "learning_rate": 0.00019392359736167833, + "loss": 0.6416, + "step": 6007 + }, + { + "epoch": 0.8, + "grad_norm": 0.57421875, + "learning_rate": 0.0001939195993145878, + "loss": 0.5967, + "step": 6008 + }, + { + "epoch": 0.8, + "grad_norm": 1.03125, + "learning_rate": 0.00019391559999388722, + "loss": 0.5207, + "step": 6009 + }, + { + "epoch": 0.8, + "grad_norm": 0.64453125, + "learning_rate": 0.00019391159939963085, + "loss": 0.3256, + "step": 6010 + }, + { + "epoch": 0.8, + "grad_norm": 0.578125, + "learning_rate": 0.0001939075975318729, + "loss": 0.4177, + "step": 6011 + }, + { + "epoch": 0.8, + "grad_norm": 0.83984375, + "learning_rate": 0.00019390359439066764, + "loss": 0.3522, + "step": 6012 + }, + { + "epoch": 0.8, + "grad_norm": 0.60546875, + "learning_rate": 0.00019389958997606942, + "loss": 0.2355, + "step": 6013 + }, + { + "epoch": 0.8, + "grad_norm": 0.60546875, + "learning_rate": 0.00019389558428813247, + "loss": 0.5459, + "step": 6014 + }, + { + "epoch": 0.8, + "grad_norm": 0.76171875, + "learning_rate": 0.00019389157732691115, + "loss": 0.3352, + "step": 6015 + }, + { + "epoch": 0.8, + "grad_norm": 0.76953125, + "learning_rate": 0.00019388756909245974, + "loss": 0.4083, + "step": 6016 + }, + { + "epoch": 0.8, + "grad_norm": 0.482421875, + "learning_rate": 0.00019388355958483268, + "loss": 0.3937, + "step": 6017 + }, + { + "epoch": 0.8, + "grad_norm": 0.75, + "learning_rate": 0.00019387954880408428, + "loss": 0.4468, + "step": 6018 + }, + { + "epoch": 0.8, + "grad_norm": 0.59375, + "learning_rate": 0.00019387553675026897, + "loss": 0.7642, + "step": 6019 + }, + { + "epoch": 0.8, + "grad_norm": 0.53515625, + "learning_rate": 0.0001938715234234411, + "loss": 0.5369, + "step": 6020 + }, + { + "epoch": 0.8, + "grad_norm": 0.50390625, + "learning_rate": 0.00019386750882365513, + "loss": 0.5787, + "step": 6021 + }, + { + "epoch": 0.8, + "grad_norm": 0.4375, + "learning_rate": 0.0001938634929509655, + "loss": 0.2516, + "step": 6022 + }, + { + "epoch": 0.8, + "grad_norm": 0.51171875, + "learning_rate": 0.00019385947580542666, + "loss": 0.2874, + "step": 6023 + }, + { + "epoch": 0.8, + "grad_norm": 0.65234375, + "learning_rate": 0.00019385545738709312, + "loss": 0.3989, + "step": 6024 + }, + { + "epoch": 0.8, + "grad_norm": 0.5078125, + "learning_rate": 0.00019385143769601933, + "loss": 0.3174, + "step": 6025 + }, + { + "epoch": 0.8, + "grad_norm": 0.7109375, + "learning_rate": 0.0001938474167322598, + "loss": 0.3743, + "step": 6026 + }, + { + "epoch": 0.8, + "grad_norm": 0.69140625, + "learning_rate": 0.00019384339449586907, + "loss": 0.4296, + "step": 6027 + }, + { + "epoch": 0.8, + "grad_norm": 0.58203125, + "learning_rate": 0.0001938393709869017, + "loss": 0.532, + "step": 6028 + }, + { + "epoch": 0.8, + "grad_norm": 0.50390625, + "learning_rate": 0.0001938353462054122, + "loss": 0.3833, + "step": 6029 + }, + { + "epoch": 0.8, + "grad_norm": 0.72265625, + "learning_rate": 0.0001938313201514552, + "loss": 0.6453, + "step": 6030 + }, + { + "epoch": 0.8, + "grad_norm": 0.66015625, + "learning_rate": 0.00019382729282508526, + "loss": 0.2608, + "step": 6031 + }, + { + "epoch": 0.8, + "grad_norm": 0.59765625, + "learning_rate": 0.00019382326422635705, + "loss": 0.5945, + "step": 6032 + }, + { + "epoch": 0.81, + "grad_norm": 0.70703125, + "learning_rate": 0.00019381923435532515, + "loss": 0.7676, + "step": 6033 + }, + { + "epoch": 0.81, + "grad_norm": 0.416015625, + "learning_rate": 0.0001938152032120442, + "loss": 0.3169, + "step": 6034 + }, + { + "epoch": 0.81, + "grad_norm": 0.53125, + "learning_rate": 0.0001938111707965689, + "loss": 0.3222, + "step": 6035 + }, + { + "epoch": 0.81, + "grad_norm": 0.6953125, + "learning_rate": 0.0001938071371089539, + "loss": 0.5103, + "step": 6036 + }, + { + "epoch": 0.81, + "grad_norm": 0.64453125, + "learning_rate": 0.00019380310214925394, + "loss": 0.4616, + "step": 6037 + }, + { + "epoch": 0.81, + "grad_norm": 0.640625, + "learning_rate": 0.00019379906591752367, + "loss": 0.4039, + "step": 6038 + }, + { + "epoch": 0.81, + "grad_norm": 0.66796875, + "learning_rate": 0.00019379502841381793, + "loss": 0.3611, + "step": 6039 + }, + { + "epoch": 0.81, + "grad_norm": 0.66015625, + "learning_rate": 0.00019379098963819135, + "loss": 0.5367, + "step": 6040 + }, + { + "epoch": 0.81, + "grad_norm": 0.48828125, + "learning_rate": 0.0001937869495906988, + "loss": 0.4519, + "step": 6041 + }, + { + "epoch": 0.81, + "grad_norm": 0.62109375, + "learning_rate": 0.00019378290827139498, + "loss": 0.4307, + "step": 6042 + }, + { + "epoch": 0.81, + "grad_norm": 0.91796875, + "learning_rate": 0.00019377886568033476, + "loss": 0.6392, + "step": 6043 + }, + { + "epoch": 0.81, + "grad_norm": 0.796875, + "learning_rate": 0.00019377482181757292, + "loss": 0.4647, + "step": 6044 + }, + { + "epoch": 0.81, + "grad_norm": 0.5, + "learning_rate": 0.00019377077668316433, + "loss": 0.3138, + "step": 6045 + }, + { + "epoch": 0.81, + "grad_norm": 0.5, + "learning_rate": 0.00019376673027716383, + "loss": 0.5342, + "step": 6046 + }, + { + "epoch": 0.81, + "grad_norm": 0.62109375, + "learning_rate": 0.00019376268259962626, + "loss": 0.4893, + "step": 6047 + }, + { + "epoch": 0.81, + "grad_norm": 0.65234375, + "learning_rate": 0.00019375863365060655, + "loss": 0.5555, + "step": 6048 + }, + { + "epoch": 0.81, + "grad_norm": 0.8046875, + "learning_rate": 0.0001937545834301596, + "loss": 0.3463, + "step": 6049 + }, + { + "epoch": 0.81, + "grad_norm": 0.66015625, + "learning_rate": 0.00019375053193834032, + "loss": 0.6811, + "step": 6050 + }, + { + "epoch": 0.81, + "grad_norm": 0.5703125, + "learning_rate": 0.00019374647917520368, + "loss": 0.1806, + "step": 6051 + }, + { + "epoch": 0.81, + "grad_norm": 0.72265625, + "learning_rate": 0.0001937424251408046, + "loss": 0.5323, + "step": 6052 + }, + { + "epoch": 0.81, + "grad_norm": 0.72265625, + "learning_rate": 0.00019373836983519805, + "loss": 0.3262, + "step": 6053 + }, + { + "epoch": 0.81, + "grad_norm": 0.65234375, + "learning_rate": 0.00019373431325843906, + "loss": 0.5833, + "step": 6054 + }, + { + "epoch": 0.81, + "grad_norm": 1.0859375, + "learning_rate": 0.00019373025541058266, + "loss": 0.6956, + "step": 6055 + }, + { + "epoch": 0.81, + "grad_norm": 0.45703125, + "learning_rate": 0.00019372619629168378, + "loss": 0.6853, + "step": 6056 + }, + { + "epoch": 0.81, + "grad_norm": 0.63671875, + "learning_rate": 0.00019372213590179756, + "loss": 0.4502, + "step": 6057 + }, + { + "epoch": 0.81, + "grad_norm": 0.494140625, + "learning_rate": 0.00019371807424097904, + "loss": 0.5847, + "step": 6058 + }, + { + "epoch": 0.81, + "grad_norm": 0.6953125, + "learning_rate": 0.00019371401130928323, + "loss": 0.4247, + "step": 6059 + }, + { + "epoch": 0.81, + "grad_norm": 0.8515625, + "learning_rate": 0.00019370994710676533, + "loss": 0.3347, + "step": 6060 + }, + { + "epoch": 0.81, + "grad_norm": 0.62890625, + "learning_rate": 0.0001937058816334804, + "loss": 0.5071, + "step": 6061 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 0.0001937018148894836, + "loss": 0.2027, + "step": 6062 + }, + { + "epoch": 0.81, + "grad_norm": 0.54296875, + "learning_rate": 0.00019369774687483, + "loss": 0.4342, + "step": 6063 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 0.00019369367758957485, + "loss": 0.3928, + "step": 6064 + }, + { + "epoch": 0.81, + "grad_norm": 0.5859375, + "learning_rate": 0.00019368960703377325, + "loss": 0.3328, + "step": 6065 + }, + { + "epoch": 0.81, + "grad_norm": 0.75390625, + "learning_rate": 0.0001936855352074805, + "loss": 0.6485, + "step": 6066 + }, + { + "epoch": 0.81, + "grad_norm": 0.67578125, + "learning_rate": 0.00019368146211075175, + "loss": 0.5223, + "step": 6067 + }, + { + "epoch": 0.81, + "grad_norm": 0.6015625, + "learning_rate": 0.00019367738774364227, + "loss": 0.5099, + "step": 6068 + }, + { + "epoch": 0.81, + "grad_norm": 0.5703125, + "learning_rate": 0.00019367331210620728, + "loss": 0.3222, + "step": 6069 + }, + { + "epoch": 0.81, + "grad_norm": 0.64453125, + "learning_rate": 0.00019366923519850202, + "loss": 0.6144, + "step": 6070 + }, + { + "epoch": 0.81, + "grad_norm": 0.59375, + "learning_rate": 0.00019366515702058185, + "loss": 0.8109, + "step": 6071 + }, + { + "epoch": 0.81, + "grad_norm": 0.6796875, + "learning_rate": 0.00019366107757250203, + "loss": 0.3865, + "step": 6072 + }, + { + "epoch": 0.81, + "grad_norm": 0.5625, + "learning_rate": 0.0001936569968543179, + "loss": 0.5514, + "step": 6073 + }, + { + "epoch": 0.81, + "grad_norm": 0.41015625, + "learning_rate": 0.0001936529148660848, + "loss": 0.3313, + "step": 6074 + }, + { + "epoch": 0.81, + "grad_norm": 0.5234375, + "learning_rate": 0.000193648831607858, + "loss": 0.575, + "step": 6075 + }, + { + "epoch": 0.81, + "grad_norm": 0.75, + "learning_rate": 0.000193644747079693, + "loss": 0.4318, + "step": 6076 + }, + { + "epoch": 0.81, + "grad_norm": 0.5703125, + "learning_rate": 0.0001936406612816451, + "loss": 0.5112, + "step": 6077 + }, + { + "epoch": 0.81, + "grad_norm": 0.6015625, + "learning_rate": 0.00019363657421376975, + "loss": 0.717, + "step": 6078 + }, + { + "epoch": 0.81, + "grad_norm": 0.71484375, + "learning_rate": 0.00019363248587612235, + "loss": 0.2367, + "step": 6079 + }, + { + "epoch": 0.81, + "grad_norm": 0.87109375, + "learning_rate": 0.00019362839626875834, + "loss": 0.4129, + "step": 6080 + }, + { + "epoch": 0.81, + "grad_norm": 0.85546875, + "learning_rate": 0.0001936243053917332, + "loss": 0.6875, + "step": 6081 + }, + { + "epoch": 0.81, + "grad_norm": 0.546875, + "learning_rate": 0.0001936202132451024, + "loss": 0.51, + "step": 6082 + }, + { + "epoch": 0.81, + "grad_norm": 0.76953125, + "learning_rate": 0.00019361611982892141, + "loss": 0.5351, + "step": 6083 + }, + { + "epoch": 0.81, + "grad_norm": 0.62890625, + "learning_rate": 0.00019361202514324574, + "loss": 0.4214, + "step": 6084 + }, + { + "epoch": 0.81, + "grad_norm": 0.58984375, + "learning_rate": 0.00019360792918813097, + "loss": 0.3751, + "step": 6085 + }, + { + "epoch": 0.81, + "grad_norm": 0.6171875, + "learning_rate": 0.0001936038319636326, + "loss": 0.4076, + "step": 6086 + }, + { + "epoch": 0.81, + "grad_norm": 0.40625, + "learning_rate": 0.00019359973346980613, + "loss": 0.4933, + "step": 6087 + }, + { + "epoch": 0.81, + "grad_norm": 0.8359375, + "learning_rate": 0.00019359563370670725, + "loss": 0.4546, + "step": 6088 + }, + { + "epoch": 0.81, + "grad_norm": 0.46484375, + "learning_rate": 0.00019359153267439153, + "loss": 0.3886, + "step": 6089 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 0.00019358743037291453, + "loss": 0.352, + "step": 6090 + }, + { + "epoch": 0.81, + "grad_norm": 0.77734375, + "learning_rate": 0.00019358332680233188, + "loss": 0.4064, + "step": 6091 + }, + { + "epoch": 0.81, + "grad_norm": 0.5859375, + "learning_rate": 0.00019357922196269932, + "loss": 0.3434, + "step": 6092 + }, + { + "epoch": 0.81, + "grad_norm": 0.62890625, + "learning_rate": 0.0001935751158540724, + "loss": 0.687, + "step": 6093 + }, + { + "epoch": 0.81, + "grad_norm": 0.55078125, + "learning_rate": 0.00019357100847650687, + "loss": 0.4967, + "step": 6094 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 0.00019356689983005842, + "loss": 0.8931, + "step": 6095 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 0.00019356278991478277, + "loss": 0.4057, + "step": 6096 + }, + { + "epoch": 0.81, + "grad_norm": 0.53515625, + "learning_rate": 0.0001935586787307356, + "loss": 0.3783, + "step": 6097 + }, + { + "epoch": 0.81, + "grad_norm": 0.56640625, + "learning_rate": 0.00019355456627797275, + "loss": 0.4825, + "step": 6098 + }, + { + "epoch": 0.81, + "grad_norm": 0.7890625, + "learning_rate": 0.00019355045255654992, + "loss": 0.7602, + "step": 6099 + }, + { + "epoch": 0.81, + "grad_norm": 0.78125, + "learning_rate": 0.00019354633756652286, + "loss": 0.3166, + "step": 6100 + }, + { + "epoch": 0.81, + "grad_norm": 0.5078125, + "learning_rate": 0.0001935422213079475, + "loss": 0.465, + "step": 6101 + }, + { + "epoch": 0.81, + "grad_norm": 0.421875, + "learning_rate": 0.00019353810378087952, + "loss": 0.3508, + "step": 6102 + }, + { + "epoch": 0.81, + "grad_norm": 0.50390625, + "learning_rate": 0.00019353398498537488, + "loss": 0.3715, + "step": 6103 + }, + { + "epoch": 0.81, + "grad_norm": 0.61328125, + "learning_rate": 0.00019352986492148934, + "loss": 0.483, + "step": 6104 + }, + { + "epoch": 0.81, + "grad_norm": 1.0, + "learning_rate": 0.00019352574358927883, + "loss": 0.3787, + "step": 6105 + }, + { + "epoch": 0.81, + "grad_norm": 0.53515625, + "learning_rate": 0.0001935216209887992, + "loss": 0.3453, + "step": 6106 + }, + { + "epoch": 0.81, + "grad_norm": 0.609375, + "learning_rate": 0.00019351749712010635, + "loss": 0.755, + "step": 6107 + }, + { + "epoch": 0.82, + "grad_norm": 0.6328125, + "learning_rate": 0.00019351337198325624, + "loss": 0.8287, + "step": 6108 + }, + { + "epoch": 0.82, + "grad_norm": 0.734375, + "learning_rate": 0.00019350924557830478, + "loss": 0.4475, + "step": 6109 + }, + { + "epoch": 0.82, + "grad_norm": 0.6015625, + "learning_rate": 0.00019350511790530795, + "loss": 0.402, + "step": 6110 + }, + { + "epoch": 0.82, + "grad_norm": 0.5390625, + "learning_rate": 0.0001935009889643217, + "loss": 0.2723, + "step": 6111 + }, + { + "epoch": 0.82, + "grad_norm": 1.125, + "learning_rate": 0.00019349685875540203, + "loss": 0.4941, + "step": 6112 + }, + { + "epoch": 0.82, + "grad_norm": 0.53515625, + "learning_rate": 0.00019349272727860496, + "loss": 0.331, + "step": 6113 + }, + { + "epoch": 0.82, + "grad_norm": 0.6015625, + "learning_rate": 0.00019348859453398646, + "loss": 0.3168, + "step": 6114 + }, + { + "epoch": 0.82, + "grad_norm": 0.62890625, + "learning_rate": 0.00019348446052160268, + "loss": 0.7964, + "step": 6115 + }, + { + "epoch": 0.82, + "grad_norm": 0.8203125, + "learning_rate": 0.00019348032524150956, + "loss": 0.5758, + "step": 6116 + }, + { + "epoch": 0.82, + "grad_norm": 0.6484375, + "learning_rate": 0.00019347618869376327, + "loss": 0.6657, + "step": 6117 + }, + { + "epoch": 0.82, + "grad_norm": 0.57421875, + "learning_rate": 0.0001934720508784199, + "loss": 0.4438, + "step": 6118 + }, + { + "epoch": 0.82, + "grad_norm": 0.6875, + "learning_rate": 0.00019346791179553546, + "loss": 0.4913, + "step": 6119 + }, + { + "epoch": 0.82, + "grad_norm": 0.578125, + "learning_rate": 0.00019346377144516619, + "loss": 0.3681, + "step": 6120 + }, + { + "epoch": 0.82, + "grad_norm": 0.515625, + "learning_rate": 0.00019345962982736818, + "loss": 0.4853, + "step": 6121 + }, + { + "epoch": 0.82, + "grad_norm": 0.83203125, + "learning_rate": 0.00019345548694219762, + "loss": 0.431, + "step": 6122 + }, + { + "epoch": 0.82, + "grad_norm": 0.58984375, + "learning_rate": 0.00019345134278971065, + "loss": 0.3049, + "step": 6123 + }, + { + "epoch": 0.82, + "grad_norm": 0.640625, + "learning_rate": 0.00019344719736996352, + "loss": 0.3716, + "step": 6124 + }, + { + "epoch": 0.82, + "grad_norm": 0.60546875, + "learning_rate": 0.00019344305068301243, + "loss": 0.2041, + "step": 6125 + }, + { + "epoch": 0.82, + "grad_norm": 0.55859375, + "learning_rate": 0.00019343890272891358, + "loss": 0.3038, + "step": 6126 + }, + { + "epoch": 0.82, + "grad_norm": 0.8203125, + "learning_rate": 0.00019343475350772326, + "loss": 0.3466, + "step": 6127 + }, + { + "epoch": 0.82, + "grad_norm": 0.65234375, + "learning_rate": 0.0001934306030194977, + "loss": 0.739, + "step": 6128 + }, + { + "epoch": 0.82, + "grad_norm": 0.41796875, + "learning_rate": 0.0001934264512642932, + "loss": 0.221, + "step": 6129 + }, + { + "epoch": 0.82, + "grad_norm": 0.66015625, + "learning_rate": 0.00019342229824216608, + "loss": 0.6789, + "step": 6130 + }, + { + "epoch": 0.82, + "grad_norm": 0.5859375, + "learning_rate": 0.0001934181439531726, + "loss": 0.3501, + "step": 6131 + }, + { + "epoch": 0.82, + "grad_norm": 0.431640625, + "learning_rate": 0.00019341398839736919, + "loss": 0.2642, + "step": 6132 + }, + { + "epoch": 0.82, + "grad_norm": 0.7890625, + "learning_rate": 0.00019340983157481207, + "loss": 0.5489, + "step": 6133 + }, + { + "epoch": 0.82, + "grad_norm": 0.90234375, + "learning_rate": 0.00019340567348555772, + "loss": 0.6884, + "step": 6134 + }, + { + "epoch": 0.82, + "grad_norm": 0.55078125, + "learning_rate": 0.0001934015141296625, + "loss": 0.334, + "step": 6135 + }, + { + "epoch": 0.82, + "grad_norm": 0.62890625, + "learning_rate": 0.0001933973535071828, + "loss": 0.5635, + "step": 6136 + }, + { + "epoch": 0.82, + "grad_norm": 0.7890625, + "learning_rate": 0.00019339319161817503, + "loss": 0.4638, + "step": 6137 + }, + { + "epoch": 0.82, + "grad_norm": 0.71875, + "learning_rate": 0.00019338902846269566, + "loss": 0.3409, + "step": 6138 + }, + { + "epoch": 0.82, + "grad_norm": 0.76953125, + "learning_rate": 0.00019338486404080115, + "loss": 0.4955, + "step": 6139 + }, + { + "epoch": 0.82, + "grad_norm": 0.5078125, + "learning_rate": 0.0001933806983525479, + "loss": 0.2985, + "step": 6140 + }, + { + "epoch": 0.82, + "grad_norm": 0.59765625, + "learning_rate": 0.00019337653139799246, + "loss": 0.4837, + "step": 6141 + }, + { + "epoch": 0.82, + "grad_norm": 0.60546875, + "learning_rate": 0.00019337236317719134, + "loss": 0.2787, + "step": 6142 + }, + { + "epoch": 0.82, + "grad_norm": 0.66796875, + "learning_rate": 0.00019336819369020105, + "loss": 0.4033, + "step": 6143 + }, + { + "epoch": 0.82, + "grad_norm": 0.75, + "learning_rate": 0.00019336402293707811, + "loss": 0.2921, + "step": 6144 + }, + { + "epoch": 0.82, + "grad_norm": 0.443359375, + "learning_rate": 0.00019335985091787912, + "loss": 0.5313, + "step": 6145 + }, + { + "epoch": 0.82, + "grad_norm": 0.62890625, + "learning_rate": 0.00019335567763266063, + "loss": 0.5661, + "step": 6146 + }, + { + "epoch": 0.82, + "grad_norm": 0.57421875, + "learning_rate": 0.00019335150308147925, + "loss": 0.6206, + "step": 6147 + }, + { + "epoch": 0.82, + "grad_norm": 0.7734375, + "learning_rate": 0.00019334732726439152, + "loss": 0.5556, + "step": 6148 + }, + { + "epoch": 0.82, + "grad_norm": 0.4765625, + "learning_rate": 0.00019334315018145417, + "loss": 0.252, + "step": 6149 + }, + { + "epoch": 0.82, + "grad_norm": 0.890625, + "learning_rate": 0.0001933389718327238, + "loss": 0.3983, + "step": 6150 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 0.00019333479221825706, + "loss": 0.4866, + "step": 6151 + }, + { + "epoch": 0.82, + "grad_norm": 0.65234375, + "learning_rate": 0.00019333061133811063, + "loss": 0.2306, + "step": 6152 + }, + { + "epoch": 0.82, + "grad_norm": 0.4375, + "learning_rate": 0.0001933264291923412, + "loss": 0.4179, + "step": 6153 + }, + { + "epoch": 0.82, + "grad_norm": 0.67578125, + "learning_rate": 0.0001933222457810055, + "loss": 0.3893, + "step": 6154 + }, + { + "epoch": 0.82, + "grad_norm": 0.6953125, + "learning_rate": 0.00019331806110416027, + "loss": 0.4344, + "step": 6155 + }, + { + "epoch": 0.82, + "grad_norm": 0.640625, + "learning_rate": 0.00019331387516186222, + "loss": 0.2737, + "step": 6156 + }, + { + "epoch": 0.82, + "grad_norm": 0.80078125, + "learning_rate": 0.00019330968795416815, + "loss": 0.4087, + "step": 6157 + }, + { + "epoch": 0.82, + "grad_norm": 0.4921875, + "learning_rate": 0.00019330549948113484, + "loss": 0.5263, + "step": 6158 + }, + { + "epoch": 0.82, + "grad_norm": 0.515625, + "learning_rate": 0.00019330130974281904, + "loss": 0.4349, + "step": 6159 + }, + { + "epoch": 0.82, + "grad_norm": 0.61328125, + "learning_rate": 0.0001932971187392776, + "loss": 0.4144, + "step": 6160 + }, + { + "epoch": 0.82, + "grad_norm": 0.703125, + "learning_rate": 0.00019329292647056736, + "loss": 0.5501, + "step": 6161 + }, + { + "epoch": 0.82, + "grad_norm": 0.67578125, + "learning_rate": 0.00019328873293674516, + "loss": 0.4831, + "step": 6162 + }, + { + "epoch": 0.82, + "grad_norm": 0.46484375, + "learning_rate": 0.0001932845381378679, + "loss": 0.4197, + "step": 6163 + }, + { + "epoch": 0.82, + "grad_norm": 1.25, + "learning_rate": 0.0001932803420739924, + "loss": 0.5146, + "step": 6164 + }, + { + "epoch": 0.82, + "grad_norm": 0.875, + "learning_rate": 0.0001932761447451756, + "loss": 0.2419, + "step": 6165 + }, + { + "epoch": 0.82, + "grad_norm": 0.55859375, + "learning_rate": 0.0001932719461514744, + "loss": 0.2455, + "step": 6166 + }, + { + "epoch": 0.82, + "grad_norm": 0.3984375, + "learning_rate": 0.0001932677462929458, + "loss": 0.1921, + "step": 6167 + }, + { + "epoch": 0.82, + "grad_norm": 0.6015625, + "learning_rate": 0.00019326354516964665, + "loss": 0.5317, + "step": 6168 + }, + { + "epoch": 0.82, + "grad_norm": 0.4296875, + "learning_rate": 0.00019325934278163398, + "loss": 0.2513, + "step": 6169 + }, + { + "epoch": 0.82, + "grad_norm": 0.6015625, + "learning_rate": 0.00019325513912896478, + "loss": 0.6186, + "step": 6170 + }, + { + "epoch": 0.82, + "grad_norm": 0.73046875, + "learning_rate": 0.00019325093421169605, + "loss": 0.4336, + "step": 6171 + }, + { + "epoch": 0.82, + "grad_norm": 0.765625, + "learning_rate": 0.00019324672802988478, + "loss": 0.3359, + "step": 6172 + }, + { + "epoch": 0.82, + "grad_norm": 0.5859375, + "learning_rate": 0.00019324252058358806, + "loss": 0.8855, + "step": 6173 + }, + { + "epoch": 0.82, + "grad_norm": 0.55078125, + "learning_rate": 0.0001932383118728629, + "loss": 0.4281, + "step": 6174 + }, + { + "epoch": 0.82, + "grad_norm": 0.5625, + "learning_rate": 0.00019323410189776638, + "loss": 0.5171, + "step": 6175 + }, + { + "epoch": 0.82, + "grad_norm": 0.6484375, + "learning_rate": 0.0001932298906583556, + "loss": 0.8271, + "step": 6176 + }, + { + "epoch": 0.82, + "grad_norm": 0.62109375, + "learning_rate": 0.0001932256781546877, + "loss": 0.3822, + "step": 6177 + }, + { + "epoch": 0.82, + "grad_norm": 0.6875, + "learning_rate": 0.00019322146438681976, + "loss": 0.5889, + "step": 6178 + }, + { + "epoch": 0.82, + "grad_norm": 0.56640625, + "learning_rate": 0.0001932172493548089, + "loss": 0.4019, + "step": 6179 + }, + { + "epoch": 0.82, + "grad_norm": 0.74609375, + "learning_rate": 0.00019321303305871237, + "loss": 0.2647, + "step": 6180 + }, + { + "epoch": 0.82, + "grad_norm": 0.59765625, + "learning_rate": 0.00019320881549858725, + "loss": 0.6663, + "step": 6181 + }, + { + "epoch": 0.82, + "grad_norm": 0.64453125, + "learning_rate": 0.00019320459667449078, + "loss": 0.5707, + "step": 6182 + }, + { + "epoch": 0.83, + "grad_norm": 0.77734375, + "learning_rate": 0.0001932003765864802, + "loss": 0.3513, + "step": 6183 + }, + { + "epoch": 0.83, + "grad_norm": 0.53515625, + "learning_rate": 0.00019319615523461265, + "loss": 0.4034, + "step": 6184 + }, + { + "epoch": 0.83, + "grad_norm": 0.62109375, + "learning_rate": 0.00019319193261894542, + "loss": 0.4729, + "step": 6185 + }, + { + "epoch": 0.83, + "grad_norm": 0.71875, + "learning_rate": 0.00019318770873953577, + "loss": 0.3884, + "step": 6186 + }, + { + "epoch": 0.83, + "grad_norm": 0.57421875, + "learning_rate": 0.000193183483596441, + "loss": 0.7167, + "step": 6187 + }, + { + "epoch": 0.83, + "grad_norm": 0.72265625, + "learning_rate": 0.00019317925718971842, + "loss": 0.4737, + "step": 6188 + }, + { + "epoch": 0.83, + "grad_norm": 0.5546875, + "learning_rate": 0.00019317502951942525, + "loss": 0.5345, + "step": 6189 + }, + { + "epoch": 0.83, + "grad_norm": 0.61328125, + "learning_rate": 0.0001931708005856189, + "loss": 0.5811, + "step": 6190 + }, + { + "epoch": 0.83, + "grad_norm": 0.5390625, + "learning_rate": 0.00019316657038835673, + "loss": 0.4489, + "step": 6191 + }, + { + "epoch": 0.83, + "grad_norm": 0.69140625, + "learning_rate": 0.00019316233892769603, + "loss": 0.5427, + "step": 6192 + }, + { + "epoch": 0.83, + "grad_norm": 0.59765625, + "learning_rate": 0.00019315810620369425, + "loss": 0.5521, + "step": 6193 + }, + { + "epoch": 0.83, + "grad_norm": 0.8359375, + "learning_rate": 0.00019315387221640874, + "loss": 0.2594, + "step": 6194 + }, + { + "epoch": 0.83, + "grad_norm": 0.65625, + "learning_rate": 0.00019314963696589693, + "loss": 0.519, + "step": 6195 + }, + { + "epoch": 0.83, + "grad_norm": 0.44140625, + "learning_rate": 0.00019314540045221626, + "loss": 0.3474, + "step": 6196 + }, + { + "epoch": 0.83, + "grad_norm": 0.58203125, + "learning_rate": 0.0001931411626754242, + "loss": 0.8688, + "step": 6197 + }, + { + "epoch": 0.83, + "grad_norm": 0.62890625, + "learning_rate": 0.00019313692363557823, + "loss": 0.3631, + "step": 6198 + }, + { + "epoch": 0.83, + "grad_norm": 0.52734375, + "learning_rate": 0.00019313268333273574, + "loss": 0.6506, + "step": 6199 + }, + { + "epoch": 0.83, + "grad_norm": 0.70703125, + "learning_rate": 0.00019312844176695433, + "loss": 0.4048, + "step": 6200 + }, + { + "epoch": 0.83, + "grad_norm": 0.81640625, + "learning_rate": 0.00019312419893829146, + "loss": 0.6019, + "step": 6201 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 0.00019311995484680468, + "loss": 0.3121, + "step": 6202 + }, + { + "epoch": 0.83, + "grad_norm": 0.578125, + "learning_rate": 0.00019311570949255157, + "loss": 0.3364, + "step": 6203 + }, + { + "epoch": 0.83, + "grad_norm": 0.6953125, + "learning_rate": 0.00019311146287558968, + "loss": 0.3419, + "step": 6204 + }, + { + "epoch": 0.83, + "grad_norm": 0.7890625, + "learning_rate": 0.00019310721499597654, + "loss": 0.4325, + "step": 6205 + }, + { + "epoch": 0.83, + "grad_norm": 0.75390625, + "learning_rate": 0.00019310296585376988, + "loss": 0.6241, + "step": 6206 + }, + { + "epoch": 0.83, + "grad_norm": 0.69140625, + "learning_rate": 0.00019309871544902722, + "loss": 0.5444, + "step": 6207 + }, + { + "epoch": 0.83, + "grad_norm": 0.498046875, + "learning_rate": 0.00019309446378180622, + "loss": 0.5055, + "step": 6208 + }, + { + "epoch": 0.83, + "grad_norm": 0.4921875, + "learning_rate": 0.00019309021085216457, + "loss": 0.3958, + "step": 6209 + }, + { + "epoch": 0.83, + "grad_norm": 0.68359375, + "learning_rate": 0.0001930859566601599, + "loss": 0.3815, + "step": 6210 + }, + { + "epoch": 0.83, + "grad_norm": 0.55859375, + "learning_rate": 0.0001930817012058499, + "loss": 0.4122, + "step": 6211 + }, + { + "epoch": 0.83, + "grad_norm": 0.58984375, + "learning_rate": 0.0001930774444892923, + "loss": 0.6765, + "step": 6212 + }, + { + "epoch": 0.83, + "grad_norm": 0.76953125, + "learning_rate": 0.00019307318651054481, + "loss": 0.2727, + "step": 6213 + }, + { + "epoch": 0.83, + "grad_norm": 0.63671875, + "learning_rate": 0.0001930689272696652, + "loss": 0.6926, + "step": 6214 + }, + { + "epoch": 0.83, + "grad_norm": 1.0078125, + "learning_rate": 0.0001930646667667112, + "loss": 0.4379, + "step": 6215 + }, + { + "epoch": 0.83, + "grad_norm": 0.8125, + "learning_rate": 0.00019306040500174058, + "loss": 0.328, + "step": 6216 + }, + { + "epoch": 0.83, + "grad_norm": 0.5703125, + "learning_rate": 0.00019305614197481116, + "loss": 0.6071, + "step": 6217 + }, + { + "epoch": 0.83, + "grad_norm": 0.58984375, + "learning_rate": 0.0001930518776859807, + "loss": 0.5484, + "step": 6218 + }, + { + "epoch": 0.83, + "grad_norm": 0.498046875, + "learning_rate": 0.00019304761213530708, + "loss": 0.4261, + "step": 6219 + }, + { + "epoch": 0.83, + "grad_norm": 0.5625, + "learning_rate": 0.00019304334532284814, + "loss": 0.3904, + "step": 6220 + }, + { + "epoch": 0.83, + "grad_norm": 0.6875, + "learning_rate": 0.00019303907724866167, + "loss": 0.5659, + "step": 6221 + }, + { + "epoch": 0.83, + "grad_norm": 0.46484375, + "learning_rate": 0.00019303480791280563, + "loss": 0.2247, + "step": 6222 + }, + { + "epoch": 0.83, + "grad_norm": 0.68359375, + "learning_rate": 0.0001930305373153379, + "loss": 0.5378, + "step": 6223 + }, + { + "epoch": 0.83, + "grad_norm": 0.984375, + "learning_rate": 0.00019302626545631636, + "loss": 0.5133, + "step": 6224 + }, + { + "epoch": 0.83, + "grad_norm": 0.57421875, + "learning_rate": 0.00019302199233579896, + "loss": 0.4461, + "step": 6225 + }, + { + "epoch": 0.83, + "grad_norm": 0.66796875, + "learning_rate": 0.00019301771795384365, + "loss": 0.4038, + "step": 6226 + }, + { + "epoch": 0.83, + "grad_norm": 0.578125, + "learning_rate": 0.00019301344231050839, + "loss": 0.3178, + "step": 6227 + }, + { + "epoch": 0.83, + "grad_norm": 0.69921875, + "learning_rate": 0.00019300916540585113, + "loss": 0.5036, + "step": 6228 + }, + { + "epoch": 0.83, + "grad_norm": 0.7734375, + "learning_rate": 0.00019300488723992992, + "loss": 0.4384, + "step": 6229 + }, + { + "epoch": 0.83, + "grad_norm": 0.515625, + "learning_rate": 0.00019300060781280275, + "loss": 0.4139, + "step": 6230 + }, + { + "epoch": 0.83, + "grad_norm": 0.58984375, + "learning_rate": 0.00019299632712452763, + "loss": 0.3579, + "step": 6231 + }, + { + "epoch": 0.83, + "grad_norm": 0.828125, + "learning_rate": 0.00019299204517516264, + "loss": 0.3344, + "step": 6232 + }, + { + "epoch": 0.83, + "grad_norm": 0.66796875, + "learning_rate": 0.00019298776196476585, + "loss": 0.4188, + "step": 6233 + }, + { + "epoch": 0.83, + "grad_norm": 0.50390625, + "learning_rate": 0.0001929834774933953, + "loss": 0.4162, + "step": 6234 + }, + { + "epoch": 0.83, + "grad_norm": 0.478515625, + "learning_rate": 0.00019297919176110913, + "loss": 0.343, + "step": 6235 + }, + { + "epoch": 0.83, + "grad_norm": 0.56640625, + "learning_rate": 0.00019297490476796544, + "loss": 0.9118, + "step": 6236 + }, + { + "epoch": 0.83, + "grad_norm": 0.64453125, + "learning_rate": 0.00019297061651402236, + "loss": 0.588, + "step": 6237 + }, + { + "epoch": 0.83, + "grad_norm": 0.890625, + "learning_rate": 0.0001929663269993381, + "loss": 0.4533, + "step": 6238 + }, + { + "epoch": 0.83, + "grad_norm": 0.59765625, + "learning_rate": 0.00019296203622397076, + "loss": 0.3466, + "step": 6239 + }, + { + "epoch": 0.83, + "grad_norm": 0.796875, + "learning_rate": 0.00019295774418797855, + "loss": 0.72, + "step": 6240 + }, + { + "epoch": 0.83, + "grad_norm": 0.515625, + "learning_rate": 0.00019295345089141963, + "loss": 0.5052, + "step": 6241 + }, + { + "epoch": 0.83, + "grad_norm": 0.66796875, + "learning_rate": 0.0001929491563343523, + "loss": 0.4978, + "step": 6242 + }, + { + "epoch": 0.83, + "grad_norm": 0.61328125, + "learning_rate": 0.00019294486051683472, + "loss": 0.5973, + "step": 6243 + }, + { + "epoch": 0.83, + "grad_norm": 0.78515625, + "learning_rate": 0.00019294056343892522, + "loss": 0.4441, + "step": 6244 + }, + { + "epoch": 0.83, + "grad_norm": 0.62109375, + "learning_rate": 0.000192936265100682, + "loss": 0.6448, + "step": 6245 + }, + { + "epoch": 0.83, + "grad_norm": 0.61328125, + "learning_rate": 0.00019293196550216338, + "loss": 0.4322, + "step": 6246 + }, + { + "epoch": 0.83, + "grad_norm": 0.5234375, + "learning_rate": 0.0001929276646434277, + "loss": 0.2526, + "step": 6247 + }, + { + "epoch": 0.83, + "grad_norm": 0.494140625, + "learning_rate": 0.0001929233625245332, + "loss": 0.478, + "step": 6248 + }, + { + "epoch": 0.83, + "grad_norm": 0.49609375, + "learning_rate": 0.0001929190591455383, + "loss": 0.4481, + "step": 6249 + }, + { + "epoch": 0.83, + "grad_norm": 0.7109375, + "learning_rate": 0.0001929147545065013, + "loss": 0.5622, + "step": 6250 + }, + { + "epoch": 0.83, + "grad_norm": 0.5, + "learning_rate": 0.0001929104486074806, + "loss": 0.499, + "step": 6251 + }, + { + "epoch": 0.83, + "grad_norm": 0.6640625, + "learning_rate": 0.0001929061414485346, + "loss": 0.6108, + "step": 6252 + }, + { + "epoch": 0.83, + "grad_norm": 0.65234375, + "learning_rate": 0.0001929018330297217, + "loss": 0.5276, + "step": 6253 + }, + { + "epoch": 0.83, + "grad_norm": 0.5546875, + "learning_rate": 0.0001928975233511003, + "loss": 0.4732, + "step": 6254 + }, + { + "epoch": 0.83, + "grad_norm": 0.69140625, + "learning_rate": 0.0001928932124127289, + "loss": 0.4671, + "step": 6255 + }, + { + "epoch": 0.83, + "grad_norm": 0.56640625, + "learning_rate": 0.0001928889002146659, + "loss": 0.4309, + "step": 6256 + }, + { + "epoch": 0.83, + "grad_norm": 0.62109375, + "learning_rate": 0.0001928845867569698, + "loss": 0.4077, + "step": 6257 + }, + { + "epoch": 0.84, + "grad_norm": 0.69921875, + "learning_rate": 0.00019288027203969907, + "loss": 0.3927, + "step": 6258 + }, + { + "epoch": 0.84, + "grad_norm": 0.4296875, + "learning_rate": 0.00019287595606291225, + "loss": 0.2785, + "step": 6259 + }, + { + "epoch": 0.84, + "grad_norm": 0.3984375, + "learning_rate": 0.00019287163882666787, + "loss": 0.2726, + "step": 6260 + }, + { + "epoch": 0.84, + "grad_norm": 1.4765625, + "learning_rate": 0.00019286732033102442, + "loss": 0.7002, + "step": 6261 + }, + { + "epoch": 0.84, + "grad_norm": 0.609375, + "learning_rate": 0.00019286300057604056, + "loss": 0.4281, + "step": 6262 + }, + { + "epoch": 0.84, + "grad_norm": 0.640625, + "learning_rate": 0.0001928586795617748, + "loss": 0.3377, + "step": 6263 + }, + { + "epoch": 0.84, + "grad_norm": 0.640625, + "learning_rate": 0.00019285435728828572, + "loss": 0.3413, + "step": 6264 + }, + { + "epoch": 0.84, + "grad_norm": 0.57421875, + "learning_rate": 0.000192850033755632, + "loss": 0.3249, + "step": 6265 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 0.00019284570896387223, + "loss": 0.4096, + "step": 6266 + }, + { + "epoch": 0.84, + "grad_norm": 0.89453125, + "learning_rate": 0.00019284138291306502, + "loss": 0.7773, + "step": 6267 + }, + { + "epoch": 0.84, + "grad_norm": 0.58203125, + "learning_rate": 0.0001928370556032691, + "loss": 0.4891, + "step": 6268 + }, + { + "epoch": 0.84, + "grad_norm": 0.453125, + "learning_rate": 0.00019283272703454312, + "loss": 0.2834, + "step": 6269 + }, + { + "epoch": 0.84, + "grad_norm": 0.703125, + "learning_rate": 0.00019282839720694578, + "loss": 0.5388, + "step": 6270 + }, + { + "epoch": 0.84, + "grad_norm": 0.59765625, + "learning_rate": 0.00019282406612053577, + "loss": 0.3634, + "step": 6271 + }, + { + "epoch": 0.84, + "grad_norm": 0.5625, + "learning_rate": 0.0001928197337753719, + "loss": 0.5575, + "step": 6272 + }, + { + "epoch": 0.84, + "grad_norm": 0.54296875, + "learning_rate": 0.00019281540017151283, + "loss": 0.4724, + "step": 6273 + }, + { + "epoch": 0.84, + "grad_norm": 0.5, + "learning_rate": 0.00019281106530901737, + "loss": 0.4315, + "step": 6274 + }, + { + "epoch": 0.84, + "grad_norm": 0.66796875, + "learning_rate": 0.00019280672918794433, + "loss": 0.5509, + "step": 6275 + }, + { + "epoch": 0.84, + "grad_norm": 0.53125, + "learning_rate": 0.00019280239180835247, + "loss": 0.3237, + "step": 6276 + }, + { + "epoch": 0.84, + "grad_norm": 0.6171875, + "learning_rate": 0.0001927980531703006, + "loss": 0.551, + "step": 6277 + }, + { + "epoch": 0.84, + "grad_norm": 0.8515625, + "learning_rate": 0.0001927937132738476, + "loss": 0.4195, + "step": 6278 + }, + { + "epoch": 0.84, + "grad_norm": 0.75390625, + "learning_rate": 0.00019278937211905225, + "loss": 0.4436, + "step": 6279 + }, + { + "epoch": 0.84, + "grad_norm": 0.83203125, + "learning_rate": 0.0001927850297059735, + "loss": 0.6889, + "step": 6280 + }, + { + "epoch": 0.84, + "grad_norm": 0.55859375, + "learning_rate": 0.00019278068603467016, + "loss": 0.3516, + "step": 6281 + }, + { + "epoch": 0.84, + "grad_norm": 0.76953125, + "learning_rate": 0.00019277634110520119, + "loss": 0.5778, + "step": 6282 + }, + { + "epoch": 0.84, + "grad_norm": 0.54296875, + "learning_rate": 0.0001927719949176255, + "loss": 0.7671, + "step": 6283 + }, + { + "epoch": 0.84, + "grad_norm": 0.78125, + "learning_rate": 0.00019276764747200203, + "loss": 0.5788, + "step": 6284 + }, + { + "epoch": 0.84, + "grad_norm": 0.58203125, + "learning_rate": 0.0001927632987683897, + "loss": 0.2937, + "step": 6285 + }, + { + "epoch": 0.84, + "grad_norm": 0.56640625, + "learning_rate": 0.00019275894880684747, + "loss": 0.5393, + "step": 6286 + }, + { + "epoch": 0.84, + "grad_norm": 0.60546875, + "learning_rate": 0.0001927545975874344, + "loss": 0.3682, + "step": 6287 + }, + { + "epoch": 0.84, + "grad_norm": 0.5, + "learning_rate": 0.0001927502451102095, + "loss": 0.3796, + "step": 6288 + }, + { + "epoch": 0.84, + "grad_norm": 0.5234375, + "learning_rate": 0.00019274589137523168, + "loss": 0.5273, + "step": 6289 + }, + { + "epoch": 0.84, + "grad_norm": 0.478515625, + "learning_rate": 0.00019274153638256008, + "loss": 0.4124, + "step": 6290 + }, + { + "epoch": 0.84, + "grad_norm": 0.498046875, + "learning_rate": 0.0001927371801322537, + "loss": 0.2708, + "step": 6291 + }, + { + "epoch": 0.84, + "grad_norm": 0.69140625, + "learning_rate": 0.00019273282262437168, + "loss": 0.6551, + "step": 6292 + }, + { + "epoch": 0.84, + "grad_norm": 0.5234375, + "learning_rate": 0.00019272846385897304, + "loss": 0.1978, + "step": 6293 + }, + { + "epoch": 0.84, + "grad_norm": 0.62109375, + "learning_rate": 0.00019272410383611694, + "loss": 0.3056, + "step": 6294 + }, + { + "epoch": 0.84, + "grad_norm": 0.326171875, + "learning_rate": 0.00019271974255586244, + "loss": 0.2746, + "step": 6295 + }, + { + "epoch": 0.84, + "grad_norm": 0.447265625, + "learning_rate": 0.0001927153800182688, + "loss": 0.2535, + "step": 6296 + }, + { + "epoch": 0.84, + "grad_norm": 0.58984375, + "learning_rate": 0.00019271101622339503, + "loss": 0.2889, + "step": 6297 + }, + { + "epoch": 0.84, + "grad_norm": 0.62109375, + "learning_rate": 0.0001927066511713004, + "loss": 0.3192, + "step": 6298 + }, + { + "epoch": 0.84, + "grad_norm": 0.7890625, + "learning_rate": 0.00019270228486204408, + "loss": 0.3556, + "step": 6299 + }, + { + "epoch": 0.84, + "grad_norm": 1.171875, + "learning_rate": 0.00019269791729568532, + "loss": 0.5101, + "step": 6300 + }, + { + "epoch": 0.84, + "grad_norm": 0.99609375, + "learning_rate": 0.00019269354847228324, + "loss": 0.3926, + "step": 6301 + }, + { + "epoch": 0.84, + "grad_norm": 0.83203125, + "learning_rate": 0.00019268917839189719, + "loss": 0.4564, + "step": 6302 + }, + { + "epoch": 0.84, + "grad_norm": 0.44140625, + "learning_rate": 0.0001926848070545864, + "loss": 0.3928, + "step": 6303 + }, + { + "epoch": 0.84, + "grad_norm": 0.61328125, + "learning_rate": 0.00019268043446041012, + "loss": 0.3447, + "step": 6304 + }, + { + "epoch": 0.84, + "grad_norm": 0.53125, + "learning_rate": 0.00019267606060942768, + "loss": 0.5755, + "step": 6305 + }, + { + "epoch": 0.84, + "grad_norm": 0.470703125, + "learning_rate": 0.00019267168550169837, + "loss": 0.2753, + "step": 6306 + }, + { + "epoch": 0.84, + "grad_norm": 0.3828125, + "learning_rate": 0.00019266730913728157, + "loss": 0.2356, + "step": 6307 + }, + { + "epoch": 0.84, + "grad_norm": 0.6171875, + "learning_rate": 0.00019266293151623654, + "loss": 0.4115, + "step": 6308 + }, + { + "epoch": 0.84, + "grad_norm": 0.42578125, + "learning_rate": 0.0001926585526386227, + "loss": 0.507, + "step": 6309 + }, + { + "epoch": 0.84, + "grad_norm": 0.46484375, + "learning_rate": 0.00019265417250449943, + "loss": 0.2591, + "step": 6310 + }, + { + "epoch": 0.84, + "grad_norm": 0.609375, + "learning_rate": 0.00019264979111392608, + "loss": 0.4289, + "step": 6311 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 0.00019264540846696213, + "loss": 0.4387, + "step": 6312 + }, + { + "epoch": 0.84, + "grad_norm": 0.451171875, + "learning_rate": 0.000192641024563667, + "loss": 0.3722, + "step": 6313 + }, + { + "epoch": 0.84, + "grad_norm": 0.443359375, + "learning_rate": 0.00019263663940410005, + "loss": 0.1896, + "step": 6314 + }, + { + "epoch": 0.84, + "grad_norm": 0.71484375, + "learning_rate": 0.0001926322529883209, + "loss": 0.4504, + "step": 6315 + }, + { + "epoch": 0.84, + "grad_norm": 0.671875, + "learning_rate": 0.0001926278653163889, + "loss": 0.2636, + "step": 6316 + }, + { + "epoch": 0.84, + "grad_norm": 0.453125, + "learning_rate": 0.0001926234763883636, + "loss": 0.2757, + "step": 6317 + }, + { + "epoch": 0.84, + "grad_norm": 0.60546875, + "learning_rate": 0.00019261908620430454, + "loss": 0.5851, + "step": 6318 + }, + { + "epoch": 0.84, + "grad_norm": 0.4296875, + "learning_rate": 0.0001926146947642712, + "loss": 0.308, + "step": 6319 + }, + { + "epoch": 0.84, + "grad_norm": 0.5, + "learning_rate": 0.00019261030206832317, + "loss": 0.1617, + "step": 6320 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 0.00019260590811652001, + "loss": 0.4058, + "step": 6321 + }, + { + "epoch": 0.84, + "grad_norm": 0.59765625, + "learning_rate": 0.0001926015129089213, + "loss": 0.6722, + "step": 6322 + }, + { + "epoch": 0.84, + "grad_norm": 0.51953125, + "learning_rate": 0.00019259711644558665, + "loss": 0.7037, + "step": 6323 + }, + { + "epoch": 0.84, + "grad_norm": 0.39453125, + "learning_rate": 0.00019259271872657565, + "loss": 0.3355, + "step": 6324 + }, + { + "epoch": 0.84, + "grad_norm": 0.51171875, + "learning_rate": 0.00019258831975194798, + "loss": 0.5406, + "step": 6325 + }, + { + "epoch": 0.84, + "grad_norm": 0.6796875, + "learning_rate": 0.00019258391952176328, + "loss": 0.4337, + "step": 6326 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 0.0001925795180360812, + "loss": 0.6661, + "step": 6327 + }, + { + "epoch": 0.84, + "grad_norm": 0.7421875, + "learning_rate": 0.00019257511529496144, + "loss": 0.4351, + "step": 6328 + }, + { + "epoch": 0.84, + "grad_norm": 0.486328125, + "learning_rate": 0.00019257071129846369, + "loss": 0.2082, + "step": 6329 + }, + { + "epoch": 0.84, + "grad_norm": 0.80859375, + "learning_rate": 0.00019256630604664769, + "loss": 0.5228, + "step": 6330 + }, + { + "epoch": 0.84, + "grad_norm": 0.578125, + "learning_rate": 0.00019256189953957317, + "loss": 0.3529, + "step": 6331 + }, + { + "epoch": 0.84, + "grad_norm": 0.55859375, + "learning_rate": 0.0001925574917772999, + "loss": 0.176, + "step": 6332 + }, + { + "epoch": 0.85, + "grad_norm": 0.439453125, + "learning_rate": 0.0001925530827598876, + "loss": 0.3151, + "step": 6333 + }, + { + "epoch": 0.85, + "grad_norm": 0.6328125, + "learning_rate": 0.00019254867248739613, + "loss": 0.4836, + "step": 6334 + }, + { + "epoch": 0.85, + "grad_norm": 0.65625, + "learning_rate": 0.00019254426095988527, + "loss": 0.6508, + "step": 6335 + }, + { + "epoch": 0.85, + "grad_norm": 0.921875, + "learning_rate": 0.0001925398481774148, + "loss": 0.4585, + "step": 6336 + }, + { + "epoch": 0.85, + "grad_norm": 0.75390625, + "learning_rate": 0.00019253543414004466, + "loss": 0.5567, + "step": 6337 + }, + { + "epoch": 0.85, + "grad_norm": 0.71875, + "learning_rate": 0.0001925310188478346, + "loss": 0.5676, + "step": 6338 + }, + { + "epoch": 0.85, + "grad_norm": 0.859375, + "learning_rate": 0.00019252660230084453, + "loss": 0.5283, + "step": 6339 + }, + { + "epoch": 0.85, + "grad_norm": 0.6171875, + "learning_rate": 0.00019252218449913438, + "loss": 0.5804, + "step": 6340 + }, + { + "epoch": 0.85, + "grad_norm": 0.66015625, + "learning_rate": 0.000192517765442764, + "loss": 0.5839, + "step": 6341 + }, + { + "epoch": 0.85, + "grad_norm": 0.50390625, + "learning_rate": 0.00019251334513179335, + "loss": 0.3768, + "step": 6342 + }, + { + "epoch": 0.85, + "grad_norm": 0.734375, + "learning_rate": 0.00019250892356628238, + "loss": 0.475, + "step": 6343 + }, + { + "epoch": 0.85, + "grad_norm": 0.5625, + "learning_rate": 0.00019250450074629103, + "loss": 0.3846, + "step": 6344 + }, + { + "epoch": 0.85, + "grad_norm": 0.77734375, + "learning_rate": 0.00019250007667187928, + "loss": 0.6484, + "step": 6345 + }, + { + "epoch": 0.85, + "grad_norm": 0.68359375, + "learning_rate": 0.00019249565134310713, + "loss": 0.465, + "step": 6346 + }, + { + "epoch": 0.85, + "grad_norm": 0.73046875, + "learning_rate": 0.00019249122476003457, + "loss": 0.6269, + "step": 6347 + }, + { + "epoch": 0.85, + "grad_norm": 0.6171875, + "learning_rate": 0.00019248679692272165, + "loss": 0.3158, + "step": 6348 + }, + { + "epoch": 0.85, + "grad_norm": 0.69140625, + "learning_rate": 0.0001924823678312284, + "loss": 0.5491, + "step": 6349 + }, + { + "epoch": 0.85, + "grad_norm": 0.54296875, + "learning_rate": 0.0001924779374856149, + "loss": 0.6047, + "step": 6350 + }, + { + "epoch": 0.85, + "grad_norm": 0.87109375, + "learning_rate": 0.0001924735058859412, + "loss": 0.2936, + "step": 6351 + }, + { + "epoch": 0.85, + "grad_norm": 0.6953125, + "learning_rate": 0.0001924690730322674, + "loss": 0.315, + "step": 6352 + }, + { + "epoch": 0.85, + "grad_norm": 0.453125, + "learning_rate": 0.00019246463892465366, + "loss": 0.5713, + "step": 6353 + }, + { + "epoch": 0.85, + "grad_norm": 0.59375, + "learning_rate": 0.00019246020356316005, + "loss": 0.5527, + "step": 6354 + }, + { + "epoch": 0.85, + "grad_norm": 0.58203125, + "learning_rate": 0.00019245576694784674, + "loss": 0.2856, + "step": 6355 + }, + { + "epoch": 0.85, + "grad_norm": 0.72265625, + "learning_rate": 0.0001924513290787739, + "loss": 0.8966, + "step": 6356 + }, + { + "epoch": 0.85, + "grad_norm": 0.8125, + "learning_rate": 0.0001924468899560017, + "loss": 0.5834, + "step": 6357 + }, + { + "epoch": 0.85, + "grad_norm": 0.70703125, + "learning_rate": 0.00019244244957959032, + "loss": 0.5063, + "step": 6358 + }, + { + "epoch": 0.85, + "grad_norm": 0.6640625, + "learning_rate": 0.00019243800794960003, + "loss": 0.4818, + "step": 6359 + }, + { + "epoch": 0.85, + "grad_norm": 0.51953125, + "learning_rate": 0.000192433565066091, + "loss": 0.3733, + "step": 6360 + }, + { + "epoch": 0.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00019242912092912352, + "loss": 0.342, + "step": 6361 + }, + { + "epoch": 0.85, + "grad_norm": 0.69140625, + "learning_rate": 0.00019242467553875784, + "loss": 0.623, + "step": 6362 + }, + { + "epoch": 0.85, + "grad_norm": 0.6484375, + "learning_rate": 0.00019242022889505423, + "loss": 0.5122, + "step": 6363 + }, + { + "epoch": 0.85, + "grad_norm": 0.498046875, + "learning_rate": 0.000192415780998073, + "loss": 0.4665, + "step": 6364 + }, + { + "epoch": 0.85, + "grad_norm": 0.5703125, + "learning_rate": 0.00019241133184787448, + "loss": 0.2548, + "step": 6365 + }, + { + "epoch": 0.85, + "grad_norm": 0.51171875, + "learning_rate": 0.00019240688144451896, + "loss": 0.3932, + "step": 6366 + }, + { + "epoch": 0.85, + "grad_norm": 0.51171875, + "learning_rate": 0.00019240242978806686, + "loss": 0.2216, + "step": 6367 + }, + { + "epoch": 0.85, + "grad_norm": 0.56640625, + "learning_rate": 0.0001923979768785785, + "loss": 0.5147, + "step": 6368 + }, + { + "epoch": 0.85, + "grad_norm": 0.76171875, + "learning_rate": 0.00019239352271611425, + "loss": 0.5, + "step": 6369 + }, + { + "epoch": 0.85, + "grad_norm": 0.7265625, + "learning_rate": 0.00019238906730073457, + "loss": 0.5507, + "step": 6370 + }, + { + "epoch": 0.85, + "grad_norm": 0.494140625, + "learning_rate": 0.00019238461063249984, + "loss": 0.5526, + "step": 6371 + }, + { + "epoch": 0.85, + "grad_norm": 0.58203125, + "learning_rate": 0.00019238015271147047, + "loss": 0.5232, + "step": 6372 + }, + { + "epoch": 0.85, + "grad_norm": 0.5234375, + "learning_rate": 0.00019237569353770695, + "loss": 0.4572, + "step": 6373 + }, + { + "epoch": 0.85, + "grad_norm": 0.68359375, + "learning_rate": 0.00019237123311126975, + "loss": 0.4905, + "step": 6374 + }, + { + "epoch": 0.85, + "grad_norm": 0.5703125, + "learning_rate": 0.00019236677143221934, + "loss": 0.5824, + "step": 6375 + }, + { + "epoch": 0.85, + "grad_norm": 0.546875, + "learning_rate": 0.00019236230850061623, + "loss": 0.4431, + "step": 6376 + }, + { + "epoch": 0.85, + "grad_norm": 0.69921875, + "learning_rate": 0.0001923578443165209, + "loss": 0.3363, + "step": 6377 + }, + { + "epoch": 0.85, + "grad_norm": 0.4609375, + "learning_rate": 0.00019235337887999398, + "loss": 0.3859, + "step": 6378 + }, + { + "epoch": 0.85, + "grad_norm": 0.6015625, + "learning_rate": 0.00019234891219109597, + "loss": 0.5068, + "step": 6379 + }, + { + "epoch": 0.85, + "grad_norm": 0.474609375, + "learning_rate": 0.0001923444442498874, + "loss": 0.7556, + "step": 6380 + }, + { + "epoch": 0.85, + "grad_norm": 0.671875, + "learning_rate": 0.00019233997505642894, + "loss": 0.3593, + "step": 6381 + }, + { + "epoch": 0.85, + "grad_norm": 0.875, + "learning_rate": 0.00019233550461078113, + "loss": 0.4696, + "step": 6382 + }, + { + "epoch": 0.85, + "grad_norm": 0.65625, + "learning_rate": 0.00019233103291300464, + "loss": 0.5966, + "step": 6383 + }, + { + "epoch": 0.85, + "grad_norm": 0.498046875, + "learning_rate": 0.00019232655996316006, + "loss": 0.4865, + "step": 6384 + }, + { + "epoch": 0.85, + "grad_norm": 0.48828125, + "learning_rate": 0.00019232208576130808, + "loss": 0.4697, + "step": 6385 + }, + { + "epoch": 0.85, + "grad_norm": 0.609375, + "learning_rate": 0.00019231761030750937, + "loss": 0.3108, + "step": 6386 + }, + { + "epoch": 0.85, + "grad_norm": 0.55078125, + "learning_rate": 0.0001923131336018246, + "loss": 0.5874, + "step": 6387 + }, + { + "epoch": 0.85, + "grad_norm": 0.458984375, + "learning_rate": 0.0001923086556443145, + "loss": 0.3311, + "step": 6388 + }, + { + "epoch": 0.85, + "grad_norm": 0.63671875, + "learning_rate": 0.0001923041764350398, + "loss": 0.4872, + "step": 6389 + }, + { + "epoch": 0.85, + "grad_norm": 0.83984375, + "learning_rate": 0.0001922996959740612, + "loss": 0.7976, + "step": 6390 + }, + { + "epoch": 0.85, + "grad_norm": 0.5234375, + "learning_rate": 0.0001922952142614395, + "loss": 0.5668, + "step": 6391 + }, + { + "epoch": 0.85, + "grad_norm": 0.494140625, + "learning_rate": 0.00019229073129723546, + "loss": 0.4492, + "step": 6392 + }, + { + "epoch": 0.85, + "grad_norm": 0.37109375, + "learning_rate": 0.00019228624708150988, + "loss": 0.3783, + "step": 6393 + }, + { + "epoch": 0.85, + "grad_norm": 0.54296875, + "learning_rate": 0.0001922817616143236, + "loss": 0.6712, + "step": 6394 + }, + { + "epoch": 0.85, + "grad_norm": 0.6328125, + "learning_rate": 0.00019227727489573733, + "loss": 0.7517, + "step": 6395 + }, + { + "epoch": 0.85, + "grad_norm": 0.6015625, + "learning_rate": 0.00019227278692581203, + "loss": 0.5717, + "step": 6396 + }, + { + "epoch": 0.85, + "grad_norm": 0.66796875, + "learning_rate": 0.0001922682977046085, + "loss": 0.4737, + "step": 6397 + }, + { + "epoch": 0.85, + "grad_norm": 0.53515625, + "learning_rate": 0.00019226380723218767, + "loss": 0.3293, + "step": 6398 + }, + { + "epoch": 0.85, + "grad_norm": 0.55859375, + "learning_rate": 0.00019225931550861036, + "loss": 0.2877, + "step": 6399 + }, + { + "epoch": 0.85, + "grad_norm": 0.6171875, + "learning_rate": 0.00019225482253393755, + "loss": 0.5201, + "step": 6400 + }, + { + "epoch": 0.85, + "grad_norm": 0.5703125, + "learning_rate": 0.00019225032830823011, + "loss": 0.389, + "step": 6401 + }, + { + "epoch": 0.85, + "grad_norm": 0.58203125, + "learning_rate": 0.00019224583283154902, + "loss": 0.6889, + "step": 6402 + }, + { + "epoch": 0.85, + "grad_norm": 0.55078125, + "learning_rate": 0.00019224133610395522, + "loss": 0.4957, + "step": 6403 + }, + { + "epoch": 0.85, + "grad_norm": 0.44921875, + "learning_rate": 0.00019223683812550976, + "loss": 0.3364, + "step": 6404 + }, + { + "epoch": 0.85, + "grad_norm": 0.447265625, + "learning_rate": 0.0001922323388962735, + "loss": 0.3167, + "step": 6405 + }, + { + "epoch": 0.85, + "grad_norm": 0.74609375, + "learning_rate": 0.0001922278384163076, + "loss": 0.171, + "step": 6406 + }, + { + "epoch": 0.85, + "grad_norm": 0.70703125, + "learning_rate": 0.00019222333668567299, + "loss": 0.5755, + "step": 6407 + }, + { + "epoch": 0.86, + "grad_norm": 0.41015625, + "learning_rate": 0.00019221883370443074, + "loss": 0.1541, + "step": 6408 + }, + { + "epoch": 0.86, + "grad_norm": 0.8046875, + "learning_rate": 0.00019221432947264192, + "loss": 0.4885, + "step": 6409 + }, + { + "epoch": 0.86, + "grad_norm": 0.5625, + "learning_rate": 0.00019220982399036763, + "loss": 0.7673, + "step": 6410 + }, + { + "epoch": 0.86, + "grad_norm": 0.53515625, + "learning_rate": 0.0001922053172576689, + "loss": 0.3758, + "step": 6411 + }, + { + "epoch": 0.86, + "grad_norm": 0.71875, + "learning_rate": 0.00019220080927460695, + "loss": 0.565, + "step": 6412 + }, + { + "epoch": 0.86, + "grad_norm": 0.7421875, + "learning_rate": 0.00019219630004124283, + "loss": 0.4808, + "step": 6413 + }, + { + "epoch": 0.86, + "grad_norm": 0.58203125, + "learning_rate": 0.00019219178955763767, + "loss": 0.5926, + "step": 6414 + }, + { + "epoch": 0.86, + "grad_norm": 0.6484375, + "learning_rate": 0.00019218727782385273, + "loss": 0.3921, + "step": 6415 + }, + { + "epoch": 0.86, + "grad_norm": 0.4921875, + "learning_rate": 0.0001921827648399491, + "loss": 0.4069, + "step": 6416 + }, + { + "epoch": 0.86, + "grad_norm": 0.5078125, + "learning_rate": 0.00019217825060598807, + "loss": 0.2605, + "step": 6417 + }, + { + "epoch": 0.86, + "grad_norm": 0.58984375, + "learning_rate": 0.00019217373512203073, + "loss": 0.4529, + "step": 6418 + }, + { + "epoch": 0.86, + "grad_norm": 0.765625, + "learning_rate": 0.00019216921838813845, + "loss": 0.6013, + "step": 6419 + }, + { + "epoch": 0.86, + "grad_norm": 0.76953125, + "learning_rate": 0.00019216470040437237, + "loss": 0.3518, + "step": 6420 + }, + { + "epoch": 0.86, + "grad_norm": 0.63671875, + "learning_rate": 0.0001921601811707938, + "loss": 0.594, + "step": 6421 + }, + { + "epoch": 0.86, + "grad_norm": 0.486328125, + "learning_rate": 0.00019215566068746408, + "loss": 0.3126, + "step": 6422 + }, + { + "epoch": 0.86, + "grad_norm": 0.77734375, + "learning_rate": 0.0001921511389544444, + "loss": 0.5604, + "step": 6423 + }, + { + "epoch": 0.86, + "grad_norm": 0.85546875, + "learning_rate": 0.00019214661597179614, + "loss": 0.3962, + "step": 6424 + }, + { + "epoch": 0.86, + "grad_norm": 0.68359375, + "learning_rate": 0.00019214209173958063, + "loss": 0.7971, + "step": 6425 + }, + { + "epoch": 0.86, + "grad_norm": 0.578125, + "learning_rate": 0.00019213756625785925, + "loss": 0.3425, + "step": 6426 + }, + { + "epoch": 0.86, + "grad_norm": 0.69921875, + "learning_rate": 0.00019213303952669332, + "loss": 0.4096, + "step": 6427 + }, + { + "epoch": 0.86, + "grad_norm": 0.84765625, + "learning_rate": 0.00019212851154614424, + "loss": 0.7705, + "step": 6428 + }, + { + "epoch": 0.86, + "grad_norm": 0.80859375, + "learning_rate": 0.00019212398231627343, + "loss": 0.4574, + "step": 6429 + }, + { + "epoch": 0.86, + "grad_norm": 0.609375, + "learning_rate": 0.00019211945183714225, + "loss": 0.2986, + "step": 6430 + }, + { + "epoch": 0.86, + "grad_norm": 0.7109375, + "learning_rate": 0.00019211492010881226, + "loss": 0.3076, + "step": 6431 + }, + { + "epoch": 0.86, + "grad_norm": 0.78125, + "learning_rate": 0.00019211038713134478, + "loss": 0.4052, + "step": 6432 + }, + { + "epoch": 0.86, + "grad_norm": 0.4609375, + "learning_rate": 0.00019210585290480134, + "loss": 0.6449, + "step": 6433 + }, + { + "epoch": 0.86, + "grad_norm": 0.69140625, + "learning_rate": 0.00019210131742924342, + "loss": 0.3978, + "step": 6434 + }, + { + "epoch": 0.86, + "grad_norm": 0.796875, + "learning_rate": 0.00019209678070473252, + "loss": 0.4506, + "step": 6435 + }, + { + "epoch": 0.86, + "grad_norm": 0.94921875, + "learning_rate": 0.00019209224273133017, + "loss": 0.4357, + "step": 6436 + }, + { + "epoch": 0.86, + "grad_norm": 0.470703125, + "learning_rate": 0.0001920877035090979, + "loss": 0.3559, + "step": 6437 + }, + { + "epoch": 0.86, + "grad_norm": 0.5390625, + "learning_rate": 0.0001920831630380973, + "loss": 0.5466, + "step": 6438 + }, + { + "epoch": 0.86, + "grad_norm": 0.69921875, + "learning_rate": 0.00019207862131838987, + "loss": 0.4704, + "step": 6439 + }, + { + "epoch": 0.86, + "grad_norm": 0.455078125, + "learning_rate": 0.00019207407835003726, + "loss": 0.6363, + "step": 6440 + }, + { + "epoch": 0.86, + "grad_norm": 0.392578125, + "learning_rate": 0.00019206953413310106, + "loss": 0.3695, + "step": 6441 + }, + { + "epoch": 0.86, + "grad_norm": 0.64453125, + "learning_rate": 0.00019206498866764288, + "loss": 0.4298, + "step": 6442 + }, + { + "epoch": 0.86, + "grad_norm": 0.78125, + "learning_rate": 0.00019206044195372438, + "loss": 0.5602, + "step": 6443 + }, + { + "epoch": 0.86, + "grad_norm": 0.47265625, + "learning_rate": 0.00019205589399140716, + "loss": 0.3003, + "step": 6444 + }, + { + "epoch": 0.86, + "grad_norm": 0.58203125, + "learning_rate": 0.000192051344780753, + "loss": 0.3247, + "step": 6445 + }, + { + "epoch": 0.86, + "grad_norm": 0.59765625, + "learning_rate": 0.00019204679432182348, + "loss": 0.5268, + "step": 6446 + }, + { + "epoch": 0.86, + "grad_norm": 0.703125, + "learning_rate": 0.00019204224261468037, + "loss": 0.6033, + "step": 6447 + }, + { + "epoch": 0.86, + "grad_norm": 0.734375, + "learning_rate": 0.0001920376896593854, + "loss": 0.1589, + "step": 6448 + }, + { + "epoch": 0.86, + "grad_norm": 0.70703125, + "learning_rate": 0.00019203313545600024, + "loss": 0.5433, + "step": 6449 + }, + { + "epoch": 0.86, + "grad_norm": 1.0234375, + "learning_rate": 0.00019202858000458672, + "loss": 0.6829, + "step": 6450 + }, + { + "epoch": 0.86, + "grad_norm": 0.63671875, + "learning_rate": 0.0001920240233052066, + "loss": 0.5544, + "step": 6451 + }, + { + "epoch": 0.86, + "grad_norm": 0.7109375, + "learning_rate": 0.00019201946535792166, + "loss": 0.3573, + "step": 6452 + }, + { + "epoch": 0.86, + "grad_norm": 0.83984375, + "learning_rate": 0.0001920149061627937, + "loss": 0.4853, + "step": 6453 + }, + { + "epoch": 0.86, + "grad_norm": 0.7734375, + "learning_rate": 0.00019201034571988458, + "loss": 0.2792, + "step": 6454 + }, + { + "epoch": 0.86, + "grad_norm": 0.73828125, + "learning_rate": 0.0001920057840292561, + "loss": 0.5155, + "step": 6455 + }, + { + "epoch": 0.86, + "grad_norm": 0.6640625, + "learning_rate": 0.00019200122109097012, + "loss": 0.8605, + "step": 6456 + }, + { + "epoch": 0.86, + "grad_norm": 0.890625, + "learning_rate": 0.00019199665690508856, + "loss": 0.5233, + "step": 6457 + }, + { + "epoch": 0.86, + "grad_norm": 0.5625, + "learning_rate": 0.0001919920914716733, + "loss": 0.6845, + "step": 6458 + }, + { + "epoch": 0.86, + "grad_norm": 0.6328125, + "learning_rate": 0.00019198752479078623, + "loss": 0.4561, + "step": 6459 + }, + { + "epoch": 0.86, + "grad_norm": 0.80078125, + "learning_rate": 0.0001919829568624893, + "loss": 0.4064, + "step": 6460 + }, + { + "epoch": 0.86, + "grad_norm": 0.455078125, + "learning_rate": 0.00019197838768684442, + "loss": 0.2561, + "step": 6461 + }, + { + "epoch": 0.86, + "grad_norm": 0.62890625, + "learning_rate": 0.00019197381726391357, + "loss": 0.2327, + "step": 6462 + }, + { + "epoch": 0.86, + "grad_norm": 0.6796875, + "learning_rate": 0.00019196924559375874, + "loss": 0.478, + "step": 6463 + }, + { + "epoch": 0.86, + "grad_norm": 0.369140625, + "learning_rate": 0.00019196467267644193, + "loss": 0.2169, + "step": 6464 + }, + { + "epoch": 0.86, + "grad_norm": 0.69140625, + "learning_rate": 0.00019196009851202511, + "loss": 0.3432, + "step": 6465 + }, + { + "epoch": 0.86, + "grad_norm": 0.90625, + "learning_rate": 0.00019195552310057035, + "loss": 0.2967, + "step": 6466 + }, + { + "epoch": 0.86, + "grad_norm": 0.373046875, + "learning_rate": 0.00019195094644213967, + "loss": 0.361, + "step": 6467 + }, + { + "epoch": 0.86, + "grad_norm": 0.5703125, + "learning_rate": 0.00019194636853679512, + "loss": 0.3954, + "step": 6468 + }, + { + "epoch": 0.86, + "grad_norm": 0.578125, + "learning_rate": 0.00019194178938459886, + "loss": 0.3166, + "step": 6469 + }, + { + "epoch": 0.86, + "grad_norm": 0.5546875, + "learning_rate": 0.0001919372089856129, + "loss": 0.4136, + "step": 6470 + }, + { + "epoch": 0.86, + "grad_norm": 0.51953125, + "learning_rate": 0.00019193262733989937, + "loss": 0.4276, + "step": 6471 + }, + { + "epoch": 0.86, + "grad_norm": 0.69140625, + "learning_rate": 0.0001919280444475204, + "loss": 0.2413, + "step": 6472 + }, + { + "epoch": 0.86, + "grad_norm": 0.44921875, + "learning_rate": 0.00019192346030853818, + "loss": 0.2706, + "step": 6473 + }, + { + "epoch": 0.86, + "grad_norm": 0.54296875, + "learning_rate": 0.00019191887492301483, + "loss": 0.4144, + "step": 6474 + }, + { + "epoch": 0.86, + "grad_norm": 0.640625, + "learning_rate": 0.00019191428829101254, + "loss": 0.285, + "step": 6475 + }, + { + "epoch": 0.86, + "grad_norm": 0.58203125, + "learning_rate": 0.00019190970041259352, + "loss": 0.5064, + "step": 6476 + }, + { + "epoch": 0.86, + "grad_norm": 0.6171875, + "learning_rate": 0.00019190511128781997, + "loss": 0.5183, + "step": 6477 + }, + { + "epoch": 0.86, + "grad_norm": 0.55859375, + "learning_rate": 0.0001919005209167541, + "loss": 0.3961, + "step": 6478 + }, + { + "epoch": 0.86, + "grad_norm": 0.59765625, + "learning_rate": 0.00019189592929945822, + "loss": 0.5337, + "step": 6479 + }, + { + "epoch": 0.86, + "grad_norm": 0.55859375, + "learning_rate": 0.00019189133643599454, + "loss": 0.3121, + "step": 6480 + }, + { + "epoch": 0.86, + "grad_norm": 0.451171875, + "learning_rate": 0.00019188674232642537, + "loss": 0.3626, + "step": 6481 + }, + { + "epoch": 0.86, + "grad_norm": 0.56640625, + "learning_rate": 0.000191882146970813, + "loss": 0.4318, + "step": 6482 + }, + { + "epoch": 0.87, + "grad_norm": 0.58203125, + "learning_rate": 0.00019187755036921978, + "loss": 0.3512, + "step": 6483 + }, + { + "epoch": 0.87, + "grad_norm": 0.63671875, + "learning_rate": 0.00019187295252170796, + "loss": 0.4732, + "step": 6484 + }, + { + "epoch": 0.87, + "grad_norm": 0.578125, + "learning_rate": 0.00019186835342833997, + "loss": 0.2621, + "step": 6485 + }, + { + "epoch": 0.87, + "grad_norm": 0.515625, + "learning_rate": 0.00019186375308917814, + "loss": 0.6368, + "step": 6486 + }, + { + "epoch": 0.87, + "grad_norm": 0.828125, + "learning_rate": 0.00019185915150428487, + "loss": 0.5629, + "step": 6487 + }, + { + "epoch": 0.87, + "grad_norm": 0.78125, + "learning_rate": 0.00019185454867372251, + "loss": 0.7818, + "step": 6488 + }, + { + "epoch": 0.87, + "grad_norm": 0.5078125, + "learning_rate": 0.00019184994459755356, + "loss": 0.1841, + "step": 6489 + }, + { + "epoch": 0.87, + "grad_norm": 0.62890625, + "learning_rate": 0.00019184533927584042, + "loss": 0.5448, + "step": 6490 + }, + { + "epoch": 0.87, + "grad_norm": 0.9140625, + "learning_rate": 0.0001918407327086455, + "loss": 0.5175, + "step": 6491 + }, + { + "epoch": 0.87, + "grad_norm": 0.51953125, + "learning_rate": 0.0001918361248960313, + "loss": 0.4255, + "step": 6492 + }, + { + "epoch": 0.87, + "grad_norm": 0.53515625, + "learning_rate": 0.00019183151583806034, + "loss": 0.3575, + "step": 6493 + }, + { + "epoch": 0.87, + "grad_norm": 0.69921875, + "learning_rate": 0.00019182690553479507, + "loss": 0.7323, + "step": 6494 + }, + { + "epoch": 0.87, + "grad_norm": 0.46484375, + "learning_rate": 0.000191822293986298, + "loss": 0.3371, + "step": 6495 + }, + { + "epoch": 0.87, + "grad_norm": 0.625, + "learning_rate": 0.00019181768119263173, + "loss": 0.5152, + "step": 6496 + }, + { + "epoch": 0.87, + "grad_norm": 0.427734375, + "learning_rate": 0.00019181306715385877, + "loss": 0.3847, + "step": 6497 + }, + { + "epoch": 0.87, + "grad_norm": 0.58984375, + "learning_rate": 0.00019180845187004167, + "loss": 0.4458, + "step": 6498 + }, + { + "epoch": 0.87, + "grad_norm": 0.5546875, + "learning_rate": 0.00019180383534124306, + "loss": 0.6019, + "step": 6499 + }, + { + "epoch": 0.87, + "grad_norm": 0.6875, + "learning_rate": 0.00019179921756752556, + "loss": 0.7681, + "step": 6500 + }, + { + "epoch": 0.87, + "grad_norm": 0.8359375, + "learning_rate": 0.0001917945985489517, + "loss": 0.1725, + "step": 6501 + }, + { + "epoch": 0.87, + "grad_norm": 0.671875, + "learning_rate": 0.00019178997828558418, + "loss": 0.6644, + "step": 6502 + }, + { + "epoch": 0.87, + "grad_norm": 0.5859375, + "learning_rate": 0.00019178535677748566, + "loss": 0.2793, + "step": 6503 + }, + { + "epoch": 0.87, + "grad_norm": 0.58984375, + "learning_rate": 0.00019178073402471876, + "loss": 0.5595, + "step": 6504 + }, + { + "epoch": 0.87, + "grad_norm": 0.6171875, + "learning_rate": 0.00019177611002734623, + "loss": 0.2791, + "step": 6505 + }, + { + "epoch": 0.87, + "grad_norm": 0.427734375, + "learning_rate": 0.00019177148478543074, + "loss": 0.251, + "step": 6506 + }, + { + "epoch": 0.87, + "grad_norm": 0.53125, + "learning_rate": 0.00019176685829903501, + "loss": 0.563, + "step": 6507 + }, + { + "epoch": 0.87, + "grad_norm": 0.6953125, + "learning_rate": 0.00019176223056822182, + "loss": 0.451, + "step": 6508 + }, + { + "epoch": 0.87, + "grad_norm": 0.7578125, + "learning_rate": 0.00019175760159305388, + "loss": 0.323, + "step": 6509 + }, + { + "epoch": 0.87, + "grad_norm": 0.7421875, + "learning_rate": 0.00019175297137359395, + "loss": 0.5129, + "step": 6510 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 0.00019174833990990485, + "loss": 0.3817, + "step": 6511 + }, + { + "epoch": 0.87, + "grad_norm": 0.63671875, + "learning_rate": 0.00019174370720204936, + "loss": 0.4003, + "step": 6512 + }, + { + "epoch": 0.87, + "grad_norm": 0.4140625, + "learning_rate": 0.00019173907325009036, + "loss": 0.5004, + "step": 6513 + }, + { + "epoch": 0.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00019173443805409063, + "loss": 0.512, + "step": 6514 + }, + { + "epoch": 0.87, + "grad_norm": 0.61328125, + "learning_rate": 0.00019172980161411305, + "loss": 0.6413, + "step": 6515 + }, + { + "epoch": 0.87, + "grad_norm": 0.875, + "learning_rate": 0.00019172516393022046, + "loss": 0.6678, + "step": 6516 + }, + { + "epoch": 0.87, + "grad_norm": 0.609375, + "learning_rate": 0.00019172052500247578, + "loss": 0.4051, + "step": 6517 + }, + { + "epoch": 0.87, + "grad_norm": 0.59765625, + "learning_rate": 0.00019171588483094196, + "loss": 0.4962, + "step": 6518 + }, + { + "epoch": 0.87, + "grad_norm": 0.59765625, + "learning_rate": 0.00019171124341568185, + "loss": 0.5242, + "step": 6519 + }, + { + "epoch": 0.87, + "grad_norm": 0.546875, + "learning_rate": 0.00019170660075675842, + "loss": 0.306, + "step": 6520 + }, + { + "epoch": 0.87, + "grad_norm": 0.6328125, + "learning_rate": 0.00019170195685423464, + "loss": 0.5214, + "step": 6521 + }, + { + "epoch": 0.87, + "grad_norm": 0.55859375, + "learning_rate": 0.00019169731170817346, + "loss": 0.405, + "step": 6522 + }, + { + "epoch": 0.87, + "grad_norm": 0.94921875, + "learning_rate": 0.00019169266531863786, + "loss": 0.5796, + "step": 6523 + }, + { + "epoch": 0.87, + "grad_norm": 0.671875, + "learning_rate": 0.0001916880176856909, + "loss": 0.4018, + "step": 6524 + }, + { + "epoch": 0.87, + "grad_norm": 0.59765625, + "learning_rate": 0.00019168336880939557, + "loss": 0.4545, + "step": 6525 + }, + { + "epoch": 0.87, + "grad_norm": 0.435546875, + "learning_rate": 0.00019167871868981494, + "loss": 0.2316, + "step": 6526 + }, + { + "epoch": 0.87, + "grad_norm": 0.6484375, + "learning_rate": 0.00019167406732701202, + "loss": 0.4142, + "step": 6527 + }, + { + "epoch": 0.87, + "grad_norm": 0.66796875, + "learning_rate": 0.0001916694147210499, + "loss": 0.2345, + "step": 6528 + }, + { + "epoch": 0.87, + "grad_norm": 0.578125, + "learning_rate": 0.0001916647608719917, + "loss": 0.3876, + "step": 6529 + }, + { + "epoch": 0.87, + "grad_norm": 0.546875, + "learning_rate": 0.0001916601057799005, + "loss": 0.5539, + "step": 6530 + }, + { + "epoch": 0.87, + "grad_norm": 0.515625, + "learning_rate": 0.00019165544944483944, + "loss": 0.3996, + "step": 6531 + }, + { + "epoch": 0.87, + "grad_norm": 0.71484375, + "learning_rate": 0.00019165079186687166, + "loss": 0.522, + "step": 6532 + }, + { + "epoch": 0.87, + "grad_norm": 0.9140625, + "learning_rate": 0.00019164613304606035, + "loss": 0.5953, + "step": 6533 + }, + { + "epoch": 0.87, + "grad_norm": 0.447265625, + "learning_rate": 0.0001916414729824686, + "loss": 0.2606, + "step": 6534 + }, + { + "epoch": 0.87, + "grad_norm": 0.515625, + "learning_rate": 0.00019163681167615974, + "loss": 0.333, + "step": 6535 + }, + { + "epoch": 0.87, + "grad_norm": 0.67578125, + "learning_rate": 0.00019163214912719685, + "loss": 0.5169, + "step": 6536 + }, + { + "epoch": 0.87, + "grad_norm": 0.55859375, + "learning_rate": 0.00019162748533564324, + "loss": 0.4092, + "step": 6537 + }, + { + "epoch": 0.87, + "grad_norm": 0.62890625, + "learning_rate": 0.00019162282030156207, + "loss": 0.4525, + "step": 6538 + }, + { + "epoch": 0.87, + "grad_norm": 0.98046875, + "learning_rate": 0.00019161815402501668, + "loss": 0.5642, + "step": 6539 + }, + { + "epoch": 0.87, + "grad_norm": 0.68359375, + "learning_rate": 0.00019161348650607033, + "loss": 0.2837, + "step": 6540 + }, + { + "epoch": 0.87, + "grad_norm": 0.61328125, + "learning_rate": 0.00019160881774478629, + "loss": 0.6151, + "step": 6541 + }, + { + "epoch": 0.87, + "grad_norm": 0.87109375, + "learning_rate": 0.00019160414774122792, + "loss": 0.3968, + "step": 6542 + }, + { + "epoch": 0.87, + "grad_norm": 0.96875, + "learning_rate": 0.00019159947649545852, + "loss": 0.5053, + "step": 6543 + }, + { + "epoch": 0.87, + "grad_norm": 0.54296875, + "learning_rate": 0.00019159480400754138, + "loss": 0.2972, + "step": 6544 + }, + { + "epoch": 0.87, + "grad_norm": 0.337890625, + "learning_rate": 0.00019159013027753995, + "loss": 0.2371, + "step": 6545 + }, + { + "epoch": 0.87, + "grad_norm": 0.70703125, + "learning_rate": 0.00019158545530551755, + "loss": 0.3535, + "step": 6546 + }, + { + "epoch": 0.87, + "grad_norm": 0.5078125, + "learning_rate": 0.0001915807790915376, + "loss": 0.4977, + "step": 6547 + }, + { + "epoch": 0.87, + "grad_norm": 0.55078125, + "learning_rate": 0.0001915761016356635, + "loss": 0.5691, + "step": 6548 + }, + { + "epoch": 0.87, + "grad_norm": 0.65234375, + "learning_rate": 0.00019157142293795874, + "loss": 0.6455, + "step": 6549 + }, + { + "epoch": 0.87, + "grad_norm": 0.62890625, + "learning_rate": 0.00019156674299848666, + "loss": 0.727, + "step": 6550 + }, + { + "epoch": 0.87, + "grad_norm": 0.62890625, + "learning_rate": 0.0001915620618173108, + "loss": 0.3194, + "step": 6551 + }, + { + "epoch": 0.87, + "grad_norm": 0.5859375, + "learning_rate": 0.0001915573793944946, + "loss": 0.5093, + "step": 6552 + }, + { + "epoch": 0.87, + "grad_norm": 0.66015625, + "learning_rate": 0.0001915526957301016, + "loss": 0.6068, + "step": 6553 + }, + { + "epoch": 0.87, + "grad_norm": 0.72265625, + "learning_rate": 0.0001915480108241953, + "loss": 0.5921, + "step": 6554 + }, + { + "epoch": 0.87, + "grad_norm": 0.53515625, + "learning_rate": 0.00019154332467683917, + "loss": 0.3686, + "step": 6555 + }, + { + "epoch": 0.87, + "grad_norm": 0.47265625, + "learning_rate": 0.00019153863728809683, + "loss": 0.6899, + "step": 6556 + }, + { + "epoch": 0.87, + "grad_norm": 0.81640625, + "learning_rate": 0.00019153394865803182, + "loss": 0.7711, + "step": 6557 + }, + { + "epoch": 0.88, + "grad_norm": 0.56640625, + "learning_rate": 0.0001915292587867077, + "loss": 0.6408, + "step": 6558 + }, + { + "epoch": 0.88, + "grad_norm": 0.3828125, + "learning_rate": 0.0001915245676741881, + "loss": 0.5103, + "step": 6559 + }, + { + "epoch": 0.88, + "grad_norm": 0.6015625, + "learning_rate": 0.00019151987532053664, + "loss": 0.4483, + "step": 6560 + }, + { + "epoch": 0.88, + "grad_norm": 0.466796875, + "learning_rate": 0.00019151518172581692, + "loss": 0.5174, + "step": 6561 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 0.0001915104868900926, + "loss": 0.4958, + "step": 6562 + }, + { + "epoch": 0.88, + "grad_norm": 0.55859375, + "learning_rate": 0.00019150579081342737, + "loss": 0.3368, + "step": 6563 + }, + { + "epoch": 0.88, + "grad_norm": 0.52734375, + "learning_rate": 0.00019150109349588487, + "loss": 0.5155, + "step": 6564 + }, + { + "epoch": 0.88, + "grad_norm": 0.54296875, + "learning_rate": 0.0001914963949375288, + "loss": 0.6872, + "step": 6565 + }, + { + "epoch": 0.88, + "grad_norm": 0.546875, + "learning_rate": 0.0001914916951384229, + "loss": 0.289, + "step": 6566 + }, + { + "epoch": 0.88, + "grad_norm": 0.9375, + "learning_rate": 0.00019148699409863093, + "loss": 0.6107, + "step": 6567 + }, + { + "epoch": 0.88, + "grad_norm": 0.443359375, + "learning_rate": 0.00019148229181821657, + "loss": 0.3632, + "step": 6568 + }, + { + "epoch": 0.88, + "grad_norm": 0.74609375, + "learning_rate": 0.00019147758829724364, + "loss": 0.8127, + "step": 6569 + }, + { + "epoch": 0.88, + "grad_norm": 0.470703125, + "learning_rate": 0.00019147288353577589, + "loss": 0.4501, + "step": 6570 + }, + { + "epoch": 0.88, + "grad_norm": 0.546875, + "learning_rate": 0.00019146817753387714, + "loss": 0.5837, + "step": 6571 + }, + { + "epoch": 0.88, + "grad_norm": 0.796875, + "learning_rate": 0.0001914634702916112, + "loss": 0.4703, + "step": 6572 + }, + { + "epoch": 0.88, + "grad_norm": 0.5703125, + "learning_rate": 0.00019145876180904188, + "loss": 0.27, + "step": 6573 + }, + { + "epoch": 0.88, + "grad_norm": 0.62890625, + "learning_rate": 0.00019145405208623306, + "loss": 0.5834, + "step": 6574 + }, + { + "epoch": 0.88, + "grad_norm": 0.69921875, + "learning_rate": 0.00019144934112324863, + "loss": 0.5188, + "step": 6575 + }, + { + "epoch": 0.88, + "grad_norm": 0.8671875, + "learning_rate": 0.0001914446289201524, + "loss": 0.7447, + "step": 6576 + }, + { + "epoch": 0.88, + "grad_norm": 0.58203125, + "learning_rate": 0.00019143991547700834, + "loss": 0.5217, + "step": 6577 + }, + { + "epoch": 0.88, + "grad_norm": 0.66015625, + "learning_rate": 0.0001914352007938803, + "loss": 0.4622, + "step": 6578 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 0.00019143048487083232, + "loss": 0.4376, + "step": 6579 + }, + { + "epoch": 0.88, + "grad_norm": 0.7421875, + "learning_rate": 0.00019142576770792824, + "loss": 0.4732, + "step": 6580 + }, + { + "epoch": 0.88, + "grad_norm": 0.8125, + "learning_rate": 0.0001914210493052321, + "loss": 0.7462, + "step": 6581 + }, + { + "epoch": 0.88, + "grad_norm": 0.61328125, + "learning_rate": 0.00019141632966280782, + "loss": 0.4444, + "step": 6582 + }, + { + "epoch": 0.88, + "grad_norm": 0.5234375, + "learning_rate": 0.00019141160878071947, + "loss": 0.5765, + "step": 6583 + }, + { + "epoch": 0.88, + "grad_norm": 0.49609375, + "learning_rate": 0.000191406886659031, + "loss": 0.4668, + "step": 6584 + }, + { + "epoch": 0.88, + "grad_norm": 0.5546875, + "learning_rate": 0.00019140216329780653, + "loss": 0.9482, + "step": 6585 + }, + { + "epoch": 0.88, + "grad_norm": 0.6875, + "learning_rate": 0.00019139743869711005, + "loss": 0.3755, + "step": 6586 + }, + { + "epoch": 0.88, + "grad_norm": 0.82421875, + "learning_rate": 0.00019139271285700563, + "loss": 0.7232, + "step": 6587 + }, + { + "epoch": 0.88, + "grad_norm": 0.4453125, + "learning_rate": 0.00019138798577755738, + "loss": 0.4259, + "step": 6588 + }, + { + "epoch": 0.88, + "grad_norm": 0.55859375, + "learning_rate": 0.0001913832574588294, + "loss": 0.4199, + "step": 6589 + }, + { + "epoch": 0.88, + "grad_norm": 0.4765625, + "learning_rate": 0.00019137852790088576, + "loss": 0.477, + "step": 6590 + }, + { + "epoch": 0.88, + "grad_norm": 0.5, + "learning_rate": 0.0001913737971037907, + "loss": 0.5059, + "step": 6591 + }, + { + "epoch": 0.88, + "grad_norm": 0.62109375, + "learning_rate": 0.00019136906506760822, + "loss": 0.4377, + "step": 6592 + }, + { + "epoch": 0.88, + "grad_norm": 0.74609375, + "learning_rate": 0.00019136433179240265, + "loss": 0.8042, + "step": 6593 + }, + { + "epoch": 0.88, + "grad_norm": 0.6171875, + "learning_rate": 0.0001913595972782381, + "loss": 0.5504, + "step": 6594 + }, + { + "epoch": 0.88, + "grad_norm": 0.48828125, + "learning_rate": 0.00019135486152517875, + "loss": 0.4285, + "step": 6595 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 0.00019135012453328884, + "loss": 0.4267, + "step": 6596 + }, + { + "epoch": 0.88, + "grad_norm": 0.486328125, + "learning_rate": 0.00019134538630263264, + "loss": 0.3652, + "step": 6597 + }, + { + "epoch": 0.88, + "grad_norm": 0.52734375, + "learning_rate": 0.00019134064683327436, + "loss": 0.6011, + "step": 6598 + }, + { + "epoch": 0.88, + "grad_norm": 0.53125, + "learning_rate": 0.00019133590612527827, + "loss": 0.503, + "step": 6599 + }, + { + "epoch": 0.88, + "grad_norm": 0.50390625, + "learning_rate": 0.0001913311641787087, + "loss": 0.4232, + "step": 6600 + }, + { + "epoch": 0.88, + "grad_norm": 0.58203125, + "learning_rate": 0.00019132642099362995, + "loss": 0.6351, + "step": 6601 + }, + { + "epoch": 0.88, + "grad_norm": 0.5703125, + "learning_rate": 0.00019132167657010627, + "loss": 0.3269, + "step": 6602 + }, + { + "epoch": 0.88, + "grad_norm": 0.78125, + "learning_rate": 0.00019131693090820208, + "loss": 0.5519, + "step": 6603 + }, + { + "epoch": 0.88, + "grad_norm": 0.796875, + "learning_rate": 0.00019131218400798166, + "loss": 0.629, + "step": 6604 + }, + { + "epoch": 0.88, + "grad_norm": 0.423828125, + "learning_rate": 0.00019130743586950945, + "loss": 0.4072, + "step": 6605 + }, + { + "epoch": 0.88, + "grad_norm": 1.0234375, + "learning_rate": 0.0001913026864928498, + "loss": 0.511, + "step": 6606 + }, + { + "epoch": 0.88, + "grad_norm": 0.578125, + "learning_rate": 0.00019129793587806715, + "loss": 0.52, + "step": 6607 + }, + { + "epoch": 0.88, + "grad_norm": 0.40234375, + "learning_rate": 0.0001912931840252259, + "loss": 0.3775, + "step": 6608 + }, + { + "epoch": 0.88, + "grad_norm": 0.90234375, + "learning_rate": 0.00019128843093439043, + "loss": 0.2976, + "step": 6609 + }, + { + "epoch": 0.88, + "grad_norm": 0.41015625, + "learning_rate": 0.00019128367660562528, + "loss": 0.284, + "step": 6610 + }, + { + "epoch": 0.88, + "grad_norm": 0.5859375, + "learning_rate": 0.00019127892103899488, + "loss": 0.6108, + "step": 6611 + }, + { + "epoch": 0.88, + "grad_norm": 0.58984375, + "learning_rate": 0.00019127416423456376, + "loss": 0.483, + "step": 6612 + }, + { + "epoch": 0.88, + "grad_norm": 0.63671875, + "learning_rate": 0.00019126940619239636, + "loss": 0.4452, + "step": 6613 + }, + { + "epoch": 0.88, + "grad_norm": 0.75390625, + "learning_rate": 0.00019126464691255723, + "loss": 0.351, + "step": 6614 + }, + { + "epoch": 0.88, + "grad_norm": 0.4375, + "learning_rate": 0.00019125988639511095, + "loss": 0.3215, + "step": 6615 + }, + { + "epoch": 0.88, + "grad_norm": 0.40234375, + "learning_rate": 0.00019125512464012202, + "loss": 0.382, + "step": 6616 + }, + { + "epoch": 0.88, + "grad_norm": 0.93359375, + "learning_rate": 0.000191250361647655, + "loss": 0.3565, + "step": 6617 + }, + { + "epoch": 0.88, + "grad_norm": 0.63671875, + "learning_rate": 0.00019124559741777458, + "loss": 0.5279, + "step": 6618 + }, + { + "epoch": 0.88, + "grad_norm": 0.578125, + "learning_rate": 0.00019124083195054526, + "loss": 0.4853, + "step": 6619 + }, + { + "epoch": 0.88, + "grad_norm": 0.4921875, + "learning_rate": 0.00019123606524603168, + "loss": 0.3237, + "step": 6620 + }, + { + "epoch": 0.88, + "grad_norm": 0.57421875, + "learning_rate": 0.00019123129730429852, + "loss": 0.5958, + "step": 6621 + }, + { + "epoch": 0.88, + "grad_norm": 0.47265625, + "learning_rate": 0.00019122652812541042, + "loss": 0.347, + "step": 6622 + }, + { + "epoch": 0.88, + "grad_norm": 0.6484375, + "learning_rate": 0.00019122175770943205, + "loss": 0.367, + "step": 6623 + }, + { + "epoch": 0.88, + "grad_norm": 1.1796875, + "learning_rate": 0.00019121698605642807, + "loss": 0.3315, + "step": 6624 + }, + { + "epoch": 0.88, + "grad_norm": 0.578125, + "learning_rate": 0.00019121221316646328, + "loss": 0.3099, + "step": 6625 + }, + { + "epoch": 0.88, + "grad_norm": 0.515625, + "learning_rate": 0.0001912074390396023, + "loss": 0.6996, + "step": 6626 + }, + { + "epoch": 0.88, + "grad_norm": 0.59765625, + "learning_rate": 0.0001912026636759099, + "loss": 0.6409, + "step": 6627 + }, + { + "epoch": 0.88, + "grad_norm": 0.7109375, + "learning_rate": 0.00019119788707545087, + "loss": 0.3372, + "step": 6628 + }, + { + "epoch": 0.88, + "grad_norm": 0.671875, + "learning_rate": 0.00019119310923828997, + "loss": 0.2676, + "step": 6629 + }, + { + "epoch": 0.88, + "grad_norm": 0.6640625, + "learning_rate": 0.00019118833016449192, + "loss": 0.3032, + "step": 6630 + }, + { + "epoch": 0.88, + "grad_norm": 0.8671875, + "learning_rate": 0.00019118354985412167, + "loss": 0.3389, + "step": 6631 + }, + { + "epoch": 0.88, + "grad_norm": 0.56640625, + "learning_rate": 0.0001911787683072439, + "loss": 0.3267, + "step": 6632 + }, + { + "epoch": 0.89, + "grad_norm": 0.515625, + "learning_rate": 0.00019117398552392356, + "loss": 0.4041, + "step": 6633 + }, + { + "epoch": 0.89, + "grad_norm": 0.65625, + "learning_rate": 0.00019116920150422543, + "loss": 0.7211, + "step": 6634 + }, + { + "epoch": 0.89, + "grad_norm": 0.765625, + "learning_rate": 0.0001911644162482144, + "loss": 0.4416, + "step": 6635 + }, + { + "epoch": 0.89, + "grad_norm": 0.5078125, + "learning_rate": 0.00019115962975595544, + "loss": 0.2655, + "step": 6636 + }, + { + "epoch": 0.89, + "grad_norm": 0.41015625, + "learning_rate": 0.00019115484202751338, + "loss": 0.4158, + "step": 6637 + }, + { + "epoch": 0.89, + "grad_norm": 0.88671875, + "learning_rate": 0.0001911500530629531, + "loss": 0.3839, + "step": 6638 + }, + { + "epoch": 0.89, + "grad_norm": 0.8046875, + "learning_rate": 0.00019114526286233966, + "loss": 0.3243, + "step": 6639 + }, + { + "epoch": 0.89, + "grad_norm": 0.5625, + "learning_rate": 0.00019114047142573793, + "loss": 0.4176, + "step": 6640 + }, + { + "epoch": 0.89, + "grad_norm": 0.65234375, + "learning_rate": 0.00019113567875321292, + "loss": 0.4008, + "step": 6641 + }, + { + "epoch": 0.89, + "grad_norm": 0.4921875, + "learning_rate": 0.00019113088484482962, + "loss": 0.3288, + "step": 6642 + }, + { + "epoch": 0.89, + "grad_norm": 0.5390625, + "learning_rate": 0.00019112608970065303, + "loss": 0.4823, + "step": 6643 + }, + { + "epoch": 0.89, + "grad_norm": 0.55859375, + "learning_rate": 0.00019112129332074819, + "loss": 0.4956, + "step": 6644 + }, + { + "epoch": 0.89, + "grad_norm": 0.447265625, + "learning_rate": 0.0001911164957051801, + "loss": 0.3155, + "step": 6645 + }, + { + "epoch": 0.89, + "grad_norm": 0.625, + "learning_rate": 0.00019111169685401386, + "loss": 0.4512, + "step": 6646 + }, + { + "epoch": 0.89, + "grad_norm": 0.73046875, + "learning_rate": 0.00019110689676731454, + "loss": 0.5677, + "step": 6647 + }, + { + "epoch": 0.89, + "grad_norm": 0.6328125, + "learning_rate": 0.00019110209544514724, + "loss": 0.3234, + "step": 6648 + }, + { + "epoch": 0.89, + "grad_norm": 0.8671875, + "learning_rate": 0.00019109729288757708, + "loss": 0.3655, + "step": 6649 + }, + { + "epoch": 0.89, + "grad_norm": 0.53515625, + "learning_rate": 0.00019109248909466914, + "loss": 0.3295, + "step": 6650 + }, + { + "epoch": 0.89, + "grad_norm": 0.55859375, + "learning_rate": 0.00019108768406648856, + "loss": 0.3996, + "step": 6651 + }, + { + "epoch": 0.89, + "grad_norm": 0.828125, + "learning_rate": 0.00019108287780310055, + "loss": 0.3218, + "step": 6652 + }, + { + "epoch": 0.89, + "grad_norm": 0.4375, + "learning_rate": 0.00019107807030457026, + "loss": 0.5939, + "step": 6653 + }, + { + "epoch": 0.89, + "grad_norm": 0.5, + "learning_rate": 0.00019107326157096285, + "loss": 0.3382, + "step": 6654 + }, + { + "epoch": 0.89, + "grad_norm": 0.953125, + "learning_rate": 0.0001910684516023436, + "loss": 0.4215, + "step": 6655 + }, + { + "epoch": 0.89, + "grad_norm": 0.65234375, + "learning_rate": 0.00019106364039877768, + "loss": 0.5859, + "step": 6656 + }, + { + "epoch": 0.89, + "grad_norm": 0.71484375, + "learning_rate": 0.00019105882796033037, + "loss": 0.3683, + "step": 6657 + }, + { + "epoch": 0.89, + "grad_norm": 0.6015625, + "learning_rate": 0.0001910540142870669, + "loss": 0.3893, + "step": 6658 + }, + { + "epoch": 0.89, + "grad_norm": 1.140625, + "learning_rate": 0.00019104919937905254, + "loss": 0.5421, + "step": 6659 + }, + { + "epoch": 0.89, + "grad_norm": 0.59765625, + "learning_rate": 0.00019104438323635264, + "loss": 0.5898, + "step": 6660 + }, + { + "epoch": 0.89, + "grad_norm": 0.435546875, + "learning_rate": 0.00019103956585903245, + "loss": 0.5883, + "step": 6661 + }, + { + "epoch": 0.89, + "grad_norm": 0.5234375, + "learning_rate": 0.00019103474724715733, + "loss": 0.5735, + "step": 6662 + }, + { + "epoch": 0.89, + "grad_norm": 0.546875, + "learning_rate": 0.00019102992740079258, + "loss": 0.3893, + "step": 6663 + }, + { + "epoch": 0.89, + "grad_norm": 0.466796875, + "learning_rate": 0.00019102510632000363, + "loss": 0.4888, + "step": 6664 + }, + { + "epoch": 0.89, + "grad_norm": 0.46484375, + "learning_rate": 0.00019102028400485582, + "loss": 0.5336, + "step": 6665 + }, + { + "epoch": 0.89, + "grad_norm": 0.4375, + "learning_rate": 0.00019101546045541452, + "loss": 0.5543, + "step": 6666 + }, + { + "epoch": 0.89, + "grad_norm": 0.73046875, + "learning_rate": 0.00019101063567174514, + "loss": 0.5359, + "step": 6667 + }, + { + "epoch": 0.89, + "grad_norm": 0.77734375, + "learning_rate": 0.00019100580965391317, + "loss": 0.6139, + "step": 6668 + }, + { + "epoch": 0.89, + "grad_norm": 0.482421875, + "learning_rate": 0.00019100098240198402, + "loss": 0.3797, + "step": 6669 + }, + { + "epoch": 0.89, + "grad_norm": 0.47265625, + "learning_rate": 0.00019099615391602313, + "loss": 0.4175, + "step": 6670 + }, + { + "epoch": 0.89, + "grad_norm": 0.80078125, + "learning_rate": 0.00019099132419609597, + "loss": 0.667, + "step": 6671 + }, + { + "epoch": 0.89, + "grad_norm": 0.77734375, + "learning_rate": 0.00019098649324226808, + "loss": 0.5803, + "step": 6672 + }, + { + "epoch": 0.89, + "grad_norm": 0.59375, + "learning_rate": 0.00019098166105460494, + "loss": 0.4357, + "step": 6673 + }, + { + "epoch": 0.89, + "grad_norm": 0.55859375, + "learning_rate": 0.00019097682763317207, + "loss": 0.4578, + "step": 6674 + }, + { + "epoch": 0.89, + "grad_norm": 0.318359375, + "learning_rate": 0.00019097199297803507, + "loss": 0.1895, + "step": 6675 + }, + { + "epoch": 0.89, + "grad_norm": 0.56640625, + "learning_rate": 0.00019096715708925946, + "loss": 0.4974, + "step": 6676 + }, + { + "epoch": 0.89, + "grad_norm": 0.6484375, + "learning_rate": 0.00019096231996691077, + "loss": 0.499, + "step": 6677 + }, + { + "epoch": 0.89, + "grad_norm": 0.5, + "learning_rate": 0.00019095748161105467, + "loss": 0.2657, + "step": 6678 + }, + { + "epoch": 0.89, + "grad_norm": 0.56640625, + "learning_rate": 0.00019095264202175673, + "loss": 0.5755, + "step": 6679 + }, + { + "epoch": 0.89, + "grad_norm": 0.68359375, + "learning_rate": 0.0001909478011990826, + "loss": 0.4774, + "step": 6680 + }, + { + "epoch": 0.89, + "grad_norm": 0.310546875, + "learning_rate": 0.00019094295914309791, + "loss": 0.1531, + "step": 6681 + }, + { + "epoch": 0.89, + "grad_norm": 0.859375, + "learning_rate": 0.00019093811585386834, + "loss": 0.3337, + "step": 6682 + }, + { + "epoch": 0.89, + "grad_norm": 0.79296875, + "learning_rate": 0.00019093327133145956, + "loss": 0.2037, + "step": 6683 + }, + { + "epoch": 0.89, + "grad_norm": 0.625, + "learning_rate": 0.00019092842557593724, + "loss": 0.2637, + "step": 6684 + }, + { + "epoch": 0.89, + "grad_norm": 0.984375, + "learning_rate": 0.0001909235785873671, + "loss": 0.4132, + "step": 6685 + }, + { + "epoch": 0.89, + "grad_norm": 0.5, + "learning_rate": 0.0001909187303658149, + "loss": 0.742, + "step": 6686 + }, + { + "epoch": 0.89, + "grad_norm": 0.72265625, + "learning_rate": 0.00019091388091134638, + "loss": 0.751, + "step": 6687 + }, + { + "epoch": 0.89, + "grad_norm": 0.5390625, + "learning_rate": 0.00019090903022402729, + "loss": 0.511, + "step": 6688 + }, + { + "epoch": 0.89, + "grad_norm": 0.390625, + "learning_rate": 0.00019090417830392337, + "loss": 0.1811, + "step": 6689 + }, + { + "epoch": 0.89, + "grad_norm": 0.6015625, + "learning_rate": 0.00019089932515110048, + "loss": 0.6415, + "step": 6690 + }, + { + "epoch": 0.89, + "grad_norm": 0.80859375, + "learning_rate": 0.0001908944707656244, + "loss": 0.4552, + "step": 6691 + }, + { + "epoch": 0.89, + "grad_norm": 0.474609375, + "learning_rate": 0.00019088961514756092, + "loss": 0.5817, + "step": 6692 + }, + { + "epoch": 0.89, + "grad_norm": 0.388671875, + "learning_rate": 0.00019088475829697596, + "loss": 0.4388, + "step": 6693 + }, + { + "epoch": 0.89, + "grad_norm": 0.76953125, + "learning_rate": 0.00019087990021393532, + "loss": 0.4855, + "step": 6694 + }, + { + "epoch": 0.89, + "grad_norm": 0.65625, + "learning_rate": 0.00019087504089850495, + "loss": 0.4944, + "step": 6695 + }, + { + "epoch": 0.89, + "grad_norm": 0.416015625, + "learning_rate": 0.00019087018035075068, + "loss": 0.257, + "step": 6696 + }, + { + "epoch": 0.89, + "grad_norm": 0.5234375, + "learning_rate": 0.00019086531857073847, + "loss": 0.5935, + "step": 6697 + }, + { + "epoch": 0.89, + "grad_norm": 0.33984375, + "learning_rate": 0.00019086045555853417, + "loss": 0.1217, + "step": 6698 + }, + { + "epoch": 0.89, + "grad_norm": 0.5390625, + "learning_rate": 0.00019085559131420382, + "loss": 0.3436, + "step": 6699 + }, + { + "epoch": 0.89, + "grad_norm": 0.640625, + "learning_rate": 0.00019085072583781333, + "loss": 0.3147, + "step": 6700 + }, + { + "epoch": 0.89, + "grad_norm": 0.625, + "learning_rate": 0.00019084585912942868, + "loss": 0.2553, + "step": 6701 + }, + { + "epoch": 0.89, + "grad_norm": 0.62890625, + "learning_rate": 0.0001908409911891159, + "loss": 0.5927, + "step": 6702 + }, + { + "epoch": 0.89, + "grad_norm": 0.67578125, + "learning_rate": 0.00019083612201694096, + "loss": 0.4683, + "step": 6703 + }, + { + "epoch": 0.89, + "grad_norm": 0.90234375, + "learning_rate": 0.0001908312516129699, + "loss": 0.5087, + "step": 6704 + }, + { + "epoch": 0.89, + "grad_norm": 0.51171875, + "learning_rate": 0.0001908263799772688, + "loss": 0.4014, + "step": 6705 + }, + { + "epoch": 0.89, + "grad_norm": 0.6171875, + "learning_rate": 0.00019082150710990366, + "loss": 0.5066, + "step": 6706 + }, + { + "epoch": 0.89, + "grad_norm": 0.546875, + "learning_rate": 0.00019081663301094063, + "loss": 0.4747, + "step": 6707 + }, + { + "epoch": 0.9, + "grad_norm": 0.69140625, + "learning_rate": 0.00019081175768044574, + "loss": 0.4281, + "step": 6708 + }, + { + "epoch": 0.9, + "grad_norm": 1.015625, + "learning_rate": 0.00019080688111848518, + "loss": 0.4747, + "step": 6709 + }, + { + "epoch": 0.9, + "grad_norm": 0.6796875, + "learning_rate": 0.00019080200332512499, + "loss": 0.3513, + "step": 6710 + }, + { + "epoch": 0.9, + "grad_norm": 0.55859375, + "learning_rate": 0.00019079712430043134, + "loss": 0.6497, + "step": 6711 + }, + { + "epoch": 0.9, + "grad_norm": 0.455078125, + "learning_rate": 0.00019079224404447046, + "loss": 0.3469, + "step": 6712 + }, + { + "epoch": 0.9, + "grad_norm": 0.62109375, + "learning_rate": 0.00019078736255730846, + "loss": 0.5533, + "step": 6713 + }, + { + "epoch": 0.9, + "grad_norm": 0.46484375, + "learning_rate": 0.00019078247983901156, + "loss": 0.4068, + "step": 6714 + }, + { + "epoch": 0.9, + "grad_norm": 0.59375, + "learning_rate": 0.00019077759588964597, + "loss": 0.4647, + "step": 6715 + }, + { + "epoch": 0.9, + "grad_norm": 0.5078125, + "learning_rate": 0.0001907727107092779, + "loss": 0.4555, + "step": 6716 + }, + { + "epoch": 0.9, + "grad_norm": 0.6171875, + "learning_rate": 0.00019076782429797365, + "loss": 0.4569, + "step": 6717 + }, + { + "epoch": 0.9, + "grad_norm": 0.703125, + "learning_rate": 0.00019076293665579942, + "loss": 0.5927, + "step": 6718 + }, + { + "epoch": 0.9, + "grad_norm": 0.69140625, + "learning_rate": 0.00019075804778282156, + "loss": 0.3508, + "step": 6719 + }, + { + "epoch": 0.9, + "grad_norm": 0.59375, + "learning_rate": 0.00019075315767910626, + "loss": 0.5377, + "step": 6720 + }, + { + "epoch": 0.9, + "grad_norm": 0.59765625, + "learning_rate": 0.00019074826634471993, + "loss": 0.6683, + "step": 6721 + }, + { + "epoch": 0.9, + "grad_norm": 0.82421875, + "learning_rate": 0.00019074337377972887, + "loss": 0.4684, + "step": 6722 + }, + { + "epoch": 0.9, + "grad_norm": 0.625, + "learning_rate": 0.00019073847998419942, + "loss": 0.532, + "step": 6723 + }, + { + "epoch": 0.9, + "grad_norm": 0.59765625, + "learning_rate": 0.00019073358495819796, + "loss": 0.4651, + "step": 6724 + }, + { + "epoch": 0.9, + "grad_norm": 0.39453125, + "learning_rate": 0.00019072868870179084, + "loss": 0.3123, + "step": 6725 + }, + { + "epoch": 0.9, + "grad_norm": 0.53515625, + "learning_rate": 0.00019072379121504447, + "loss": 0.5326, + "step": 6726 + }, + { + "epoch": 0.9, + "grad_norm": 0.765625, + "learning_rate": 0.00019071889249802526, + "loss": 0.7932, + "step": 6727 + }, + { + "epoch": 0.9, + "grad_norm": 0.62890625, + "learning_rate": 0.00019071399255079965, + "loss": 0.5385, + "step": 6728 + }, + { + "epoch": 0.9, + "grad_norm": 0.60546875, + "learning_rate": 0.00019070909137343408, + "loss": 0.3224, + "step": 6729 + }, + { + "epoch": 0.9, + "grad_norm": 0.419921875, + "learning_rate": 0.00019070418896599501, + "loss": 0.2796, + "step": 6730 + }, + { + "epoch": 0.9, + "grad_norm": 0.640625, + "learning_rate": 0.00019069928532854894, + "loss": 0.3894, + "step": 6731 + }, + { + "epoch": 0.9, + "grad_norm": 0.84375, + "learning_rate": 0.00019069438046116232, + "loss": 0.4884, + "step": 6732 + }, + { + "epoch": 0.9, + "grad_norm": 0.59765625, + "learning_rate": 0.00019068947436390175, + "loss": 0.2895, + "step": 6733 + }, + { + "epoch": 0.9, + "grad_norm": 0.6171875, + "learning_rate": 0.00019068456703683366, + "loss": 0.1956, + "step": 6734 + }, + { + "epoch": 0.9, + "grad_norm": 0.60546875, + "learning_rate": 0.00019067965848002465, + "loss": 0.5445, + "step": 6735 + }, + { + "epoch": 0.9, + "grad_norm": 0.70703125, + "learning_rate": 0.00019067474869354127, + "loss": 0.3655, + "step": 6736 + }, + { + "epoch": 0.9, + "grad_norm": 0.671875, + "learning_rate": 0.0001906698376774501, + "loss": 0.2035, + "step": 6737 + }, + { + "epoch": 0.9, + "grad_norm": 0.5390625, + "learning_rate": 0.00019066492543181774, + "loss": 0.228, + "step": 6738 + }, + { + "epoch": 0.9, + "grad_norm": 0.52734375, + "learning_rate": 0.00019066001195671082, + "loss": 0.3752, + "step": 6739 + }, + { + "epoch": 0.9, + "grad_norm": 0.640625, + "learning_rate": 0.00019065509725219593, + "loss": 0.2802, + "step": 6740 + }, + { + "epoch": 0.9, + "grad_norm": 0.76171875, + "learning_rate": 0.00019065018131833976, + "loss": 0.488, + "step": 6741 + }, + { + "epoch": 0.9, + "grad_norm": 1.0078125, + "learning_rate": 0.00019064526415520894, + "loss": 0.7094, + "step": 6742 + }, + { + "epoch": 0.9, + "grad_norm": 0.83984375, + "learning_rate": 0.00019064034576287016, + "loss": 0.4863, + "step": 6743 + }, + { + "epoch": 0.9, + "grad_norm": 0.482421875, + "learning_rate": 0.0001906354261413901, + "loss": 0.4608, + "step": 6744 + }, + { + "epoch": 0.9, + "grad_norm": 0.6640625, + "learning_rate": 0.00019063050529083552, + "loss": 0.1601, + "step": 6745 + }, + { + "epoch": 0.9, + "grad_norm": 1.0, + "learning_rate": 0.00019062558321127312, + "loss": 0.3313, + "step": 6746 + }, + { + "epoch": 0.9, + "grad_norm": 1.1640625, + "learning_rate": 0.00019062065990276965, + "loss": 0.3106, + "step": 6747 + }, + { + "epoch": 0.9, + "grad_norm": 0.50390625, + "learning_rate": 0.00019061573536539186, + "loss": 0.509, + "step": 6748 + }, + { + "epoch": 0.9, + "grad_norm": 0.58203125, + "learning_rate": 0.00019061080959920657, + "loss": 0.5358, + "step": 6749 + }, + { + "epoch": 0.9, + "grad_norm": 0.4375, + "learning_rate": 0.0001906058826042805, + "loss": 0.5045, + "step": 6750 + }, + { + "epoch": 0.9, + "grad_norm": 0.71875, + "learning_rate": 0.00019060095438068055, + "loss": 0.5013, + "step": 6751 + }, + { + "epoch": 0.9, + "grad_norm": 0.64453125, + "learning_rate": 0.0001905960249284735, + "loss": 0.6067, + "step": 6752 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 0.0001905910942477262, + "loss": 0.4211, + "step": 6753 + }, + { + "epoch": 0.9, + "grad_norm": 0.55859375, + "learning_rate": 0.00019058616233850553, + "loss": 0.5882, + "step": 6754 + }, + { + "epoch": 0.9, + "grad_norm": 0.8359375, + "learning_rate": 0.00019058122920087838, + "loss": 0.245, + "step": 6755 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 0.00019057629483491158, + "loss": 0.7101, + "step": 6756 + }, + { + "epoch": 0.9, + "grad_norm": 0.65625, + "learning_rate": 0.00019057135924067212, + "loss": 0.5007, + "step": 6757 + }, + { + "epoch": 0.9, + "grad_norm": 0.60546875, + "learning_rate": 0.00019056642241822692, + "loss": 0.5979, + "step": 6758 + }, + { + "epoch": 0.9, + "grad_norm": 0.73828125, + "learning_rate": 0.00019056148436764287, + "loss": 0.6402, + "step": 6759 + }, + { + "epoch": 0.9, + "grad_norm": 0.6015625, + "learning_rate": 0.000190556545088987, + "loss": 0.4894, + "step": 6760 + }, + { + "epoch": 0.9, + "grad_norm": 0.53125, + "learning_rate": 0.00019055160458232625, + "loss": 0.3522, + "step": 6761 + }, + { + "epoch": 0.9, + "grad_norm": 0.486328125, + "learning_rate": 0.00019054666284772762, + "loss": 0.4594, + "step": 6762 + }, + { + "epoch": 0.9, + "grad_norm": 0.65625, + "learning_rate": 0.0001905417198852581, + "loss": 0.3729, + "step": 6763 + }, + { + "epoch": 0.9, + "grad_norm": 0.890625, + "learning_rate": 0.0001905367756949848, + "loss": 0.4557, + "step": 6764 + }, + { + "epoch": 0.9, + "grad_norm": 0.63671875, + "learning_rate": 0.0001905318302769747, + "loss": 0.4849, + "step": 6765 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 0.00019052688363129487, + "loss": 0.4442, + "step": 6766 + }, + { + "epoch": 0.9, + "grad_norm": 0.3515625, + "learning_rate": 0.0001905219357580124, + "loss": 0.2618, + "step": 6767 + }, + { + "epoch": 0.9, + "grad_norm": 0.609375, + "learning_rate": 0.0001905169866571944, + "loss": 0.5353, + "step": 6768 + }, + { + "epoch": 0.9, + "grad_norm": 0.7421875, + "learning_rate": 0.00019051203632890795, + "loss": 0.625, + "step": 6769 + }, + { + "epoch": 0.9, + "grad_norm": 0.63671875, + "learning_rate": 0.00019050708477322018, + "loss": 0.5235, + "step": 6770 + }, + { + "epoch": 0.9, + "grad_norm": 0.50390625, + "learning_rate": 0.00019050213199019828, + "loss": 0.5997, + "step": 6771 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 0.00019049717797990938, + "loss": 0.4716, + "step": 6772 + }, + { + "epoch": 0.9, + "grad_norm": 0.765625, + "learning_rate": 0.00019049222274242068, + "loss": 0.3883, + "step": 6773 + }, + { + "epoch": 0.9, + "grad_norm": 0.6171875, + "learning_rate": 0.00019048726627779932, + "loss": 0.3686, + "step": 6774 + }, + { + "epoch": 0.9, + "grad_norm": 0.66796875, + "learning_rate": 0.0001904823085861126, + "loss": 0.4525, + "step": 6775 + }, + { + "epoch": 0.9, + "grad_norm": 0.55859375, + "learning_rate": 0.0001904773496674277, + "loss": 0.4657, + "step": 6776 + }, + { + "epoch": 0.9, + "grad_norm": 0.56640625, + "learning_rate": 0.00019047238952181185, + "loss": 0.6861, + "step": 6777 + }, + { + "epoch": 0.9, + "grad_norm": 0.72265625, + "learning_rate": 0.00019046742814933233, + "loss": 0.3776, + "step": 6778 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 0.00019046246555005645, + "loss": 0.4084, + "step": 6779 + }, + { + "epoch": 0.9, + "grad_norm": 0.8515625, + "learning_rate": 0.00019045750172405146, + "loss": 0.2746, + "step": 6780 + }, + { + "epoch": 0.9, + "grad_norm": 0.8984375, + "learning_rate": 0.00019045253667138469, + "loss": 0.6018, + "step": 6781 + }, + { + "epoch": 0.9, + "grad_norm": 0.62109375, + "learning_rate": 0.00019044757039212348, + "loss": 0.4588, + "step": 6782 + }, + { + "epoch": 0.91, + "grad_norm": 0.6796875, + "learning_rate": 0.00019044260288633515, + "loss": 0.3136, + "step": 6783 + }, + { + "epoch": 0.91, + "grad_norm": 0.9453125, + "learning_rate": 0.00019043763415408708, + "loss": 0.3555, + "step": 6784 + }, + { + "epoch": 0.91, + "grad_norm": 0.609375, + "learning_rate": 0.00019043266419544667, + "loss": 0.5915, + "step": 6785 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 0.0001904276930104813, + "loss": 0.308, + "step": 6786 + }, + { + "epoch": 0.91, + "grad_norm": 0.6953125, + "learning_rate": 0.00019042272059925836, + "loss": 0.3816, + "step": 6787 + }, + { + "epoch": 0.91, + "grad_norm": 0.69921875, + "learning_rate": 0.00019041774696184529, + "loss": 0.5526, + "step": 6788 + }, + { + "epoch": 0.91, + "grad_norm": 0.640625, + "learning_rate": 0.00019041277209830954, + "loss": 0.3412, + "step": 6789 + }, + { + "epoch": 0.91, + "grad_norm": 0.8046875, + "learning_rate": 0.0001904077960087186, + "loss": 0.3318, + "step": 6790 + }, + { + "epoch": 0.91, + "grad_norm": 0.63671875, + "learning_rate": 0.00019040281869313986, + "loss": 0.3178, + "step": 6791 + }, + { + "epoch": 0.91, + "grad_norm": 0.57421875, + "learning_rate": 0.00019039784015164094, + "loss": 0.3975, + "step": 6792 + }, + { + "epoch": 0.91, + "grad_norm": 0.64453125, + "learning_rate": 0.00019039286038428926, + "loss": 0.5167, + "step": 6793 + }, + { + "epoch": 0.91, + "grad_norm": 1.0703125, + "learning_rate": 0.0001903878793911524, + "loss": 0.3234, + "step": 6794 + }, + { + "epoch": 0.91, + "grad_norm": 0.74609375, + "learning_rate": 0.0001903828971722979, + "loss": 0.4798, + "step": 6795 + }, + { + "epoch": 0.91, + "grad_norm": 0.83203125, + "learning_rate": 0.00019037791372779328, + "loss": 0.3322, + "step": 6796 + }, + { + "epoch": 0.91, + "grad_norm": 0.6171875, + "learning_rate": 0.00019037292905770613, + "loss": 0.3602, + "step": 6797 + }, + { + "epoch": 0.91, + "grad_norm": 0.53125, + "learning_rate": 0.00019036794316210412, + "loss": 0.357, + "step": 6798 + }, + { + "epoch": 0.91, + "grad_norm": 0.484375, + "learning_rate": 0.00019036295604105474, + "loss": 0.3725, + "step": 6799 + }, + { + "epoch": 0.91, + "grad_norm": 0.69140625, + "learning_rate": 0.0001903579676946257, + "loss": 0.3125, + "step": 6800 + }, + { + "epoch": 0.91, + "grad_norm": 0.59375, + "learning_rate": 0.00019035297812288463, + "loss": 0.8761, + "step": 6801 + }, + { + "epoch": 0.91, + "grad_norm": 0.78515625, + "learning_rate": 0.0001903479873258992, + "loss": 0.4895, + "step": 6802 + }, + { + "epoch": 0.91, + "grad_norm": 0.6796875, + "learning_rate": 0.00019034299530373708, + "loss": 0.3815, + "step": 6803 + }, + { + "epoch": 0.91, + "grad_norm": 0.7578125, + "learning_rate": 0.0001903380020564659, + "loss": 0.5594, + "step": 6804 + }, + { + "epoch": 0.91, + "grad_norm": 0.54296875, + "learning_rate": 0.00019033300758415354, + "loss": 0.34, + "step": 6805 + }, + { + "epoch": 0.91, + "grad_norm": 0.53125, + "learning_rate": 0.00019032801188686755, + "loss": 0.3391, + "step": 6806 + }, + { + "epoch": 0.91, + "grad_norm": 0.8828125, + "learning_rate": 0.00019032301496467574, + "loss": 0.8563, + "step": 6807 + }, + { + "epoch": 0.91, + "grad_norm": 0.57421875, + "learning_rate": 0.0001903180168176459, + "loss": 0.5008, + "step": 6808 + }, + { + "epoch": 0.91, + "grad_norm": 0.71484375, + "learning_rate": 0.0001903130174458458, + "loss": 0.9702, + "step": 6809 + }, + { + "epoch": 0.91, + "grad_norm": 0.66015625, + "learning_rate": 0.00019030801684934322, + "loss": 0.4592, + "step": 6810 + }, + { + "epoch": 0.91, + "grad_norm": 0.59375, + "learning_rate": 0.00019030301502820596, + "loss": 0.394, + "step": 6811 + }, + { + "epoch": 0.91, + "grad_norm": 0.44140625, + "learning_rate": 0.00019029801198250184, + "loss": 0.3517, + "step": 6812 + }, + { + "epoch": 0.91, + "grad_norm": 0.58984375, + "learning_rate": 0.00019029300771229875, + "loss": 0.3529, + "step": 6813 + }, + { + "epoch": 0.91, + "grad_norm": 0.5078125, + "learning_rate": 0.00019028800221766452, + "loss": 0.56, + "step": 6814 + }, + { + "epoch": 0.91, + "grad_norm": 0.5703125, + "learning_rate": 0.00019028299549866704, + "loss": 0.7581, + "step": 6815 + }, + { + "epoch": 0.91, + "grad_norm": 0.76953125, + "learning_rate": 0.00019027798755537418, + "loss": 0.675, + "step": 6816 + }, + { + "epoch": 0.91, + "grad_norm": 0.828125, + "learning_rate": 0.0001902729783878539, + "loss": 0.467, + "step": 6817 + }, + { + "epoch": 0.91, + "grad_norm": 0.51953125, + "learning_rate": 0.00019026796799617406, + "loss": 0.43, + "step": 6818 + }, + { + "epoch": 0.91, + "grad_norm": 0.462890625, + "learning_rate": 0.00019026295638040265, + "loss": 0.343, + "step": 6819 + }, + { + "epoch": 0.91, + "grad_norm": 0.65625, + "learning_rate": 0.00019025794354060764, + "loss": 0.5912, + "step": 6820 + }, + { + "epoch": 0.91, + "grad_norm": 0.66796875, + "learning_rate": 0.00019025292947685697, + "loss": 0.4045, + "step": 6821 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 0.00019024791418921865, + "loss": 0.3593, + "step": 6822 + }, + { + "epoch": 0.91, + "grad_norm": 0.404296875, + "learning_rate": 0.0001902428976777607, + "loss": 0.263, + "step": 6823 + }, + { + "epoch": 0.91, + "grad_norm": 0.48046875, + "learning_rate": 0.00019023787994255114, + "loss": 0.4152, + "step": 6824 + }, + { + "epoch": 0.91, + "grad_norm": 0.7421875, + "learning_rate": 0.00019023286098365802, + "loss": 0.37, + "step": 6825 + }, + { + "epoch": 0.91, + "grad_norm": 0.466796875, + "learning_rate": 0.0001902278408011494, + "loss": 0.2927, + "step": 6826 + }, + { + "epoch": 0.91, + "grad_norm": 0.7421875, + "learning_rate": 0.00019022281939509332, + "loss": 0.4287, + "step": 6827 + }, + { + "epoch": 0.91, + "grad_norm": 0.546875, + "learning_rate": 0.00019021779676555792, + "loss": 0.5444, + "step": 6828 + }, + { + "epoch": 0.91, + "grad_norm": 0.828125, + "learning_rate": 0.00019021277291261131, + "loss": 0.6189, + "step": 6829 + }, + { + "epoch": 0.91, + "grad_norm": 0.458984375, + "learning_rate": 0.0001902077478363216, + "loss": 0.3221, + "step": 6830 + }, + { + "epoch": 0.91, + "grad_norm": 0.609375, + "learning_rate": 0.0001902027215367569, + "loss": 0.3883, + "step": 6831 + }, + { + "epoch": 0.91, + "grad_norm": 0.67578125, + "learning_rate": 0.00019019769401398544, + "loss": 0.5874, + "step": 6832 + }, + { + "epoch": 0.91, + "grad_norm": 0.75, + "learning_rate": 0.00019019266526807535, + "loss": 0.482, + "step": 6833 + }, + { + "epoch": 0.91, + "grad_norm": 0.56640625, + "learning_rate": 0.00019018763529909483, + "loss": 0.2914, + "step": 6834 + }, + { + "epoch": 0.91, + "grad_norm": 0.62890625, + "learning_rate": 0.0001901826041071121, + "loss": 0.6056, + "step": 6835 + }, + { + "epoch": 0.91, + "grad_norm": 0.51171875, + "learning_rate": 0.00019017757169219538, + "loss": 0.3566, + "step": 6836 + }, + { + "epoch": 0.91, + "grad_norm": 0.8125, + "learning_rate": 0.0001901725380544129, + "loss": 0.3421, + "step": 6837 + }, + { + "epoch": 0.91, + "grad_norm": 0.609375, + "learning_rate": 0.00019016750319383294, + "loss": 0.4668, + "step": 6838 + }, + { + "epoch": 0.91, + "grad_norm": 0.482421875, + "learning_rate": 0.00019016246711052378, + "loss": 0.576, + "step": 6839 + }, + { + "epoch": 0.91, + "grad_norm": 0.5625, + "learning_rate": 0.0001901574298045537, + "loss": 0.278, + "step": 6840 + }, + { + "epoch": 0.91, + "grad_norm": 0.69921875, + "learning_rate": 0.00019015239127599098, + "loss": 0.3465, + "step": 6841 + }, + { + "epoch": 0.91, + "grad_norm": 0.42578125, + "learning_rate": 0.00019014735152490396, + "loss": 0.3874, + "step": 6842 + }, + { + "epoch": 0.91, + "grad_norm": 0.47265625, + "learning_rate": 0.00019014231055136107, + "loss": 0.2908, + "step": 6843 + }, + { + "epoch": 0.91, + "grad_norm": 0.60546875, + "learning_rate": 0.00019013726835543054, + "loss": 0.4209, + "step": 6844 + }, + { + "epoch": 0.91, + "grad_norm": 0.5859375, + "learning_rate": 0.0001901322249371808, + "loss": 0.3786, + "step": 6845 + }, + { + "epoch": 0.91, + "grad_norm": 0.73828125, + "learning_rate": 0.00019012718029668029, + "loss": 0.3124, + "step": 6846 + }, + { + "epoch": 0.91, + "grad_norm": 0.609375, + "learning_rate": 0.0001901221344339973, + "loss": 0.4508, + "step": 6847 + }, + { + "epoch": 0.91, + "grad_norm": 0.6328125, + "learning_rate": 0.0001901170873492004, + "loss": 0.239, + "step": 6848 + }, + { + "epoch": 0.91, + "grad_norm": 0.5625, + "learning_rate": 0.00019011203904235793, + "loss": 0.4528, + "step": 6849 + }, + { + "epoch": 0.91, + "grad_norm": 0.6875, + "learning_rate": 0.00019010698951353838, + "loss": 0.367, + "step": 6850 + }, + { + "epoch": 0.91, + "grad_norm": 0.46484375, + "learning_rate": 0.0001901019387628102, + "loss": 0.2726, + "step": 6851 + }, + { + "epoch": 0.91, + "grad_norm": 0.51953125, + "learning_rate": 0.0001900968867902419, + "loss": 0.322, + "step": 6852 + }, + { + "epoch": 0.91, + "grad_norm": 0.47265625, + "learning_rate": 0.00019009183359590203, + "loss": 0.4554, + "step": 6853 + }, + { + "epoch": 0.91, + "grad_norm": 0.51171875, + "learning_rate": 0.00019008677917985907, + "loss": 0.5097, + "step": 6854 + }, + { + "epoch": 0.91, + "grad_norm": 0.6171875, + "learning_rate": 0.00019008172354218156, + "loss": 0.1687, + "step": 6855 + }, + { + "epoch": 0.91, + "grad_norm": 0.68359375, + "learning_rate": 0.00019007666668293804, + "loss": 0.4099, + "step": 6856 + }, + { + "epoch": 0.91, + "grad_norm": 0.51171875, + "learning_rate": 0.00019007160860219714, + "loss": 0.4561, + "step": 6857 + }, + { + "epoch": 0.92, + "grad_norm": 0.59765625, + "learning_rate": 0.0001900665493000274, + "loss": 0.8851, + "step": 6858 + }, + { + "epoch": 0.92, + "grad_norm": 0.458984375, + "learning_rate": 0.00019006148877649746, + "loss": 0.1858, + "step": 6859 + }, + { + "epoch": 0.92, + "grad_norm": 0.67578125, + "learning_rate": 0.0001900564270316759, + "loss": 0.4316, + "step": 6860 + }, + { + "epoch": 0.92, + "grad_norm": 0.66015625, + "learning_rate": 0.00019005136406563146, + "loss": 0.6434, + "step": 6861 + }, + { + "epoch": 0.92, + "grad_norm": 0.5, + "learning_rate": 0.00019004629987843266, + "loss": 0.5276, + "step": 6862 + }, + { + "epoch": 0.92, + "grad_norm": 0.7734375, + "learning_rate": 0.00019004123447014827, + "loss": 0.4286, + "step": 6863 + }, + { + "epoch": 0.92, + "grad_norm": 0.82421875, + "learning_rate": 0.00019003616784084694, + "loss": 0.5913, + "step": 6864 + }, + { + "epoch": 0.92, + "grad_norm": 0.6484375, + "learning_rate": 0.0001900310999905974, + "loss": 0.4236, + "step": 6865 + }, + { + "epoch": 0.92, + "grad_norm": 0.5, + "learning_rate": 0.00019002603091946836, + "loss": 0.357, + "step": 6866 + }, + { + "epoch": 0.92, + "grad_norm": 0.828125, + "learning_rate": 0.0001900209606275286, + "loss": 0.388, + "step": 6867 + }, + { + "epoch": 0.92, + "grad_norm": 0.60546875, + "learning_rate": 0.00019001588911484677, + "loss": 0.5156, + "step": 6868 + }, + { + "epoch": 0.92, + "grad_norm": 0.421875, + "learning_rate": 0.00019001081638149176, + "loss": 0.1912, + "step": 6869 + }, + { + "epoch": 0.92, + "grad_norm": 0.46484375, + "learning_rate": 0.00019000574242753227, + "loss": 0.4184, + "step": 6870 + }, + { + "epoch": 0.92, + "grad_norm": 0.68359375, + "learning_rate": 0.00019000066725303718, + "loss": 0.6285, + "step": 6871 + }, + { + "epoch": 0.92, + "grad_norm": 0.859375, + "learning_rate": 0.0001899955908580753, + "loss": 0.6168, + "step": 6872 + }, + { + "epoch": 0.92, + "grad_norm": 0.5078125, + "learning_rate": 0.0001899905132427154, + "loss": 0.7046, + "step": 6873 + }, + { + "epoch": 0.92, + "grad_norm": 0.55078125, + "learning_rate": 0.00018998543440702643, + "loss": 0.5739, + "step": 6874 + }, + { + "epoch": 0.92, + "grad_norm": 0.59765625, + "learning_rate": 0.0001899803543510772, + "loss": 0.246, + "step": 6875 + }, + { + "epoch": 0.92, + "grad_norm": 0.88671875, + "learning_rate": 0.0001899752730749366, + "loss": 0.7111, + "step": 6876 + }, + { + "epoch": 0.92, + "grad_norm": 0.63671875, + "learning_rate": 0.00018997019057867356, + "loss": 0.4791, + "step": 6877 + }, + { + "epoch": 0.92, + "grad_norm": 0.74609375, + "learning_rate": 0.000189965106862357, + "loss": 0.5753, + "step": 6878 + }, + { + "epoch": 0.92, + "grad_norm": 0.65625, + "learning_rate": 0.00018996002192605584, + "loss": 0.2484, + "step": 6879 + }, + { + "epoch": 0.92, + "grad_norm": 0.828125, + "learning_rate": 0.00018995493576983905, + "loss": 0.5287, + "step": 6880 + }, + { + "epoch": 0.92, + "grad_norm": 0.62109375, + "learning_rate": 0.0001899498483937756, + "loss": 0.7374, + "step": 6881 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390625, + "learning_rate": 0.0001899447597979345, + "loss": 0.4986, + "step": 6882 + }, + { + "epoch": 0.92, + "grad_norm": 0.7890625, + "learning_rate": 0.00018993966998238467, + "loss": 0.3121, + "step": 6883 + }, + { + "epoch": 0.92, + "grad_norm": 0.82421875, + "learning_rate": 0.00018993457894719525, + "loss": 0.5098, + "step": 6884 + }, + { + "epoch": 0.92, + "grad_norm": 0.63671875, + "learning_rate": 0.00018992948669243522, + "loss": 0.3905, + "step": 6885 + }, + { + "epoch": 0.92, + "grad_norm": 1.0390625, + "learning_rate": 0.0001899243932181736, + "loss": 0.6012, + "step": 6886 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390625, + "learning_rate": 0.0001899192985244795, + "loss": 0.6145, + "step": 6887 + }, + { + "epoch": 0.92, + "grad_norm": 0.455078125, + "learning_rate": 0.00018991420261142204, + "loss": 0.442, + "step": 6888 + }, + { + "epoch": 0.92, + "grad_norm": 0.68359375, + "learning_rate": 0.00018990910547907026, + "loss": 0.2959, + "step": 6889 + }, + { + "epoch": 0.92, + "grad_norm": 0.6796875, + "learning_rate": 0.0001899040071274933, + "loss": 0.7249, + "step": 6890 + }, + { + "epoch": 0.92, + "grad_norm": 0.55078125, + "learning_rate": 0.00018989890755676032, + "loss": 0.209, + "step": 6891 + }, + { + "epoch": 0.92, + "grad_norm": 0.427734375, + "learning_rate": 0.00018989380676694042, + "loss": 0.4196, + "step": 6892 + }, + { + "epoch": 0.92, + "grad_norm": 0.5234375, + "learning_rate": 0.00018988870475810282, + "loss": 0.397, + "step": 6893 + }, + { + "epoch": 0.92, + "grad_norm": 0.734375, + "learning_rate": 0.00018988360153031673, + "loss": 0.593, + "step": 6894 + }, + { + "epoch": 0.92, + "grad_norm": 0.62109375, + "learning_rate": 0.00018987849708365127, + "loss": 0.2204, + "step": 6895 + }, + { + "epoch": 0.92, + "grad_norm": 0.56640625, + "learning_rate": 0.00018987339141817575, + "loss": 0.3208, + "step": 6896 + }, + { + "epoch": 0.92, + "grad_norm": 0.53515625, + "learning_rate": 0.00018986828453395935, + "loss": 0.4171, + "step": 6897 + }, + { + "epoch": 0.92, + "grad_norm": 0.46875, + "learning_rate": 0.0001898631764310713, + "loss": 0.4393, + "step": 6898 + }, + { + "epoch": 0.92, + "grad_norm": 0.6640625, + "learning_rate": 0.00018985806710958094, + "loss": 0.2953, + "step": 6899 + }, + { + "epoch": 0.92, + "grad_norm": 0.9765625, + "learning_rate": 0.00018985295656955752, + "loss": 0.4575, + "step": 6900 + }, + { + "epoch": 0.92, + "grad_norm": 0.546875, + "learning_rate": 0.00018984784481107032, + "loss": 0.5069, + "step": 6901 + }, + { + "epoch": 0.92, + "grad_norm": 0.68359375, + "learning_rate": 0.0001898427318341887, + "loss": 0.4332, + "step": 6902 + }, + { + "epoch": 0.92, + "grad_norm": 0.625, + "learning_rate": 0.00018983761763898197, + "loss": 0.5902, + "step": 6903 + }, + { + "epoch": 0.92, + "grad_norm": 0.5859375, + "learning_rate": 0.00018983250222551947, + "loss": 0.2583, + "step": 6904 + }, + { + "epoch": 0.92, + "grad_norm": 0.77734375, + "learning_rate": 0.00018982738559387058, + "loss": 0.3694, + "step": 6905 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 0.00018982226774410475, + "loss": 0.353, + "step": 6906 + }, + { + "epoch": 0.92, + "grad_norm": 0.400390625, + "learning_rate": 0.00018981714867629125, + "loss": 0.2299, + "step": 6907 + }, + { + "epoch": 0.92, + "grad_norm": 0.43359375, + "learning_rate": 0.00018981202839049962, + "loss": 0.2851, + "step": 6908 + }, + { + "epoch": 0.92, + "grad_norm": 0.76171875, + "learning_rate": 0.0001898069068867992, + "loss": 0.3042, + "step": 6909 + }, + { + "epoch": 0.92, + "grad_norm": 0.6328125, + "learning_rate": 0.00018980178416525952, + "loss": 0.3908, + "step": 6910 + }, + { + "epoch": 0.92, + "grad_norm": 0.78125, + "learning_rate": 0.00018979666022595, + "loss": 0.9568, + "step": 6911 + }, + { + "epoch": 0.92, + "grad_norm": 0.87890625, + "learning_rate": 0.00018979153506894014, + "loss": 0.3669, + "step": 6912 + }, + { + "epoch": 0.92, + "grad_norm": 0.66796875, + "learning_rate": 0.00018978640869429943, + "loss": 0.4134, + "step": 6913 + }, + { + "epoch": 0.92, + "grad_norm": 0.498046875, + "learning_rate": 0.00018978128110209741, + "loss": 0.2947, + "step": 6914 + }, + { + "epoch": 0.92, + "grad_norm": 0.466796875, + "learning_rate": 0.00018977615229240358, + "loss": 0.5052, + "step": 6915 + }, + { + "epoch": 0.92, + "grad_norm": 0.64453125, + "learning_rate": 0.00018977102226528752, + "loss": 0.2694, + "step": 6916 + }, + { + "epoch": 0.92, + "grad_norm": 0.546875, + "learning_rate": 0.00018976589102081878, + "loss": 0.5122, + "step": 6917 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390625, + "learning_rate": 0.00018976075855906693, + "loss": 0.5442, + "step": 6918 + }, + { + "epoch": 0.92, + "grad_norm": 0.62890625, + "learning_rate": 0.0001897556248801016, + "loss": 0.414, + "step": 6919 + }, + { + "epoch": 0.92, + "grad_norm": 0.6953125, + "learning_rate": 0.0001897504899839924, + "loss": 0.7277, + "step": 6920 + }, + { + "epoch": 0.92, + "grad_norm": 0.484375, + "learning_rate": 0.00018974535387080894, + "loss": 0.5947, + "step": 6921 + }, + { + "epoch": 0.92, + "grad_norm": 0.439453125, + "learning_rate": 0.0001897402165406209, + "loss": 0.4918, + "step": 6922 + }, + { + "epoch": 0.92, + "grad_norm": 0.828125, + "learning_rate": 0.0001897350779934979, + "loss": 0.7959, + "step": 6923 + }, + { + "epoch": 0.92, + "grad_norm": 0.384765625, + "learning_rate": 0.00018972993822950968, + "loss": 0.3038, + "step": 6924 + }, + { + "epoch": 0.92, + "grad_norm": 0.59765625, + "learning_rate": 0.0001897247972487259, + "loss": 0.6279, + "step": 6925 + }, + { + "epoch": 0.92, + "grad_norm": 0.71875, + "learning_rate": 0.00018971965505121626, + "loss": 0.3005, + "step": 6926 + }, + { + "epoch": 0.92, + "grad_norm": 0.78515625, + "learning_rate": 0.00018971451163705055, + "loss": 0.4307, + "step": 6927 + }, + { + "epoch": 0.92, + "grad_norm": 0.4765625, + "learning_rate": 0.00018970936700629846, + "loss": 0.3998, + "step": 6928 + }, + { + "epoch": 0.92, + "grad_norm": 0.62109375, + "learning_rate": 0.0001897042211590298, + "loss": 0.3431, + "step": 6929 + }, + { + "epoch": 0.92, + "grad_norm": 0.80859375, + "learning_rate": 0.00018969907409531432, + "loss": 0.4963, + "step": 6930 + }, + { + "epoch": 0.92, + "grad_norm": 0.482421875, + "learning_rate": 0.0001896939258152218, + "loss": 0.4881, + "step": 6931 + }, + { + "epoch": 0.93, + "grad_norm": 0.7109375, + "learning_rate": 0.0001896887763188221, + "loss": 0.628, + "step": 6932 + }, + { + "epoch": 0.93, + "grad_norm": 1.109375, + "learning_rate": 0.00018968362560618505, + "loss": 0.3776, + "step": 6933 + }, + { + "epoch": 0.93, + "grad_norm": 0.73046875, + "learning_rate": 0.00018967847367738048, + "loss": 0.7184, + "step": 6934 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 0.00018967332053247823, + "loss": 0.4291, + "step": 6935 + }, + { + "epoch": 0.93, + "grad_norm": 0.5390625, + "learning_rate": 0.00018966816617154822, + "loss": 0.5011, + "step": 6936 + }, + { + "epoch": 0.93, + "grad_norm": 0.73046875, + "learning_rate": 0.0001896630105946603, + "loss": 0.762, + "step": 6937 + }, + { + "epoch": 0.93, + "grad_norm": 0.6796875, + "learning_rate": 0.00018965785380188442, + "loss": 0.5845, + "step": 6938 + }, + { + "epoch": 0.93, + "grad_norm": 0.4453125, + "learning_rate": 0.00018965269579329053, + "loss": 0.4715, + "step": 6939 + }, + { + "epoch": 0.93, + "grad_norm": 0.474609375, + "learning_rate": 0.00018964753656894852, + "loss": 0.6555, + "step": 6940 + }, + { + "epoch": 0.93, + "grad_norm": 0.451171875, + "learning_rate": 0.0001896423761289284, + "loss": 0.1755, + "step": 6941 + }, + { + "epoch": 0.93, + "grad_norm": 0.439453125, + "learning_rate": 0.00018963721447330008, + "loss": 0.3837, + "step": 6942 + }, + { + "epoch": 0.93, + "grad_norm": 0.609375, + "learning_rate": 0.00018963205160213366, + "loss": 0.3634, + "step": 6943 + }, + { + "epoch": 0.93, + "grad_norm": 0.486328125, + "learning_rate": 0.00018962688751549906, + "loss": 0.4939, + "step": 6944 + }, + { + "epoch": 0.93, + "grad_norm": 0.345703125, + "learning_rate": 0.00018962172221346634, + "loss": 0.3058, + "step": 6945 + }, + { + "epoch": 0.93, + "grad_norm": 0.443359375, + "learning_rate": 0.00018961655569610557, + "loss": 0.5341, + "step": 6946 + }, + { + "epoch": 0.93, + "grad_norm": 0.462890625, + "learning_rate": 0.0001896113879634868, + "loss": 0.3324, + "step": 6947 + }, + { + "epoch": 0.93, + "grad_norm": 0.62890625, + "learning_rate": 0.00018960621901568006, + "loss": 0.7973, + "step": 6948 + }, + { + "epoch": 0.93, + "grad_norm": 0.498046875, + "learning_rate": 0.0001896010488527555, + "loss": 0.3313, + "step": 6949 + }, + { + "epoch": 0.93, + "grad_norm": 0.8515625, + "learning_rate": 0.0001895958774747832, + "loss": 0.7491, + "step": 6950 + }, + { + "epoch": 0.93, + "grad_norm": 0.66015625, + "learning_rate": 0.00018959070488183333, + "loss": 0.412, + "step": 6951 + }, + { + "epoch": 0.93, + "grad_norm": 0.9140625, + "learning_rate": 0.00018958553107397596, + "loss": 0.5294, + "step": 6952 + }, + { + "epoch": 0.93, + "grad_norm": 0.431640625, + "learning_rate": 0.0001895803560512813, + "loss": 0.2669, + "step": 6953 + }, + { + "epoch": 0.93, + "grad_norm": 0.63671875, + "learning_rate": 0.00018957517981381953, + "loss": 0.2636, + "step": 6954 + }, + { + "epoch": 0.93, + "grad_norm": 0.55859375, + "learning_rate": 0.00018957000236166085, + "loss": 0.346, + "step": 6955 + }, + { + "epoch": 0.93, + "grad_norm": 0.515625, + "learning_rate": 0.00018956482369487544, + "loss": 0.6196, + "step": 6956 + }, + { + "epoch": 0.93, + "grad_norm": 0.4609375, + "learning_rate": 0.00018955964381353355, + "loss": 0.2257, + "step": 6957 + }, + { + "epoch": 0.93, + "grad_norm": 0.578125, + "learning_rate": 0.00018955446271770541, + "loss": 0.5019, + "step": 6958 + }, + { + "epoch": 0.93, + "grad_norm": 0.478515625, + "learning_rate": 0.00018954928040746126, + "loss": 0.3373, + "step": 6959 + }, + { + "epoch": 0.93, + "grad_norm": 0.41796875, + "learning_rate": 0.0001895440968828714, + "loss": 0.29, + "step": 6960 + }, + { + "epoch": 0.93, + "grad_norm": 0.7265625, + "learning_rate": 0.00018953891214400613, + "loss": 0.5985, + "step": 6961 + }, + { + "epoch": 0.93, + "grad_norm": 0.53125, + "learning_rate": 0.00018953372619093573, + "loss": 0.5656, + "step": 6962 + }, + { + "epoch": 0.93, + "grad_norm": 0.57421875, + "learning_rate": 0.00018952853902373052, + "loss": 0.3182, + "step": 6963 + }, + { + "epoch": 0.93, + "grad_norm": 0.5390625, + "learning_rate": 0.00018952335064246093, + "loss": 0.3157, + "step": 6964 + }, + { + "epoch": 0.93, + "grad_norm": 0.50390625, + "learning_rate": 0.00018951816104719718, + "loss": 0.4246, + "step": 6965 + }, + { + "epoch": 0.93, + "grad_norm": 0.6328125, + "learning_rate": 0.00018951297023800975, + "loss": 0.7072, + "step": 6966 + }, + { + "epoch": 0.93, + "grad_norm": 0.51171875, + "learning_rate": 0.000189507778214969, + "loss": 0.5925, + "step": 6967 + }, + { + "epoch": 0.93, + "grad_norm": 0.69921875, + "learning_rate": 0.0001895025849781453, + "loss": 0.393, + "step": 6968 + }, + { + "epoch": 0.93, + "grad_norm": 0.59375, + "learning_rate": 0.00018949739052760914, + "loss": 0.6766, + "step": 6969 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 0.0001894921948634309, + "loss": 0.3361, + "step": 6970 + }, + { + "epoch": 0.93, + "grad_norm": 0.65234375, + "learning_rate": 0.00018948699798568107, + "loss": 0.6352, + "step": 6971 + }, + { + "epoch": 0.93, + "grad_norm": 0.80859375, + "learning_rate": 0.00018948179989443012, + "loss": 0.526, + "step": 6972 + }, + { + "epoch": 0.93, + "grad_norm": 0.5703125, + "learning_rate": 0.00018947660058974853, + "loss": 0.3206, + "step": 6973 + }, + { + "epoch": 0.93, + "grad_norm": 0.67578125, + "learning_rate": 0.0001894714000717068, + "loss": 0.553, + "step": 6974 + }, + { + "epoch": 0.93, + "grad_norm": 0.8046875, + "learning_rate": 0.00018946619834037546, + "loss": 0.3223, + "step": 6975 + }, + { + "epoch": 0.93, + "grad_norm": 0.5859375, + "learning_rate": 0.00018946099539582508, + "loss": 0.4248, + "step": 6976 + }, + { + "epoch": 0.93, + "grad_norm": 0.66015625, + "learning_rate": 0.00018945579123812617, + "loss": 0.5331, + "step": 6977 + }, + { + "epoch": 0.93, + "grad_norm": 0.66796875, + "learning_rate": 0.00018945058586734934, + "loss": 0.5515, + "step": 6978 + }, + { + "epoch": 0.93, + "grad_norm": 0.55859375, + "learning_rate": 0.00018944537928356514, + "loss": 0.7042, + "step": 6979 + }, + { + "epoch": 0.93, + "grad_norm": 0.94921875, + "learning_rate": 0.00018944017148684418, + "loss": 0.3431, + "step": 6980 + }, + { + "epoch": 0.93, + "grad_norm": 0.515625, + "learning_rate": 0.0001894349624772571, + "loss": 0.5747, + "step": 6981 + }, + { + "epoch": 0.93, + "grad_norm": 0.62890625, + "learning_rate": 0.0001894297522548745, + "loss": 0.439, + "step": 6982 + }, + { + "epoch": 0.93, + "grad_norm": 0.498046875, + "learning_rate": 0.0001894245408197671, + "loss": 0.3905, + "step": 6983 + }, + { + "epoch": 0.93, + "grad_norm": 0.671875, + "learning_rate": 0.00018941932817200554, + "loss": 0.4674, + "step": 6984 + }, + { + "epoch": 0.93, + "grad_norm": 0.81640625, + "learning_rate": 0.0001894141143116605, + "loss": 0.5512, + "step": 6985 + }, + { + "epoch": 0.93, + "grad_norm": 0.6484375, + "learning_rate": 0.00018940889923880268, + "loss": 0.547, + "step": 6986 + }, + { + "epoch": 0.93, + "grad_norm": 0.5390625, + "learning_rate": 0.0001894036829535028, + "loss": 0.4849, + "step": 6987 + }, + { + "epoch": 0.93, + "grad_norm": 0.51953125, + "learning_rate": 0.00018939846545583158, + "loss": 0.4346, + "step": 6988 + }, + { + "epoch": 0.93, + "grad_norm": 0.5, + "learning_rate": 0.00018939324674585982, + "loss": 0.5076, + "step": 6989 + }, + { + "epoch": 0.93, + "grad_norm": 0.53515625, + "learning_rate": 0.00018938802682365825, + "loss": 0.356, + "step": 6990 + }, + { + "epoch": 0.93, + "grad_norm": 0.50390625, + "learning_rate": 0.00018938280568929768, + "loss": 0.4899, + "step": 6991 + }, + { + "epoch": 0.93, + "grad_norm": 0.68359375, + "learning_rate": 0.00018937758334284887, + "loss": 0.5147, + "step": 6992 + }, + { + "epoch": 0.93, + "grad_norm": 0.55078125, + "learning_rate": 0.0001893723597843827, + "loss": 0.478, + "step": 6993 + }, + { + "epoch": 0.93, + "grad_norm": 0.84375, + "learning_rate": 0.00018936713501396994, + "loss": 0.3143, + "step": 6994 + }, + { + "epoch": 0.93, + "grad_norm": 0.59375, + "learning_rate": 0.0001893619090316815, + "loss": 0.559, + "step": 6995 + }, + { + "epoch": 0.93, + "grad_norm": 0.423828125, + "learning_rate": 0.00018935668183758822, + "loss": 0.2794, + "step": 6996 + }, + { + "epoch": 0.93, + "grad_norm": 0.70703125, + "learning_rate": 0.000189351453431761, + "loss": 0.3726, + "step": 6997 + }, + { + "epoch": 0.93, + "grad_norm": 0.5703125, + "learning_rate": 0.0001893462238142707, + "loss": 0.7634, + "step": 6998 + }, + { + "epoch": 0.93, + "grad_norm": 0.447265625, + "learning_rate": 0.00018934099298518827, + "loss": 0.4474, + "step": 6999 + }, + { + "epoch": 0.93, + "grad_norm": 0.6171875, + "learning_rate": 0.00018933576094458464, + "loss": 0.5828, + "step": 7000 + }, + { + "epoch": 0.93, + "grad_norm": 0.5625, + "learning_rate": 0.00018933052769253074, + "loss": 0.4707, + "step": 7001 + }, + { + "epoch": 0.93, + "grad_norm": 0.55859375, + "learning_rate": 0.00018932529322909756, + "loss": 0.757, + "step": 7002 + }, + { + "epoch": 0.93, + "grad_norm": 0.64453125, + "learning_rate": 0.0001893200575543561, + "loss": 0.3207, + "step": 7003 + }, + { + "epoch": 0.93, + "grad_norm": 0.52734375, + "learning_rate": 0.0001893148206683773, + "loss": 0.5661, + "step": 7004 + }, + { + "epoch": 0.93, + "grad_norm": 0.486328125, + "learning_rate": 0.00018930958257123225, + "loss": 0.528, + "step": 7005 + }, + { + "epoch": 0.93, + "grad_norm": 0.70703125, + "learning_rate": 0.0001893043432629919, + "loss": 0.6364, + "step": 7006 + }, + { + "epoch": 0.94, + "grad_norm": 0.50390625, + "learning_rate": 0.00018929910274372735, + "loss": 0.4057, + "step": 7007 + }, + { + "epoch": 0.94, + "grad_norm": 0.474609375, + "learning_rate": 0.00018929386101350967, + "loss": 0.3531, + "step": 7008 + }, + { + "epoch": 0.94, + "grad_norm": 0.62890625, + "learning_rate": 0.00018928861807240992, + "loss": 0.3332, + "step": 7009 + }, + { + "epoch": 0.94, + "grad_norm": 0.58203125, + "learning_rate": 0.0001892833739204992, + "loss": 0.5487, + "step": 7010 + }, + { + "epoch": 0.94, + "grad_norm": 0.75390625, + "learning_rate": 0.0001892781285578486, + "loss": 0.4626, + "step": 7011 + }, + { + "epoch": 0.94, + "grad_norm": 0.48828125, + "learning_rate": 0.00018927288198452932, + "loss": 0.2824, + "step": 7012 + }, + { + "epoch": 0.94, + "grad_norm": 0.578125, + "learning_rate": 0.00018926763420061245, + "loss": 0.334, + "step": 7013 + }, + { + "epoch": 0.94, + "grad_norm": 0.74609375, + "learning_rate": 0.0001892623852061692, + "loss": 0.7449, + "step": 7014 + }, + { + "epoch": 0.94, + "grad_norm": 0.36328125, + "learning_rate": 0.00018925713500127068, + "loss": 0.2638, + "step": 7015 + }, + { + "epoch": 0.94, + "grad_norm": 0.435546875, + "learning_rate": 0.00018925188358598813, + "loss": 0.5371, + "step": 7016 + }, + { + "epoch": 0.94, + "grad_norm": 0.52734375, + "learning_rate": 0.00018924663096039278, + "loss": 0.4532, + "step": 7017 + }, + { + "epoch": 0.94, + "grad_norm": 0.7578125, + "learning_rate": 0.0001892413771245558, + "loss": 0.6102, + "step": 7018 + }, + { + "epoch": 0.94, + "grad_norm": 0.63671875, + "learning_rate": 0.00018923612207854848, + "loss": 0.3784, + "step": 7019 + }, + { + "epoch": 0.94, + "grad_norm": 0.47265625, + "learning_rate": 0.0001892308658224421, + "loss": 0.3836, + "step": 7020 + }, + { + "epoch": 0.94, + "grad_norm": 0.69140625, + "learning_rate": 0.00018922560835630788, + "loss": 0.3777, + "step": 7021 + }, + { + "epoch": 0.94, + "grad_norm": 0.87109375, + "learning_rate": 0.0001892203496802172, + "loss": 0.7111, + "step": 7022 + }, + { + "epoch": 0.94, + "grad_norm": 0.82421875, + "learning_rate": 0.00018921508979424125, + "loss": 0.5657, + "step": 7023 + }, + { + "epoch": 0.94, + "grad_norm": 0.53125, + "learning_rate": 0.00018920982869845146, + "loss": 0.5437, + "step": 7024 + }, + { + "epoch": 0.94, + "grad_norm": 0.72265625, + "learning_rate": 0.00018920456639291914, + "loss": 0.4868, + "step": 7025 + }, + { + "epoch": 0.94, + "grad_norm": 0.60546875, + "learning_rate": 0.0001891993028777156, + "loss": 0.4477, + "step": 7026 + }, + { + "epoch": 0.94, + "grad_norm": 0.62890625, + "learning_rate": 0.0001891940381529123, + "loss": 0.5936, + "step": 7027 + }, + { + "epoch": 0.94, + "grad_norm": 0.494140625, + "learning_rate": 0.00018918877221858063, + "loss": 0.4717, + "step": 7028 + }, + { + "epoch": 0.94, + "grad_norm": 0.578125, + "learning_rate": 0.00018918350507479193, + "loss": 0.4352, + "step": 7029 + }, + { + "epoch": 0.94, + "grad_norm": 0.57421875, + "learning_rate": 0.00018917823672161767, + "loss": 0.5256, + "step": 7030 + }, + { + "epoch": 0.94, + "grad_norm": 0.474609375, + "learning_rate": 0.0001891729671591293, + "loss": 0.3318, + "step": 7031 + }, + { + "epoch": 0.94, + "grad_norm": 0.34375, + "learning_rate": 0.00018916769638739825, + "loss": 0.4206, + "step": 7032 + }, + { + "epoch": 0.94, + "grad_norm": 0.58984375, + "learning_rate": 0.00018916242440649602, + "loss": 0.655, + "step": 7033 + }, + { + "epoch": 0.94, + "grad_norm": 0.6171875, + "learning_rate": 0.0001891571512164941, + "loss": 0.4047, + "step": 7034 + }, + { + "epoch": 0.94, + "grad_norm": 0.431640625, + "learning_rate": 0.00018915187681746396, + "loss": 0.5362, + "step": 7035 + }, + { + "epoch": 0.94, + "grad_norm": 0.75390625, + "learning_rate": 0.00018914660120947715, + "loss": 0.3585, + "step": 7036 + }, + { + "epoch": 0.94, + "grad_norm": 0.46875, + "learning_rate": 0.00018914132439260524, + "loss": 0.4664, + "step": 7037 + }, + { + "epoch": 0.94, + "grad_norm": 0.59765625, + "learning_rate": 0.00018913604636691976, + "loss": 0.3149, + "step": 7038 + }, + { + "epoch": 0.94, + "grad_norm": 0.9609375, + "learning_rate": 0.00018913076713249227, + "loss": 0.4713, + "step": 7039 + }, + { + "epoch": 0.94, + "grad_norm": 0.46484375, + "learning_rate": 0.00018912548668939438, + "loss": 0.2088, + "step": 7040 + }, + { + "epoch": 0.94, + "grad_norm": 0.61328125, + "learning_rate": 0.00018912020503769768, + "loss": 0.3568, + "step": 7041 + }, + { + "epoch": 0.94, + "grad_norm": 0.62109375, + "learning_rate": 0.0001891149221774738, + "loss": 0.4394, + "step": 7042 + }, + { + "epoch": 0.94, + "grad_norm": 0.60546875, + "learning_rate": 0.0001891096381087944, + "loss": 0.3228, + "step": 7043 + }, + { + "epoch": 0.94, + "grad_norm": 0.484375, + "learning_rate": 0.00018910435283173111, + "loss": 0.3661, + "step": 7044 + }, + { + "epoch": 0.94, + "grad_norm": 0.8125, + "learning_rate": 0.0001890990663463556, + "loss": 0.4132, + "step": 7045 + }, + { + "epoch": 0.94, + "grad_norm": 0.59765625, + "learning_rate": 0.0001890937786527396, + "loss": 0.6033, + "step": 7046 + }, + { + "epoch": 0.94, + "grad_norm": 0.56640625, + "learning_rate": 0.00018908848975095476, + "loss": 0.6803, + "step": 7047 + }, + { + "epoch": 0.94, + "grad_norm": 0.49609375, + "learning_rate": 0.00018908319964107282, + "loss": 0.4707, + "step": 7048 + }, + { + "epoch": 0.94, + "grad_norm": 0.41015625, + "learning_rate": 0.00018907790832316552, + "loss": 0.426, + "step": 7049 + }, + { + "epoch": 0.94, + "grad_norm": 0.55859375, + "learning_rate": 0.00018907261579730462, + "loss": 0.5367, + "step": 7050 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 0.0001890673220635619, + "loss": 0.6675, + "step": 7051 + }, + { + "epoch": 0.94, + "grad_norm": 0.59375, + "learning_rate": 0.00018906202712200908, + "loss": 0.286, + "step": 7052 + }, + { + "epoch": 0.94, + "grad_norm": 0.75, + "learning_rate": 0.00018905673097271804, + "loss": 0.3252, + "step": 7053 + }, + { + "epoch": 0.94, + "grad_norm": 0.375, + "learning_rate": 0.0001890514336157606, + "loss": 0.2415, + "step": 7054 + }, + { + "epoch": 0.94, + "grad_norm": 0.66796875, + "learning_rate": 0.00018904613505120853, + "loss": 0.396, + "step": 7055 + }, + { + "epoch": 0.94, + "grad_norm": 0.66015625, + "learning_rate": 0.0001890408352791337, + "loss": 0.381, + "step": 7056 + }, + { + "epoch": 0.94, + "grad_norm": 0.765625, + "learning_rate": 0.00018903553429960802, + "loss": 0.4213, + "step": 7057 + }, + { + "epoch": 0.94, + "grad_norm": 0.404296875, + "learning_rate": 0.00018903023211270338, + "loss": 0.3327, + "step": 7058 + }, + { + "epoch": 0.94, + "grad_norm": 0.59765625, + "learning_rate": 0.00018902492871849164, + "loss": 0.6347, + "step": 7059 + }, + { + "epoch": 0.94, + "grad_norm": 0.62109375, + "learning_rate": 0.0001890196241170447, + "loss": 0.3525, + "step": 7060 + }, + { + "epoch": 0.94, + "grad_norm": 0.439453125, + "learning_rate": 0.00018901431830843456, + "loss": 0.5359, + "step": 7061 + }, + { + "epoch": 0.94, + "grad_norm": 0.421875, + "learning_rate": 0.00018900901129273313, + "loss": 0.3798, + "step": 7062 + }, + { + "epoch": 0.94, + "grad_norm": 0.60546875, + "learning_rate": 0.00018900370307001234, + "loss": 0.5462, + "step": 7063 + }, + { + "epoch": 0.94, + "grad_norm": 0.71484375, + "learning_rate": 0.00018899839364034426, + "loss": 0.4436, + "step": 7064 + }, + { + "epoch": 0.94, + "grad_norm": 0.59765625, + "learning_rate": 0.00018899308300380084, + "loss": 0.7697, + "step": 7065 + }, + { + "epoch": 0.94, + "grad_norm": 0.57421875, + "learning_rate": 0.00018898777116045407, + "loss": 0.2699, + "step": 7066 + }, + { + "epoch": 0.94, + "grad_norm": 0.71484375, + "learning_rate": 0.00018898245811037602, + "loss": 0.3738, + "step": 7067 + }, + { + "epoch": 0.94, + "grad_norm": 0.6171875, + "learning_rate": 0.00018897714385363875, + "loss": 0.4736, + "step": 7068 + }, + { + "epoch": 0.94, + "grad_norm": 0.486328125, + "learning_rate": 0.00018897182839031432, + "loss": 0.6618, + "step": 7069 + }, + { + "epoch": 0.94, + "grad_norm": 0.392578125, + "learning_rate": 0.00018896651172047477, + "loss": 0.3083, + "step": 7070 + }, + { + "epoch": 0.94, + "grad_norm": 0.63671875, + "learning_rate": 0.00018896119384419219, + "loss": 0.5906, + "step": 7071 + }, + { + "epoch": 0.94, + "grad_norm": 0.73828125, + "learning_rate": 0.00018895587476153877, + "loss": 0.1874, + "step": 7072 + }, + { + "epoch": 0.94, + "grad_norm": 0.67578125, + "learning_rate": 0.00018895055447258662, + "loss": 0.6444, + "step": 7073 + }, + { + "epoch": 0.94, + "grad_norm": 0.6953125, + "learning_rate": 0.00018894523297740778, + "loss": 0.505, + "step": 7074 + }, + { + "epoch": 0.94, + "grad_norm": 0.4765625, + "learning_rate": 0.00018893991027607457, + "loss": 0.5052, + "step": 7075 + }, + { + "epoch": 0.94, + "grad_norm": 0.458984375, + "learning_rate": 0.00018893458636865906, + "loss": 0.411, + "step": 7076 + }, + { + "epoch": 0.94, + "grad_norm": 0.8359375, + "learning_rate": 0.0001889292612552335, + "loss": 0.2756, + "step": 7077 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 0.00018892393493587006, + "loss": 0.34, + "step": 7078 + }, + { + "epoch": 0.94, + "grad_norm": 0.6328125, + "learning_rate": 0.00018891860741064101, + "loss": 0.3519, + "step": 7079 + }, + { + "epoch": 0.94, + "grad_norm": 0.68359375, + "learning_rate": 0.00018891327867961858, + "loss": 0.6516, + "step": 7080 + }, + { + "epoch": 0.94, + "grad_norm": 0.62109375, + "learning_rate": 0.00018890794874287502, + "loss": 0.6008, + "step": 7081 + }, + { + "epoch": 0.95, + "grad_norm": 0.5703125, + "learning_rate": 0.0001889026176004826, + "loss": 0.3621, + "step": 7082 + }, + { + "epoch": 0.95, + "grad_norm": 0.6015625, + "learning_rate": 0.00018889728525251366, + "loss": 0.3052, + "step": 7083 + }, + { + "epoch": 0.95, + "grad_norm": 0.6953125, + "learning_rate": 0.00018889195169904043, + "loss": 0.6582, + "step": 7084 + }, + { + "epoch": 0.95, + "grad_norm": 0.384765625, + "learning_rate": 0.00018888661694013532, + "loss": 0.2484, + "step": 7085 + }, + { + "epoch": 0.95, + "grad_norm": 0.71484375, + "learning_rate": 0.00018888128097587065, + "loss": 0.4846, + "step": 7086 + }, + { + "epoch": 0.95, + "grad_norm": 0.671875, + "learning_rate": 0.00018887594380631874, + "loss": 0.5543, + "step": 7087 + }, + { + "epoch": 0.95, + "grad_norm": 0.625, + "learning_rate": 0.00018887060543155198, + "loss": 0.4382, + "step": 7088 + }, + { + "epoch": 0.95, + "grad_norm": 0.51171875, + "learning_rate": 0.0001888652658516428, + "loss": 0.4994, + "step": 7089 + }, + { + "epoch": 0.95, + "grad_norm": 0.8125, + "learning_rate": 0.00018885992506666356, + "loss": 0.5478, + "step": 7090 + }, + { + "epoch": 0.95, + "grad_norm": 0.427734375, + "learning_rate": 0.00018885458307668672, + "loss": 0.5356, + "step": 7091 + }, + { + "epoch": 0.95, + "grad_norm": 0.71875, + "learning_rate": 0.0001888492398817847, + "loss": 0.247, + "step": 7092 + }, + { + "epoch": 0.95, + "grad_norm": 0.5859375, + "learning_rate": 0.00018884389548202998, + "loss": 0.261, + "step": 7093 + }, + { + "epoch": 0.95, + "grad_norm": 0.96875, + "learning_rate": 0.00018883854987749503, + "loss": 0.2311, + "step": 7094 + }, + { + "epoch": 0.95, + "grad_norm": 0.7734375, + "learning_rate": 0.0001888332030682523, + "loss": 0.5699, + "step": 7095 + }, + { + "epoch": 0.95, + "grad_norm": 0.734375, + "learning_rate": 0.00018882785505437434, + "loss": 0.4435, + "step": 7096 + }, + { + "epoch": 0.95, + "grad_norm": 0.7734375, + "learning_rate": 0.00018882250583593363, + "loss": 0.6517, + "step": 7097 + }, + { + "epoch": 0.95, + "grad_norm": 0.6875, + "learning_rate": 0.00018881715541300276, + "loss": 0.2866, + "step": 7098 + }, + { + "epoch": 0.95, + "grad_norm": 0.4609375, + "learning_rate": 0.00018881180378565424, + "loss": 0.3272, + "step": 7099 + }, + { + "epoch": 0.95, + "grad_norm": 0.66796875, + "learning_rate": 0.0001888064509539607, + "loss": 0.3997, + "step": 7100 + }, + { + "epoch": 0.95, + "grad_norm": 0.5625, + "learning_rate": 0.00018880109691799462, + "loss": 0.5022, + "step": 7101 + }, + { + "epoch": 0.95, + "grad_norm": 0.578125, + "learning_rate": 0.00018879574167782873, + "loss": 0.2679, + "step": 7102 + }, + { + "epoch": 0.95, + "grad_norm": 0.6640625, + "learning_rate": 0.0001887903852335356, + "loss": 0.407, + "step": 7103 + }, + { + "epoch": 0.95, + "grad_norm": 0.59765625, + "learning_rate": 0.0001887850275851878, + "loss": 0.5239, + "step": 7104 + }, + { + "epoch": 0.95, + "grad_norm": 0.640625, + "learning_rate": 0.0001887796687328581, + "loss": 0.201, + "step": 7105 + }, + { + "epoch": 0.95, + "grad_norm": 0.65234375, + "learning_rate": 0.00018877430867661913, + "loss": 0.3389, + "step": 7106 + }, + { + "epoch": 0.95, + "grad_norm": 0.671875, + "learning_rate": 0.00018876894741654348, + "loss": 0.3981, + "step": 7107 + }, + { + "epoch": 0.95, + "grad_norm": 0.60546875, + "learning_rate": 0.000188763584952704, + "loss": 0.4726, + "step": 7108 + }, + { + "epoch": 0.95, + "grad_norm": 0.474609375, + "learning_rate": 0.00018875822128517334, + "loss": 0.411, + "step": 7109 + }, + { + "epoch": 0.95, + "grad_norm": 0.515625, + "learning_rate": 0.0001887528564140242, + "loss": 0.6368, + "step": 7110 + }, + { + "epoch": 0.95, + "grad_norm": 0.6015625, + "learning_rate": 0.00018874749033932938, + "loss": 0.3277, + "step": 7111 + }, + { + "epoch": 0.95, + "grad_norm": 0.5703125, + "learning_rate": 0.00018874212306116165, + "loss": 0.7108, + "step": 7112 + }, + { + "epoch": 0.95, + "grad_norm": 0.82421875, + "learning_rate": 0.00018873675457959375, + "loss": 0.4514, + "step": 7113 + }, + { + "epoch": 0.95, + "grad_norm": 0.66796875, + "learning_rate": 0.0001887313848946985, + "loss": 0.2506, + "step": 7114 + }, + { + "epoch": 0.95, + "grad_norm": 0.392578125, + "learning_rate": 0.00018872601400654875, + "loss": 0.2097, + "step": 7115 + }, + { + "epoch": 0.95, + "grad_norm": 0.5703125, + "learning_rate": 0.0001887206419152173, + "loss": 0.4608, + "step": 7116 + }, + { + "epoch": 0.95, + "grad_norm": 0.447265625, + "learning_rate": 0.000188715268620777, + "loss": 0.3289, + "step": 7117 + }, + { + "epoch": 0.95, + "grad_norm": 0.56640625, + "learning_rate": 0.00018870989412330072, + "loss": 0.4962, + "step": 7118 + }, + { + "epoch": 0.95, + "grad_norm": 0.54296875, + "learning_rate": 0.00018870451842286135, + "loss": 0.5509, + "step": 7119 + }, + { + "epoch": 0.95, + "grad_norm": 0.431640625, + "learning_rate": 0.00018869914151953176, + "loss": 0.5323, + "step": 7120 + }, + { + "epoch": 0.95, + "grad_norm": 0.6171875, + "learning_rate": 0.00018869376341338492, + "loss": 0.6024, + "step": 7121 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 0.0001886883841044937, + "loss": 0.4828, + "step": 7122 + }, + { + "epoch": 0.95, + "grad_norm": 0.53125, + "learning_rate": 0.00018868300359293106, + "loss": 0.6029, + "step": 7123 + }, + { + "epoch": 0.95, + "grad_norm": 0.73828125, + "learning_rate": 0.00018867762187877, + "loss": 0.3414, + "step": 7124 + }, + { + "epoch": 0.95, + "grad_norm": 0.61328125, + "learning_rate": 0.00018867223896208346, + "loss": 0.5493, + "step": 7125 + }, + { + "epoch": 0.95, + "grad_norm": 0.447265625, + "learning_rate": 0.00018866685484294444, + "loss": 0.3794, + "step": 7126 + }, + { + "epoch": 0.95, + "grad_norm": 0.59765625, + "learning_rate": 0.00018866146952142599, + "loss": 0.5668, + "step": 7127 + }, + { + "epoch": 0.95, + "grad_norm": 0.7265625, + "learning_rate": 0.00018865608299760108, + "loss": 0.3535, + "step": 7128 + }, + { + "epoch": 0.95, + "grad_norm": 0.55859375, + "learning_rate": 0.00018865069527154282, + "loss": 0.177, + "step": 7129 + }, + { + "epoch": 0.95, + "grad_norm": 1.0859375, + "learning_rate": 0.00018864530634332422, + "loss": 0.5511, + "step": 7130 + }, + { + "epoch": 0.95, + "grad_norm": 0.72265625, + "learning_rate": 0.00018863991621301837, + "loss": 0.5284, + "step": 7131 + }, + { + "epoch": 0.95, + "grad_norm": 0.78515625, + "learning_rate": 0.00018863452488069836, + "loss": 0.5547, + "step": 7132 + }, + { + "epoch": 0.95, + "grad_norm": 0.61328125, + "learning_rate": 0.00018862913234643733, + "loss": 0.6224, + "step": 7133 + }, + { + "epoch": 0.95, + "grad_norm": 0.5625, + "learning_rate": 0.00018862373861030837, + "loss": 0.4691, + "step": 7134 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 0.00018861834367238465, + "loss": 0.3884, + "step": 7135 + }, + { + "epoch": 0.95, + "grad_norm": 0.453125, + "learning_rate": 0.0001886129475327393, + "loss": 0.2646, + "step": 7136 + }, + { + "epoch": 0.95, + "grad_norm": 0.66796875, + "learning_rate": 0.00018860755019144552, + "loss": 0.2949, + "step": 7137 + }, + { + "epoch": 0.95, + "grad_norm": 0.494140625, + "learning_rate": 0.0001886021516485765, + "loss": 0.2451, + "step": 7138 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 0.00018859675190420537, + "loss": 0.381, + "step": 7139 + }, + { + "epoch": 0.95, + "grad_norm": 0.419921875, + "learning_rate": 0.0001885913509584055, + "loss": 0.5683, + "step": 7140 + }, + { + "epoch": 0.95, + "grad_norm": 0.59765625, + "learning_rate": 0.00018858594881125, + "loss": 0.5096, + "step": 7141 + }, + { + "epoch": 0.95, + "grad_norm": 0.6484375, + "learning_rate": 0.00018858054546281222, + "loss": 0.3296, + "step": 7142 + }, + { + "epoch": 0.95, + "grad_norm": 0.58203125, + "learning_rate": 0.0001885751409131654, + "loss": 0.3537, + "step": 7143 + }, + { + "epoch": 0.95, + "grad_norm": 0.640625, + "learning_rate": 0.00018856973516238282, + "loss": 0.4003, + "step": 7144 + }, + { + "epoch": 0.95, + "grad_norm": 0.7109375, + "learning_rate": 0.00018856432821053777, + "loss": 0.4672, + "step": 7145 + }, + { + "epoch": 0.95, + "grad_norm": 0.4375, + "learning_rate": 0.0001885589200577036, + "loss": 0.2279, + "step": 7146 + }, + { + "epoch": 0.95, + "grad_norm": 0.5859375, + "learning_rate": 0.00018855351070395362, + "loss": 0.7518, + "step": 7147 + }, + { + "epoch": 0.95, + "grad_norm": 0.68359375, + "learning_rate": 0.0001885481001493612, + "loss": 0.525, + "step": 7148 + }, + { + "epoch": 0.95, + "grad_norm": 0.49609375, + "learning_rate": 0.00018854268839399972, + "loss": 0.4776, + "step": 7149 + }, + { + "epoch": 0.95, + "grad_norm": 0.578125, + "learning_rate": 0.00018853727543794257, + "loss": 0.215, + "step": 7150 + }, + { + "epoch": 0.95, + "grad_norm": 0.46875, + "learning_rate": 0.00018853186128126313, + "loss": 0.5325, + "step": 7151 + }, + { + "epoch": 0.95, + "grad_norm": 0.67578125, + "learning_rate": 0.00018852644592403484, + "loss": 0.5203, + "step": 7152 + }, + { + "epoch": 0.95, + "grad_norm": 0.52734375, + "learning_rate": 0.00018852102936633112, + "loss": 0.2492, + "step": 7153 + }, + { + "epoch": 0.95, + "grad_norm": 0.72265625, + "learning_rate": 0.00018851561160822543, + "loss": 0.8013, + "step": 7154 + }, + { + "epoch": 0.95, + "grad_norm": 0.96484375, + "learning_rate": 0.00018851019264979124, + "loss": 0.791, + "step": 7155 + }, + { + "epoch": 0.95, + "grad_norm": 0.61328125, + "learning_rate": 0.00018850477249110202, + "loss": 0.6446, + "step": 7156 + }, + { + "epoch": 0.96, + "grad_norm": 0.65234375, + "learning_rate": 0.00018849935113223132, + "loss": 0.3491, + "step": 7157 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 0.00018849392857325258, + "loss": 0.4875, + "step": 7158 + }, + { + "epoch": 0.96, + "grad_norm": 0.6328125, + "learning_rate": 0.0001884885048142394, + "loss": 0.311, + "step": 7159 + }, + { + "epoch": 0.96, + "grad_norm": 0.5625, + "learning_rate": 0.0001884830798552653, + "loss": 0.5636, + "step": 7160 + }, + { + "epoch": 0.96, + "grad_norm": 0.515625, + "learning_rate": 0.00018847765369640386, + "loss": 0.3128, + "step": 7161 + }, + { + "epoch": 0.96, + "grad_norm": 0.55859375, + "learning_rate": 0.0001884722263377286, + "loss": 0.2783, + "step": 7162 + }, + { + "epoch": 0.96, + "grad_norm": 0.486328125, + "learning_rate": 0.0001884667977793132, + "loss": 0.411, + "step": 7163 + }, + { + "epoch": 0.96, + "grad_norm": 0.51171875, + "learning_rate": 0.00018846136802123126, + "loss": 0.386, + "step": 7164 + }, + { + "epoch": 0.96, + "grad_norm": 0.57421875, + "learning_rate": 0.00018845593706355639, + "loss": 0.3031, + "step": 7165 + }, + { + "epoch": 0.96, + "grad_norm": 0.54296875, + "learning_rate": 0.00018845050490636224, + "loss": 0.5383, + "step": 7166 + }, + { + "epoch": 0.96, + "grad_norm": 0.5078125, + "learning_rate": 0.00018844507154972248, + "loss": 0.6821, + "step": 7167 + }, + { + "epoch": 0.96, + "grad_norm": 0.60546875, + "learning_rate": 0.00018843963699371075, + "loss": 0.2601, + "step": 7168 + }, + { + "epoch": 0.96, + "grad_norm": 0.46875, + "learning_rate": 0.00018843420123840083, + "loss": 0.3788, + "step": 7169 + }, + { + "epoch": 0.96, + "grad_norm": 0.70703125, + "learning_rate": 0.00018842876428386634, + "loss": 0.4322, + "step": 7170 + }, + { + "epoch": 0.96, + "grad_norm": 0.46484375, + "learning_rate": 0.0001884233261301811, + "loss": 0.2676, + "step": 7171 + }, + { + "epoch": 0.96, + "grad_norm": 0.70703125, + "learning_rate": 0.00018841788677741877, + "loss": 0.2865, + "step": 7172 + }, + { + "epoch": 0.96, + "grad_norm": 0.7421875, + "learning_rate": 0.00018841244622565316, + "loss": 0.4497, + "step": 7173 + }, + { + "epoch": 0.96, + "grad_norm": 0.55078125, + "learning_rate": 0.00018840700447495806, + "loss": 0.4253, + "step": 7174 + }, + { + "epoch": 0.96, + "grad_norm": 0.451171875, + "learning_rate": 0.00018840156152540721, + "loss": 0.2762, + "step": 7175 + }, + { + "epoch": 0.96, + "grad_norm": 0.51171875, + "learning_rate": 0.00018839611737707447, + "loss": 0.5594, + "step": 7176 + }, + { + "epoch": 0.96, + "grad_norm": 0.58203125, + "learning_rate": 0.00018839067203003365, + "loss": 0.4202, + "step": 7177 + }, + { + "epoch": 0.96, + "grad_norm": 0.5625, + "learning_rate": 0.00018838522548435854, + "loss": 0.4349, + "step": 7178 + }, + { + "epoch": 0.96, + "grad_norm": 0.828125, + "learning_rate": 0.0001883797777401231, + "loss": 0.7328, + "step": 7179 + }, + { + "epoch": 0.96, + "grad_norm": 0.52734375, + "learning_rate": 0.00018837432879740114, + "loss": 0.59, + "step": 7180 + }, + { + "epoch": 0.96, + "grad_norm": 0.60546875, + "learning_rate": 0.00018836887865626654, + "loss": 0.5002, + "step": 7181 + }, + { + "epoch": 0.96, + "grad_norm": 0.9453125, + "learning_rate": 0.00018836342731679326, + "loss": 0.7071, + "step": 7182 + }, + { + "epoch": 0.96, + "grad_norm": 0.59375, + "learning_rate": 0.0001883579747790552, + "loss": 0.3489, + "step": 7183 + }, + { + "epoch": 0.96, + "grad_norm": 0.609375, + "learning_rate": 0.0001883525210431263, + "loss": 0.4807, + "step": 7184 + }, + { + "epoch": 0.96, + "grad_norm": 0.55859375, + "learning_rate": 0.00018834706610908045, + "loss": 0.4903, + "step": 7185 + }, + { + "epoch": 0.96, + "grad_norm": 0.45703125, + "learning_rate": 0.00018834160997699173, + "loss": 0.3418, + "step": 7186 + }, + { + "epoch": 0.96, + "grad_norm": 0.62109375, + "learning_rate": 0.00018833615264693407, + "loss": 0.3661, + "step": 7187 + }, + { + "epoch": 0.96, + "grad_norm": 0.421875, + "learning_rate": 0.00018833069411898148, + "loss": 0.3117, + "step": 7188 + }, + { + "epoch": 0.96, + "grad_norm": 0.625, + "learning_rate": 0.000188325234393208, + "loss": 0.3372, + "step": 7189 + }, + { + "epoch": 0.96, + "grad_norm": 0.66015625, + "learning_rate": 0.00018831977346968766, + "loss": 0.2656, + "step": 7190 + }, + { + "epoch": 0.96, + "grad_norm": 0.3671875, + "learning_rate": 0.0001883143113484945, + "loss": 0.1927, + "step": 7191 + }, + { + "epoch": 0.96, + "grad_norm": 0.51171875, + "learning_rate": 0.0001883088480297026, + "loss": 0.3461, + "step": 7192 + }, + { + "epoch": 0.96, + "grad_norm": 0.7109375, + "learning_rate": 0.00018830338351338602, + "loss": 0.4786, + "step": 7193 + }, + { + "epoch": 0.96, + "grad_norm": 0.578125, + "learning_rate": 0.00018829791779961891, + "loss": 0.3783, + "step": 7194 + }, + { + "epoch": 0.96, + "grad_norm": 0.5625, + "learning_rate": 0.00018829245088847538, + "loss": 0.488, + "step": 7195 + }, + { + "epoch": 0.96, + "grad_norm": 0.45703125, + "learning_rate": 0.00018828698278002955, + "loss": 0.5466, + "step": 7196 + }, + { + "epoch": 0.96, + "grad_norm": 0.75390625, + "learning_rate": 0.00018828151347435553, + "loss": 0.4876, + "step": 7197 + }, + { + "epoch": 0.96, + "grad_norm": 0.6328125, + "learning_rate": 0.00018827604297152757, + "loss": 0.5495, + "step": 7198 + }, + { + "epoch": 0.96, + "grad_norm": 0.439453125, + "learning_rate": 0.00018827057127161978, + "loss": 0.3421, + "step": 7199 + }, + { + "epoch": 0.96, + "grad_norm": 0.703125, + "learning_rate": 0.00018826509837470643, + "loss": 0.5264, + "step": 7200 + }, + { + "epoch": 0.96, + "grad_norm": 0.49609375, + "learning_rate": 0.00018825962428086168, + "loss": 0.5172, + "step": 7201 + }, + { + "epoch": 0.96, + "grad_norm": 0.63671875, + "learning_rate": 0.00018825414899015976, + "loss": 0.2813, + "step": 7202 + }, + { + "epoch": 0.96, + "grad_norm": 0.71875, + "learning_rate": 0.00018824867250267496, + "loss": 0.3618, + "step": 7203 + }, + { + "epoch": 0.96, + "grad_norm": 0.69140625, + "learning_rate": 0.00018824319481848152, + "loss": 0.3417, + "step": 7204 + }, + { + "epoch": 0.96, + "grad_norm": 0.47265625, + "learning_rate": 0.00018823771593765373, + "loss": 0.2094, + "step": 7205 + }, + { + "epoch": 0.96, + "grad_norm": 0.69921875, + "learning_rate": 0.00018823223586026587, + "loss": 0.4718, + "step": 7206 + }, + { + "epoch": 0.96, + "grad_norm": 0.40234375, + "learning_rate": 0.00018822675458639228, + "loss": 0.3162, + "step": 7207 + }, + { + "epoch": 0.96, + "grad_norm": 0.94140625, + "learning_rate": 0.00018822127211610726, + "loss": 0.7415, + "step": 7208 + }, + { + "epoch": 0.96, + "grad_norm": 0.52734375, + "learning_rate": 0.00018821578844948516, + "loss": 0.549, + "step": 7209 + }, + { + "epoch": 0.96, + "grad_norm": 0.453125, + "learning_rate": 0.00018821030358660037, + "loss": 0.2602, + "step": 7210 + }, + { + "epoch": 0.96, + "grad_norm": 0.58984375, + "learning_rate": 0.00018820481752752723, + "loss": 0.4101, + "step": 7211 + }, + { + "epoch": 0.96, + "grad_norm": 0.80078125, + "learning_rate": 0.0001881993302723402, + "loss": 0.3037, + "step": 7212 + }, + { + "epoch": 0.96, + "grad_norm": 0.6796875, + "learning_rate": 0.0001881938418211136, + "loss": 0.1531, + "step": 7213 + }, + { + "epoch": 0.96, + "grad_norm": 0.58984375, + "learning_rate": 0.0001881883521739219, + "loss": 0.2461, + "step": 7214 + }, + { + "epoch": 0.96, + "grad_norm": 0.67578125, + "learning_rate": 0.0001881828613308396, + "loss": 0.4483, + "step": 7215 + }, + { + "epoch": 0.96, + "grad_norm": 0.94140625, + "learning_rate": 0.00018817736929194106, + "loss": 0.3266, + "step": 7216 + }, + { + "epoch": 0.96, + "grad_norm": 0.6640625, + "learning_rate": 0.0001881718760573008, + "loss": 0.3022, + "step": 7217 + }, + { + "epoch": 0.96, + "grad_norm": 0.59765625, + "learning_rate": 0.00018816638162699333, + "loss": 0.6526, + "step": 7218 + }, + { + "epoch": 0.96, + "grad_norm": 0.51171875, + "learning_rate": 0.00018816088600109314, + "loss": 0.3115, + "step": 7219 + }, + { + "epoch": 0.96, + "grad_norm": 0.77734375, + "learning_rate": 0.00018815538917967475, + "loss": 0.3902, + "step": 7220 + }, + { + "epoch": 0.96, + "grad_norm": 0.796875, + "learning_rate": 0.0001881498911628127, + "loss": 0.3263, + "step": 7221 + }, + { + "epoch": 0.96, + "grad_norm": 0.5859375, + "learning_rate": 0.00018814439195058154, + "loss": 0.3374, + "step": 7222 + }, + { + "epoch": 0.96, + "grad_norm": 0.56640625, + "learning_rate": 0.00018813889154305587, + "loss": 0.2702, + "step": 7223 + }, + { + "epoch": 0.96, + "grad_norm": 0.51953125, + "learning_rate": 0.00018813338994031023, + "loss": 0.3566, + "step": 7224 + }, + { + "epoch": 0.96, + "grad_norm": 0.44140625, + "learning_rate": 0.00018812788714241932, + "loss": 0.3459, + "step": 7225 + }, + { + "epoch": 0.96, + "grad_norm": 0.5, + "learning_rate": 0.00018812238314945768, + "loss": 0.3671, + "step": 7226 + }, + { + "epoch": 0.96, + "grad_norm": 0.640625, + "learning_rate": 0.00018811687796149994, + "loss": 0.4308, + "step": 7227 + }, + { + "epoch": 0.96, + "grad_norm": 0.62890625, + "learning_rate": 0.00018811137157862082, + "loss": 0.5744, + "step": 7228 + }, + { + "epoch": 0.96, + "grad_norm": 0.447265625, + "learning_rate": 0.0001881058640008949, + "loss": 0.3198, + "step": 7229 + }, + { + "epoch": 0.96, + "grad_norm": 0.890625, + "learning_rate": 0.00018810035522839695, + "loss": 0.4835, + "step": 7230 + }, + { + "epoch": 0.96, + "grad_norm": 0.56640625, + "learning_rate": 0.00018809484526120164, + "loss": 0.4637, + "step": 7231 + }, + { + "epoch": 0.97, + "grad_norm": 0.6484375, + "learning_rate": 0.00018808933409938372, + "loss": 0.3566, + "step": 7232 + }, + { + "epoch": 0.97, + "grad_norm": 0.55859375, + "learning_rate": 0.00018808382174301786, + "loss": 0.5038, + "step": 7233 + }, + { + "epoch": 0.97, + "grad_norm": 0.546875, + "learning_rate": 0.00018807830819217883, + "loss": 0.3362, + "step": 7234 + }, + { + "epoch": 0.97, + "grad_norm": 0.6171875, + "learning_rate": 0.00018807279344694143, + "loss": 0.448, + "step": 7235 + }, + { + "epoch": 0.97, + "grad_norm": 0.484375, + "learning_rate": 0.00018806727750738045, + "loss": 0.4347, + "step": 7236 + }, + { + "epoch": 0.97, + "grad_norm": 0.515625, + "learning_rate": 0.00018806176037357064, + "loss": 0.4196, + "step": 7237 + }, + { + "epoch": 0.97, + "grad_norm": 0.58984375, + "learning_rate": 0.00018805624204558684, + "loss": 0.4663, + "step": 7238 + }, + { + "epoch": 0.97, + "grad_norm": 0.57421875, + "learning_rate": 0.00018805072252350388, + "loss": 0.4265, + "step": 7239 + }, + { + "epoch": 0.97, + "grad_norm": 0.5390625, + "learning_rate": 0.00018804520180739663, + "loss": 0.4115, + "step": 7240 + }, + { + "epoch": 0.97, + "grad_norm": 0.71484375, + "learning_rate": 0.00018803967989733993, + "loss": 0.4679, + "step": 7241 + }, + { + "epoch": 0.97, + "grad_norm": 0.447265625, + "learning_rate": 0.00018803415679340864, + "loss": 0.389, + "step": 7242 + }, + { + "epoch": 0.97, + "grad_norm": 0.58984375, + "learning_rate": 0.00018802863249567774, + "loss": 0.4166, + "step": 7243 + }, + { + "epoch": 0.97, + "grad_norm": 0.58203125, + "learning_rate": 0.00018802310700422205, + "loss": 0.7665, + "step": 7244 + }, + { + "epoch": 0.97, + "grad_norm": 0.73828125, + "learning_rate": 0.00018801758031911656, + "loss": 0.5961, + "step": 7245 + }, + { + "epoch": 0.97, + "grad_norm": 0.78515625, + "learning_rate": 0.00018801205244043615, + "loss": 0.7778, + "step": 7246 + }, + { + "epoch": 0.97, + "grad_norm": 0.67578125, + "learning_rate": 0.00018800652336825585, + "loss": 0.5766, + "step": 7247 + }, + { + "epoch": 0.97, + "grad_norm": 0.6796875, + "learning_rate": 0.00018800099310265061, + "loss": 0.5684, + "step": 7248 + }, + { + "epoch": 0.97, + "grad_norm": 0.52734375, + "learning_rate": 0.00018799546164369543, + "loss": 0.3287, + "step": 7249 + }, + { + "epoch": 0.97, + "grad_norm": 0.58203125, + "learning_rate": 0.00018798992899146533, + "loss": 0.303, + "step": 7250 + }, + { + "epoch": 0.97, + "grad_norm": 0.73046875, + "learning_rate": 0.00018798439514603529, + "loss": 0.6607, + "step": 7251 + }, + { + "epoch": 0.97, + "grad_norm": 0.68359375, + "learning_rate": 0.00018797886010748038, + "loss": 0.4228, + "step": 7252 + }, + { + "epoch": 0.97, + "grad_norm": 0.51171875, + "learning_rate": 0.00018797332387587566, + "loss": 0.4239, + "step": 7253 + }, + { + "epoch": 0.97, + "grad_norm": 0.48046875, + "learning_rate": 0.00018796778645129621, + "loss": 0.5726, + "step": 7254 + }, + { + "epoch": 0.97, + "grad_norm": 0.408203125, + "learning_rate": 0.00018796224783381718, + "loss": 0.2444, + "step": 7255 + }, + { + "epoch": 0.97, + "grad_norm": 1.015625, + "learning_rate": 0.00018795670802351354, + "loss": 0.6147, + "step": 7256 + }, + { + "epoch": 0.97, + "grad_norm": 0.75390625, + "learning_rate": 0.0001879511670204605, + "loss": 0.5141, + "step": 7257 + }, + { + "epoch": 0.97, + "grad_norm": 0.734375, + "learning_rate": 0.00018794562482473323, + "loss": 0.5713, + "step": 7258 + }, + { + "epoch": 0.97, + "grad_norm": 0.53125, + "learning_rate": 0.00018794008143640682, + "loss": 0.3484, + "step": 7259 + }, + { + "epoch": 0.97, + "grad_norm": 0.69921875, + "learning_rate": 0.00018793453685555646, + "loss": 0.6181, + "step": 7260 + }, + { + "epoch": 0.97, + "grad_norm": 0.478515625, + "learning_rate": 0.00018792899108225735, + "loss": 0.6004, + "step": 7261 + }, + { + "epoch": 0.97, + "grad_norm": 0.61328125, + "learning_rate": 0.00018792344411658468, + "loss": 0.3572, + "step": 7262 + }, + { + "epoch": 0.97, + "grad_norm": 0.490234375, + "learning_rate": 0.00018791789595861368, + "loss": 0.5022, + "step": 7263 + }, + { + "epoch": 0.97, + "grad_norm": 0.427734375, + "learning_rate": 0.0001879123466084196, + "loss": 0.3346, + "step": 7264 + }, + { + "epoch": 0.97, + "grad_norm": 0.484375, + "learning_rate": 0.0001879067960660777, + "loss": 0.4377, + "step": 7265 + }, + { + "epoch": 0.97, + "grad_norm": 0.54296875, + "learning_rate": 0.0001879012443316632, + "loss": 0.4363, + "step": 7266 + }, + { + "epoch": 0.97, + "grad_norm": 0.57421875, + "learning_rate": 0.00018789569140525137, + "loss": 0.7483, + "step": 7267 + }, + { + "epoch": 0.97, + "grad_norm": 0.5703125, + "learning_rate": 0.0001878901372869176, + "loss": 0.5279, + "step": 7268 + }, + { + "epoch": 0.97, + "grad_norm": 0.546875, + "learning_rate": 0.00018788458197673715, + "loss": 0.5986, + "step": 7269 + }, + { + "epoch": 0.97, + "grad_norm": 0.60546875, + "learning_rate": 0.0001878790254747854, + "loss": 0.4794, + "step": 7270 + }, + { + "epoch": 0.97, + "grad_norm": 0.55859375, + "learning_rate": 0.0001878734677811376, + "loss": 0.667, + "step": 7271 + }, + { + "epoch": 0.97, + "grad_norm": 0.439453125, + "learning_rate": 0.00018786790889586922, + "loss": 0.358, + "step": 7272 + }, + { + "epoch": 0.97, + "grad_norm": 0.482421875, + "learning_rate": 0.0001878623488190556, + "loss": 0.2322, + "step": 7273 + }, + { + "epoch": 0.97, + "grad_norm": 0.6171875, + "learning_rate": 0.00018785678755077213, + "loss": 0.4227, + "step": 7274 + }, + { + "epoch": 0.97, + "grad_norm": 0.58984375, + "learning_rate": 0.00018785122509109426, + "loss": 0.3853, + "step": 7275 + }, + { + "epoch": 0.97, + "grad_norm": 0.390625, + "learning_rate": 0.00018784566144009737, + "loss": 0.3753, + "step": 7276 + }, + { + "epoch": 0.97, + "grad_norm": 0.37109375, + "learning_rate": 0.00018784009659785693, + "loss": 0.2305, + "step": 7277 + }, + { + "epoch": 0.97, + "grad_norm": 0.51953125, + "learning_rate": 0.0001878345305644484, + "loss": 0.3607, + "step": 7278 + }, + { + "epoch": 0.97, + "grad_norm": 0.421875, + "learning_rate": 0.00018782896333994727, + "loss": 0.4114, + "step": 7279 + }, + { + "epoch": 0.97, + "grad_norm": 0.85546875, + "learning_rate": 0.00018782339492442906, + "loss": 0.3945, + "step": 7280 + }, + { + "epoch": 0.97, + "grad_norm": 0.703125, + "learning_rate": 0.00018781782531796922, + "loss": 0.359, + "step": 7281 + }, + { + "epoch": 0.97, + "grad_norm": 0.6171875, + "learning_rate": 0.00018781225452064331, + "loss": 0.5471, + "step": 7282 + }, + { + "epoch": 0.97, + "grad_norm": 0.5390625, + "learning_rate": 0.00018780668253252686, + "loss": 0.476, + "step": 7283 + }, + { + "epoch": 0.97, + "grad_norm": 0.56640625, + "learning_rate": 0.00018780110935369545, + "loss": 0.375, + "step": 7284 + }, + { + "epoch": 0.97, + "grad_norm": 0.6484375, + "learning_rate": 0.00018779553498422466, + "loss": 0.4361, + "step": 7285 + }, + { + "epoch": 0.97, + "grad_norm": 1.25, + "learning_rate": 0.00018778995942419005, + "loss": 0.7681, + "step": 7286 + }, + { + "epoch": 0.97, + "grad_norm": 0.53515625, + "learning_rate": 0.00018778438267366728, + "loss": 0.3896, + "step": 7287 + }, + { + "epoch": 0.97, + "grad_norm": 0.7265625, + "learning_rate": 0.0001877788047327319, + "loss": 0.2373, + "step": 7288 + }, + { + "epoch": 0.97, + "grad_norm": 0.5234375, + "learning_rate": 0.00018777322560145963, + "loss": 0.5318, + "step": 7289 + }, + { + "epoch": 0.97, + "grad_norm": 0.51171875, + "learning_rate": 0.00018776764527992604, + "loss": 0.4628, + "step": 7290 + }, + { + "epoch": 0.97, + "grad_norm": 0.6171875, + "learning_rate": 0.0001877620637682069, + "loss": 0.4689, + "step": 7291 + }, + { + "epoch": 0.97, + "grad_norm": 0.609375, + "learning_rate": 0.00018775648106637784, + "loss": 0.3934, + "step": 7292 + }, + { + "epoch": 0.97, + "grad_norm": 0.52734375, + "learning_rate": 0.00018775089717451454, + "loss": 0.6408, + "step": 7293 + }, + { + "epoch": 0.97, + "grad_norm": 0.4140625, + "learning_rate": 0.0001877453120926928, + "loss": 0.527, + "step": 7294 + }, + { + "epoch": 0.97, + "grad_norm": 0.51171875, + "learning_rate": 0.0001877397258209883, + "loss": 0.4631, + "step": 7295 + }, + { + "epoch": 0.97, + "grad_norm": 0.421875, + "learning_rate": 0.0001877341383594768, + "loss": 0.5072, + "step": 7296 + }, + { + "epoch": 0.97, + "grad_norm": 0.67578125, + "learning_rate": 0.00018772854970823405, + "loss": 0.6794, + "step": 7297 + }, + { + "epoch": 0.97, + "grad_norm": 0.5, + "learning_rate": 0.0001877229598673359, + "loss": 0.2848, + "step": 7298 + }, + { + "epoch": 0.97, + "grad_norm": 0.5859375, + "learning_rate": 0.00018771736883685812, + "loss": 0.5873, + "step": 7299 + }, + { + "epoch": 0.97, + "grad_norm": 0.671875, + "learning_rate": 0.00018771177661687649, + "loss": 0.252, + "step": 7300 + }, + { + "epoch": 0.97, + "grad_norm": 0.49609375, + "learning_rate": 0.0001877061832074669, + "loss": 0.3304, + "step": 7301 + }, + { + "epoch": 0.97, + "grad_norm": 0.439453125, + "learning_rate": 0.00018770058860870516, + "loss": 0.6047, + "step": 7302 + }, + { + "epoch": 0.97, + "grad_norm": 0.8046875, + "learning_rate": 0.00018769499282066717, + "loss": 0.2613, + "step": 7303 + }, + { + "epoch": 0.97, + "grad_norm": 0.7109375, + "learning_rate": 0.00018768939584342878, + "loss": 0.3817, + "step": 7304 + }, + { + "epoch": 0.97, + "grad_norm": 0.6953125, + "learning_rate": 0.00018768379767706593, + "loss": 0.4477, + "step": 7305 + }, + { + "epoch": 0.97, + "grad_norm": 0.92578125, + "learning_rate": 0.00018767819832165444, + "loss": 0.5112, + "step": 7306 + }, + { + "epoch": 0.98, + "grad_norm": 0.640625, + "learning_rate": 0.00018767259777727037, + "loss": 0.4379, + "step": 7307 + }, + { + "epoch": 0.98, + "grad_norm": 0.578125, + "learning_rate": 0.0001876669960439896, + "loss": 0.5051, + "step": 7308 + }, + { + "epoch": 0.98, + "grad_norm": 0.73828125, + "learning_rate": 0.00018766139312188812, + "loss": 0.2602, + "step": 7309 + }, + { + "epoch": 0.98, + "grad_norm": 0.4296875, + "learning_rate": 0.00018765578901104184, + "loss": 0.3352, + "step": 7310 + }, + { + "epoch": 0.98, + "grad_norm": 0.53515625, + "learning_rate": 0.00018765018371152682, + "loss": 0.6174, + "step": 7311 + }, + { + "epoch": 0.98, + "grad_norm": 0.79296875, + "learning_rate": 0.00018764457722341907, + "loss": 0.6412, + "step": 7312 + }, + { + "epoch": 0.98, + "grad_norm": 0.6640625, + "learning_rate": 0.00018763896954679458, + "loss": 0.5916, + "step": 7313 + }, + { + "epoch": 0.98, + "grad_norm": 0.52734375, + "learning_rate": 0.00018763336068172942, + "loss": 0.4565, + "step": 7314 + }, + { + "epoch": 0.98, + "grad_norm": 0.57421875, + "learning_rate": 0.00018762775062829967, + "loss": 0.421, + "step": 7315 + }, + { + "epoch": 0.98, + "grad_norm": 0.546875, + "learning_rate": 0.00018762213938658138, + "loss": 0.4396, + "step": 7316 + }, + { + "epoch": 0.98, + "grad_norm": 0.69140625, + "learning_rate": 0.00018761652695665064, + "loss": 0.4989, + "step": 7317 + }, + { + "epoch": 0.98, + "grad_norm": 0.498046875, + "learning_rate": 0.00018761091333858353, + "loss": 0.4372, + "step": 7318 + }, + { + "epoch": 0.98, + "grad_norm": 0.65234375, + "learning_rate": 0.00018760529853245625, + "loss": 0.4096, + "step": 7319 + }, + { + "epoch": 0.98, + "grad_norm": 0.6640625, + "learning_rate": 0.00018759968253834484, + "loss": 0.3521, + "step": 7320 + }, + { + "epoch": 0.98, + "grad_norm": 0.75390625, + "learning_rate": 0.00018759406535632557, + "loss": 0.4063, + "step": 7321 + }, + { + "epoch": 0.98, + "grad_norm": 0.43359375, + "learning_rate": 0.00018758844698647456, + "loss": 0.4144, + "step": 7322 + }, + { + "epoch": 0.98, + "grad_norm": 0.41015625, + "learning_rate": 0.00018758282742886796, + "loss": 0.3137, + "step": 7323 + }, + { + "epoch": 0.98, + "grad_norm": 0.546875, + "learning_rate": 0.00018757720668358203, + "loss": 0.46, + "step": 7324 + }, + { + "epoch": 0.98, + "grad_norm": 0.5859375, + "learning_rate": 0.00018757158475069297, + "loss": 0.3171, + "step": 7325 + }, + { + "epoch": 0.98, + "grad_norm": 0.52734375, + "learning_rate": 0.000187565961630277, + "loss": 0.2211, + "step": 7326 + }, + { + "epoch": 0.98, + "grad_norm": 0.466796875, + "learning_rate": 0.0001875603373224104, + "loss": 0.4867, + "step": 7327 + }, + { + "epoch": 0.98, + "grad_norm": 0.466796875, + "learning_rate": 0.00018755471182716943, + "loss": 0.3885, + "step": 7328 + }, + { + "epoch": 0.98, + "grad_norm": 0.69140625, + "learning_rate": 0.00018754908514463038, + "loss": 0.4759, + "step": 7329 + }, + { + "epoch": 0.98, + "grad_norm": 0.63671875, + "learning_rate": 0.00018754345727486956, + "loss": 0.5986, + "step": 7330 + }, + { + "epoch": 0.98, + "grad_norm": 0.56640625, + "learning_rate": 0.00018753782821796325, + "loss": 0.6805, + "step": 7331 + }, + { + "epoch": 0.98, + "grad_norm": 0.48828125, + "learning_rate": 0.0001875321979739878, + "loss": 0.4589, + "step": 7332 + }, + { + "epoch": 0.98, + "grad_norm": 0.5625, + "learning_rate": 0.00018752656654301962, + "loss": 0.4257, + "step": 7333 + }, + { + "epoch": 0.98, + "grad_norm": 0.63671875, + "learning_rate": 0.000187520933925135, + "loss": 0.37, + "step": 7334 + }, + { + "epoch": 0.98, + "grad_norm": 0.63671875, + "learning_rate": 0.00018751530012041032, + "loss": 0.4331, + "step": 7335 + }, + { + "epoch": 0.98, + "grad_norm": 0.7578125, + "learning_rate": 0.000187509665128922, + "loss": 0.2881, + "step": 7336 + }, + { + "epoch": 0.98, + "grad_norm": 0.6171875, + "learning_rate": 0.0001875040289507465, + "loss": 0.5307, + "step": 7337 + }, + { + "epoch": 0.98, + "grad_norm": 0.5703125, + "learning_rate": 0.0001874983915859602, + "loss": 0.446, + "step": 7338 + }, + { + "epoch": 0.98, + "grad_norm": 0.5625, + "learning_rate": 0.00018749275303463951, + "loss": 0.3835, + "step": 7339 + }, + { + "epoch": 0.98, + "grad_norm": 0.6953125, + "learning_rate": 0.00018748711329686098, + "loss": 0.2732, + "step": 7340 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390625, + "learning_rate": 0.00018748147237270102, + "loss": 0.3158, + "step": 7341 + }, + { + "epoch": 0.98, + "grad_norm": 0.58203125, + "learning_rate": 0.00018747583026223615, + "loss": 0.2396, + "step": 7342 + }, + { + "epoch": 0.98, + "grad_norm": 0.55078125, + "learning_rate": 0.0001874701869655429, + "loss": 0.4705, + "step": 7343 + }, + { + "epoch": 0.98, + "grad_norm": 0.4140625, + "learning_rate": 0.00018746454248269777, + "loss": 0.519, + "step": 7344 + }, + { + "epoch": 0.98, + "grad_norm": 0.375, + "learning_rate": 0.00018745889681377729, + "loss": 0.3671, + "step": 7345 + }, + { + "epoch": 0.98, + "grad_norm": 0.71875, + "learning_rate": 0.00018745324995885805, + "loss": 0.4762, + "step": 7346 + }, + { + "epoch": 0.98, + "grad_norm": 0.55859375, + "learning_rate": 0.00018744760191801663, + "loss": 0.8158, + "step": 7347 + }, + { + "epoch": 0.98, + "grad_norm": 0.474609375, + "learning_rate": 0.00018744195269132958, + "loss": 0.5196, + "step": 7348 + }, + { + "epoch": 0.98, + "grad_norm": 0.5625, + "learning_rate": 0.00018743630227887353, + "loss": 0.3234, + "step": 7349 + }, + { + "epoch": 0.98, + "grad_norm": 0.6171875, + "learning_rate": 0.00018743065068072513, + "loss": 0.2686, + "step": 7350 + }, + { + "epoch": 0.98, + "grad_norm": 0.6484375, + "learning_rate": 0.00018742499789696097, + "loss": 0.7015, + "step": 7351 + }, + { + "epoch": 0.98, + "grad_norm": 0.72265625, + "learning_rate": 0.00018741934392765773, + "loss": 0.4769, + "step": 7352 + }, + { + "epoch": 0.98, + "grad_norm": 0.70703125, + "learning_rate": 0.00018741368877289212, + "loss": 0.3893, + "step": 7353 + }, + { + "epoch": 0.98, + "grad_norm": 0.640625, + "learning_rate": 0.00018740803243274073, + "loss": 0.3155, + "step": 7354 + }, + { + "epoch": 0.98, + "grad_norm": 0.5546875, + "learning_rate": 0.00018740237490728036, + "loss": 0.3829, + "step": 7355 + }, + { + "epoch": 0.98, + "grad_norm": 0.60546875, + "learning_rate": 0.0001873967161965877, + "loss": 0.2622, + "step": 7356 + }, + { + "epoch": 0.98, + "grad_norm": 0.73828125, + "learning_rate": 0.00018739105630073946, + "loss": 0.3913, + "step": 7357 + }, + { + "epoch": 0.98, + "grad_norm": 0.65234375, + "learning_rate": 0.00018738539521981243, + "loss": 0.533, + "step": 7358 + }, + { + "epoch": 0.98, + "grad_norm": 0.7109375, + "learning_rate": 0.00018737973295388335, + "loss": 0.5404, + "step": 7359 + }, + { + "epoch": 0.98, + "grad_norm": 0.8203125, + "learning_rate": 0.00018737406950302898, + "loss": 0.2648, + "step": 7360 + }, + { + "epoch": 0.98, + "grad_norm": 0.59765625, + "learning_rate": 0.00018736840486732617, + "loss": 0.4863, + "step": 7361 + }, + { + "epoch": 0.98, + "grad_norm": 0.71875, + "learning_rate": 0.0001873627390468517, + "loss": 0.4283, + "step": 7362 + }, + { + "epoch": 0.98, + "grad_norm": 0.49609375, + "learning_rate": 0.00018735707204168244, + "loss": 0.3772, + "step": 7363 + }, + { + "epoch": 0.98, + "grad_norm": 0.515625, + "learning_rate": 0.00018735140385189522, + "loss": 0.5731, + "step": 7364 + }, + { + "epoch": 0.98, + "grad_norm": 0.640625, + "learning_rate": 0.0001873457344775669, + "loss": 0.4518, + "step": 7365 + }, + { + "epoch": 0.98, + "grad_norm": 0.39453125, + "learning_rate": 0.00018734006391877436, + "loss": 0.4834, + "step": 7366 + }, + { + "epoch": 0.98, + "grad_norm": 0.470703125, + "learning_rate": 0.0001873343921755945, + "loss": 0.3252, + "step": 7367 + }, + { + "epoch": 0.98, + "grad_norm": 0.515625, + "learning_rate": 0.00018732871924810422, + "loss": 0.458, + "step": 7368 + }, + { + "epoch": 0.98, + "grad_norm": 0.51953125, + "learning_rate": 0.00018732304513638048, + "loss": 0.2208, + "step": 7369 + }, + { + "epoch": 0.98, + "grad_norm": 0.63671875, + "learning_rate": 0.00018731736984050018, + "loss": 0.614, + "step": 7370 + }, + { + "epoch": 0.98, + "grad_norm": 0.3984375, + "learning_rate": 0.00018731169336054032, + "loss": 0.2198, + "step": 7371 + }, + { + "epoch": 0.98, + "grad_norm": 0.5078125, + "learning_rate": 0.0001873060156965779, + "loss": 0.4248, + "step": 7372 + }, + { + "epoch": 0.98, + "grad_norm": 0.58203125, + "learning_rate": 0.0001873003368486898, + "loss": 0.4083, + "step": 7373 + }, + { + "epoch": 0.98, + "grad_norm": 0.58984375, + "learning_rate": 0.00018729465681695316, + "loss": 0.539, + "step": 7374 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390625, + "learning_rate": 0.00018728897560144493, + "loss": 0.5833, + "step": 7375 + }, + { + "epoch": 0.98, + "grad_norm": 0.53515625, + "learning_rate": 0.00018728329320224216, + "loss": 0.4494, + "step": 7376 + }, + { + "epoch": 0.98, + "grad_norm": 0.43359375, + "learning_rate": 0.00018727760961942194, + "loss": 0.4591, + "step": 7377 + }, + { + "epoch": 0.98, + "grad_norm": 0.6953125, + "learning_rate": 0.00018727192485306127, + "loss": 0.4659, + "step": 7378 + }, + { + "epoch": 0.98, + "grad_norm": 0.578125, + "learning_rate": 0.00018726623890323731, + "loss": 0.4277, + "step": 7379 + }, + { + "epoch": 0.98, + "grad_norm": 0.5546875, + "learning_rate": 0.00018726055177002718, + "loss": 0.3815, + "step": 7380 + }, + { + "epoch": 0.98, + "grad_norm": 0.91796875, + "learning_rate": 0.00018725486345350795, + "loss": 0.4677, + "step": 7381 + }, + { + "epoch": 0.99, + "grad_norm": 0.59765625, + "learning_rate": 0.00018724917395375674, + "loss": 0.5225, + "step": 7382 + }, + { + "epoch": 0.99, + "grad_norm": 0.490234375, + "learning_rate": 0.00018724348327085077, + "loss": 0.2616, + "step": 7383 + }, + { + "epoch": 0.99, + "grad_norm": 0.984375, + "learning_rate": 0.00018723779140486716, + "loss": 0.6891, + "step": 7384 + }, + { + "epoch": 0.99, + "grad_norm": 0.625, + "learning_rate": 0.0001872320983558831, + "loss": 0.3824, + "step": 7385 + }, + { + "epoch": 0.99, + "grad_norm": 0.796875, + "learning_rate": 0.00018722640412397582, + "loss": 0.4622, + "step": 7386 + }, + { + "epoch": 0.99, + "grad_norm": 0.83203125, + "learning_rate": 0.0001872207087092225, + "loss": 0.4124, + "step": 7387 + }, + { + "epoch": 0.99, + "grad_norm": 0.462890625, + "learning_rate": 0.0001872150121117004, + "loss": 0.2756, + "step": 7388 + }, + { + "epoch": 0.99, + "grad_norm": 0.65625, + "learning_rate": 0.00018720931433148676, + "loss": 0.7755, + "step": 7389 + }, + { + "epoch": 0.99, + "grad_norm": 0.419921875, + "learning_rate": 0.00018720361536865885, + "loss": 0.4652, + "step": 7390 + }, + { + "epoch": 0.99, + "grad_norm": 0.625, + "learning_rate": 0.00018719791522329394, + "loss": 0.5347, + "step": 7391 + }, + { + "epoch": 0.99, + "grad_norm": 0.49609375, + "learning_rate": 0.00018719221389546935, + "loss": 0.3991, + "step": 7392 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 0.00018718651138526236, + "loss": 0.4291, + "step": 7393 + }, + { + "epoch": 0.99, + "grad_norm": 0.61328125, + "learning_rate": 0.00018718080769275034, + "loss": 0.3342, + "step": 7394 + }, + { + "epoch": 0.99, + "grad_norm": 0.6015625, + "learning_rate": 0.0001871751028180106, + "loss": 0.6498, + "step": 7395 + }, + { + "epoch": 0.99, + "grad_norm": 0.62890625, + "learning_rate": 0.00018716939676112056, + "loss": 0.4196, + "step": 7396 + }, + { + "epoch": 0.99, + "grad_norm": 0.45703125, + "learning_rate": 0.00018716368952215748, + "loss": 0.2885, + "step": 7397 + }, + { + "epoch": 0.99, + "grad_norm": 0.60546875, + "learning_rate": 0.0001871579811011989, + "loss": 0.4524, + "step": 7398 + }, + { + "epoch": 0.99, + "grad_norm": 0.60546875, + "learning_rate": 0.00018715227149832214, + "loss": 0.5106, + "step": 7399 + }, + { + "epoch": 0.99, + "grad_norm": 0.609375, + "learning_rate": 0.0001871465607136046, + "loss": 0.3362, + "step": 7400 + }, + { + "epoch": 0.99, + "grad_norm": 0.6796875, + "learning_rate": 0.0001871408487471238, + "loss": 0.4431, + "step": 7401 + }, + { + "epoch": 0.99, + "grad_norm": 0.52734375, + "learning_rate": 0.0001871351355989572, + "loss": 0.2589, + "step": 7402 + }, + { + "epoch": 0.99, + "grad_norm": 0.86328125, + "learning_rate": 0.00018712942126918218, + "loss": 0.4128, + "step": 7403 + }, + { + "epoch": 0.99, + "grad_norm": 0.69921875, + "learning_rate": 0.0001871237057578763, + "loss": 0.5186, + "step": 7404 + }, + { + "epoch": 0.99, + "grad_norm": 0.55078125, + "learning_rate": 0.00018711798906511707, + "loss": 0.5269, + "step": 7405 + }, + { + "epoch": 0.99, + "grad_norm": 0.8125, + "learning_rate": 0.000187112271190982, + "loss": 0.3392, + "step": 7406 + }, + { + "epoch": 0.99, + "grad_norm": 0.609375, + "learning_rate": 0.0001871065521355486, + "loss": 0.257, + "step": 7407 + }, + { + "epoch": 0.99, + "grad_norm": 0.7734375, + "learning_rate": 0.0001871008318988945, + "loss": 0.4404, + "step": 7408 + }, + { + "epoch": 0.99, + "grad_norm": 0.484375, + "learning_rate": 0.00018709511048109717, + "loss": 0.4989, + "step": 7409 + }, + { + "epoch": 0.99, + "grad_norm": 0.796875, + "learning_rate": 0.00018708938788223424, + "loss": 0.6208, + "step": 7410 + }, + { + "epoch": 0.99, + "grad_norm": 0.46484375, + "learning_rate": 0.00018708366410238333, + "loss": 0.5008, + "step": 7411 + }, + { + "epoch": 0.99, + "grad_norm": 0.419921875, + "learning_rate": 0.00018707793914162207, + "loss": 0.2347, + "step": 7412 + }, + { + "epoch": 0.99, + "grad_norm": 0.46484375, + "learning_rate": 0.00018707221300002803, + "loss": 0.4855, + "step": 7413 + }, + { + "epoch": 0.99, + "grad_norm": 0.6875, + "learning_rate": 0.00018706648567767893, + "loss": 0.7403, + "step": 7414 + }, + { + "epoch": 0.99, + "grad_norm": 0.3984375, + "learning_rate": 0.0001870607571746524, + "loss": 0.2405, + "step": 7415 + }, + { + "epoch": 0.99, + "grad_norm": 0.50390625, + "learning_rate": 0.0001870550274910261, + "loss": 0.3768, + "step": 7416 + }, + { + "epoch": 0.99, + "grad_norm": 0.5390625, + "learning_rate": 0.00018704929662687777, + "loss": 0.5925, + "step": 7417 + }, + { + "epoch": 0.99, + "grad_norm": 0.984375, + "learning_rate": 0.00018704356458228513, + "loss": 0.72, + "step": 7418 + }, + { + "epoch": 0.99, + "grad_norm": 0.61328125, + "learning_rate": 0.00018703783135732587, + "loss": 0.824, + "step": 7419 + }, + { + "epoch": 0.99, + "grad_norm": 0.5546875, + "learning_rate": 0.00018703209695207775, + "loss": 0.6954, + "step": 7420 + }, + { + "epoch": 0.99, + "grad_norm": 0.703125, + "learning_rate": 0.00018702636136661856, + "loss": 0.4701, + "step": 7421 + }, + { + "epoch": 0.99, + "grad_norm": 0.52734375, + "learning_rate": 0.00018702062460102605, + "loss": 0.6303, + "step": 7422 + }, + { + "epoch": 0.99, + "grad_norm": 0.404296875, + "learning_rate": 0.00018701488665537802, + "loss": 0.1961, + "step": 7423 + }, + { + "epoch": 0.99, + "grad_norm": 0.61328125, + "learning_rate": 0.00018700914752975227, + "loss": 0.4018, + "step": 7424 + }, + { + "epoch": 0.99, + "grad_norm": 0.57421875, + "learning_rate": 0.00018700340722422666, + "loss": 0.5255, + "step": 7425 + }, + { + "epoch": 0.99, + "grad_norm": 0.5, + "learning_rate": 0.000186997665738879, + "loss": 0.3699, + "step": 7426 + }, + { + "epoch": 0.99, + "grad_norm": 0.609375, + "learning_rate": 0.00018699192307378715, + "loss": 0.2782, + "step": 7427 + }, + { + "epoch": 0.99, + "grad_norm": 0.65625, + "learning_rate": 0.00018698617922902899, + "loss": 0.4247, + "step": 7428 + }, + { + "epoch": 0.99, + "grad_norm": 0.62890625, + "learning_rate": 0.0001869804342046824, + "loss": 0.5504, + "step": 7429 + }, + { + "epoch": 0.99, + "grad_norm": 0.50390625, + "learning_rate": 0.00018697468800082533, + "loss": 0.4795, + "step": 7430 + }, + { + "epoch": 0.99, + "grad_norm": 0.5859375, + "learning_rate": 0.00018696894061753566, + "loss": 0.5472, + "step": 7431 + }, + { + "epoch": 0.99, + "grad_norm": 0.62890625, + "learning_rate": 0.00018696319205489134, + "loss": 0.3345, + "step": 7432 + }, + { + "epoch": 0.99, + "grad_norm": 0.82421875, + "learning_rate": 0.00018695744231297031, + "loss": 0.2179, + "step": 7433 + }, + { + "epoch": 0.99, + "grad_norm": 0.8203125, + "learning_rate": 0.00018695169139185057, + "loss": 0.2496, + "step": 7434 + }, + { + "epoch": 0.99, + "grad_norm": 0.494140625, + "learning_rate": 0.0001869459392916101, + "loss": 0.8094, + "step": 7435 + }, + { + "epoch": 0.99, + "grad_norm": 0.5859375, + "learning_rate": 0.00018694018601232685, + "loss": 0.3617, + "step": 7436 + }, + { + "epoch": 0.99, + "grad_norm": 0.484375, + "learning_rate": 0.0001869344315540789, + "loss": 0.4318, + "step": 7437 + }, + { + "epoch": 0.99, + "grad_norm": 0.69921875, + "learning_rate": 0.00018692867591694426, + "loss": 0.4664, + "step": 7438 + }, + { + "epoch": 0.99, + "grad_norm": 0.68359375, + "learning_rate": 0.00018692291910100102, + "loss": 0.4406, + "step": 7439 + }, + { + "epoch": 0.99, + "grad_norm": 0.404296875, + "learning_rate": 0.00018691716110632719, + "loss": 0.3443, + "step": 7440 + }, + { + "epoch": 0.99, + "grad_norm": 0.5234375, + "learning_rate": 0.00018691140193300085, + "loss": 0.5079, + "step": 7441 + }, + { + "epoch": 0.99, + "grad_norm": 0.69921875, + "learning_rate": 0.00018690564158110014, + "loss": 0.3403, + "step": 7442 + }, + { + "epoch": 0.99, + "grad_norm": 0.671875, + "learning_rate": 0.00018689988005070313, + "loss": 0.2742, + "step": 7443 + }, + { + "epoch": 0.99, + "grad_norm": 0.427734375, + "learning_rate": 0.00018689411734188803, + "loss": 0.2113, + "step": 7444 + }, + { + "epoch": 0.99, + "grad_norm": 0.53515625, + "learning_rate": 0.00018688835345473287, + "loss": 0.448, + "step": 7445 + }, + { + "epoch": 0.99, + "grad_norm": 0.4453125, + "learning_rate": 0.0001868825883893159, + "loss": 0.3631, + "step": 7446 + }, + { + "epoch": 0.99, + "grad_norm": 0.72265625, + "learning_rate": 0.00018687682214571527, + "loss": 0.6224, + "step": 7447 + }, + { + "epoch": 0.99, + "grad_norm": 0.423828125, + "learning_rate": 0.00018687105472400917, + "loss": 0.2439, + "step": 7448 + }, + { + "epoch": 0.99, + "grad_norm": 0.58203125, + "learning_rate": 0.0001868652861242758, + "loss": 0.5854, + "step": 7449 + }, + { + "epoch": 0.99, + "grad_norm": 0.5, + "learning_rate": 0.00018685951634659343, + "loss": 0.5485, + "step": 7450 + }, + { + "epoch": 0.99, + "grad_norm": 0.435546875, + "learning_rate": 0.00018685374539104027, + "loss": 0.4229, + "step": 7451 + }, + { + "epoch": 0.99, + "grad_norm": 0.3984375, + "learning_rate": 0.00018684797325769457, + "loss": 0.3344, + "step": 7452 + }, + { + "epoch": 0.99, + "grad_norm": 0.68359375, + "learning_rate": 0.0001868421999466346, + "loss": 0.1509, + "step": 7453 + }, + { + "epoch": 0.99, + "grad_norm": 0.69140625, + "learning_rate": 0.0001868364254579387, + "loss": 0.5479, + "step": 7454 + }, + { + "epoch": 0.99, + "grad_norm": 0.380859375, + "learning_rate": 0.00018683064979168508, + "loss": 0.339, + "step": 7455 + }, + { + "epoch": 0.99, + "grad_norm": 0.58984375, + "learning_rate": 0.0001868248729479522, + "loss": 0.5419, + "step": 7456 + }, + { + "epoch": 1.0, + "grad_norm": 0.4296875, + "learning_rate": 0.00018681909492681827, + "loss": 0.2651, + "step": 7457 + }, + { + "epoch": 1.0, + "grad_norm": 0.78125, + "learning_rate": 0.0001868133157283617, + "loss": 0.4268, + "step": 7458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7265625, + "learning_rate": 0.00018680753535266085, + "loss": 0.2025, + "step": 7459 + }, + { + "epoch": 1.0, + "grad_norm": 0.80078125, + "learning_rate": 0.0001868017537997941, + "loss": 0.3048, + "step": 7460 + }, + { + "epoch": 1.0, + "grad_norm": 0.447265625, + "learning_rate": 0.00018679597106983987, + "loss": 0.5556, + "step": 7461 + }, + { + "epoch": 1.0, + "grad_norm": 0.423828125, + "learning_rate": 0.00018679018716287658, + "loss": 0.3617, + "step": 7462 + }, + { + "epoch": 1.0, + "grad_norm": 0.48828125, + "learning_rate": 0.00018678440207898264, + "loss": 0.3008, + "step": 7463 + }, + { + "epoch": 1.0, + "grad_norm": 0.43359375, + "learning_rate": 0.00018677861581823649, + "loss": 0.365, + "step": 7464 + }, + { + "epoch": 1.0, + "grad_norm": 0.55859375, + "learning_rate": 0.00018677282838071662, + "loss": 0.6067, + "step": 7465 + }, + { + "epoch": 1.0, + "grad_norm": 0.4921875, + "learning_rate": 0.00018676703976650154, + "loss": 0.3691, + "step": 7466 + }, + { + "epoch": 1.0, + "grad_norm": 0.67578125, + "learning_rate": 0.0001867612499756697, + "loss": 0.2305, + "step": 7467 + }, + { + "epoch": 1.0, + "grad_norm": 0.89453125, + "learning_rate": 0.00018675545900829961, + "loss": 0.4651, + "step": 7468 + }, + { + "epoch": 1.0, + "grad_norm": 0.54296875, + "learning_rate": 0.00018674966686446984, + "loss": 0.2886, + "step": 7469 + }, + { + "epoch": 1.0, + "grad_norm": 0.72265625, + "learning_rate": 0.00018674387354425888, + "loss": 0.4259, + "step": 7470 + }, + { + "epoch": 1.0, + "grad_norm": 0.423828125, + "learning_rate": 0.00018673807904774536, + "loss": 0.3509, + "step": 7471 + }, + { + "epoch": 1.0, + "grad_norm": 0.43359375, + "learning_rate": 0.0001867322833750078, + "loss": 0.2906, + "step": 7472 + }, + { + "epoch": 1.0, + "grad_norm": 0.5859375, + "learning_rate": 0.00018672648652612482, + "loss": 0.2943, + "step": 7473 + }, + { + "epoch": 1.0, + "grad_norm": 0.54296875, + "learning_rate": 0.00018672068850117502, + "loss": 0.2387, + "step": 7474 + }, + { + "epoch": 1.0, + "grad_norm": 0.427734375, + "learning_rate": 0.000186714889300237, + "loss": 0.4802, + "step": 7475 + }, + { + "epoch": 1.0, + "grad_norm": 0.48828125, + "learning_rate": 0.00018670908892338945, + "loss": 0.4584, + "step": 7476 + }, + { + "epoch": 1.0, + "grad_norm": 0.6640625, + "learning_rate": 0.00018670328737071103, + "loss": 0.5914, + "step": 7477 + }, + { + "epoch": 1.0, + "grad_norm": 0.58984375, + "learning_rate": 0.00018669748464228035, + "loss": 0.332, + "step": 7478 + }, + { + "epoch": 1.0, + "grad_norm": 0.578125, + "learning_rate": 0.00018669168073817615, + "loss": 0.3338, + "step": 7479 + }, + { + "epoch": 1.0, + "grad_norm": 0.5703125, + "learning_rate": 0.00018668587565847712, + "loss": 0.3781, + "step": 7480 + }, + { + "epoch": 1.0, + "grad_norm": 0.51171875, + "learning_rate": 0.000186680069403262, + "loss": 0.342, + "step": 7481 + }, + { + "epoch": 1.0, + "grad_norm": 0.5859375, + "learning_rate": 0.0001866742619726095, + "loss": 0.3038, + "step": 7482 + }, + { + "epoch": 1.0, + "grad_norm": 0.5234375, + "learning_rate": 0.00018666845336659835, + "loss": 0.3005, + "step": 7483 + }, + { + "epoch": 1.0, + "grad_norm": 0.73046875, + "learning_rate": 0.00018666264358530737, + "loss": 0.5642, + "step": 7484 + }, + { + "epoch": 1.0, + "grad_norm": 0.51953125, + "learning_rate": 0.0001866568326288153, + "loss": 0.6502, + "step": 7485 + }, + { + "epoch": 1.0, + "grad_norm": 0.55078125, + "learning_rate": 0.000186651020497201, + "loss": 0.4891, + "step": 7486 + }, + { + "epoch": 1.0, + "grad_norm": 0.5390625, + "learning_rate": 0.00018664520719054325, + "loss": 0.3032, + "step": 7487 + }, + { + "epoch": 1.0, + "grad_norm": 0.70703125, + "learning_rate": 0.00018663939270892086, + "loss": 0.537, + "step": 7488 + }, + { + "epoch": 1.0, + "grad_norm": 0.58984375, + "learning_rate": 0.0001866335770524127, + "loss": 0.4228, + "step": 7489 + }, + { + "epoch": 1.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00018662776022109764, + "loss": 0.3332, + "step": 7490 + }, + { + "epoch": 1.0, + "grad_norm": 0.62109375, + "learning_rate": 0.00018662194221505458, + "loss": 0.3391, + "step": 7491 + }, + { + "epoch": 1.0, + "grad_norm": 0.5625, + "learning_rate": 0.00018661612303436236, + "loss": 0.4321, + "step": 7492 + }, + { + "epoch": 1.0, + "grad_norm": 0.53515625, + "learning_rate": 0.00018661030267909991, + "loss": 0.2155, + "step": 7493 + }, + { + "epoch": 1.0, + "grad_norm": 0.6484375, + "learning_rate": 0.00018660448114934619, + "loss": 0.7903, + "step": 7494 + }, + { + "epoch": 1.0, + "grad_norm": 0.53125, + "learning_rate": 0.00018659865844518014, + "loss": 0.3399, + "step": 7495 + }, + { + "epoch": 1.0, + "grad_norm": 0.46875, + "learning_rate": 0.00018659283456668067, + "loss": 0.4113, + "step": 7496 + }, + { + "epoch": 1.0, + "grad_norm": 0.4765625, + "learning_rate": 0.00018658700951392683, + "loss": 0.5027, + "step": 7497 + }, + { + "epoch": 1.0, + "grad_norm": 0.50390625, + "learning_rate": 0.0001865811832869975, + "loss": 0.4188, + "step": 7498 + }, + { + "epoch": 1.0, + "grad_norm": 0.8984375, + "learning_rate": 0.00018657535588597184, + "loss": 0.4796, + "step": 7499 + }, + { + "epoch": 1.0, + "grad_norm": 0.79296875, + "learning_rate": 0.00018656952731092874, + "loss": 0.3748, + "step": 7500 + }, + { + "epoch": 1.0, + "grad_norm": 0.5078125, + "learning_rate": 0.0001865636975619473, + "loss": 0.3733, + "step": 7501 + }, + { + "epoch": 1.0, + "grad_norm": 0.412109375, + "learning_rate": 0.00018655786663910657, + "loss": 0.4825, + "step": 7502 + }, + { + "epoch": 1.0, + "grad_norm": 0.51171875, + "learning_rate": 0.0001865520345424856, + "loss": 0.3763, + "step": 7503 + }, + { + "epoch": 1.0, + "grad_norm": 0.470703125, + "learning_rate": 0.00018654620127216349, + "loss": 0.4636, + "step": 7504 + }, + { + "epoch": 1.0, + "grad_norm": 0.369140625, + "learning_rate": 0.00018654036682821937, + "loss": 0.1685, + "step": 7505 + }, + { + "epoch": 1.0, + "grad_norm": 0.62890625, + "learning_rate": 0.00018653453121073233, + "loss": 0.2849, + "step": 7506 + }, + { + "epoch": 1.0, + "grad_norm": 0.5859375, + "learning_rate": 0.00018652869441978148, + "loss": 0.3196, + "step": 7507 + }, + { + "epoch": 1.0, + "grad_norm": 0.8984375, + "learning_rate": 0.00018652285645544603, + "loss": 0.5028, + "step": 7508 + }, + { + "epoch": 1.0, + "grad_norm": 0.447265625, + "learning_rate": 0.0001865170173178051, + "loss": 0.2979, + "step": 7509 + }, + { + "epoch": 1.0, + "grad_norm": 0.65234375, + "learning_rate": 0.00018651117700693793, + "loss": 0.5039, + "step": 7510 + }, + { + "epoch": 1.0, + "grad_norm": 0.52734375, + "learning_rate": 0.00018650533552292363, + "loss": 0.5834, + "step": 7511 + }, + { + "epoch": 1.0, + "grad_norm": 1.4296875, + "learning_rate": 0.0001864994928658415, + "loss": 0.2841, + "step": 7512 + }, + { + "epoch": 1.0, + "grad_norm": 0.65234375, + "learning_rate": 0.0001864936490357707, + "loss": 0.3389, + "step": 7513 + }, + { + "epoch": 1.0, + "grad_norm": 0.486328125, + "learning_rate": 0.0001864878040327905, + "loss": 0.4301, + "step": 7514 + }, + { + "epoch": 1.0, + "grad_norm": 0.5546875, + "learning_rate": 0.00018648195785698018, + "loss": 0.3547, + "step": 7515 + }, + { + "epoch": 1.0, + "grad_norm": 0.5078125, + "learning_rate": 0.00018647611050841902, + "loss": 0.3918, + "step": 7516 + }, + { + "epoch": 1.0, + "grad_norm": 0.5859375, + "learning_rate": 0.0001864702619871863, + "loss": 0.3893, + "step": 7517 + }, + { + "epoch": 1.0, + "grad_norm": 0.58984375, + "learning_rate": 0.00018646441229336133, + "loss": 0.5488, + "step": 7518 + }, + { + "epoch": 1.0, + "grad_norm": 0.64453125, + "learning_rate": 0.00018645856142702344, + "loss": 0.7597, + "step": 7519 + }, + { + "epoch": 1.0, + "grad_norm": 0.73046875, + "learning_rate": 0.00018645270938825195, + "loss": 0.411, + "step": 7520 + }, + { + "epoch": 1.0, + "grad_norm": 0.54296875, + "learning_rate": 0.00018644685617712626, + "loss": 0.3844, + "step": 7521 + }, + { + "epoch": 1.0, + "grad_norm": 0.59765625, + "learning_rate": 0.0001864410017937257, + "loss": 0.7152, + "step": 7522 + }, + { + "epoch": 1.0, + "grad_norm": 0.6171875, + "learning_rate": 0.0001864351462381297, + "loss": 0.4579, + "step": 7523 + }, + { + "epoch": 1.0, + "grad_norm": 0.5, + "learning_rate": 0.0001864292895104176, + "loss": 0.3257, + "step": 7524 + }, + { + "epoch": 1.0, + "grad_norm": 0.5703125, + "learning_rate": 0.0001864234316106689, + "loss": 0.4964, + "step": 7525 + }, + { + "epoch": 1.0, + "grad_norm": 0.6015625, + "learning_rate": 0.00018641757253896302, + "loss": 0.6084, + "step": 7526 + }, + { + "epoch": 1.0, + "grad_norm": 0.73046875, + "learning_rate": 0.00018641171229537935, + "loss": 0.3672, + "step": 7527 + }, + { + "epoch": 1.0, + "grad_norm": 0.7421875, + "learning_rate": 0.00018640585087999743, + "loss": 0.3132, + "step": 7528 + }, + { + "epoch": 1.0, + "grad_norm": 0.7578125, + "learning_rate": 0.0001863999882928967, + "loss": 0.2417, + "step": 7529 + }, + { + "epoch": 1.0, + "grad_norm": 0.51953125, + "learning_rate": 0.0001863941245341567, + "loss": 0.5158, + "step": 7530 + }, + { + "epoch": 1.0, + "grad_norm": 0.69140625, + "learning_rate": 0.00018638825960385692, + "loss": 0.506, + "step": 7531 + }, + { + "epoch": 1.01, + "grad_norm": 0.63671875, + "learning_rate": 0.00018638239350207684, + "loss": 0.6136, + "step": 7532 + }, + { + "epoch": 1.01, + "grad_norm": 0.6796875, + "learning_rate": 0.00018637652622889613, + "loss": 0.6055, + "step": 7533 + }, + { + "epoch": 1.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00018637065778439427, + "loss": 0.4576, + "step": 7534 + }, + { + "epoch": 1.01, + "grad_norm": 0.51171875, + "learning_rate": 0.00018636478816865085, + "loss": 0.4535, + "step": 7535 + }, + { + "epoch": 1.01, + "grad_norm": 0.625, + "learning_rate": 0.00018635891738174549, + "loss": 0.3641, + "step": 7536 + }, + { + "epoch": 1.01, + "grad_norm": 0.6875, + "learning_rate": 0.00018635304542375777, + "loss": 0.5978, + "step": 7537 + }, + { + "epoch": 1.01, + "grad_norm": 0.46875, + "learning_rate": 0.00018634717229476736, + "loss": 0.3581, + "step": 7538 + }, + { + "epoch": 1.01, + "grad_norm": 0.48046875, + "learning_rate": 0.00018634129799485383, + "loss": 0.2136, + "step": 7539 + }, + { + "epoch": 1.01, + "grad_norm": 0.5546875, + "learning_rate": 0.0001863354225240969, + "loss": 0.6943, + "step": 7540 + }, + { + "epoch": 1.01, + "grad_norm": 0.39453125, + "learning_rate": 0.00018632954588257625, + "loss": 0.3508, + "step": 7541 + }, + { + "epoch": 1.01, + "grad_norm": 0.474609375, + "learning_rate": 0.00018632366807037155, + "loss": 0.3684, + "step": 7542 + }, + { + "epoch": 1.01, + "grad_norm": 0.59765625, + "learning_rate": 0.0001863177890875625, + "loss": 0.4681, + "step": 7543 + }, + { + "epoch": 1.01, + "grad_norm": 0.4140625, + "learning_rate": 0.00018631190893422884, + "loss": 0.2718, + "step": 7544 + }, + { + "epoch": 1.01, + "grad_norm": 0.69921875, + "learning_rate": 0.0001863060276104503, + "loss": 0.523, + "step": 7545 + }, + { + "epoch": 1.01, + "grad_norm": 0.5625, + "learning_rate": 0.00018630014511630661, + "loss": 0.5498, + "step": 7546 + }, + { + "epoch": 1.01, + "grad_norm": 0.46875, + "learning_rate": 0.00018629426145187758, + "loss": 0.6187, + "step": 7547 + }, + { + "epoch": 1.01, + "grad_norm": 0.73046875, + "learning_rate": 0.00018628837661724298, + "loss": 0.4351, + "step": 7548 + }, + { + "epoch": 1.01, + "grad_norm": 0.56640625, + "learning_rate": 0.00018628249061248262, + "loss": 0.5633, + "step": 7549 + }, + { + "epoch": 1.01, + "grad_norm": 0.5625, + "learning_rate": 0.00018627660343767632, + "loss": 0.5233, + "step": 7550 + }, + { + "epoch": 1.01, + "grad_norm": 0.62890625, + "learning_rate": 0.0001862707150929039, + "loss": 0.188, + "step": 7551 + }, + { + "epoch": 1.01, + "grad_norm": 0.55078125, + "learning_rate": 0.0001862648255782452, + "loss": 0.4642, + "step": 7552 + }, + { + "epoch": 1.01, + "grad_norm": 0.359375, + "learning_rate": 0.0001862589348937801, + "loss": 0.299, + "step": 7553 + }, + { + "epoch": 1.01, + "grad_norm": 0.359375, + "learning_rate": 0.00018625304303958852, + "loss": 0.183, + "step": 7554 + }, + { + "epoch": 1.01, + "grad_norm": 0.5703125, + "learning_rate": 0.0001862471500157503, + "loss": 0.5628, + "step": 7555 + }, + { + "epoch": 1.01, + "grad_norm": 0.53515625, + "learning_rate": 0.00018624125582234537, + "loss": 0.4162, + "step": 7556 + }, + { + "epoch": 1.01, + "grad_norm": 0.4921875, + "learning_rate": 0.00018623536045945364, + "loss": 0.2456, + "step": 7557 + }, + { + "epoch": 1.01, + "grad_norm": 0.65625, + "learning_rate": 0.0001862294639271551, + "loss": 0.2288, + "step": 7558 + }, + { + "epoch": 1.01, + "grad_norm": 0.5078125, + "learning_rate": 0.0001862235662255297, + "loss": 0.334, + "step": 7559 + }, + { + "epoch": 1.01, + "grad_norm": 0.53515625, + "learning_rate": 0.00018621766735465734, + "loss": 0.5197, + "step": 7560 + }, + { + "epoch": 1.01, + "grad_norm": 0.484375, + "learning_rate": 0.00018621176731461816, + "loss": 0.3767, + "step": 7561 + }, + { + "epoch": 1.01, + "grad_norm": 0.76953125, + "learning_rate": 0.00018620586610549204, + "loss": 0.3599, + "step": 7562 + }, + { + "epoch": 1.01, + "grad_norm": 1.0234375, + "learning_rate": 0.00018619996372735905, + "loss": 0.5453, + "step": 7563 + }, + { + "epoch": 1.01, + "grad_norm": 0.46484375, + "learning_rate": 0.00018619406018029923, + "loss": 0.4253, + "step": 7564 + }, + { + "epoch": 1.01, + "grad_norm": 0.4921875, + "learning_rate": 0.00018618815546439266, + "loss": 0.3579, + "step": 7565 + }, + { + "epoch": 1.01, + "grad_norm": 0.99609375, + "learning_rate": 0.00018618224957971937, + "loss": 0.5049, + "step": 7566 + }, + { + "epoch": 1.01, + "grad_norm": 0.6015625, + "learning_rate": 0.00018617634252635944, + "loss": 0.652, + "step": 7567 + }, + { + "epoch": 1.01, + "grad_norm": 0.5234375, + "learning_rate": 0.00018617043430439303, + "loss": 0.3832, + "step": 7568 + }, + { + "epoch": 1.01, + "grad_norm": 0.51171875, + "learning_rate": 0.00018616452491390022, + "loss": 0.5944, + "step": 7569 + }, + { + "epoch": 1.01, + "grad_norm": 0.59765625, + "learning_rate": 0.00018615861435496113, + "loss": 0.4721, + "step": 7570 + }, + { + "epoch": 1.01, + "grad_norm": 0.578125, + "learning_rate": 0.00018615270262765598, + "loss": 0.5546, + "step": 7571 + }, + { + "epoch": 1.01, + "grad_norm": 0.494140625, + "learning_rate": 0.00018614678973206486, + "loss": 0.349, + "step": 7572 + }, + { + "epoch": 1.01, + "grad_norm": 0.6015625, + "learning_rate": 0.00018614087566826796, + "loss": 0.4398, + "step": 7573 + }, + { + "epoch": 1.01, + "grad_norm": 0.734375, + "learning_rate": 0.00018613496043634554, + "loss": 0.306, + "step": 7574 + }, + { + "epoch": 1.01, + "grad_norm": 0.62890625, + "learning_rate": 0.00018612904403637775, + "loss": 0.5878, + "step": 7575 + }, + { + "epoch": 1.01, + "grad_norm": 0.62890625, + "learning_rate": 0.00018612312646844488, + "loss": 0.3611, + "step": 7576 + }, + { + "epoch": 1.01, + "grad_norm": 0.66796875, + "learning_rate": 0.0001861172077326271, + "loss": 0.3608, + "step": 7577 + }, + { + "epoch": 1.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00018611128782900472, + "loss": 0.6459, + "step": 7578 + }, + { + "epoch": 1.01, + "grad_norm": 0.5703125, + "learning_rate": 0.00018610536675765804, + "loss": 0.3444, + "step": 7579 + }, + { + "epoch": 1.01, + "grad_norm": 0.57421875, + "learning_rate": 0.00018609944451866733, + "loss": 0.2649, + "step": 7580 + }, + { + "epoch": 1.01, + "grad_norm": 0.52734375, + "learning_rate": 0.00018609352111211287, + "loss": 0.5939, + "step": 7581 + }, + { + "epoch": 1.01, + "grad_norm": 0.50390625, + "learning_rate": 0.000186087596538075, + "loss": 0.533, + "step": 7582 + }, + { + "epoch": 1.01, + "grad_norm": 0.5234375, + "learning_rate": 0.00018608167079663408, + "loss": 0.2532, + "step": 7583 + }, + { + "epoch": 1.01, + "grad_norm": 0.4921875, + "learning_rate": 0.00018607574388787045, + "loss": 0.2388, + "step": 7584 + }, + { + "epoch": 1.01, + "grad_norm": 0.6171875, + "learning_rate": 0.0001860698158118645, + "loss": 0.3241, + "step": 7585 + }, + { + "epoch": 1.01, + "grad_norm": 1.1796875, + "learning_rate": 0.00018606388656869661, + "loss": 0.2292, + "step": 7586 + }, + { + "epoch": 1.01, + "grad_norm": 0.494140625, + "learning_rate": 0.00018605795615844718, + "loss": 0.4453, + "step": 7587 + }, + { + "epoch": 1.01, + "grad_norm": 0.671875, + "learning_rate": 0.00018605202458119662, + "loss": 0.4118, + "step": 7588 + }, + { + "epoch": 1.01, + "grad_norm": 0.65234375, + "learning_rate": 0.00018604609183702538, + "loss": 0.5536, + "step": 7589 + }, + { + "epoch": 1.01, + "grad_norm": 0.478515625, + "learning_rate": 0.00018604015792601396, + "loss": 0.2748, + "step": 7590 + }, + { + "epoch": 1.01, + "grad_norm": 0.58203125, + "learning_rate": 0.00018603422284824272, + "loss": 0.3318, + "step": 7591 + }, + { + "epoch": 1.01, + "grad_norm": 0.6953125, + "learning_rate": 0.00018602828660379224, + "loss": 0.3863, + "step": 7592 + }, + { + "epoch": 1.01, + "grad_norm": 0.62890625, + "learning_rate": 0.00018602234919274296, + "loss": 0.4866, + "step": 7593 + }, + { + "epoch": 1.01, + "grad_norm": 0.60546875, + "learning_rate": 0.0001860164106151754, + "loss": 0.491, + "step": 7594 + }, + { + "epoch": 1.01, + "grad_norm": 0.90234375, + "learning_rate": 0.00018601047087117014, + "loss": 0.4393, + "step": 7595 + }, + { + "epoch": 1.01, + "grad_norm": 0.65625, + "learning_rate": 0.00018600452996080766, + "loss": 0.4855, + "step": 7596 + }, + { + "epoch": 1.01, + "grad_norm": 0.490234375, + "learning_rate": 0.0001859985878841686, + "loss": 0.535, + "step": 7597 + }, + { + "epoch": 1.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00018599264464133345, + "loss": 0.4045, + "step": 7598 + }, + { + "epoch": 1.01, + "grad_norm": 0.50390625, + "learning_rate": 0.00018598670023238286, + "loss": 0.285, + "step": 7599 + }, + { + "epoch": 1.01, + "grad_norm": 0.859375, + "learning_rate": 0.00018598075465739745, + "loss": 0.479, + "step": 7600 + }, + { + "epoch": 1.01, + "grad_norm": 0.80859375, + "learning_rate": 0.00018597480791645782, + "loss": 0.4698, + "step": 7601 + }, + { + "epoch": 1.01, + "grad_norm": 0.54296875, + "learning_rate": 0.0001859688600096446, + "loss": 0.3369, + "step": 7602 + }, + { + "epoch": 1.01, + "grad_norm": 0.515625, + "learning_rate": 0.0001859629109370385, + "loss": 0.4675, + "step": 7603 + }, + { + "epoch": 1.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00018595696069872013, + "loss": 0.5905, + "step": 7604 + }, + { + "epoch": 1.01, + "grad_norm": 0.462890625, + "learning_rate": 0.0001859510092947702, + "loss": 0.4076, + "step": 7605 + }, + { + "epoch": 1.01, + "grad_norm": 0.38671875, + "learning_rate": 0.00018594505672526942, + "loss": 0.3354, + "step": 7606 + }, + { + "epoch": 1.02, + "grad_norm": 0.5078125, + "learning_rate": 0.00018593910299029854, + "loss": 0.4482, + "step": 7607 + }, + { + "epoch": 1.02, + "grad_norm": 0.53515625, + "learning_rate": 0.00018593314808993826, + "loss": 0.3779, + "step": 7608 + }, + { + "epoch": 1.02, + "grad_norm": 0.474609375, + "learning_rate": 0.00018592719202426933, + "loss": 0.4237, + "step": 7609 + }, + { + "epoch": 1.02, + "grad_norm": 0.6875, + "learning_rate": 0.00018592123479337249, + "loss": 0.6018, + "step": 7610 + }, + { + "epoch": 1.02, + "grad_norm": 0.703125, + "learning_rate": 0.00018591527639732862, + "loss": 0.6764, + "step": 7611 + }, + { + "epoch": 1.02, + "grad_norm": 0.474609375, + "learning_rate": 0.00018590931683621845, + "loss": 0.6366, + "step": 7612 + }, + { + "epoch": 1.02, + "grad_norm": 0.609375, + "learning_rate": 0.0001859033561101228, + "loss": 0.3612, + "step": 7613 + }, + { + "epoch": 1.02, + "grad_norm": 0.671875, + "learning_rate": 0.0001858973942191225, + "loss": 0.3907, + "step": 7614 + }, + { + "epoch": 1.02, + "grad_norm": 0.71484375, + "learning_rate": 0.00018589143116329838, + "loss": 0.4481, + "step": 7615 + }, + { + "epoch": 1.02, + "grad_norm": 0.6953125, + "learning_rate": 0.0001858854669427314, + "loss": 0.4154, + "step": 7616 + }, + { + "epoch": 1.02, + "grad_norm": 0.396484375, + "learning_rate": 0.00018587950155750233, + "loss": 0.457, + "step": 7617 + }, + { + "epoch": 1.02, + "grad_norm": 0.8359375, + "learning_rate": 0.0001858735350076921, + "loss": 0.4733, + "step": 7618 + }, + { + "epoch": 1.02, + "grad_norm": 0.41015625, + "learning_rate": 0.0001858675672933816, + "loss": 0.4583, + "step": 7619 + }, + { + "epoch": 1.02, + "grad_norm": 0.3359375, + "learning_rate": 0.00018586159841465182, + "loss": 0.3339, + "step": 7620 + }, + { + "epoch": 1.02, + "grad_norm": 0.54296875, + "learning_rate": 0.00018585562837158362, + "loss": 0.4563, + "step": 7621 + }, + { + "epoch": 1.02, + "grad_norm": 0.7265625, + "learning_rate": 0.00018584965716425802, + "loss": 0.5827, + "step": 7622 + }, + { + "epoch": 1.02, + "grad_norm": 0.52734375, + "learning_rate": 0.000185843684792756, + "loss": 0.4956, + "step": 7623 + }, + { + "epoch": 1.02, + "grad_norm": 0.55078125, + "learning_rate": 0.00018583771125715848, + "loss": 0.1651, + "step": 7624 + }, + { + "epoch": 1.02, + "grad_norm": 0.44140625, + "learning_rate": 0.0001858317365575465, + "loss": 0.4137, + "step": 7625 + }, + { + "epoch": 1.02, + "grad_norm": 0.73828125, + "learning_rate": 0.00018582576069400112, + "loss": 0.5781, + "step": 7626 + }, + { + "epoch": 1.02, + "grad_norm": 0.56640625, + "learning_rate": 0.0001858197836666033, + "loss": 0.2626, + "step": 7627 + }, + { + "epoch": 1.02, + "grad_norm": 0.625, + "learning_rate": 0.00018581380547543416, + "loss": 0.6601, + "step": 7628 + }, + { + "epoch": 1.02, + "grad_norm": 0.57421875, + "learning_rate": 0.00018580782612057474, + "loss": 0.5088, + "step": 7629 + }, + { + "epoch": 1.02, + "grad_norm": 0.9140625, + "learning_rate": 0.00018580184560210614, + "loss": 0.4193, + "step": 7630 + }, + { + "epoch": 1.02, + "grad_norm": 0.6171875, + "learning_rate": 0.00018579586392010943, + "loss": 0.4077, + "step": 7631 + }, + { + "epoch": 1.02, + "grad_norm": 0.423828125, + "learning_rate": 0.00018578988107466576, + "loss": 0.5436, + "step": 7632 + }, + { + "epoch": 1.02, + "grad_norm": 0.7734375, + "learning_rate": 0.00018578389706585622, + "loss": 0.4797, + "step": 7633 + }, + { + "epoch": 1.02, + "grad_norm": 0.63671875, + "learning_rate": 0.00018577791189376198, + "loss": 0.6903, + "step": 7634 + }, + { + "epoch": 1.02, + "grad_norm": 0.63671875, + "learning_rate": 0.00018577192555846423, + "loss": 0.5487, + "step": 7635 + }, + { + "epoch": 1.02, + "grad_norm": 0.72265625, + "learning_rate": 0.0001857659380600441, + "loss": 0.5407, + "step": 7636 + }, + { + "epoch": 1.02, + "grad_norm": 0.78125, + "learning_rate": 0.00018575994939858278, + "loss": 0.3925, + "step": 7637 + }, + { + "epoch": 1.02, + "grad_norm": 0.62109375, + "learning_rate": 0.00018575395957416154, + "loss": 0.3433, + "step": 7638 + }, + { + "epoch": 1.02, + "grad_norm": 0.59375, + "learning_rate": 0.0001857479685868616, + "loss": 0.5057, + "step": 7639 + }, + { + "epoch": 1.02, + "grad_norm": 0.6171875, + "learning_rate": 0.0001857419764367641, + "loss": 0.4774, + "step": 7640 + }, + { + "epoch": 1.02, + "grad_norm": 0.7265625, + "learning_rate": 0.0001857359831239504, + "loss": 0.4176, + "step": 7641 + }, + { + "epoch": 1.02, + "grad_norm": 0.63671875, + "learning_rate": 0.00018572998864850178, + "loss": 0.4268, + "step": 7642 + }, + { + "epoch": 1.02, + "grad_norm": 0.53515625, + "learning_rate": 0.00018572399301049948, + "loss": 0.2668, + "step": 7643 + }, + { + "epoch": 1.02, + "grad_norm": 0.63671875, + "learning_rate": 0.0001857179962100248, + "loss": 0.3212, + "step": 7644 + }, + { + "epoch": 1.02, + "grad_norm": 0.75390625, + "learning_rate": 0.00018571199824715909, + "loss": 0.4853, + "step": 7645 + }, + { + "epoch": 1.02, + "grad_norm": 0.58203125, + "learning_rate": 0.00018570599912198365, + "loss": 0.5602, + "step": 7646 + }, + { + "epoch": 1.02, + "grad_norm": 0.66015625, + "learning_rate": 0.00018569999883457988, + "loss": 0.4896, + "step": 7647 + }, + { + "epoch": 1.02, + "grad_norm": 0.48828125, + "learning_rate": 0.0001856939973850291, + "loss": 0.2171, + "step": 7648 + }, + { + "epoch": 1.02, + "grad_norm": 0.54296875, + "learning_rate": 0.00018568799477341273, + "loss": 0.4644, + "step": 7649 + }, + { + "epoch": 1.02, + "grad_norm": 0.7421875, + "learning_rate": 0.00018568199099981217, + "loss": 0.6769, + "step": 7650 + }, + { + "epoch": 1.02, + "grad_norm": 0.478515625, + "learning_rate": 0.00018567598606430882, + "loss": 0.3123, + "step": 7651 + }, + { + "epoch": 1.02, + "grad_norm": 0.62890625, + "learning_rate": 0.0001856699799669841, + "loss": 0.6035, + "step": 7652 + }, + { + "epoch": 1.02, + "grad_norm": 0.57421875, + "learning_rate": 0.00018566397270791947, + "loss": 0.5847, + "step": 7653 + }, + { + "epoch": 1.02, + "grad_norm": 0.578125, + "learning_rate": 0.00018565796428719639, + "loss": 0.32, + "step": 7654 + }, + { + "epoch": 1.02, + "grad_norm": 0.470703125, + "learning_rate": 0.00018565195470489636, + "loss": 0.6049, + "step": 7655 + }, + { + "epoch": 1.02, + "grad_norm": 0.494140625, + "learning_rate": 0.00018564594396110082, + "loss": 0.3215, + "step": 7656 + }, + { + "epoch": 1.02, + "grad_norm": 0.6328125, + "learning_rate": 0.00018563993205589135, + "loss": 0.3812, + "step": 7657 + }, + { + "epoch": 1.02, + "grad_norm": 0.5078125, + "learning_rate": 0.0001856339189893494, + "loss": 0.5625, + "step": 7658 + }, + { + "epoch": 1.02, + "grad_norm": 0.77734375, + "learning_rate": 0.00018562790476155659, + "loss": 0.4022, + "step": 7659 + }, + { + "epoch": 1.02, + "grad_norm": 0.5234375, + "learning_rate": 0.00018562188937259441, + "loss": 0.6985, + "step": 7660 + }, + { + "epoch": 1.02, + "grad_norm": 0.494140625, + "learning_rate": 0.00018561587282254444, + "loss": 0.4368, + "step": 7661 + }, + { + "epoch": 1.02, + "grad_norm": 0.6015625, + "learning_rate": 0.0001856098551114883, + "loss": 0.4068, + "step": 7662 + }, + { + "epoch": 1.02, + "grad_norm": 0.5546875, + "learning_rate": 0.0001856038362395076, + "loss": 0.4828, + "step": 7663 + }, + { + "epoch": 1.02, + "grad_norm": 0.47265625, + "learning_rate": 0.00018559781620668394, + "loss": 0.2962, + "step": 7664 + }, + { + "epoch": 1.02, + "grad_norm": 0.5, + "learning_rate": 0.00018559179501309892, + "loss": 0.3439, + "step": 7665 + }, + { + "epoch": 1.02, + "grad_norm": 0.6328125, + "learning_rate": 0.00018558577265883429, + "loss": 0.3181, + "step": 7666 + }, + { + "epoch": 1.02, + "grad_norm": 0.416015625, + "learning_rate": 0.0001855797491439716, + "loss": 0.3315, + "step": 7667 + }, + { + "epoch": 1.02, + "grad_norm": 0.5859375, + "learning_rate": 0.0001855737244685926, + "loss": 0.5858, + "step": 7668 + }, + { + "epoch": 1.02, + "grad_norm": 0.671875, + "learning_rate": 0.00018556769863277897, + "loss": 0.3178, + "step": 7669 + }, + { + "epoch": 1.02, + "grad_norm": 0.474609375, + "learning_rate": 0.00018556167163661245, + "loss": 0.3179, + "step": 7670 + }, + { + "epoch": 1.02, + "grad_norm": 0.55078125, + "learning_rate": 0.00018555564348017472, + "loss": 0.4667, + "step": 7671 + }, + { + "epoch": 1.02, + "grad_norm": 0.458984375, + "learning_rate": 0.0001855496141635476, + "loss": 0.3689, + "step": 7672 + }, + { + "epoch": 1.02, + "grad_norm": 0.5546875, + "learning_rate": 0.00018554358368681276, + "loss": 0.2951, + "step": 7673 + }, + { + "epoch": 1.02, + "grad_norm": 0.60546875, + "learning_rate": 0.00018553755205005204, + "loss": 0.4843, + "step": 7674 + }, + { + "epoch": 1.02, + "grad_norm": 0.53515625, + "learning_rate": 0.00018553151925334725, + "loss": 0.714, + "step": 7675 + }, + { + "epoch": 1.02, + "grad_norm": 0.60546875, + "learning_rate": 0.0001855254852967801, + "loss": 0.2958, + "step": 7676 + }, + { + "epoch": 1.02, + "grad_norm": 0.5, + "learning_rate": 0.00018551945018043252, + "loss": 0.4468, + "step": 7677 + }, + { + "epoch": 1.02, + "grad_norm": 0.466796875, + "learning_rate": 0.00018551341390438632, + "loss": 0.4979, + "step": 7678 + }, + { + "epoch": 1.02, + "grad_norm": 0.60546875, + "learning_rate": 0.0001855073764687233, + "loss": 0.2566, + "step": 7679 + }, + { + "epoch": 1.02, + "grad_norm": 0.47265625, + "learning_rate": 0.00018550133787352543, + "loss": 0.3835, + "step": 7680 + }, + { + "epoch": 1.02, + "grad_norm": 0.46875, + "learning_rate": 0.00018549529811887452, + "loss": 0.3263, + "step": 7681 + }, + { + "epoch": 1.03, + "grad_norm": 0.6640625, + "learning_rate": 0.0001854892572048525, + "loss": 0.2851, + "step": 7682 + }, + { + "epoch": 1.03, + "grad_norm": 0.53515625, + "learning_rate": 0.00018548321513154128, + "loss": 0.3476, + "step": 7683 + }, + { + "epoch": 1.03, + "grad_norm": 0.73828125, + "learning_rate": 0.0001854771718990228, + "loss": 0.4622, + "step": 7684 + }, + { + "epoch": 1.03, + "grad_norm": 0.55078125, + "learning_rate": 0.00018547112750737902, + "loss": 0.2769, + "step": 7685 + }, + { + "epoch": 1.03, + "grad_norm": 0.69921875, + "learning_rate": 0.00018546508195669187, + "loss": 0.7881, + "step": 7686 + }, + { + "epoch": 1.03, + "grad_norm": 0.6015625, + "learning_rate": 0.0001854590352470434, + "loss": 0.4381, + "step": 7687 + }, + { + "epoch": 1.03, + "grad_norm": 0.546875, + "learning_rate": 0.00018545298737851552, + "loss": 0.4703, + "step": 7688 + }, + { + "epoch": 1.03, + "grad_norm": 0.447265625, + "learning_rate": 0.00018544693835119031, + "loss": 0.2798, + "step": 7689 + }, + { + "epoch": 1.03, + "grad_norm": 0.482421875, + "learning_rate": 0.00018544088816514978, + "loss": 0.3321, + "step": 7690 + }, + { + "epoch": 1.03, + "grad_norm": 0.44921875, + "learning_rate": 0.00018543483682047598, + "loss": 0.212, + "step": 7691 + }, + { + "epoch": 1.03, + "grad_norm": 0.59765625, + "learning_rate": 0.00018542878431725094, + "loss": 0.4079, + "step": 7692 + }, + { + "epoch": 1.03, + "grad_norm": 0.78515625, + "learning_rate": 0.00018542273065555676, + "loss": 0.3237, + "step": 7693 + }, + { + "epoch": 1.03, + "grad_norm": 0.42578125, + "learning_rate": 0.00018541667583547552, + "loss": 0.4088, + "step": 7694 + }, + { + "epoch": 1.03, + "grad_norm": 0.392578125, + "learning_rate": 0.00018541061985708935, + "loss": 0.4051, + "step": 7695 + }, + { + "epoch": 1.03, + "grad_norm": 0.4921875, + "learning_rate": 0.00018540456272048036, + "loss": 0.3328, + "step": 7696 + }, + { + "epoch": 1.03, + "grad_norm": 0.5390625, + "learning_rate": 0.0001853985044257307, + "loss": 0.3, + "step": 7697 + }, + { + "epoch": 1.03, + "grad_norm": 0.447265625, + "learning_rate": 0.00018539244497292248, + "loss": 0.3584, + "step": 7698 + }, + { + "epoch": 1.03, + "grad_norm": 0.6796875, + "learning_rate": 0.00018538638436213793, + "loss": 0.4288, + "step": 7699 + }, + { + "epoch": 1.03, + "grad_norm": 0.416015625, + "learning_rate": 0.0001853803225934592, + "loss": 0.219, + "step": 7700 + }, + { + "epoch": 1.03, + "grad_norm": 0.56640625, + "learning_rate": 0.0001853742596669685, + "loss": 0.3834, + "step": 7701 + }, + { + "epoch": 1.03, + "grad_norm": 0.484375, + "learning_rate": 0.00018536819558274804, + "loss": 0.3031, + "step": 7702 + }, + { + "epoch": 1.03, + "grad_norm": 0.69140625, + "learning_rate": 0.00018536213034088005, + "loss": 0.3787, + "step": 7703 + }, + { + "epoch": 1.03, + "grad_norm": 0.875, + "learning_rate": 0.0001853560639414468, + "loss": 0.55, + "step": 7704 + }, + { + "epoch": 1.03, + "grad_norm": 0.5234375, + "learning_rate": 0.0001853499963845305, + "loss": 0.1941, + "step": 7705 + }, + { + "epoch": 1.03, + "grad_norm": 0.5859375, + "learning_rate": 0.00018534392767021355, + "loss": 0.6048, + "step": 7706 + }, + { + "epoch": 1.03, + "grad_norm": 0.416015625, + "learning_rate": 0.00018533785779857812, + "loss": 0.3533, + "step": 7707 + }, + { + "epoch": 1.03, + "grad_norm": 0.58984375, + "learning_rate": 0.00018533178676970657, + "loss": 0.5037, + "step": 7708 + }, + { + "epoch": 1.03, + "grad_norm": 0.73828125, + "learning_rate": 0.00018532571458368123, + "loss": 0.5021, + "step": 7709 + }, + { + "epoch": 1.03, + "grad_norm": 0.59375, + "learning_rate": 0.00018531964124058443, + "loss": 0.3423, + "step": 7710 + }, + { + "epoch": 1.03, + "grad_norm": 0.6015625, + "learning_rate": 0.00018531356674049854, + "loss": 0.5042, + "step": 7711 + }, + { + "epoch": 1.03, + "grad_norm": 0.5390625, + "learning_rate": 0.00018530749108350595, + "loss": 0.5532, + "step": 7712 + }, + { + "epoch": 1.03, + "grad_norm": 0.62890625, + "learning_rate": 0.00018530141426968902, + "loss": 0.3321, + "step": 7713 + }, + { + "epoch": 1.03, + "grad_norm": 0.640625, + "learning_rate": 0.00018529533629913018, + "loss": 0.7085, + "step": 7714 + }, + { + "epoch": 1.03, + "grad_norm": 0.53515625, + "learning_rate": 0.00018528925717191183, + "loss": 0.6168, + "step": 7715 + }, + { + "epoch": 1.03, + "grad_norm": 0.486328125, + "learning_rate": 0.00018528317688811639, + "loss": 0.4431, + "step": 7716 + }, + { + "epoch": 1.03, + "grad_norm": 0.4609375, + "learning_rate": 0.00018527709544782637, + "loss": 0.3948, + "step": 7717 + }, + { + "epoch": 1.03, + "grad_norm": 0.3515625, + "learning_rate": 0.00018527101285112417, + "loss": 0.3968, + "step": 7718 + }, + { + "epoch": 1.03, + "grad_norm": 0.4453125, + "learning_rate": 0.00018526492909809233, + "loss": 0.3152, + "step": 7719 + }, + { + "epoch": 1.03, + "grad_norm": 0.474609375, + "learning_rate": 0.00018525884418881333, + "loss": 0.485, + "step": 7720 + }, + { + "epoch": 1.03, + "grad_norm": 0.482421875, + "learning_rate": 0.0001852527581233697, + "loss": 0.2577, + "step": 7721 + }, + { + "epoch": 1.03, + "grad_norm": 0.5859375, + "learning_rate": 0.00018524667090184392, + "loss": 0.4091, + "step": 7722 + }, + { + "epoch": 1.03, + "grad_norm": 0.6484375, + "learning_rate": 0.00018524058252431859, + "loss": 0.5764, + "step": 7723 + }, + { + "epoch": 1.03, + "grad_norm": 0.52734375, + "learning_rate": 0.00018523449299087625, + "loss": 0.5734, + "step": 7724 + }, + { + "epoch": 1.03, + "grad_norm": 0.443359375, + "learning_rate": 0.00018522840230159951, + "loss": 0.2905, + "step": 7725 + }, + { + "epoch": 1.03, + "grad_norm": 0.56640625, + "learning_rate": 0.0001852223104565709, + "loss": 0.4156, + "step": 7726 + }, + { + "epoch": 1.03, + "grad_norm": 0.59765625, + "learning_rate": 0.0001852162174558731, + "loss": 0.3562, + "step": 7727 + }, + { + "epoch": 1.03, + "grad_norm": 0.69921875, + "learning_rate": 0.00018521012329958864, + "loss": 0.3681, + "step": 7728 + }, + { + "epoch": 1.03, + "grad_norm": 0.61328125, + "learning_rate": 0.00018520402798780027, + "loss": 0.7199, + "step": 7729 + }, + { + "epoch": 1.03, + "grad_norm": 0.53125, + "learning_rate": 0.00018519793152059057, + "loss": 0.2946, + "step": 7730 + }, + { + "epoch": 1.03, + "grad_norm": 0.59375, + "learning_rate": 0.00018519183389804225, + "loss": 0.7615, + "step": 7731 + }, + { + "epoch": 1.03, + "grad_norm": 0.419921875, + "learning_rate": 0.000185185735120238, + "loss": 0.2084, + "step": 7732 + }, + { + "epoch": 1.03, + "grad_norm": 0.478515625, + "learning_rate": 0.00018517963518726048, + "loss": 0.4205, + "step": 7733 + }, + { + "epoch": 1.03, + "grad_norm": 0.65234375, + "learning_rate": 0.00018517353409919248, + "loss": 0.5933, + "step": 7734 + }, + { + "epoch": 1.03, + "grad_norm": 0.73828125, + "learning_rate": 0.00018516743185611665, + "loss": 0.2788, + "step": 7735 + }, + { + "epoch": 1.03, + "grad_norm": 0.490234375, + "learning_rate": 0.00018516132845811583, + "loss": 0.4646, + "step": 7736 + }, + { + "epoch": 1.03, + "grad_norm": 0.63671875, + "learning_rate": 0.00018515522390527268, + "loss": 0.3645, + "step": 7737 + }, + { + "epoch": 1.03, + "grad_norm": 0.94140625, + "learning_rate": 0.00018514911819767012, + "loss": 0.3962, + "step": 7738 + }, + { + "epoch": 1.03, + "grad_norm": 0.46484375, + "learning_rate": 0.0001851430113353908, + "loss": 0.2916, + "step": 7739 + }, + { + "epoch": 1.03, + "grad_norm": 0.5390625, + "learning_rate": 0.00018513690331851764, + "loss": 0.5039, + "step": 7740 + }, + { + "epoch": 1.03, + "grad_norm": 0.359375, + "learning_rate": 0.0001851307941471334, + "loss": 0.2822, + "step": 7741 + }, + { + "epoch": 1.03, + "grad_norm": 0.67578125, + "learning_rate": 0.00018512468382132096, + "loss": 0.6129, + "step": 7742 + }, + { + "epoch": 1.03, + "grad_norm": 0.408203125, + "learning_rate": 0.00018511857234116322, + "loss": 0.2304, + "step": 7743 + }, + { + "epoch": 1.03, + "grad_norm": 0.515625, + "learning_rate": 0.00018511245970674297, + "loss": 0.28, + "step": 7744 + }, + { + "epoch": 1.03, + "grad_norm": 0.62109375, + "learning_rate": 0.00018510634591814316, + "loss": 0.3416, + "step": 7745 + }, + { + "epoch": 1.03, + "grad_norm": 0.69140625, + "learning_rate": 0.00018510023097544664, + "loss": 0.5802, + "step": 7746 + }, + { + "epoch": 1.03, + "grad_norm": 0.60546875, + "learning_rate": 0.0001850941148787364, + "loss": 0.3113, + "step": 7747 + }, + { + "epoch": 1.03, + "grad_norm": 0.484375, + "learning_rate": 0.00018508799762809535, + "loss": 0.2239, + "step": 7748 + }, + { + "epoch": 1.03, + "grad_norm": 0.5859375, + "learning_rate": 0.00018508187922360644, + "loss": 0.2927, + "step": 7749 + }, + { + "epoch": 1.03, + "grad_norm": 0.455078125, + "learning_rate": 0.00018507575966535264, + "loss": 0.2425, + "step": 7750 + }, + { + "epoch": 1.03, + "grad_norm": 0.50390625, + "learning_rate": 0.00018506963895341695, + "loss": 0.3043, + "step": 7751 + }, + { + "epoch": 1.03, + "grad_norm": 0.8046875, + "learning_rate": 0.00018506351708788235, + "loss": 0.5467, + "step": 7752 + }, + { + "epoch": 1.03, + "grad_norm": 0.37109375, + "learning_rate": 0.00018505739406883187, + "loss": 0.238, + "step": 7753 + }, + { + "epoch": 1.03, + "grad_norm": 0.443359375, + "learning_rate": 0.0001850512698963485, + "loss": 0.2695, + "step": 7754 + }, + { + "epoch": 1.03, + "grad_norm": 0.74609375, + "learning_rate": 0.0001850451445705154, + "loss": 0.4692, + "step": 7755 + }, + { + "epoch": 1.03, + "grad_norm": 0.59765625, + "learning_rate": 0.00018503901809141548, + "loss": 0.5186, + "step": 7756 + }, + { + "epoch": 1.04, + "grad_norm": 0.404296875, + "learning_rate": 0.00018503289045913195, + "loss": 0.3726, + "step": 7757 + }, + { + "epoch": 1.04, + "grad_norm": 0.48828125, + "learning_rate": 0.0001850267616737478, + "loss": 0.3164, + "step": 7758 + }, + { + "epoch": 1.04, + "grad_norm": 0.515625, + "learning_rate": 0.00018502063173534626, + "loss": 0.2955, + "step": 7759 + }, + { + "epoch": 1.04, + "grad_norm": 0.6015625, + "learning_rate": 0.00018501450064401033, + "loss": 0.5493, + "step": 7760 + }, + { + "epoch": 1.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00018500836839982325, + "loss": 0.5316, + "step": 7761 + }, + { + "epoch": 1.04, + "grad_norm": 0.54296875, + "learning_rate": 0.00018500223500286813, + "loss": 0.2859, + "step": 7762 + }, + { + "epoch": 1.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00018499610045322814, + "loss": 0.5785, + "step": 7763 + }, + { + "epoch": 1.04, + "grad_norm": 0.3984375, + "learning_rate": 0.00018498996475098647, + "loss": 0.4049, + "step": 7764 + }, + { + "epoch": 1.04, + "grad_norm": 0.458984375, + "learning_rate": 0.00018498382789622632, + "loss": 0.2243, + "step": 7765 + }, + { + "epoch": 1.04, + "grad_norm": 0.578125, + "learning_rate": 0.00018497768988903093, + "loss": 0.57, + "step": 7766 + }, + { + "epoch": 1.04, + "grad_norm": 0.5234375, + "learning_rate": 0.00018497155072948353, + "loss": 0.4612, + "step": 7767 + }, + { + "epoch": 1.04, + "grad_norm": 0.41796875, + "learning_rate": 0.00018496541041766737, + "loss": 0.2567, + "step": 7768 + }, + { + "epoch": 1.04, + "grad_norm": 0.57421875, + "learning_rate": 0.00018495926895366574, + "loss": 0.3972, + "step": 7769 + }, + { + "epoch": 1.04, + "grad_norm": 0.423828125, + "learning_rate": 0.00018495312633756185, + "loss": 0.3274, + "step": 7770 + }, + { + "epoch": 1.04, + "grad_norm": 0.40625, + "learning_rate": 0.00018494698256943908, + "loss": 0.456, + "step": 7771 + }, + { + "epoch": 1.04, + "grad_norm": 0.86328125, + "learning_rate": 0.0001849408376493807, + "loss": 0.678, + "step": 7772 + }, + { + "epoch": 1.04, + "grad_norm": 0.640625, + "learning_rate": 0.00018493469157747003, + "loss": 0.4592, + "step": 7773 + }, + { + "epoch": 1.04, + "grad_norm": 0.6640625, + "learning_rate": 0.00018492854435379045, + "loss": 0.412, + "step": 7774 + }, + { + "epoch": 1.04, + "grad_norm": 0.51953125, + "learning_rate": 0.00018492239597842528, + "loss": 0.3781, + "step": 7775 + }, + { + "epoch": 1.04, + "grad_norm": 0.7578125, + "learning_rate": 0.00018491624645145796, + "loss": 0.3315, + "step": 7776 + }, + { + "epoch": 1.04, + "grad_norm": 0.609375, + "learning_rate": 0.0001849100957729718, + "loss": 0.3049, + "step": 7777 + }, + { + "epoch": 1.04, + "grad_norm": 0.51953125, + "learning_rate": 0.00018490394394305026, + "loss": 0.5955, + "step": 7778 + }, + { + "epoch": 1.04, + "grad_norm": 0.5390625, + "learning_rate": 0.00018489779096177676, + "loss": 0.4773, + "step": 7779 + }, + { + "epoch": 1.04, + "grad_norm": 0.61328125, + "learning_rate": 0.0001848916368292347, + "loss": 0.3704, + "step": 7780 + }, + { + "epoch": 1.04, + "grad_norm": 0.546875, + "learning_rate": 0.00018488548154550757, + "loss": 0.2402, + "step": 7781 + }, + { + "epoch": 1.04, + "grad_norm": 0.5625, + "learning_rate": 0.00018487932511067885, + "loss": 0.4773, + "step": 7782 + }, + { + "epoch": 1.04, + "grad_norm": 0.5078125, + "learning_rate": 0.00018487316752483198, + "loss": 0.3728, + "step": 7783 + }, + { + "epoch": 1.04, + "grad_norm": 0.55078125, + "learning_rate": 0.00018486700878805052, + "loss": 0.279, + "step": 7784 + }, + { + "epoch": 1.04, + "grad_norm": 0.5078125, + "learning_rate": 0.00018486084890041794, + "loss": 0.6951, + "step": 7785 + }, + { + "epoch": 1.04, + "grad_norm": 0.54296875, + "learning_rate": 0.00018485468786201778, + "loss": 0.4176, + "step": 7786 + }, + { + "epoch": 1.04, + "grad_norm": 0.72265625, + "learning_rate": 0.00018484852567293358, + "loss": 0.5247, + "step": 7787 + }, + { + "epoch": 1.04, + "grad_norm": 0.54296875, + "learning_rate": 0.00018484236233324894, + "loss": 0.3997, + "step": 7788 + }, + { + "epoch": 1.04, + "grad_norm": 0.59765625, + "learning_rate": 0.0001848361978430474, + "loss": 0.2124, + "step": 7789 + }, + { + "epoch": 1.04, + "grad_norm": 0.494140625, + "learning_rate": 0.00018483003220241256, + "loss": 0.6231, + "step": 7790 + }, + { + "epoch": 1.04, + "grad_norm": 0.58203125, + "learning_rate": 0.0001848238654114281, + "loss": 0.2671, + "step": 7791 + }, + { + "epoch": 1.04, + "grad_norm": 0.54296875, + "learning_rate": 0.00018481769747017752, + "loss": 0.3717, + "step": 7792 + }, + { + "epoch": 1.04, + "grad_norm": 0.5703125, + "learning_rate": 0.00018481152837874457, + "loss": 0.4225, + "step": 7793 + }, + { + "epoch": 1.04, + "grad_norm": 0.6875, + "learning_rate": 0.0001848053581372128, + "loss": 0.3543, + "step": 7794 + }, + { + "epoch": 1.04, + "grad_norm": 0.490234375, + "learning_rate": 0.000184799186745666, + "loss": 0.4478, + "step": 7795 + }, + { + "epoch": 1.04, + "grad_norm": 0.451171875, + "learning_rate": 0.00018479301420418778, + "loss": 0.4061, + "step": 7796 + }, + { + "epoch": 1.04, + "grad_norm": 0.5625, + "learning_rate": 0.0001847868405128619, + "loss": 0.305, + "step": 7797 + }, + { + "epoch": 1.04, + "grad_norm": 0.74609375, + "learning_rate": 0.00018478066567177202, + "loss": 0.314, + "step": 7798 + }, + { + "epoch": 1.04, + "grad_norm": 0.48828125, + "learning_rate": 0.0001847744896810019, + "loss": 0.5602, + "step": 7799 + }, + { + "epoch": 1.04, + "grad_norm": 0.59765625, + "learning_rate": 0.0001847683125406353, + "loss": 0.3604, + "step": 7800 + }, + { + "epoch": 1.04, + "grad_norm": 0.8203125, + "learning_rate": 0.00018476213425075598, + "loss": 0.8302, + "step": 7801 + }, + { + "epoch": 1.04, + "grad_norm": 0.6171875, + "learning_rate": 0.00018475595481144772, + "loss": 0.3082, + "step": 7802 + }, + { + "epoch": 1.04, + "grad_norm": 0.5703125, + "learning_rate": 0.0001847497742227943, + "loss": 0.3213, + "step": 7803 + }, + { + "epoch": 1.04, + "grad_norm": 0.734375, + "learning_rate": 0.0001847435924848796, + "loss": 0.6542, + "step": 7804 + }, + { + "epoch": 1.04, + "grad_norm": 0.6171875, + "learning_rate": 0.00018473740959778736, + "loss": 0.3164, + "step": 7805 + }, + { + "epoch": 1.04, + "grad_norm": 0.478515625, + "learning_rate": 0.00018473122556160142, + "loss": 0.3069, + "step": 7806 + }, + { + "epoch": 1.04, + "grad_norm": 0.86328125, + "learning_rate": 0.00018472504037640575, + "loss": 0.3977, + "step": 7807 + }, + { + "epoch": 1.04, + "grad_norm": 0.6015625, + "learning_rate": 0.00018471885404228413, + "loss": 0.6801, + "step": 7808 + }, + { + "epoch": 1.04, + "grad_norm": 0.58984375, + "learning_rate": 0.00018471266655932048, + "loss": 0.3137, + "step": 7809 + }, + { + "epoch": 1.04, + "grad_norm": 0.484375, + "learning_rate": 0.0001847064779275987, + "loss": 0.4076, + "step": 7810 + }, + { + "epoch": 1.04, + "grad_norm": 0.412109375, + "learning_rate": 0.0001847002881472027, + "loss": 0.1393, + "step": 7811 + }, + { + "epoch": 1.04, + "grad_norm": 0.578125, + "learning_rate": 0.00018469409721821647, + "loss": 0.4736, + "step": 7812 + }, + { + "epoch": 1.04, + "grad_norm": 0.6171875, + "learning_rate": 0.0001846879051407239, + "loss": 0.3646, + "step": 7813 + }, + { + "epoch": 1.04, + "grad_norm": 0.71875, + "learning_rate": 0.000184681711914809, + "loss": 0.1725, + "step": 7814 + }, + { + "epoch": 1.04, + "grad_norm": 0.48828125, + "learning_rate": 0.0001846755175405557, + "loss": 0.5291, + "step": 7815 + }, + { + "epoch": 1.04, + "grad_norm": 0.64453125, + "learning_rate": 0.00018466932201804807, + "loss": 0.5356, + "step": 7816 + }, + { + "epoch": 1.04, + "grad_norm": 0.52734375, + "learning_rate": 0.00018466312534737007, + "loss": 0.3716, + "step": 7817 + }, + { + "epoch": 1.04, + "grad_norm": 0.58984375, + "learning_rate": 0.00018465692752860576, + "loss": 0.4738, + "step": 7818 + }, + { + "epoch": 1.04, + "grad_norm": 0.66015625, + "learning_rate": 0.0001846507285618392, + "loss": 0.3693, + "step": 7819 + }, + { + "epoch": 1.04, + "grad_norm": 0.625, + "learning_rate": 0.00018464452844715442, + "loss": 0.4668, + "step": 7820 + }, + { + "epoch": 1.04, + "grad_norm": 0.6015625, + "learning_rate": 0.00018463832718463552, + "loss": 0.4956, + "step": 7821 + }, + { + "epoch": 1.04, + "grad_norm": 0.5703125, + "learning_rate": 0.00018463212477436657, + "loss": 0.2592, + "step": 7822 + }, + { + "epoch": 1.04, + "grad_norm": 0.435546875, + "learning_rate": 0.00018462592121643173, + "loss": 0.4304, + "step": 7823 + }, + { + "epoch": 1.04, + "grad_norm": 0.6875, + "learning_rate": 0.00018461971651091504, + "loss": 0.2897, + "step": 7824 + }, + { + "epoch": 1.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00018461351065790068, + "loss": 0.6444, + "step": 7825 + }, + { + "epoch": 1.04, + "grad_norm": 0.400390625, + "learning_rate": 0.00018460730365747283, + "loss": 0.166, + "step": 7826 + }, + { + "epoch": 1.04, + "grad_norm": 0.5390625, + "learning_rate": 0.00018460109550971565, + "loss": 0.4535, + "step": 7827 + }, + { + "epoch": 1.04, + "grad_norm": 0.66015625, + "learning_rate": 0.0001845948862147133, + "loss": 0.4018, + "step": 7828 + }, + { + "epoch": 1.04, + "grad_norm": 0.48046875, + "learning_rate": 0.00018458867577255002, + "loss": 0.3524, + "step": 7829 + }, + { + "epoch": 1.04, + "grad_norm": 0.37109375, + "learning_rate": 0.00018458246418331, + "loss": 0.2273, + "step": 7830 + }, + { + "epoch": 1.04, + "grad_norm": 0.447265625, + "learning_rate": 0.00018457625144707748, + "loss": 0.4705, + "step": 7831 + }, + { + "epoch": 1.05, + "grad_norm": 0.515625, + "learning_rate": 0.0001845700375639367, + "loss": 0.3449, + "step": 7832 + }, + { + "epoch": 1.05, + "grad_norm": 0.43359375, + "learning_rate": 0.00018456382253397195, + "loss": 0.3084, + "step": 7833 + }, + { + "epoch": 1.05, + "grad_norm": 0.53515625, + "learning_rate": 0.00018455760635726747, + "loss": 0.4128, + "step": 7834 + }, + { + "epoch": 1.05, + "grad_norm": 0.58203125, + "learning_rate": 0.00018455138903390758, + "loss": 0.2494, + "step": 7835 + }, + { + "epoch": 1.05, + "grad_norm": 0.458984375, + "learning_rate": 0.00018454517056397661, + "loss": 0.2037, + "step": 7836 + }, + { + "epoch": 1.05, + "grad_norm": 0.734375, + "learning_rate": 0.00018453895094755888, + "loss": 0.346, + "step": 7837 + }, + { + "epoch": 1.05, + "grad_norm": 0.625, + "learning_rate": 0.00018453273018473868, + "loss": 0.3645, + "step": 7838 + }, + { + "epoch": 1.05, + "grad_norm": 0.625, + "learning_rate": 0.00018452650827560044, + "loss": 0.5882, + "step": 7839 + }, + { + "epoch": 1.05, + "grad_norm": 0.68359375, + "learning_rate": 0.0001845202852202285, + "loss": 0.2442, + "step": 7840 + }, + { + "epoch": 1.05, + "grad_norm": 0.484375, + "learning_rate": 0.00018451406101870722, + "loss": 0.312, + "step": 7841 + }, + { + "epoch": 1.05, + "grad_norm": 0.66796875, + "learning_rate": 0.00018450783567112105, + "loss": 0.4841, + "step": 7842 + }, + { + "epoch": 1.05, + "grad_norm": 0.6484375, + "learning_rate": 0.00018450160917755438, + "loss": 0.4695, + "step": 7843 + }, + { + "epoch": 1.05, + "grad_norm": 0.58984375, + "learning_rate": 0.00018449538153809166, + "loss": 0.5617, + "step": 7844 + }, + { + "epoch": 1.05, + "grad_norm": 0.6484375, + "learning_rate": 0.00018448915275281734, + "loss": 0.5075, + "step": 7845 + }, + { + "epoch": 1.05, + "grad_norm": 0.67578125, + "learning_rate": 0.0001844829228218159, + "loss": 0.2894, + "step": 7846 + }, + { + "epoch": 1.05, + "grad_norm": 0.6171875, + "learning_rate": 0.00018447669174517176, + "loss": 0.506, + "step": 7847 + }, + { + "epoch": 1.05, + "grad_norm": 0.68359375, + "learning_rate": 0.0001844704595229695, + "loss": 0.5659, + "step": 7848 + }, + { + "epoch": 1.05, + "grad_norm": 0.42578125, + "learning_rate": 0.00018446422615529356, + "loss": 0.2765, + "step": 7849 + }, + { + "epoch": 1.05, + "grad_norm": 0.71875, + "learning_rate": 0.00018445799164222858, + "loss": 0.307, + "step": 7850 + }, + { + "epoch": 1.05, + "grad_norm": 0.6796875, + "learning_rate": 0.00018445175598385895, + "loss": 0.3964, + "step": 7851 + }, + { + "epoch": 1.05, + "grad_norm": 0.56640625, + "learning_rate": 0.00018444551918026935, + "loss": 0.6283, + "step": 7852 + }, + { + "epoch": 1.05, + "grad_norm": 0.6875, + "learning_rate": 0.0001844392812315443, + "loss": 0.3332, + "step": 7853 + }, + { + "epoch": 1.05, + "grad_norm": 0.640625, + "learning_rate": 0.00018443304213776841, + "loss": 0.397, + "step": 7854 + }, + { + "epoch": 1.05, + "grad_norm": 0.60546875, + "learning_rate": 0.00018442680189902628, + "loss": 0.4573, + "step": 7855 + }, + { + "epoch": 1.05, + "grad_norm": 0.5546875, + "learning_rate": 0.00018442056051540254, + "loss": 0.3351, + "step": 7856 + }, + { + "epoch": 1.05, + "grad_norm": 0.51171875, + "learning_rate": 0.0001844143179869818, + "loss": 0.3322, + "step": 7857 + }, + { + "epoch": 1.05, + "grad_norm": 0.609375, + "learning_rate": 0.00018440807431384872, + "loss": 0.6325, + "step": 7858 + }, + { + "epoch": 1.05, + "grad_norm": 0.6796875, + "learning_rate": 0.000184401829496088, + "loss": 0.3642, + "step": 7859 + }, + { + "epoch": 1.05, + "grad_norm": 0.6875, + "learning_rate": 0.00018439558353378432, + "loss": 0.3577, + "step": 7860 + }, + { + "epoch": 1.05, + "grad_norm": 0.55859375, + "learning_rate": 0.00018438933642702233, + "loss": 0.7111, + "step": 7861 + }, + { + "epoch": 1.05, + "grad_norm": 0.484375, + "learning_rate": 0.0001843830881758868, + "loss": 0.369, + "step": 7862 + }, + { + "epoch": 1.05, + "grad_norm": 0.73828125, + "learning_rate": 0.00018437683878046244, + "loss": 0.3012, + "step": 7863 + }, + { + "epoch": 1.05, + "grad_norm": 0.66015625, + "learning_rate": 0.00018437058824083397, + "loss": 0.2647, + "step": 7864 + }, + { + "epoch": 1.05, + "grad_norm": 0.44140625, + "learning_rate": 0.0001843643365570862, + "loss": 0.2459, + "step": 7865 + }, + { + "epoch": 1.05, + "grad_norm": 0.58203125, + "learning_rate": 0.00018435808372930384, + "loss": 0.3556, + "step": 7866 + }, + { + "epoch": 1.05, + "grad_norm": 0.62890625, + "learning_rate": 0.00018435182975757176, + "loss": 0.5027, + "step": 7867 + }, + { + "epoch": 1.05, + "grad_norm": 0.458984375, + "learning_rate": 0.00018434557464197472, + "loss": 0.4511, + "step": 7868 + }, + { + "epoch": 1.05, + "grad_norm": 0.6171875, + "learning_rate": 0.00018433931838259757, + "loss": 0.4227, + "step": 7869 + }, + { + "epoch": 1.05, + "grad_norm": 0.578125, + "learning_rate": 0.00018433306097952513, + "loss": 0.3548, + "step": 7870 + }, + { + "epoch": 1.05, + "grad_norm": 0.54296875, + "learning_rate": 0.00018432680243284227, + "loss": 0.2577, + "step": 7871 + }, + { + "epoch": 1.05, + "grad_norm": 0.51953125, + "learning_rate": 0.00018432054274263382, + "loss": 0.6762, + "step": 7872 + }, + { + "epoch": 1.05, + "grad_norm": 0.361328125, + "learning_rate": 0.0001843142819089847, + "loss": 0.1502, + "step": 7873 + }, + { + "epoch": 1.05, + "grad_norm": 0.59765625, + "learning_rate": 0.00018430801993197978, + "loss": 0.3922, + "step": 7874 + }, + { + "epoch": 1.05, + "grad_norm": 0.5078125, + "learning_rate": 0.00018430175681170404, + "loss": 0.4358, + "step": 7875 + }, + { + "epoch": 1.05, + "grad_norm": 0.53125, + "learning_rate": 0.00018429549254824238, + "loss": 0.2873, + "step": 7876 + }, + { + "epoch": 1.05, + "grad_norm": 0.419921875, + "learning_rate": 0.0001842892271416797, + "loss": 0.3969, + "step": 7877 + }, + { + "epoch": 1.05, + "grad_norm": 0.4921875, + "learning_rate": 0.00018428296059210103, + "loss": 0.1799, + "step": 7878 + }, + { + "epoch": 1.05, + "grad_norm": 0.578125, + "learning_rate": 0.00018427669289959134, + "loss": 0.3972, + "step": 7879 + }, + { + "epoch": 1.05, + "grad_norm": 0.44921875, + "learning_rate": 0.00018427042406423557, + "loss": 0.3287, + "step": 7880 + }, + { + "epoch": 1.05, + "grad_norm": 0.75, + "learning_rate": 0.0001842641540861188, + "loss": 0.3959, + "step": 7881 + }, + { + "epoch": 1.05, + "grad_norm": 0.66796875, + "learning_rate": 0.000184257882965326, + "loss": 0.3853, + "step": 7882 + }, + { + "epoch": 1.05, + "grad_norm": 0.50390625, + "learning_rate": 0.00018425161070194222, + "loss": 0.4, + "step": 7883 + }, + { + "epoch": 1.05, + "grad_norm": 0.7578125, + "learning_rate": 0.00018424533729605256, + "loss": 0.3956, + "step": 7884 + }, + { + "epoch": 1.05, + "grad_norm": 0.5546875, + "learning_rate": 0.00018423906274774203, + "loss": 0.3415, + "step": 7885 + }, + { + "epoch": 1.05, + "grad_norm": 0.81640625, + "learning_rate": 0.00018423278705709573, + "loss": 0.4426, + "step": 7886 + }, + { + "epoch": 1.05, + "grad_norm": 0.57421875, + "learning_rate": 0.0001842265102241988, + "loss": 0.6233, + "step": 7887 + }, + { + "epoch": 1.05, + "grad_norm": 0.5234375, + "learning_rate": 0.00018422023224913634, + "loss": 0.5878, + "step": 7888 + }, + { + "epoch": 1.05, + "grad_norm": 0.578125, + "learning_rate": 0.0001842139531319935, + "loss": 0.3902, + "step": 7889 + }, + { + "epoch": 1.05, + "grad_norm": 0.63671875, + "learning_rate": 0.00018420767287285537, + "loss": 0.3594, + "step": 7890 + }, + { + "epoch": 1.05, + "grad_norm": 0.435546875, + "learning_rate": 0.00018420139147180714, + "loss": 0.4336, + "step": 7891 + }, + { + "epoch": 1.05, + "grad_norm": 0.734375, + "learning_rate": 0.00018419510892893402, + "loss": 0.3319, + "step": 7892 + }, + { + "epoch": 1.05, + "grad_norm": 0.6875, + "learning_rate": 0.0001841888252443212, + "loss": 0.9386, + "step": 7893 + }, + { + "epoch": 1.05, + "grad_norm": 0.5, + "learning_rate": 0.00018418254041805386, + "loss": 0.3201, + "step": 7894 + }, + { + "epoch": 1.05, + "grad_norm": 0.4921875, + "learning_rate": 0.00018417625445021725, + "loss": 0.251, + "step": 7895 + }, + { + "epoch": 1.05, + "grad_norm": 0.58203125, + "learning_rate": 0.00018416996734089663, + "loss": 0.329, + "step": 7896 + }, + { + "epoch": 1.05, + "grad_norm": 0.8359375, + "learning_rate": 0.0001841636790901772, + "loss": 0.498, + "step": 7897 + }, + { + "epoch": 1.05, + "grad_norm": 0.640625, + "learning_rate": 0.00018415738969814426, + "loss": 0.6792, + "step": 7898 + }, + { + "epoch": 1.05, + "grad_norm": 0.72265625, + "learning_rate": 0.0001841510991648831, + "loss": 0.5394, + "step": 7899 + }, + { + "epoch": 1.05, + "grad_norm": 0.54296875, + "learning_rate": 0.00018414480749047907, + "loss": 0.3698, + "step": 7900 + }, + { + "epoch": 1.05, + "grad_norm": 0.5546875, + "learning_rate": 0.00018413851467501743, + "loss": 0.4526, + "step": 7901 + }, + { + "epoch": 1.05, + "grad_norm": 0.65625, + "learning_rate": 0.0001841322207185835, + "loss": 0.3512, + "step": 7902 + }, + { + "epoch": 1.05, + "grad_norm": 0.6484375, + "learning_rate": 0.0001841259256212627, + "loss": 0.4512, + "step": 7903 + }, + { + "epoch": 1.05, + "grad_norm": 0.54296875, + "learning_rate": 0.00018411962938314033, + "loss": 0.3274, + "step": 7904 + }, + { + "epoch": 1.05, + "grad_norm": 0.57421875, + "learning_rate": 0.0001841133320043018, + "loss": 0.4658, + "step": 7905 + }, + { + "epoch": 1.05, + "grad_norm": 0.51171875, + "learning_rate": 0.00018410703348483253, + "loss": 0.4253, + "step": 7906 + }, + { + "epoch": 1.06, + "grad_norm": 0.65625, + "learning_rate": 0.0001841007338248179, + "loss": 0.457, + "step": 7907 + }, + { + "epoch": 1.06, + "grad_norm": 0.46484375, + "learning_rate": 0.00018409443302434334, + "loss": 0.3812, + "step": 7908 + }, + { + "epoch": 1.06, + "grad_norm": 0.51171875, + "learning_rate": 0.00018408813108349427, + "loss": 0.2403, + "step": 7909 + }, + { + "epoch": 1.06, + "grad_norm": 0.494140625, + "learning_rate": 0.00018408182800235622, + "loss": 0.4316, + "step": 7910 + }, + { + "epoch": 1.06, + "grad_norm": 0.484375, + "learning_rate": 0.00018407552378101462, + "loss": 0.4842, + "step": 7911 + }, + { + "epoch": 1.06, + "grad_norm": 0.62890625, + "learning_rate": 0.00018406921841955488, + "loss": 0.3218, + "step": 7912 + }, + { + "epoch": 1.06, + "grad_norm": 0.4765625, + "learning_rate": 0.00018406291191806266, + "loss": 0.296, + "step": 7913 + }, + { + "epoch": 1.06, + "grad_norm": 0.43359375, + "learning_rate": 0.00018405660427662338, + "loss": 0.268, + "step": 7914 + }, + { + "epoch": 1.06, + "grad_norm": 0.60546875, + "learning_rate": 0.0001840502954953226, + "loss": 0.4803, + "step": 7915 + }, + { + "epoch": 1.06, + "grad_norm": 0.421875, + "learning_rate": 0.00018404398557424586, + "loss": 0.2081, + "step": 7916 + }, + { + "epoch": 1.06, + "grad_norm": 0.515625, + "learning_rate": 0.00018403767451347878, + "loss": 0.358, + "step": 7917 + }, + { + "epoch": 1.06, + "grad_norm": 0.55078125, + "learning_rate": 0.00018403136231310684, + "loss": 0.4715, + "step": 7918 + }, + { + "epoch": 1.06, + "grad_norm": 0.5625, + "learning_rate": 0.00018402504897321574, + "loss": 0.4704, + "step": 7919 + }, + { + "epoch": 1.06, + "grad_norm": 0.578125, + "learning_rate": 0.00018401873449389101, + "loss": 0.2351, + "step": 7920 + }, + { + "epoch": 1.06, + "grad_norm": 0.58203125, + "learning_rate": 0.00018401241887521834, + "loss": 0.5136, + "step": 7921 + }, + { + "epoch": 1.06, + "grad_norm": 0.4765625, + "learning_rate": 0.00018400610211728337, + "loss": 0.2775, + "step": 7922 + }, + { + "epoch": 1.06, + "grad_norm": 0.490234375, + "learning_rate": 0.0001839997842201717, + "loss": 0.2194, + "step": 7923 + }, + { + "epoch": 1.06, + "grad_norm": 0.6328125, + "learning_rate": 0.00018399346518396906, + "loss": 0.5167, + "step": 7924 + }, + { + "epoch": 1.06, + "grad_norm": 0.87109375, + "learning_rate": 0.00018398714500876114, + "loss": 0.546, + "step": 7925 + }, + { + "epoch": 1.06, + "grad_norm": 0.6171875, + "learning_rate": 0.00018398082369463362, + "loss": 0.4849, + "step": 7926 + }, + { + "epoch": 1.06, + "grad_norm": 0.75, + "learning_rate": 0.00018397450124167222, + "loss": 0.4601, + "step": 7927 + }, + { + "epoch": 1.06, + "grad_norm": 0.435546875, + "learning_rate": 0.0001839681776499627, + "loss": 0.2688, + "step": 7928 + }, + { + "epoch": 1.06, + "grad_norm": 0.54296875, + "learning_rate": 0.00018396185291959084, + "loss": 0.4609, + "step": 7929 + }, + { + "epoch": 1.06, + "grad_norm": 0.59765625, + "learning_rate": 0.0001839555270506423, + "loss": 0.6896, + "step": 7930 + }, + { + "epoch": 1.06, + "grad_norm": 0.453125, + "learning_rate": 0.00018394920004320296, + "loss": 0.3903, + "step": 7931 + }, + { + "epoch": 1.06, + "grad_norm": 0.478515625, + "learning_rate": 0.0001839428718973586, + "loss": 0.4778, + "step": 7932 + }, + { + "epoch": 1.06, + "grad_norm": 0.54296875, + "learning_rate": 0.000183936542613195, + "loss": 0.4359, + "step": 7933 + }, + { + "epoch": 1.06, + "grad_norm": 0.5625, + "learning_rate": 0.00018393021219079804, + "loss": 0.5093, + "step": 7934 + }, + { + "epoch": 1.06, + "grad_norm": 0.474609375, + "learning_rate": 0.00018392388063025351, + "loss": 0.3897, + "step": 7935 + }, + { + "epoch": 1.06, + "grad_norm": 0.498046875, + "learning_rate": 0.00018391754793164731, + "loss": 0.4333, + "step": 7936 + }, + { + "epoch": 1.06, + "grad_norm": 0.44140625, + "learning_rate": 0.0001839112140950653, + "loss": 0.3375, + "step": 7937 + }, + { + "epoch": 1.06, + "grad_norm": 0.443359375, + "learning_rate": 0.00018390487912059339, + "loss": 0.2369, + "step": 7938 + }, + { + "epoch": 1.06, + "grad_norm": 0.71484375, + "learning_rate": 0.00018389854300831745, + "loss": 0.3183, + "step": 7939 + }, + { + "epoch": 1.06, + "grad_norm": 0.51953125, + "learning_rate": 0.0001838922057583234, + "loss": 0.3791, + "step": 7940 + }, + { + "epoch": 1.06, + "grad_norm": 0.72265625, + "learning_rate": 0.00018388586737069722, + "loss": 0.3563, + "step": 7941 + }, + { + "epoch": 1.06, + "grad_norm": 0.55078125, + "learning_rate": 0.00018387952784552485, + "loss": 0.5316, + "step": 7942 + }, + { + "epoch": 1.06, + "grad_norm": 0.66796875, + "learning_rate": 0.00018387318718289227, + "loss": 0.5338, + "step": 7943 + }, + { + "epoch": 1.06, + "grad_norm": 0.6875, + "learning_rate": 0.0001838668453828854, + "loss": 0.3475, + "step": 7944 + }, + { + "epoch": 1.06, + "grad_norm": 0.6796875, + "learning_rate": 0.0001838605024455903, + "loss": 0.3112, + "step": 7945 + }, + { + "epoch": 1.06, + "grad_norm": 0.75, + "learning_rate": 0.00018385415837109296, + "loss": 0.4452, + "step": 7946 + }, + { + "epoch": 1.06, + "grad_norm": 0.609375, + "learning_rate": 0.0001838478131594794, + "loss": 0.4887, + "step": 7947 + }, + { + "epoch": 1.06, + "grad_norm": 0.4921875, + "learning_rate": 0.0001838414668108357, + "loss": 0.4539, + "step": 7948 + }, + { + "epoch": 1.06, + "grad_norm": 0.71484375, + "learning_rate": 0.0001838351193252479, + "loss": 0.3613, + "step": 7949 + }, + { + "epoch": 1.06, + "grad_norm": 0.51953125, + "learning_rate": 0.0001838287707028021, + "loss": 0.4281, + "step": 7950 + }, + { + "epoch": 1.06, + "grad_norm": 0.56640625, + "learning_rate": 0.00018382242094358434, + "loss": 0.3417, + "step": 7951 + }, + { + "epoch": 1.06, + "grad_norm": 0.62890625, + "learning_rate": 0.00018381607004768077, + "loss": 0.4077, + "step": 7952 + }, + { + "epoch": 1.06, + "grad_norm": 0.68359375, + "learning_rate": 0.00018380971801517745, + "loss": 0.535, + "step": 7953 + }, + { + "epoch": 1.06, + "grad_norm": 0.48046875, + "learning_rate": 0.0001838033648461606, + "loss": 0.4453, + "step": 7954 + }, + { + "epoch": 1.06, + "grad_norm": 0.609375, + "learning_rate": 0.00018379701054071633, + "loss": 0.28, + "step": 7955 + }, + { + "epoch": 1.06, + "grad_norm": 0.5390625, + "learning_rate": 0.00018379065509893083, + "loss": 0.5324, + "step": 7956 + }, + { + "epoch": 1.06, + "grad_norm": 0.6953125, + "learning_rate": 0.00018378429852089029, + "loss": 0.4999, + "step": 7957 + }, + { + "epoch": 1.06, + "grad_norm": 0.5390625, + "learning_rate": 0.00018377794080668086, + "loss": 0.5, + "step": 7958 + }, + { + "epoch": 1.06, + "grad_norm": 0.69140625, + "learning_rate": 0.00018377158195638876, + "loss": 0.4146, + "step": 7959 + }, + { + "epoch": 1.06, + "grad_norm": 0.73828125, + "learning_rate": 0.0001837652219701003, + "loss": 0.4898, + "step": 7960 + }, + { + "epoch": 1.06, + "grad_norm": 0.447265625, + "learning_rate": 0.00018375886084790163, + "loss": 0.2888, + "step": 7961 + }, + { + "epoch": 1.06, + "grad_norm": 0.400390625, + "learning_rate": 0.0001837524985898791, + "loss": 0.497, + "step": 7962 + }, + { + "epoch": 1.06, + "grad_norm": 0.55859375, + "learning_rate": 0.00018374613519611888, + "loss": 0.6487, + "step": 7963 + }, + { + "epoch": 1.06, + "grad_norm": 0.47265625, + "learning_rate": 0.00018373977066670733, + "loss": 0.413, + "step": 7964 + }, + { + "epoch": 1.06, + "grad_norm": 0.64453125, + "learning_rate": 0.00018373340500173074, + "loss": 0.591, + "step": 7965 + }, + { + "epoch": 1.06, + "grad_norm": 0.65625, + "learning_rate": 0.0001837270382012755, + "loss": 0.3769, + "step": 7966 + }, + { + "epoch": 1.06, + "grad_norm": 0.52734375, + "learning_rate": 0.0001837206702654278, + "loss": 0.1998, + "step": 7967 + }, + { + "epoch": 1.06, + "grad_norm": 0.71484375, + "learning_rate": 0.00018371430119427413, + "loss": 0.4397, + "step": 7968 + }, + { + "epoch": 1.06, + "grad_norm": 0.54296875, + "learning_rate": 0.0001837079309879008, + "loss": 0.4309, + "step": 7969 + }, + { + "epoch": 1.06, + "grad_norm": 0.55859375, + "learning_rate": 0.0001837015596463942, + "loss": 0.4524, + "step": 7970 + }, + { + "epoch": 1.06, + "grad_norm": 0.65625, + "learning_rate": 0.00018369518716984074, + "loss": 0.4458, + "step": 7971 + }, + { + "epoch": 1.06, + "grad_norm": 0.62109375, + "learning_rate": 0.00018368881355832682, + "loss": 0.5472, + "step": 7972 + }, + { + "epoch": 1.06, + "grad_norm": 0.59375, + "learning_rate": 0.00018368243881193886, + "loss": 0.3698, + "step": 7973 + }, + { + "epoch": 1.06, + "grad_norm": 0.671875, + "learning_rate": 0.00018367606293076335, + "loss": 0.3177, + "step": 7974 + }, + { + "epoch": 1.06, + "grad_norm": 0.75390625, + "learning_rate": 0.00018366968591488673, + "loss": 0.2581, + "step": 7975 + }, + { + "epoch": 1.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00018366330776439544, + "loss": 0.5432, + "step": 7976 + }, + { + "epoch": 1.06, + "grad_norm": 0.62109375, + "learning_rate": 0.00018365692847937602, + "loss": 0.357, + "step": 7977 + }, + { + "epoch": 1.06, + "grad_norm": 0.875, + "learning_rate": 0.00018365054805991496, + "loss": 0.4451, + "step": 7978 + }, + { + "epoch": 1.06, + "grad_norm": 0.74609375, + "learning_rate": 0.0001836441665060988, + "loss": 0.3831, + "step": 7979 + }, + { + "epoch": 1.06, + "grad_norm": 0.59765625, + "learning_rate": 0.00018363778381801402, + "loss": 0.4788, + "step": 7980 + }, + { + "epoch": 1.06, + "grad_norm": 0.6484375, + "learning_rate": 0.00018363139999574724, + "loss": 0.35, + "step": 7981 + }, + { + "epoch": 1.07, + "grad_norm": 0.66015625, + "learning_rate": 0.00018362501503938497, + "loss": 0.3455, + "step": 7982 + }, + { + "epoch": 1.07, + "grad_norm": 0.6796875, + "learning_rate": 0.00018361862894901386, + "loss": 0.3988, + "step": 7983 + }, + { + "epoch": 1.07, + "grad_norm": 0.82421875, + "learning_rate": 0.00018361224172472044, + "loss": 0.6209, + "step": 7984 + }, + { + "epoch": 1.07, + "grad_norm": 0.45703125, + "learning_rate": 0.0001836058533665914, + "loss": 0.2173, + "step": 7985 + }, + { + "epoch": 1.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00018359946387471333, + "loss": 0.4144, + "step": 7986 + }, + { + "epoch": 1.07, + "grad_norm": 0.380859375, + "learning_rate": 0.00018359307324917286, + "loss": 0.1859, + "step": 7987 + }, + { + "epoch": 1.07, + "grad_norm": 0.79296875, + "learning_rate": 0.00018358668149005665, + "loss": 0.3117, + "step": 7988 + }, + { + "epoch": 1.07, + "grad_norm": 0.5234375, + "learning_rate": 0.0001835802885974514, + "loss": 0.4269, + "step": 7989 + }, + { + "epoch": 1.07, + "grad_norm": 0.451171875, + "learning_rate": 0.00018357389457144383, + "loss": 0.2884, + "step": 7990 + }, + { + "epoch": 1.07, + "grad_norm": 0.4140625, + "learning_rate": 0.00018356749941212058, + "loss": 0.1858, + "step": 7991 + }, + { + "epoch": 1.07, + "grad_norm": 0.51953125, + "learning_rate": 0.0001835611031195684, + "loss": 0.6726, + "step": 7992 + }, + { + "epoch": 1.07, + "grad_norm": 0.6171875, + "learning_rate": 0.00018355470569387405, + "loss": 0.3577, + "step": 7993 + }, + { + "epoch": 1.07, + "grad_norm": 0.75, + "learning_rate": 0.00018354830713512428, + "loss": 0.6256, + "step": 7994 + }, + { + "epoch": 1.07, + "grad_norm": 0.53125, + "learning_rate": 0.0001835419074434058, + "loss": 0.3846, + "step": 7995 + }, + { + "epoch": 1.07, + "grad_norm": 0.498046875, + "learning_rate": 0.00018353550661880547, + "loss": 0.6718, + "step": 7996 + }, + { + "epoch": 1.07, + "grad_norm": 0.58203125, + "learning_rate": 0.00018352910466141006, + "loss": 0.2426, + "step": 7997 + }, + { + "epoch": 1.07, + "grad_norm": 0.64453125, + "learning_rate": 0.00018352270157130635, + "loss": 0.3943, + "step": 7998 + }, + { + "epoch": 1.07, + "grad_norm": 0.54296875, + "learning_rate": 0.00018351629734858125, + "loss": 0.3956, + "step": 7999 + }, + { + "epoch": 1.07, + "grad_norm": 0.8515625, + "learning_rate": 0.00018350989199332154, + "loss": 0.4058, + "step": 8000 + }, + { + "epoch": 1.07, + "grad_norm": 0.6015625, + "learning_rate": 0.0001835034855056141, + "loss": 0.5586, + "step": 8001 + }, + { + "epoch": 1.07, + "grad_norm": 0.53125, + "learning_rate": 0.00018349707788554577, + "loss": 0.2602, + "step": 8002 + }, + { + "epoch": 1.07, + "grad_norm": 0.60546875, + "learning_rate": 0.00018349066913320348, + "loss": 0.5706, + "step": 8003 + }, + { + "epoch": 1.07, + "grad_norm": 0.427734375, + "learning_rate": 0.00018348425924867416, + "loss": 0.3486, + "step": 8004 + }, + { + "epoch": 1.07, + "grad_norm": 0.404296875, + "learning_rate": 0.00018347784823204472, + "loss": 0.4521, + "step": 8005 + }, + { + "epoch": 1.07, + "grad_norm": 0.77734375, + "learning_rate": 0.00018347143608340205, + "loss": 0.6115, + "step": 8006 + }, + { + "epoch": 1.07, + "grad_norm": 0.74609375, + "learning_rate": 0.00018346502280283314, + "loss": 0.32, + "step": 8007 + }, + { + "epoch": 1.07, + "grad_norm": 0.54296875, + "learning_rate": 0.00018345860839042496, + "loss": 0.3962, + "step": 8008 + }, + { + "epoch": 1.07, + "grad_norm": 0.58984375, + "learning_rate": 0.0001834521928462645, + "loss": 0.363, + "step": 8009 + }, + { + "epoch": 1.07, + "grad_norm": 0.51171875, + "learning_rate": 0.00018344577617043873, + "loss": 0.3872, + "step": 8010 + }, + { + "epoch": 1.07, + "grad_norm": 0.66015625, + "learning_rate": 0.00018343935836303467, + "loss": 0.3519, + "step": 8011 + }, + { + "epoch": 1.07, + "grad_norm": 0.484375, + "learning_rate": 0.00018343293942413936, + "loss": 0.2739, + "step": 8012 + }, + { + "epoch": 1.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00018342651935383987, + "loss": 0.411, + "step": 8013 + }, + { + "epoch": 1.07, + "grad_norm": 0.609375, + "learning_rate": 0.00018342009815222322, + "loss": 0.4446, + "step": 8014 + }, + { + "epoch": 1.07, + "grad_norm": 0.703125, + "learning_rate": 0.0001834136758193765, + "loss": 0.5691, + "step": 8015 + }, + { + "epoch": 1.07, + "grad_norm": 0.5703125, + "learning_rate": 0.00018340725235538682, + "loss": 0.6306, + "step": 8016 + }, + { + "epoch": 1.07, + "grad_norm": 0.703125, + "learning_rate": 0.00018340082776034124, + "loss": 0.5642, + "step": 8017 + }, + { + "epoch": 1.07, + "grad_norm": 0.59765625, + "learning_rate": 0.00018339440203432692, + "loss": 0.4435, + "step": 8018 + }, + { + "epoch": 1.07, + "grad_norm": 0.39453125, + "learning_rate": 0.000183387975177431, + "loss": 0.4889, + "step": 8019 + }, + { + "epoch": 1.07, + "grad_norm": 0.53515625, + "learning_rate": 0.00018338154718974058, + "loss": 0.5141, + "step": 8020 + }, + { + "epoch": 1.07, + "grad_norm": 0.515625, + "learning_rate": 0.0001833751180713429, + "loss": 0.5162, + "step": 8021 + }, + { + "epoch": 1.07, + "grad_norm": 0.50390625, + "learning_rate": 0.0001833686878223251, + "loss": 0.2564, + "step": 8022 + }, + { + "epoch": 1.07, + "grad_norm": 0.62890625, + "learning_rate": 0.0001833622564427744, + "loss": 0.2705, + "step": 8023 + }, + { + "epoch": 1.07, + "grad_norm": 0.79296875, + "learning_rate": 0.000183355823932778, + "loss": 0.1869, + "step": 8024 + }, + { + "epoch": 1.07, + "grad_norm": 0.5546875, + "learning_rate": 0.00018334939029242308, + "loss": 0.3673, + "step": 8025 + }, + { + "epoch": 1.07, + "grad_norm": 0.392578125, + "learning_rate": 0.00018334295552179696, + "loss": 0.5648, + "step": 8026 + }, + { + "epoch": 1.07, + "grad_norm": 0.6171875, + "learning_rate": 0.0001833365196209869, + "loss": 0.1967, + "step": 8027 + }, + { + "epoch": 1.07, + "grad_norm": 1.078125, + "learning_rate": 0.00018333008259008014, + "loss": 0.2248, + "step": 8028 + }, + { + "epoch": 1.07, + "grad_norm": 0.54296875, + "learning_rate": 0.000183323644429164, + "loss": 0.3075, + "step": 8029 + }, + { + "epoch": 1.07, + "grad_norm": 0.43359375, + "learning_rate": 0.0001833172051383257, + "loss": 0.4033, + "step": 8030 + }, + { + "epoch": 1.07, + "grad_norm": 0.4453125, + "learning_rate": 0.00018331076471765268, + "loss": 0.3328, + "step": 8031 + }, + { + "epoch": 1.07, + "grad_norm": 0.5703125, + "learning_rate": 0.0001833043231672322, + "loss": 0.3327, + "step": 8032 + }, + { + "epoch": 1.07, + "grad_norm": 0.734375, + "learning_rate": 0.00018329788048715163, + "loss": 0.3756, + "step": 8033 + }, + { + "epoch": 1.07, + "grad_norm": 0.458984375, + "learning_rate": 0.00018329143667749836, + "loss": 0.2946, + "step": 8034 + }, + { + "epoch": 1.07, + "grad_norm": 0.5, + "learning_rate": 0.00018328499173835976, + "loss": 0.4591, + "step": 8035 + }, + { + "epoch": 1.07, + "grad_norm": 0.72265625, + "learning_rate": 0.0001832785456698232, + "loss": 0.5158, + "step": 8036 + }, + { + "epoch": 1.07, + "grad_norm": 0.59375, + "learning_rate": 0.0001832720984719761, + "loss": 0.3837, + "step": 8037 + }, + { + "epoch": 1.07, + "grad_norm": 0.42578125, + "learning_rate": 0.00018326565014490593, + "loss": 0.4252, + "step": 8038 + }, + { + "epoch": 1.07, + "grad_norm": 0.76171875, + "learning_rate": 0.00018325920068870008, + "loss": 0.5704, + "step": 8039 + }, + { + "epoch": 1.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00018325275010344606, + "loss": 0.5081, + "step": 8040 + }, + { + "epoch": 1.07, + "grad_norm": 0.64453125, + "learning_rate": 0.00018324629838923132, + "loss": 0.5834, + "step": 8041 + }, + { + "epoch": 1.07, + "grad_norm": 0.455078125, + "learning_rate": 0.0001832398455461433, + "loss": 0.3704, + "step": 8042 + }, + { + "epoch": 1.07, + "grad_norm": 0.6015625, + "learning_rate": 0.00018323339157426958, + "loss": 0.4749, + "step": 8043 + }, + { + "epoch": 1.07, + "grad_norm": 0.357421875, + "learning_rate": 0.00018322693647369767, + "loss": 0.1329, + "step": 8044 + }, + { + "epoch": 1.07, + "grad_norm": 0.63671875, + "learning_rate": 0.00018322048024451505, + "loss": 0.3923, + "step": 8045 + }, + { + "epoch": 1.07, + "grad_norm": 0.609375, + "learning_rate": 0.00018321402288680932, + "loss": 0.4507, + "step": 8046 + }, + { + "epoch": 1.07, + "grad_norm": 0.6328125, + "learning_rate": 0.00018320756440066806, + "loss": 0.4237, + "step": 8047 + }, + { + "epoch": 1.07, + "grad_norm": 0.427734375, + "learning_rate": 0.0001832011047861788, + "loss": 0.3606, + "step": 8048 + }, + { + "epoch": 1.07, + "grad_norm": 0.80078125, + "learning_rate": 0.00018319464404342915, + "loss": 0.3526, + "step": 8049 + }, + { + "epoch": 1.07, + "grad_norm": 0.578125, + "learning_rate": 0.00018318818217250677, + "loss": 0.4443, + "step": 8050 + }, + { + "epoch": 1.07, + "grad_norm": 0.60546875, + "learning_rate": 0.00018318171917349922, + "loss": 0.4863, + "step": 8051 + }, + { + "epoch": 1.07, + "grad_norm": 0.423828125, + "learning_rate": 0.00018317525504649417, + "loss": 0.4761, + "step": 8052 + }, + { + "epoch": 1.07, + "grad_norm": 0.5625, + "learning_rate": 0.0001831687897915793, + "loss": 0.481, + "step": 8053 + }, + { + "epoch": 1.07, + "grad_norm": 1.0546875, + "learning_rate": 0.00018316232340884225, + "loss": 0.5517, + "step": 8054 + }, + { + "epoch": 1.07, + "grad_norm": 0.490234375, + "learning_rate": 0.0001831558558983707, + "loss": 0.3912, + "step": 8055 + }, + { + "epoch": 1.07, + "grad_norm": 0.412109375, + "learning_rate": 0.00018314938726025242, + "loss": 0.3235, + "step": 8056 + }, + { + "epoch": 1.08, + "grad_norm": 0.52734375, + "learning_rate": 0.0001831429174945751, + "loss": 0.528, + "step": 8057 + }, + { + "epoch": 1.08, + "grad_norm": 0.36328125, + "learning_rate": 0.0001831364466014264, + "loss": 0.3433, + "step": 8058 + }, + { + "epoch": 1.08, + "grad_norm": 0.55078125, + "learning_rate": 0.00018312997458089412, + "loss": 0.2226, + "step": 8059 + }, + { + "epoch": 1.08, + "grad_norm": 0.59375, + "learning_rate": 0.00018312350143306606, + "loss": 0.2696, + "step": 8060 + }, + { + "epoch": 1.08, + "grad_norm": 1.078125, + "learning_rate": 0.00018311702715802998, + "loss": 0.4112, + "step": 8061 + }, + { + "epoch": 1.08, + "grad_norm": 0.6796875, + "learning_rate": 0.00018311055175587368, + "loss": 0.3605, + "step": 8062 + }, + { + "epoch": 1.08, + "grad_norm": 0.5078125, + "learning_rate": 0.00018310407522668492, + "loss": 0.4212, + "step": 8063 + }, + { + "epoch": 1.08, + "grad_norm": 0.5078125, + "learning_rate": 0.0001830975975705516, + "loss": 0.3668, + "step": 8064 + }, + { + "epoch": 1.08, + "grad_norm": 0.5390625, + "learning_rate": 0.00018309111878756152, + "loss": 0.3474, + "step": 8065 + }, + { + "epoch": 1.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00018308463887780252, + "loss": 0.4109, + "step": 8066 + }, + { + "epoch": 1.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0001830781578413625, + "loss": 0.212, + "step": 8067 + }, + { + "epoch": 1.08, + "grad_norm": 0.62890625, + "learning_rate": 0.00018307167567832932, + "loss": 0.5518, + "step": 8068 + }, + { + "epoch": 1.08, + "grad_norm": 0.6484375, + "learning_rate": 0.00018306519238879093, + "loss": 0.4505, + "step": 8069 + }, + { + "epoch": 1.08, + "grad_norm": 0.890625, + "learning_rate": 0.00018305870797283524, + "loss": 0.368, + "step": 8070 + }, + { + "epoch": 1.08, + "grad_norm": 0.77734375, + "learning_rate": 0.00018305222243055014, + "loss": 0.574, + "step": 8071 + }, + { + "epoch": 1.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0001830457357620236, + "loss": 0.5006, + "step": 8072 + }, + { + "epoch": 1.08, + "grad_norm": 0.62109375, + "learning_rate": 0.0001830392479673436, + "loss": 0.7607, + "step": 8073 + }, + { + "epoch": 1.08, + "grad_norm": 0.45703125, + "learning_rate": 0.00018303275904659806, + "loss": 0.4073, + "step": 8074 + }, + { + "epoch": 1.08, + "grad_norm": 0.70703125, + "learning_rate": 0.00018302626899987506, + "loss": 0.5798, + "step": 8075 + }, + { + "epoch": 1.08, + "grad_norm": 0.42578125, + "learning_rate": 0.00018301977782726256, + "loss": 0.251, + "step": 8076 + }, + { + "epoch": 1.08, + "grad_norm": 0.75390625, + "learning_rate": 0.00018301328552884858, + "loss": 0.4269, + "step": 8077 + }, + { + "epoch": 1.08, + "grad_norm": 0.58984375, + "learning_rate": 0.00018300679210472116, + "loss": 0.2883, + "step": 8078 + }, + { + "epoch": 1.08, + "grad_norm": 0.6640625, + "learning_rate": 0.00018300029755496838, + "loss": 0.3114, + "step": 8079 + }, + { + "epoch": 1.08, + "grad_norm": 0.78515625, + "learning_rate": 0.00018299380187967833, + "loss": 0.6837, + "step": 8080 + }, + { + "epoch": 1.08, + "grad_norm": 0.416015625, + "learning_rate": 0.00018298730507893901, + "loss": 0.4341, + "step": 8081 + }, + { + "epoch": 1.08, + "grad_norm": 0.62890625, + "learning_rate": 0.00018298080715283858, + "loss": 0.3732, + "step": 8082 + }, + { + "epoch": 1.08, + "grad_norm": 0.4375, + "learning_rate": 0.00018297430810146516, + "loss": 0.4771, + "step": 8083 + }, + { + "epoch": 1.08, + "grad_norm": 0.51953125, + "learning_rate": 0.00018296780792490685, + "loss": 0.3087, + "step": 8084 + }, + { + "epoch": 1.08, + "grad_norm": 0.45703125, + "learning_rate": 0.00018296130662325187, + "loss": 0.1937, + "step": 8085 + }, + { + "epoch": 1.08, + "grad_norm": 0.5546875, + "learning_rate": 0.0001829548041965883, + "loss": 0.5373, + "step": 8086 + }, + { + "epoch": 1.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00018294830064500433, + "loss": 0.3989, + "step": 8087 + }, + { + "epoch": 1.08, + "grad_norm": 0.7265625, + "learning_rate": 0.00018294179596858816, + "loss": 0.3443, + "step": 8088 + }, + { + "epoch": 1.08, + "grad_norm": 0.765625, + "learning_rate": 0.00018293529016742802, + "loss": 0.5822, + "step": 8089 + }, + { + "epoch": 1.08, + "grad_norm": 0.87890625, + "learning_rate": 0.00018292878324161215, + "loss": 0.5198, + "step": 8090 + }, + { + "epoch": 1.08, + "grad_norm": 0.486328125, + "learning_rate": 0.00018292227519122869, + "loss": 0.3317, + "step": 8091 + }, + { + "epoch": 1.08, + "grad_norm": 0.50390625, + "learning_rate": 0.00018291576601636597, + "loss": 0.5296, + "step": 8092 + }, + { + "epoch": 1.08, + "grad_norm": 0.56640625, + "learning_rate": 0.0001829092557171123, + "loss": 0.348, + "step": 8093 + }, + { + "epoch": 1.08, + "grad_norm": 0.50390625, + "learning_rate": 0.00018290274429355588, + "loss": 0.3376, + "step": 8094 + }, + { + "epoch": 1.08, + "grad_norm": 0.5078125, + "learning_rate": 0.00018289623174578504, + "loss": 0.3408, + "step": 8095 + }, + { + "epoch": 1.08, + "grad_norm": 0.55078125, + "learning_rate": 0.0001828897180738881, + "loss": 0.1948, + "step": 8096 + }, + { + "epoch": 1.08, + "grad_norm": 0.7890625, + "learning_rate": 0.0001828832032779534, + "loss": 0.6047, + "step": 8097 + }, + { + "epoch": 1.08, + "grad_norm": 0.5390625, + "learning_rate": 0.00018287668735806923, + "loss": 0.4146, + "step": 8098 + }, + { + "epoch": 1.08, + "grad_norm": 0.421875, + "learning_rate": 0.00018287017031432402, + "loss": 0.4125, + "step": 8099 + }, + { + "epoch": 1.08, + "grad_norm": 0.69140625, + "learning_rate": 0.0001828636521468061, + "loss": 0.4792, + "step": 8100 + }, + { + "epoch": 1.08, + "grad_norm": 0.55859375, + "learning_rate": 0.00018285713285560385, + "loss": 0.6892, + "step": 8101 + }, + { + "epoch": 1.08, + "grad_norm": 0.52734375, + "learning_rate": 0.00018285061244080578, + "loss": 0.3609, + "step": 8102 + }, + { + "epoch": 1.08, + "grad_norm": 0.4765625, + "learning_rate": 0.00018284409090250017, + "loss": 0.4799, + "step": 8103 + }, + { + "epoch": 1.08, + "grad_norm": 0.7890625, + "learning_rate": 0.00018283756824077554, + "loss": 0.4771, + "step": 8104 + }, + { + "epoch": 1.08, + "grad_norm": 0.6171875, + "learning_rate": 0.00018283104445572033, + "loss": 0.2897, + "step": 8105 + }, + { + "epoch": 1.08, + "grad_norm": 0.43359375, + "learning_rate": 0.00018282451954742296, + "loss": 0.1413, + "step": 8106 + }, + { + "epoch": 1.08, + "grad_norm": 0.462890625, + "learning_rate": 0.00018281799351597196, + "loss": 0.1981, + "step": 8107 + }, + { + "epoch": 1.08, + "grad_norm": 0.64453125, + "learning_rate": 0.00018281146636145584, + "loss": 0.437, + "step": 8108 + }, + { + "epoch": 1.08, + "grad_norm": 0.59375, + "learning_rate": 0.00018280493808396305, + "loss": 0.5675, + "step": 8109 + }, + { + "epoch": 1.08, + "grad_norm": 0.54296875, + "learning_rate": 0.0001827984086835822, + "loss": 0.3377, + "step": 8110 + }, + { + "epoch": 1.08, + "grad_norm": 0.5078125, + "learning_rate": 0.00018279187816040176, + "loss": 0.2952, + "step": 8111 + }, + { + "epoch": 1.08, + "grad_norm": 0.546875, + "learning_rate": 0.00018278534651451032, + "loss": 0.393, + "step": 8112 + }, + { + "epoch": 1.08, + "grad_norm": 0.625, + "learning_rate": 0.00018277881374599643, + "loss": 0.3935, + "step": 8113 + }, + { + "epoch": 1.08, + "grad_norm": 0.4609375, + "learning_rate": 0.00018277227985494875, + "loss": 0.3036, + "step": 8114 + }, + { + "epoch": 1.08, + "grad_norm": 0.84375, + "learning_rate": 0.00018276574484145578, + "loss": 0.5514, + "step": 8115 + }, + { + "epoch": 1.08, + "grad_norm": 0.455078125, + "learning_rate": 0.00018275920870560624, + "loss": 0.4222, + "step": 8116 + }, + { + "epoch": 1.08, + "grad_norm": 0.76953125, + "learning_rate": 0.00018275267144748868, + "loss": 0.1972, + "step": 8117 + }, + { + "epoch": 1.08, + "grad_norm": 0.8828125, + "learning_rate": 0.0001827461330671918, + "loss": 0.2439, + "step": 8118 + }, + { + "epoch": 1.08, + "grad_norm": 0.48046875, + "learning_rate": 0.00018273959356480423, + "loss": 0.405, + "step": 8119 + }, + { + "epoch": 1.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0001827330529404147, + "loss": 0.4728, + "step": 8120 + }, + { + "epoch": 1.08, + "grad_norm": 0.57421875, + "learning_rate": 0.00018272651119411186, + "loss": 0.5431, + "step": 8121 + }, + { + "epoch": 1.08, + "grad_norm": 0.515625, + "learning_rate": 0.00018271996832598442, + "loss": 0.5013, + "step": 8122 + }, + { + "epoch": 1.08, + "grad_norm": 0.703125, + "learning_rate": 0.00018271342433612113, + "loss": 0.5136, + "step": 8123 + }, + { + "epoch": 1.08, + "grad_norm": 0.59375, + "learning_rate": 0.00018270687922461074, + "loss": 0.5265, + "step": 8124 + }, + { + "epoch": 1.08, + "grad_norm": 0.640625, + "learning_rate": 0.00018270033299154197, + "loss": 0.6353, + "step": 8125 + }, + { + "epoch": 1.08, + "grad_norm": 0.5, + "learning_rate": 0.00018269378563700358, + "loss": 0.4427, + "step": 8126 + }, + { + "epoch": 1.08, + "grad_norm": 0.63671875, + "learning_rate": 0.00018268723716108445, + "loss": 0.2653, + "step": 8127 + }, + { + "epoch": 1.08, + "grad_norm": 0.6328125, + "learning_rate": 0.00018268068756387327, + "loss": 0.6365, + "step": 8128 + }, + { + "epoch": 1.08, + "grad_norm": 0.61328125, + "learning_rate": 0.00018267413684545892, + "loss": 0.6174, + "step": 8129 + }, + { + "epoch": 1.08, + "grad_norm": 0.73046875, + "learning_rate": 0.0001826675850059302, + "loss": 0.6274, + "step": 8130 + }, + { + "epoch": 1.09, + "grad_norm": 0.55078125, + "learning_rate": 0.00018266103204537596, + "loss": 0.463, + "step": 8131 + }, + { + "epoch": 1.09, + "grad_norm": 0.5703125, + "learning_rate": 0.0001826544779638851, + "loss": 0.3502, + "step": 8132 + }, + { + "epoch": 1.09, + "grad_norm": 0.6171875, + "learning_rate": 0.00018264792276154648, + "loss": 0.354, + "step": 8133 + }, + { + "epoch": 1.09, + "grad_norm": 0.6875, + "learning_rate": 0.00018264136643844896, + "loss": 0.3003, + "step": 8134 + }, + { + "epoch": 1.09, + "grad_norm": 0.439453125, + "learning_rate": 0.00018263480899468148, + "loss": 0.3251, + "step": 8135 + }, + { + "epoch": 1.09, + "grad_norm": 0.63671875, + "learning_rate": 0.00018262825043033297, + "loss": 0.5203, + "step": 8136 + }, + { + "epoch": 1.09, + "grad_norm": 0.4140625, + "learning_rate": 0.00018262169074549232, + "loss": 0.3457, + "step": 8137 + }, + { + "epoch": 1.09, + "grad_norm": 0.7578125, + "learning_rate": 0.00018261512994024854, + "loss": 0.4327, + "step": 8138 + }, + { + "epoch": 1.09, + "grad_norm": 0.83203125, + "learning_rate": 0.00018260856801469058, + "loss": 0.3499, + "step": 8139 + }, + { + "epoch": 1.09, + "grad_norm": 0.41015625, + "learning_rate": 0.0001826020049689074, + "loss": 0.3067, + "step": 8140 + }, + { + "epoch": 1.09, + "grad_norm": 0.640625, + "learning_rate": 0.00018259544080298802, + "loss": 0.3765, + "step": 8141 + }, + { + "epoch": 1.09, + "grad_norm": 0.6171875, + "learning_rate": 0.00018258887551702148, + "loss": 0.1948, + "step": 8142 + }, + { + "epoch": 1.09, + "grad_norm": 0.47265625, + "learning_rate": 0.00018258230911109675, + "loss": 0.3393, + "step": 8143 + }, + { + "epoch": 1.09, + "grad_norm": 0.87890625, + "learning_rate": 0.00018257574158530294, + "loss": 0.2718, + "step": 8144 + }, + { + "epoch": 1.09, + "grad_norm": 0.43359375, + "learning_rate": 0.00018256917293972907, + "loss": 0.3871, + "step": 8145 + }, + { + "epoch": 1.09, + "grad_norm": 0.61328125, + "learning_rate": 0.0001825626031744642, + "loss": 0.2862, + "step": 8146 + }, + { + "epoch": 1.09, + "grad_norm": 0.6796875, + "learning_rate": 0.0001825560322895975, + "loss": 0.7018, + "step": 8147 + }, + { + "epoch": 1.09, + "grad_norm": 0.51171875, + "learning_rate": 0.00018254946028521793, + "loss": 0.3523, + "step": 8148 + }, + { + "epoch": 1.09, + "grad_norm": 0.51171875, + "learning_rate": 0.00018254288716141475, + "loss": 0.5171, + "step": 8149 + }, + { + "epoch": 1.09, + "grad_norm": 0.54296875, + "learning_rate": 0.00018253631291827705, + "loss": 0.5338, + "step": 8150 + }, + { + "epoch": 1.09, + "grad_norm": 0.640625, + "learning_rate": 0.00018252973755589395, + "loss": 0.2878, + "step": 8151 + }, + { + "epoch": 1.09, + "grad_norm": 0.578125, + "learning_rate": 0.00018252316107435465, + "loss": 0.405, + "step": 8152 + }, + { + "epoch": 1.09, + "grad_norm": 0.56640625, + "learning_rate": 0.00018251658347374835, + "loss": 0.4405, + "step": 8153 + }, + { + "epoch": 1.09, + "grad_norm": 0.5234375, + "learning_rate": 0.00018251000475416417, + "loss": 0.3014, + "step": 8154 + }, + { + "epoch": 1.09, + "grad_norm": 0.55859375, + "learning_rate": 0.0001825034249156914, + "loss": 0.4565, + "step": 8155 + }, + { + "epoch": 1.09, + "grad_norm": 0.458984375, + "learning_rate": 0.00018249684395841925, + "loss": 0.5791, + "step": 8156 + }, + { + "epoch": 1.09, + "grad_norm": 0.7578125, + "learning_rate": 0.00018249026188243693, + "loss": 0.5046, + "step": 8157 + }, + { + "epoch": 1.09, + "grad_norm": 0.53125, + "learning_rate": 0.0001824836786878337, + "loss": 0.2281, + "step": 8158 + }, + { + "epoch": 1.09, + "grad_norm": 0.72265625, + "learning_rate": 0.0001824770943746989, + "loss": 0.4855, + "step": 8159 + }, + { + "epoch": 1.09, + "grad_norm": 0.515625, + "learning_rate": 0.00018247050894312176, + "loss": 0.2733, + "step": 8160 + }, + { + "epoch": 1.09, + "grad_norm": 0.5078125, + "learning_rate": 0.00018246392239319154, + "loss": 0.3138, + "step": 8161 + }, + { + "epoch": 1.09, + "grad_norm": 0.478515625, + "learning_rate": 0.00018245733472499763, + "loss": 0.3415, + "step": 8162 + }, + { + "epoch": 1.09, + "grad_norm": 0.498046875, + "learning_rate": 0.00018245074593862938, + "loss": 0.3933, + "step": 8163 + }, + { + "epoch": 1.09, + "grad_norm": 0.76953125, + "learning_rate": 0.00018244415603417603, + "loss": 0.578, + "step": 8164 + }, + { + "epoch": 1.09, + "grad_norm": 0.76953125, + "learning_rate": 0.00018243756501172705, + "loss": 0.4116, + "step": 8165 + }, + { + "epoch": 1.09, + "grad_norm": 0.408203125, + "learning_rate": 0.0001824309728713718, + "loss": 0.3295, + "step": 8166 + }, + { + "epoch": 1.09, + "grad_norm": 0.439453125, + "learning_rate": 0.0001824243796131996, + "loss": 0.3305, + "step": 8167 + }, + { + "epoch": 1.09, + "grad_norm": 0.7265625, + "learning_rate": 0.00018241778523729995, + "loss": 0.3961, + "step": 8168 + }, + { + "epoch": 1.09, + "grad_norm": 0.55078125, + "learning_rate": 0.00018241118974376223, + "loss": 0.3671, + "step": 8169 + }, + { + "epoch": 1.09, + "grad_norm": 0.59375, + "learning_rate": 0.0001824045931326759, + "loss": 0.5359, + "step": 8170 + }, + { + "epoch": 1.09, + "grad_norm": 0.67578125, + "learning_rate": 0.00018239799540413037, + "loss": 0.4836, + "step": 8171 + }, + { + "epoch": 1.09, + "grad_norm": 0.65234375, + "learning_rate": 0.00018239139655821518, + "loss": 0.6506, + "step": 8172 + }, + { + "epoch": 1.09, + "grad_norm": 0.392578125, + "learning_rate": 0.00018238479659501975, + "loss": 0.197, + "step": 8173 + }, + { + "epoch": 1.09, + "grad_norm": 0.578125, + "learning_rate": 0.0001823781955146336, + "loss": 0.3666, + "step": 8174 + }, + { + "epoch": 1.09, + "grad_norm": 0.72265625, + "learning_rate": 0.00018237159331714625, + "loss": 0.515, + "step": 8175 + }, + { + "epoch": 1.09, + "grad_norm": 0.58203125, + "learning_rate": 0.0001823649900026472, + "loss": 0.5371, + "step": 8176 + }, + { + "epoch": 1.09, + "grad_norm": 0.39453125, + "learning_rate": 0.00018235838557122607, + "loss": 0.2384, + "step": 8177 + }, + { + "epoch": 1.09, + "grad_norm": 0.74609375, + "learning_rate": 0.00018235178002297238, + "loss": 0.317, + "step": 8178 + }, + { + "epoch": 1.09, + "grad_norm": 0.6015625, + "learning_rate": 0.00018234517335797567, + "loss": 0.438, + "step": 8179 + }, + { + "epoch": 1.09, + "grad_norm": 0.55859375, + "learning_rate": 0.0001823385655763256, + "loss": 0.2845, + "step": 8180 + }, + { + "epoch": 1.09, + "grad_norm": 0.515625, + "learning_rate": 0.0001823319566781117, + "loss": 0.7679, + "step": 8181 + }, + { + "epoch": 1.09, + "grad_norm": 0.66796875, + "learning_rate": 0.00018232534666342363, + "loss": 0.3263, + "step": 8182 + }, + { + "epoch": 1.09, + "grad_norm": 0.498046875, + "learning_rate": 0.000182318735532351, + "loss": 0.4006, + "step": 8183 + }, + { + "epoch": 1.09, + "grad_norm": 0.4609375, + "learning_rate": 0.00018231212328498355, + "loss": 0.2686, + "step": 8184 + }, + { + "epoch": 1.09, + "grad_norm": 0.51953125, + "learning_rate": 0.00018230550992141082, + "loss": 0.4691, + "step": 8185 + }, + { + "epoch": 1.09, + "grad_norm": 0.69140625, + "learning_rate": 0.0001822988954417226, + "loss": 0.5268, + "step": 8186 + }, + { + "epoch": 1.09, + "grad_norm": 0.578125, + "learning_rate": 0.0001822922798460085, + "loss": 0.4642, + "step": 8187 + }, + { + "epoch": 1.09, + "grad_norm": 0.57421875, + "learning_rate": 0.00018228566313435826, + "loss": 0.258, + "step": 8188 + }, + { + "epoch": 1.09, + "grad_norm": 0.53515625, + "learning_rate": 0.00018227904530686165, + "loss": 0.2934, + "step": 8189 + }, + { + "epoch": 1.09, + "grad_norm": 0.5625, + "learning_rate": 0.00018227242636360838, + "loss": 0.3686, + "step": 8190 + }, + { + "epoch": 1.09, + "grad_norm": 0.48046875, + "learning_rate": 0.00018226580630468822, + "loss": 0.4022, + "step": 8191 + }, + { + "epoch": 1.09, + "grad_norm": 0.70703125, + "learning_rate": 0.00018225918513019087, + "loss": 0.5111, + "step": 8192 + }, + { + "epoch": 1.09, + "grad_norm": 0.52734375, + "learning_rate": 0.0001822525628402062, + "loss": 0.3398, + "step": 8193 + }, + { + "epoch": 1.09, + "grad_norm": 0.4921875, + "learning_rate": 0.00018224593943482403, + "loss": 0.2107, + "step": 8194 + }, + { + "epoch": 1.09, + "grad_norm": 0.6875, + "learning_rate": 0.0001822393149141341, + "loss": 0.6162, + "step": 8195 + }, + { + "epoch": 1.09, + "grad_norm": 0.58984375, + "learning_rate": 0.00018223268927822626, + "loss": 0.701, + "step": 8196 + }, + { + "epoch": 1.09, + "grad_norm": 0.486328125, + "learning_rate": 0.00018222606252719044, + "loss": 0.2936, + "step": 8197 + }, + { + "epoch": 1.09, + "grad_norm": 0.578125, + "learning_rate": 0.00018221943466111638, + "loss": 0.1986, + "step": 8198 + }, + { + "epoch": 1.09, + "grad_norm": 0.62109375, + "learning_rate": 0.00018221280568009405, + "loss": 0.7957, + "step": 8199 + }, + { + "epoch": 1.09, + "grad_norm": 0.65234375, + "learning_rate": 0.00018220617558421332, + "loss": 0.472, + "step": 8200 + }, + { + "epoch": 1.09, + "grad_norm": 1.140625, + "learning_rate": 0.00018219954437356407, + "loss": 0.2322, + "step": 8201 + }, + { + "epoch": 1.09, + "grad_norm": 0.50390625, + "learning_rate": 0.00018219291204823624, + "loss": 0.3788, + "step": 8202 + }, + { + "epoch": 1.09, + "grad_norm": 0.5390625, + "learning_rate": 0.00018218627860831977, + "loss": 0.2745, + "step": 8203 + }, + { + "epoch": 1.09, + "grad_norm": 0.5625, + "learning_rate": 0.00018217964405390461, + "loss": 0.4143, + "step": 8204 + }, + { + "epoch": 1.09, + "grad_norm": 0.59765625, + "learning_rate": 0.00018217300838508073, + "loss": 0.2116, + "step": 8205 + }, + { + "epoch": 1.1, + "grad_norm": 0.6015625, + "learning_rate": 0.00018216637160193818, + "loss": 0.4468, + "step": 8206 + }, + { + "epoch": 1.1, + "grad_norm": 0.60546875, + "learning_rate": 0.00018215973370456682, + "loss": 0.1839, + "step": 8207 + }, + { + "epoch": 1.1, + "grad_norm": 0.443359375, + "learning_rate": 0.00018215309469305676, + "loss": 0.5371, + "step": 8208 + }, + { + "epoch": 1.1, + "grad_norm": 1.0859375, + "learning_rate": 0.00018214645456749804, + "loss": 0.3067, + "step": 8209 + }, + { + "epoch": 1.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00018213981332798067, + "loss": 0.4951, + "step": 8210 + }, + { + "epoch": 1.1, + "grad_norm": 0.72265625, + "learning_rate": 0.00018213317097459468, + "loss": 0.3147, + "step": 8211 + }, + { + "epoch": 1.1, + "grad_norm": 0.48828125, + "learning_rate": 0.0001821265275074302, + "loss": 0.4449, + "step": 8212 + }, + { + "epoch": 1.1, + "grad_norm": 1.0078125, + "learning_rate": 0.0001821198829265773, + "loss": 0.387, + "step": 8213 + }, + { + "epoch": 1.1, + "grad_norm": 0.55859375, + "learning_rate": 0.00018211323723212608, + "loss": 0.2864, + "step": 8214 + }, + { + "epoch": 1.1, + "grad_norm": 0.671875, + "learning_rate": 0.00018210659042416666, + "loss": 0.303, + "step": 8215 + }, + { + "epoch": 1.1, + "grad_norm": 0.498046875, + "learning_rate": 0.00018209994250278919, + "loss": 0.345, + "step": 8216 + }, + { + "epoch": 1.1, + "grad_norm": 0.44921875, + "learning_rate": 0.0001820932934680838, + "loss": 0.2183, + "step": 8217 + }, + { + "epoch": 1.1, + "grad_norm": 0.515625, + "learning_rate": 0.00018208664332014068, + "loss": 0.2647, + "step": 8218 + }, + { + "epoch": 1.1, + "grad_norm": 0.63671875, + "learning_rate": 0.00018207999205904996, + "loss": 0.5212, + "step": 8219 + }, + { + "epoch": 1.1, + "grad_norm": 0.46875, + "learning_rate": 0.0001820733396849019, + "loss": 0.2362, + "step": 8220 + }, + { + "epoch": 1.1, + "grad_norm": 0.69921875, + "learning_rate": 0.00018206668619778665, + "loss": 0.4779, + "step": 8221 + }, + { + "epoch": 1.1, + "grad_norm": 0.55078125, + "learning_rate": 0.0001820600315977945, + "loss": 0.4202, + "step": 8222 + }, + { + "epoch": 1.1, + "grad_norm": 0.55078125, + "learning_rate": 0.00018205337588501563, + "loss": 0.4468, + "step": 8223 + }, + { + "epoch": 1.1, + "grad_norm": 0.546875, + "learning_rate": 0.00018204671905954034, + "loss": 0.2369, + "step": 8224 + }, + { + "epoch": 1.1, + "grad_norm": 0.6171875, + "learning_rate": 0.00018204006112145887, + "loss": 0.3059, + "step": 8225 + }, + { + "epoch": 1.1, + "grad_norm": 0.48828125, + "learning_rate": 0.0001820334020708615, + "loss": 0.5737, + "step": 8226 + }, + { + "epoch": 1.1, + "grad_norm": 0.54296875, + "learning_rate": 0.00018202674190783859, + "loss": 0.2377, + "step": 8227 + }, + { + "epoch": 1.1, + "grad_norm": 0.82421875, + "learning_rate": 0.00018202008063248037, + "loss": 0.4432, + "step": 8228 + }, + { + "epoch": 1.1, + "grad_norm": 0.5234375, + "learning_rate": 0.00018201341824487726, + "loss": 0.4217, + "step": 8229 + }, + { + "epoch": 1.1, + "grad_norm": 0.625, + "learning_rate": 0.00018200675474511956, + "loss": 0.35, + "step": 8230 + }, + { + "epoch": 1.1, + "grad_norm": 0.67578125, + "learning_rate": 0.0001820000901332976, + "loss": 0.4999, + "step": 8231 + }, + { + "epoch": 1.1, + "grad_norm": 0.6484375, + "learning_rate": 0.0001819934244095018, + "loss": 0.5446, + "step": 8232 + }, + { + "epoch": 1.1, + "grad_norm": 0.5, + "learning_rate": 0.00018198675757382255, + "loss": 0.5124, + "step": 8233 + }, + { + "epoch": 1.1, + "grad_norm": 0.80078125, + "learning_rate": 0.00018198008962635025, + "loss": 0.4537, + "step": 8234 + }, + { + "epoch": 1.1, + "grad_norm": 0.412109375, + "learning_rate": 0.0001819734205671753, + "loss": 0.2089, + "step": 8235 + }, + { + "epoch": 1.1, + "grad_norm": 0.6171875, + "learning_rate": 0.0001819667503963882, + "loss": 0.3954, + "step": 8236 + }, + { + "epoch": 1.1, + "grad_norm": 0.56640625, + "learning_rate": 0.00018196007911407933, + "loss": 0.2833, + "step": 8237 + }, + { + "epoch": 1.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00018195340672033918, + "loss": 0.4105, + "step": 8238 + }, + { + "epoch": 1.1, + "grad_norm": 0.65625, + "learning_rate": 0.00018194673321525825, + "loss": 0.3755, + "step": 8239 + }, + { + "epoch": 1.1, + "grad_norm": 0.56640625, + "learning_rate": 0.00018194005859892702, + "loss": 0.4611, + "step": 8240 + }, + { + "epoch": 1.1, + "grad_norm": 0.75, + "learning_rate": 0.000181933382871436, + "loss": 0.5476, + "step": 8241 + }, + { + "epoch": 1.1, + "grad_norm": 0.6640625, + "learning_rate": 0.00018192670603287575, + "loss": 0.5871, + "step": 8242 + }, + { + "epoch": 1.1, + "grad_norm": 0.72265625, + "learning_rate": 0.00018192002808333676, + "loss": 0.6142, + "step": 8243 + }, + { + "epoch": 1.1, + "grad_norm": 0.4140625, + "learning_rate": 0.00018191334902290963, + "loss": 0.1707, + "step": 8244 + }, + { + "epoch": 1.1, + "grad_norm": 0.609375, + "learning_rate": 0.0001819066688516849, + "loss": 0.5211, + "step": 8245 + }, + { + "epoch": 1.1, + "grad_norm": 0.6328125, + "learning_rate": 0.00018189998756975318, + "loss": 0.4682, + "step": 8246 + }, + { + "epoch": 1.1, + "grad_norm": 0.546875, + "learning_rate": 0.00018189330517720507, + "loss": 0.2699, + "step": 8247 + }, + { + "epoch": 1.1, + "grad_norm": 0.421875, + "learning_rate": 0.0001818866216741312, + "loss": 0.415, + "step": 8248 + }, + { + "epoch": 1.1, + "grad_norm": 0.4140625, + "learning_rate": 0.00018187993706062216, + "loss": 0.1851, + "step": 8249 + }, + { + "epoch": 1.1, + "grad_norm": 0.45703125, + "learning_rate": 0.0001818732513367686, + "loss": 0.3611, + "step": 8250 + }, + { + "epoch": 1.1, + "grad_norm": 0.62890625, + "learning_rate": 0.00018186656450266123, + "loss": 0.6802, + "step": 8251 + }, + { + "epoch": 1.1, + "grad_norm": 0.56640625, + "learning_rate": 0.0001818598765583907, + "loss": 0.5015, + "step": 8252 + }, + { + "epoch": 1.1, + "grad_norm": 0.490234375, + "learning_rate": 0.00018185318750404774, + "loss": 0.3016, + "step": 8253 + }, + { + "epoch": 1.1, + "grad_norm": 0.5546875, + "learning_rate": 0.000181846497339723, + "loss": 0.3116, + "step": 8254 + }, + { + "epoch": 1.1, + "grad_norm": 0.640625, + "learning_rate": 0.00018183980606550725, + "loss": 0.4398, + "step": 8255 + }, + { + "epoch": 1.1, + "grad_norm": 0.6015625, + "learning_rate": 0.00018183311368149117, + "loss": 0.6001, + "step": 8256 + }, + { + "epoch": 1.1, + "grad_norm": 0.54296875, + "learning_rate": 0.00018182642018776555, + "loss": 0.4231, + "step": 8257 + }, + { + "epoch": 1.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00018181972558442116, + "loss": 0.4266, + "step": 8258 + }, + { + "epoch": 1.1, + "grad_norm": 0.39453125, + "learning_rate": 0.0001818130298715488, + "loss": 0.1564, + "step": 8259 + }, + { + "epoch": 1.1, + "grad_norm": 0.515625, + "learning_rate": 0.00018180633304923922, + "loss": 0.5032, + "step": 8260 + }, + { + "epoch": 1.1, + "grad_norm": 0.494140625, + "learning_rate": 0.00018179963511758327, + "loss": 0.4709, + "step": 8261 + }, + { + "epoch": 1.1, + "grad_norm": 0.5546875, + "learning_rate": 0.00018179293607667178, + "loss": 0.2897, + "step": 8262 + }, + { + "epoch": 1.1, + "grad_norm": 0.53515625, + "learning_rate": 0.00018178623592659556, + "loss": 0.4022, + "step": 8263 + }, + { + "epoch": 1.1, + "grad_norm": 0.447265625, + "learning_rate": 0.00018177953466744554, + "loss": 0.2546, + "step": 8264 + }, + { + "epoch": 1.1, + "grad_norm": 0.796875, + "learning_rate": 0.0001817728322993125, + "loss": 0.2949, + "step": 8265 + }, + { + "epoch": 1.1, + "grad_norm": 0.66796875, + "learning_rate": 0.00018176612882228737, + "loss": 0.3594, + "step": 8266 + }, + { + "epoch": 1.1, + "grad_norm": 0.6171875, + "learning_rate": 0.0001817594242364611, + "loss": 0.4681, + "step": 8267 + }, + { + "epoch": 1.1, + "grad_norm": 0.5234375, + "learning_rate": 0.0001817527185419245, + "loss": 0.4655, + "step": 8268 + }, + { + "epoch": 1.1, + "grad_norm": 0.421875, + "learning_rate": 0.00018174601173876859, + "loss": 0.3263, + "step": 8269 + }, + { + "epoch": 1.1, + "grad_norm": 0.6875, + "learning_rate": 0.00018173930382708433, + "loss": 0.187, + "step": 8270 + }, + { + "epoch": 1.1, + "grad_norm": 0.408203125, + "learning_rate": 0.00018173259480696262, + "loss": 0.2135, + "step": 8271 + }, + { + "epoch": 1.1, + "grad_norm": 0.65234375, + "learning_rate": 0.00018172588467849448, + "loss": 0.611, + "step": 8272 + }, + { + "epoch": 1.1, + "grad_norm": 0.5703125, + "learning_rate": 0.0001817191734417709, + "loss": 0.3038, + "step": 8273 + }, + { + "epoch": 1.1, + "grad_norm": 0.5703125, + "learning_rate": 0.00018171246109688287, + "loss": 0.7112, + "step": 8274 + }, + { + "epoch": 1.1, + "grad_norm": 0.50390625, + "learning_rate": 0.0001817057476439214, + "loss": 0.3391, + "step": 8275 + }, + { + "epoch": 1.1, + "grad_norm": 0.51171875, + "learning_rate": 0.0001816990330829776, + "loss": 0.528, + "step": 8276 + }, + { + "epoch": 1.1, + "grad_norm": 0.5390625, + "learning_rate": 0.00018169231741414245, + "loss": 0.3145, + "step": 8277 + }, + { + "epoch": 1.1, + "grad_norm": 0.408203125, + "learning_rate": 0.00018168560063750705, + "loss": 0.4222, + "step": 8278 + }, + { + "epoch": 1.1, + "grad_norm": 0.4140625, + "learning_rate": 0.00018167888275316248, + "loss": 0.1942, + "step": 8279 + }, + { + "epoch": 1.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00018167216376119988, + "loss": 0.2468, + "step": 8280 + }, + { + "epoch": 1.11, + "grad_norm": 0.671875, + "learning_rate": 0.00018166544366171025, + "loss": 0.5547, + "step": 8281 + }, + { + "epoch": 1.11, + "grad_norm": 0.62109375, + "learning_rate": 0.00018165872245478484, + "loss": 0.3242, + "step": 8282 + }, + { + "epoch": 1.11, + "grad_norm": 0.67578125, + "learning_rate": 0.00018165200014051474, + "loss": 0.3736, + "step": 8283 + }, + { + "epoch": 1.11, + "grad_norm": 0.546875, + "learning_rate": 0.00018164527671899106, + "loss": 0.4122, + "step": 8284 + }, + { + "epoch": 1.11, + "grad_norm": 0.482421875, + "learning_rate": 0.0001816385521903051, + "loss": 0.3753, + "step": 8285 + }, + { + "epoch": 1.11, + "grad_norm": 0.453125, + "learning_rate": 0.00018163182655454795, + "loss": 0.6255, + "step": 8286 + }, + { + "epoch": 1.11, + "grad_norm": 0.5859375, + "learning_rate": 0.00018162509981181084, + "loss": 0.4162, + "step": 8287 + }, + { + "epoch": 1.11, + "grad_norm": 0.57421875, + "learning_rate": 0.000181618371962185, + "loss": 0.6257, + "step": 8288 + }, + { + "epoch": 1.11, + "grad_norm": 0.73046875, + "learning_rate": 0.00018161164300576163, + "loss": 0.5313, + "step": 8289 + }, + { + "epoch": 1.11, + "grad_norm": 0.609375, + "learning_rate": 0.00018160491294263203, + "loss": 0.1915, + "step": 8290 + }, + { + "epoch": 1.11, + "grad_norm": 0.7265625, + "learning_rate": 0.00018159818177288742, + "loss": 0.3948, + "step": 8291 + }, + { + "epoch": 1.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00018159144949661912, + "loss": 0.3957, + "step": 8292 + }, + { + "epoch": 1.11, + "grad_norm": 0.396484375, + "learning_rate": 0.00018158471611391837, + "loss": 0.1655, + "step": 8293 + }, + { + "epoch": 1.11, + "grad_norm": 0.6015625, + "learning_rate": 0.0001815779816248765, + "loss": 0.3383, + "step": 8294 + }, + { + "epoch": 1.11, + "grad_norm": 0.46484375, + "learning_rate": 0.00018157124602958486, + "loss": 0.1985, + "step": 8295 + }, + { + "epoch": 1.11, + "grad_norm": 0.66796875, + "learning_rate": 0.0001815645093281348, + "loss": 0.5273, + "step": 8296 + }, + { + "epoch": 1.11, + "grad_norm": 0.419921875, + "learning_rate": 0.0001815577715206176, + "loss": 0.2137, + "step": 8297 + }, + { + "epoch": 1.11, + "grad_norm": 0.55078125, + "learning_rate": 0.0001815510326071247, + "loss": 0.5285, + "step": 8298 + }, + { + "epoch": 1.11, + "grad_norm": 0.69921875, + "learning_rate": 0.00018154429258774744, + "loss": 0.4563, + "step": 8299 + }, + { + "epoch": 1.11, + "grad_norm": 0.51953125, + "learning_rate": 0.00018153755146257727, + "loss": 0.4038, + "step": 8300 + }, + { + "epoch": 1.11, + "grad_norm": 0.6171875, + "learning_rate": 0.00018153080923170552, + "loss": 0.4603, + "step": 8301 + }, + { + "epoch": 1.11, + "grad_norm": 0.5546875, + "learning_rate": 0.0001815240658952237, + "loss": 0.5984, + "step": 8302 + }, + { + "epoch": 1.11, + "grad_norm": 0.62109375, + "learning_rate": 0.00018151732145322324, + "loss": 0.6502, + "step": 8303 + }, + { + "epoch": 1.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00018151057590579556, + "loss": 0.3876, + "step": 8304 + }, + { + "epoch": 1.11, + "grad_norm": 0.51953125, + "learning_rate": 0.00018150382925303217, + "loss": 0.1916, + "step": 8305 + }, + { + "epoch": 1.11, + "grad_norm": 0.578125, + "learning_rate": 0.00018149708149502457, + "loss": 0.3787, + "step": 8306 + }, + { + "epoch": 1.11, + "grad_norm": 0.51953125, + "learning_rate": 0.00018149033263186422, + "loss": 0.3556, + "step": 8307 + }, + { + "epoch": 1.11, + "grad_norm": 0.4375, + "learning_rate": 0.00018148358266364265, + "loss": 0.4128, + "step": 8308 + }, + { + "epoch": 1.11, + "grad_norm": 0.61328125, + "learning_rate": 0.00018147683159045142, + "loss": 0.1681, + "step": 8309 + }, + { + "epoch": 1.11, + "grad_norm": 0.6953125, + "learning_rate": 0.00018147007941238203, + "loss": 0.5295, + "step": 8310 + }, + { + "epoch": 1.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001814633261295261, + "loss": 0.4311, + "step": 8311 + }, + { + "epoch": 1.11, + "grad_norm": 0.671875, + "learning_rate": 0.00018145657174197517, + "loss": 0.4822, + "step": 8312 + }, + { + "epoch": 1.11, + "grad_norm": 0.73828125, + "learning_rate": 0.00018144981624982086, + "loss": 0.596, + "step": 8313 + }, + { + "epoch": 1.11, + "grad_norm": 0.6640625, + "learning_rate": 0.00018144305965315478, + "loss": 0.5173, + "step": 8314 + }, + { + "epoch": 1.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00018143630195206854, + "loss": 0.349, + "step": 8315 + }, + { + "epoch": 1.11, + "grad_norm": 0.609375, + "learning_rate": 0.00018142954314665378, + "loss": 0.4045, + "step": 8316 + }, + { + "epoch": 1.11, + "grad_norm": 0.54296875, + "learning_rate": 0.00018142278323700212, + "loss": 0.5816, + "step": 8317 + }, + { + "epoch": 1.11, + "grad_norm": 0.435546875, + "learning_rate": 0.0001814160222232053, + "loss": 0.4933, + "step": 8318 + }, + { + "epoch": 1.11, + "grad_norm": 0.62109375, + "learning_rate": 0.00018140926010535492, + "loss": 0.7075, + "step": 8319 + }, + { + "epoch": 1.11, + "grad_norm": 0.48828125, + "learning_rate": 0.00018140249688354277, + "loss": 0.3664, + "step": 8320 + }, + { + "epoch": 1.11, + "grad_norm": 0.55859375, + "learning_rate": 0.0001813957325578605, + "loss": 0.2639, + "step": 8321 + }, + { + "epoch": 1.11, + "grad_norm": 0.51953125, + "learning_rate": 0.00018138896712839987, + "loss": 0.4479, + "step": 8322 + }, + { + "epoch": 1.11, + "grad_norm": 0.609375, + "learning_rate": 0.0001813822005952526, + "loss": 0.506, + "step": 8323 + }, + { + "epoch": 1.11, + "grad_norm": 0.4453125, + "learning_rate": 0.00018137543295851044, + "loss": 0.2486, + "step": 8324 + }, + { + "epoch": 1.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00018136866421826522, + "loss": 0.4026, + "step": 8325 + }, + { + "epoch": 1.11, + "grad_norm": 0.447265625, + "learning_rate": 0.00018136189437460866, + "loss": 0.2158, + "step": 8326 + }, + { + "epoch": 1.11, + "grad_norm": 0.490234375, + "learning_rate": 0.00018135512342763263, + "loss": 0.6149, + "step": 8327 + }, + { + "epoch": 1.11, + "grad_norm": 0.5859375, + "learning_rate": 0.0001813483513774289, + "loss": 0.3657, + "step": 8328 + }, + { + "epoch": 1.11, + "grad_norm": 0.5859375, + "learning_rate": 0.0001813415782240893, + "loss": 0.6404, + "step": 8329 + }, + { + "epoch": 1.11, + "grad_norm": 0.59375, + "learning_rate": 0.00018133480396770573, + "loss": 0.5486, + "step": 8330 + }, + { + "epoch": 1.11, + "grad_norm": 0.7109375, + "learning_rate": 0.00018132802860836998, + "loss": 0.4341, + "step": 8331 + }, + { + "epoch": 1.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00018132125214617397, + "loss": 0.4483, + "step": 8332 + }, + { + "epoch": 1.11, + "grad_norm": 0.6640625, + "learning_rate": 0.0001813144745812096, + "loss": 0.2336, + "step": 8333 + }, + { + "epoch": 1.11, + "grad_norm": 0.6640625, + "learning_rate": 0.00018130769591356876, + "loss": 0.219, + "step": 8334 + }, + { + "epoch": 1.11, + "grad_norm": 0.5625, + "learning_rate": 0.0001813009161433434, + "loss": 0.3278, + "step": 8335 + }, + { + "epoch": 1.11, + "grad_norm": 0.474609375, + "learning_rate": 0.0001812941352706254, + "loss": 0.2956, + "step": 8336 + }, + { + "epoch": 1.11, + "grad_norm": 0.68359375, + "learning_rate": 0.00018128735329550678, + "loss": 0.579, + "step": 8337 + }, + { + "epoch": 1.11, + "grad_norm": 0.484375, + "learning_rate": 0.00018128057021807946, + "loss": 0.4927, + "step": 8338 + }, + { + "epoch": 1.11, + "grad_norm": 0.25390625, + "learning_rate": 0.00018127378603843548, + "loss": 0.1799, + "step": 8339 + }, + { + "epoch": 1.11, + "grad_norm": 0.482421875, + "learning_rate": 0.00018126700075666677, + "loss": 0.2178, + "step": 8340 + }, + { + "epoch": 1.11, + "grad_norm": 0.439453125, + "learning_rate": 0.0001812602143728654, + "loss": 0.2048, + "step": 8341 + }, + { + "epoch": 1.11, + "grad_norm": 0.51953125, + "learning_rate": 0.00018125342688712331, + "loss": 0.4985, + "step": 8342 + }, + { + "epoch": 1.11, + "grad_norm": 0.447265625, + "learning_rate": 0.00018124663829953265, + "loss": 0.6613, + "step": 8343 + }, + { + "epoch": 1.11, + "grad_norm": 0.671875, + "learning_rate": 0.00018123984861018544, + "loss": 0.2866, + "step": 8344 + }, + { + "epoch": 1.11, + "grad_norm": 1.0546875, + "learning_rate": 0.00018123305781917373, + "loss": 0.6407, + "step": 8345 + }, + { + "epoch": 1.11, + "grad_norm": 0.53125, + "learning_rate": 0.00018122626592658963, + "loss": 0.1548, + "step": 8346 + }, + { + "epoch": 1.11, + "grad_norm": 0.671875, + "learning_rate": 0.00018121947293252521, + "loss": 0.2649, + "step": 8347 + }, + { + "epoch": 1.11, + "grad_norm": 0.55859375, + "learning_rate": 0.0001812126788370726, + "loss": 0.3556, + "step": 8348 + }, + { + "epoch": 1.11, + "grad_norm": 0.61328125, + "learning_rate": 0.00018120588364032398, + "loss": 0.3202, + "step": 8349 + }, + { + "epoch": 1.11, + "grad_norm": 0.6171875, + "learning_rate": 0.00018119908734237145, + "loss": 0.3431, + "step": 8350 + }, + { + "epoch": 1.11, + "grad_norm": 0.482421875, + "learning_rate": 0.00018119228994330718, + "loss": 0.3446, + "step": 8351 + }, + { + "epoch": 1.11, + "grad_norm": 0.49609375, + "learning_rate": 0.00018118549144322335, + "loss": 0.3458, + "step": 8352 + }, + { + "epoch": 1.11, + "grad_norm": 0.7265625, + "learning_rate": 0.00018117869184221215, + "loss": 0.4268, + "step": 8353 + }, + { + "epoch": 1.11, + "grad_norm": 0.6015625, + "learning_rate": 0.00018117189114036578, + "loss": 0.3983, + "step": 8354 + }, + { + "epoch": 1.11, + "grad_norm": 0.6484375, + "learning_rate": 0.0001811650893377765, + "loss": 0.2393, + "step": 8355 + }, + { + "epoch": 1.12, + "grad_norm": 0.6875, + "learning_rate": 0.00018115828643453647, + "loss": 0.4487, + "step": 8356 + }, + { + "epoch": 1.12, + "grad_norm": 0.578125, + "learning_rate": 0.00018115148243073802, + "loss": 0.374, + "step": 8357 + }, + { + "epoch": 1.12, + "grad_norm": 0.7109375, + "learning_rate": 0.0001811446773264734, + "loss": 0.6983, + "step": 8358 + }, + { + "epoch": 1.12, + "grad_norm": 0.43359375, + "learning_rate": 0.00018113787112183485, + "loss": 0.3536, + "step": 8359 + }, + { + "epoch": 1.12, + "grad_norm": 0.62109375, + "learning_rate": 0.0001811310638169147, + "loss": 0.484, + "step": 8360 + }, + { + "epoch": 1.12, + "grad_norm": 0.56640625, + "learning_rate": 0.00018112425541180524, + "loss": 0.4365, + "step": 8361 + }, + { + "epoch": 1.12, + "grad_norm": 0.453125, + "learning_rate": 0.00018111744590659882, + "loss": 0.4031, + "step": 8362 + }, + { + "epoch": 1.12, + "grad_norm": 0.6171875, + "learning_rate": 0.0001811106353013878, + "loss": 0.693, + "step": 8363 + }, + { + "epoch": 1.12, + "grad_norm": 0.4765625, + "learning_rate": 0.0001811038235962645, + "loss": 0.5465, + "step": 8364 + }, + { + "epoch": 1.12, + "grad_norm": 0.55078125, + "learning_rate": 0.00018109701079132128, + "loss": 0.2243, + "step": 8365 + }, + { + "epoch": 1.12, + "grad_norm": 0.61328125, + "learning_rate": 0.00018109019688665055, + "loss": 0.3387, + "step": 8366 + }, + { + "epoch": 1.12, + "grad_norm": 0.431640625, + "learning_rate": 0.00018108338188234472, + "loss": 0.3814, + "step": 8367 + }, + { + "epoch": 1.12, + "grad_norm": 0.47265625, + "learning_rate": 0.00018107656577849617, + "loss": 0.3948, + "step": 8368 + }, + { + "epoch": 1.12, + "grad_norm": 0.6640625, + "learning_rate": 0.00018106974857519736, + "loss": 0.7064, + "step": 8369 + }, + { + "epoch": 1.12, + "grad_norm": 0.94140625, + "learning_rate": 0.00018106293027254076, + "loss": 0.4942, + "step": 8370 + }, + { + "epoch": 1.12, + "grad_norm": 0.55859375, + "learning_rate": 0.00018105611087061878, + "loss": 0.5056, + "step": 8371 + }, + { + "epoch": 1.12, + "grad_norm": 0.82421875, + "learning_rate": 0.0001810492903695239, + "loss": 0.4892, + "step": 8372 + }, + { + "epoch": 1.12, + "grad_norm": 0.60546875, + "learning_rate": 0.00018104246876934866, + "loss": 0.3944, + "step": 8373 + }, + { + "epoch": 1.12, + "grad_norm": 0.703125, + "learning_rate": 0.00018103564607018551, + "loss": 0.3101, + "step": 8374 + }, + { + "epoch": 1.12, + "grad_norm": 0.466796875, + "learning_rate": 0.00018102882227212698, + "loss": 0.2452, + "step": 8375 + }, + { + "epoch": 1.12, + "grad_norm": 0.55078125, + "learning_rate": 0.00018102199737526562, + "loss": 0.4829, + "step": 8376 + }, + { + "epoch": 1.12, + "grad_norm": 0.5546875, + "learning_rate": 0.00018101517137969397, + "loss": 0.3095, + "step": 8377 + }, + { + "epoch": 1.12, + "grad_norm": 0.5234375, + "learning_rate": 0.00018100834428550462, + "loss": 0.4199, + "step": 8378 + }, + { + "epoch": 1.12, + "grad_norm": 0.56640625, + "learning_rate": 0.00018100151609279014, + "loss": 0.272, + "step": 8379 + }, + { + "epoch": 1.12, + "grad_norm": 0.70703125, + "learning_rate": 0.00018099468680164309, + "loss": 0.2825, + "step": 8380 + }, + { + "epoch": 1.12, + "grad_norm": 0.578125, + "learning_rate": 0.0001809878564121561, + "loss": 0.2771, + "step": 8381 + }, + { + "epoch": 1.12, + "grad_norm": 0.51953125, + "learning_rate": 0.00018098102492442183, + "loss": 0.4685, + "step": 8382 + }, + { + "epoch": 1.12, + "grad_norm": 0.81640625, + "learning_rate": 0.00018097419233853286, + "loss": 0.4821, + "step": 8383 + }, + { + "epoch": 1.12, + "grad_norm": 0.73828125, + "learning_rate": 0.00018096735865458187, + "loss": 0.4609, + "step": 8384 + }, + { + "epoch": 1.12, + "grad_norm": 0.5, + "learning_rate": 0.0001809605238726616, + "loss": 0.4345, + "step": 8385 + }, + { + "epoch": 1.12, + "grad_norm": 0.5078125, + "learning_rate": 0.00018095368799286456, + "loss": 0.4377, + "step": 8386 + }, + { + "epoch": 1.12, + "grad_norm": 0.46484375, + "learning_rate": 0.00018094685101528363, + "loss": 0.4671, + "step": 8387 + }, + { + "epoch": 1.12, + "grad_norm": 0.59375, + "learning_rate": 0.00018094001294001143, + "loss": 0.4717, + "step": 8388 + }, + { + "epoch": 1.12, + "grad_norm": 0.59765625, + "learning_rate": 0.0001809331737671407, + "loss": 0.4903, + "step": 8389 + }, + { + "epoch": 1.12, + "grad_norm": 0.65234375, + "learning_rate": 0.00018092633349676418, + "loss": 0.1739, + "step": 8390 + }, + { + "epoch": 1.12, + "grad_norm": 0.5546875, + "learning_rate": 0.00018091949212897465, + "loss": 0.5306, + "step": 8391 + }, + { + "epoch": 1.12, + "grad_norm": 0.5078125, + "learning_rate": 0.0001809126496638649, + "loss": 0.3202, + "step": 8392 + }, + { + "epoch": 1.12, + "grad_norm": 0.59765625, + "learning_rate": 0.00018090580610152768, + "loss": 0.3162, + "step": 8393 + }, + { + "epoch": 1.12, + "grad_norm": 0.66015625, + "learning_rate": 0.00018089896144205577, + "loss": 0.5207, + "step": 8394 + }, + { + "epoch": 1.12, + "grad_norm": 0.5078125, + "learning_rate": 0.00018089211568554205, + "loss": 0.6295, + "step": 8395 + }, + { + "epoch": 1.12, + "grad_norm": 0.53125, + "learning_rate": 0.00018088526883207933, + "loss": 0.3667, + "step": 8396 + }, + { + "epoch": 1.12, + "grad_norm": 0.427734375, + "learning_rate": 0.00018087842088176045, + "loss": 0.1802, + "step": 8397 + }, + { + "epoch": 1.12, + "grad_norm": 0.55078125, + "learning_rate": 0.00018087157183467825, + "loss": 0.4362, + "step": 8398 + }, + { + "epoch": 1.12, + "grad_norm": 0.4921875, + "learning_rate": 0.00018086472169092564, + "loss": 0.3612, + "step": 8399 + }, + { + "epoch": 1.12, + "grad_norm": 0.65625, + "learning_rate": 0.00018085787045059553, + "loss": 0.5852, + "step": 8400 + }, + { + "epoch": 1.12, + "grad_norm": 0.58984375, + "learning_rate": 0.00018085101811378081, + "loss": 0.4465, + "step": 8401 + }, + { + "epoch": 1.12, + "grad_norm": 0.58203125, + "learning_rate": 0.00018084416468057434, + "loss": 0.3354, + "step": 8402 + }, + { + "epoch": 1.12, + "grad_norm": 0.458984375, + "learning_rate": 0.00018083731015106918, + "loss": 0.3653, + "step": 8403 + }, + { + "epoch": 1.12, + "grad_norm": 0.45703125, + "learning_rate": 0.00018083045452535815, + "loss": 0.2498, + "step": 8404 + }, + { + "epoch": 1.12, + "grad_norm": 0.58984375, + "learning_rate": 0.00018082359780353434, + "loss": 0.6004, + "step": 8405 + }, + { + "epoch": 1.12, + "grad_norm": 0.8359375, + "learning_rate": 0.00018081673998569065, + "loss": 0.4378, + "step": 8406 + }, + { + "epoch": 1.12, + "grad_norm": 0.6171875, + "learning_rate": 0.00018080988107192005, + "loss": 0.3132, + "step": 8407 + }, + { + "epoch": 1.12, + "grad_norm": 0.8515625, + "learning_rate": 0.00018080302106231563, + "loss": 0.4441, + "step": 8408 + }, + { + "epoch": 1.12, + "grad_norm": 0.5234375, + "learning_rate": 0.00018079615995697043, + "loss": 0.6212, + "step": 8409 + }, + { + "epoch": 1.12, + "grad_norm": 0.62109375, + "learning_rate": 0.0001807892977559774, + "loss": 0.4613, + "step": 8410 + }, + { + "epoch": 1.12, + "grad_norm": 0.46875, + "learning_rate": 0.00018078243445942963, + "loss": 0.3819, + "step": 8411 + }, + { + "epoch": 1.12, + "grad_norm": 0.47265625, + "learning_rate": 0.00018077557006742026, + "loss": 0.2745, + "step": 8412 + }, + { + "epoch": 1.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00018076870458004227, + "loss": 0.531, + "step": 8413 + }, + { + "epoch": 1.12, + "grad_norm": 0.56640625, + "learning_rate": 0.0001807618379973888, + "loss": 0.568, + "step": 8414 + }, + { + "epoch": 1.12, + "grad_norm": 0.390625, + "learning_rate": 0.00018075497031955302, + "loss": 0.3046, + "step": 8415 + }, + { + "epoch": 1.12, + "grad_norm": 0.41015625, + "learning_rate": 0.00018074810154662798, + "loss": 0.2856, + "step": 8416 + }, + { + "epoch": 1.12, + "grad_norm": 0.63671875, + "learning_rate": 0.00018074123167870687, + "loss": 0.2375, + "step": 8417 + }, + { + "epoch": 1.12, + "grad_norm": 0.51171875, + "learning_rate": 0.00018073436071588286, + "loss": 0.3085, + "step": 8418 + }, + { + "epoch": 1.12, + "grad_norm": 0.63671875, + "learning_rate": 0.00018072748865824906, + "loss": 0.5303, + "step": 8419 + }, + { + "epoch": 1.12, + "grad_norm": 0.60546875, + "learning_rate": 0.00018072061550589874, + "loss": 0.5896, + "step": 8420 + }, + { + "epoch": 1.12, + "grad_norm": 0.63671875, + "learning_rate": 0.00018071374125892505, + "loss": 0.3861, + "step": 8421 + }, + { + "epoch": 1.12, + "grad_norm": 0.625, + "learning_rate": 0.00018070686591742122, + "loss": 0.2179, + "step": 8422 + }, + { + "epoch": 1.12, + "grad_norm": 0.50390625, + "learning_rate": 0.00018069998948148051, + "loss": 0.2937, + "step": 8423 + }, + { + "epoch": 1.12, + "grad_norm": 0.6796875, + "learning_rate": 0.00018069311195119614, + "loss": 0.3293, + "step": 8424 + }, + { + "epoch": 1.12, + "grad_norm": 0.578125, + "learning_rate": 0.00018068623332666137, + "loss": 0.4402, + "step": 8425 + }, + { + "epoch": 1.12, + "grad_norm": 0.427734375, + "learning_rate": 0.00018067935360796953, + "loss": 0.161, + "step": 8426 + }, + { + "epoch": 1.12, + "grad_norm": 0.71875, + "learning_rate": 0.00018067247279521384, + "loss": 0.3917, + "step": 8427 + }, + { + "epoch": 1.12, + "grad_norm": 0.625, + "learning_rate": 0.00018066559088848765, + "loss": 0.2436, + "step": 8428 + }, + { + "epoch": 1.12, + "grad_norm": 0.58984375, + "learning_rate": 0.00018065870788788431, + "loss": 0.3343, + "step": 8429 + }, + { + "epoch": 1.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00018065182379349708, + "loss": 0.4488, + "step": 8430 + }, + { + "epoch": 1.13, + "grad_norm": 0.46484375, + "learning_rate": 0.00018064493860541937, + "loss": 0.4315, + "step": 8431 + }, + { + "epoch": 1.13, + "grad_norm": 0.66015625, + "learning_rate": 0.00018063805232374454, + "loss": 0.6684, + "step": 8432 + }, + { + "epoch": 1.13, + "grad_norm": 0.68359375, + "learning_rate": 0.00018063116494856595, + "loss": 0.442, + "step": 8433 + }, + { + "epoch": 1.13, + "grad_norm": 0.7421875, + "learning_rate": 0.00018062427647997704, + "loss": 0.4132, + "step": 8434 + }, + { + "epoch": 1.13, + "grad_norm": 0.5234375, + "learning_rate": 0.0001806173869180712, + "loss": 0.3898, + "step": 8435 + }, + { + "epoch": 1.13, + "grad_norm": 0.486328125, + "learning_rate": 0.00018061049626294184, + "loss": 0.2971, + "step": 8436 + }, + { + "epoch": 1.13, + "grad_norm": 0.5390625, + "learning_rate": 0.00018060360451468242, + "loss": 0.1919, + "step": 8437 + }, + { + "epoch": 1.13, + "grad_norm": 0.56640625, + "learning_rate": 0.0001805967116733864, + "loss": 0.62, + "step": 8438 + }, + { + "epoch": 1.13, + "grad_norm": 0.453125, + "learning_rate": 0.00018058981773914722, + "loss": 0.2306, + "step": 8439 + }, + { + "epoch": 1.13, + "grad_norm": 0.421875, + "learning_rate": 0.00018058292271205838, + "loss": 0.3062, + "step": 8440 + }, + { + "epoch": 1.13, + "grad_norm": 0.578125, + "learning_rate": 0.00018057602659221343, + "loss": 0.4203, + "step": 8441 + }, + { + "epoch": 1.13, + "grad_norm": 0.64453125, + "learning_rate": 0.0001805691293797058, + "loss": 0.2539, + "step": 8442 + }, + { + "epoch": 1.13, + "grad_norm": 0.52734375, + "learning_rate": 0.0001805622310746291, + "loss": 0.3867, + "step": 8443 + }, + { + "epoch": 1.13, + "grad_norm": 0.7890625, + "learning_rate": 0.00018055533167707686, + "loss": 0.4202, + "step": 8444 + }, + { + "epoch": 1.13, + "grad_norm": 0.5390625, + "learning_rate": 0.0001805484311871426, + "loss": 0.2888, + "step": 8445 + }, + { + "epoch": 1.13, + "grad_norm": 0.421875, + "learning_rate": 0.00018054152960491991, + "loss": 0.4031, + "step": 8446 + }, + { + "epoch": 1.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00018053462693050237, + "loss": 0.3057, + "step": 8447 + }, + { + "epoch": 1.13, + "grad_norm": 0.50390625, + "learning_rate": 0.00018052772316398365, + "loss": 0.2301, + "step": 8448 + }, + { + "epoch": 1.13, + "grad_norm": 0.61328125, + "learning_rate": 0.0001805208183054573, + "loss": 0.3121, + "step": 8449 + }, + { + "epoch": 1.13, + "grad_norm": 0.484375, + "learning_rate": 0.00018051391235501696, + "loss": 0.1962, + "step": 8450 + }, + { + "epoch": 1.13, + "grad_norm": 0.66796875, + "learning_rate": 0.0001805070053127563, + "loss": 0.276, + "step": 8451 + }, + { + "epoch": 1.13, + "grad_norm": 0.5703125, + "learning_rate": 0.000180500097178769, + "loss": 0.3644, + "step": 8452 + }, + { + "epoch": 1.13, + "grad_norm": 0.486328125, + "learning_rate": 0.0001804931879531487, + "loss": 0.4294, + "step": 8453 + }, + { + "epoch": 1.13, + "grad_norm": 0.71875, + "learning_rate": 0.00018048627763598912, + "loss": 0.2897, + "step": 8454 + }, + { + "epoch": 1.13, + "grad_norm": 0.486328125, + "learning_rate": 0.00018047936622738394, + "loss": 0.3499, + "step": 8455 + }, + { + "epoch": 1.13, + "grad_norm": 0.46484375, + "learning_rate": 0.0001804724537274269, + "loss": 0.5195, + "step": 8456 + }, + { + "epoch": 1.13, + "grad_norm": 0.486328125, + "learning_rate": 0.0001804655401362118, + "loss": 0.3136, + "step": 8457 + }, + { + "epoch": 1.13, + "grad_norm": 0.5546875, + "learning_rate": 0.00018045862545383227, + "loss": 0.5773, + "step": 8458 + }, + { + "epoch": 1.13, + "grad_norm": 0.423828125, + "learning_rate": 0.00018045170968038215, + "loss": 0.3392, + "step": 8459 + }, + { + "epoch": 1.13, + "grad_norm": 0.5703125, + "learning_rate": 0.00018044479281595523, + "loss": 0.3753, + "step": 8460 + }, + { + "epoch": 1.13, + "grad_norm": 0.470703125, + "learning_rate": 0.0001804378748606453, + "loss": 0.492, + "step": 8461 + }, + { + "epoch": 1.13, + "grad_norm": 0.6171875, + "learning_rate": 0.00018043095581454614, + "loss": 0.5894, + "step": 8462 + }, + { + "epoch": 1.13, + "grad_norm": 0.427734375, + "learning_rate": 0.00018042403567775163, + "loss": 0.2914, + "step": 8463 + }, + { + "epoch": 1.13, + "grad_norm": 0.57421875, + "learning_rate": 0.00018041711445035551, + "loss": 0.3519, + "step": 8464 + }, + { + "epoch": 1.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00018041019213245178, + "loss": 0.3076, + "step": 8465 + }, + { + "epoch": 1.13, + "grad_norm": 0.65625, + "learning_rate": 0.0001804032687241342, + "loss": 0.5086, + "step": 8466 + }, + { + "epoch": 1.13, + "grad_norm": 0.48828125, + "learning_rate": 0.00018039634422549672, + "loss": 0.3606, + "step": 8467 + }, + { + "epoch": 1.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00018038941863663318, + "loss": 0.4062, + "step": 8468 + }, + { + "epoch": 1.13, + "grad_norm": 0.57421875, + "learning_rate": 0.00018038249195763756, + "loss": 0.4178, + "step": 8469 + }, + { + "epoch": 1.13, + "grad_norm": 0.66796875, + "learning_rate": 0.00018037556418860375, + "loss": 0.523, + "step": 8470 + }, + { + "epoch": 1.13, + "grad_norm": 0.486328125, + "learning_rate": 0.00018036863532962572, + "loss": 0.4018, + "step": 8471 + }, + { + "epoch": 1.13, + "grad_norm": 0.7265625, + "learning_rate": 0.00018036170538079737, + "loss": 0.3209, + "step": 8472 + }, + { + "epoch": 1.13, + "grad_norm": 0.5546875, + "learning_rate": 0.00018035477434221275, + "loss": 0.4295, + "step": 8473 + }, + { + "epoch": 1.13, + "grad_norm": 0.5859375, + "learning_rate": 0.0001803478422139658, + "loss": 0.2652, + "step": 8474 + }, + { + "epoch": 1.13, + "grad_norm": 0.4140625, + "learning_rate": 0.00018034090899615057, + "loss": 0.3941, + "step": 8475 + }, + { + "epoch": 1.13, + "grad_norm": 0.67578125, + "learning_rate": 0.00018033397468886103, + "loss": 0.6283, + "step": 8476 + }, + { + "epoch": 1.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00018032703929219128, + "loss": 0.488, + "step": 8477 + }, + { + "epoch": 1.13, + "grad_norm": 0.63671875, + "learning_rate": 0.00018032010280623527, + "loss": 0.3255, + "step": 8478 + }, + { + "epoch": 1.13, + "grad_norm": 0.85546875, + "learning_rate": 0.00018031316523108715, + "loss": 0.4386, + "step": 8479 + }, + { + "epoch": 1.13, + "grad_norm": 0.59765625, + "learning_rate": 0.00018030622656684092, + "loss": 0.3279, + "step": 8480 + }, + { + "epoch": 1.13, + "grad_norm": 0.55859375, + "learning_rate": 0.00018029928681359078, + "loss": 0.4595, + "step": 8481 + }, + { + "epoch": 1.13, + "grad_norm": 0.6171875, + "learning_rate": 0.00018029234597143073, + "loss": 0.4984, + "step": 8482 + }, + { + "epoch": 1.13, + "grad_norm": 0.6640625, + "learning_rate": 0.00018028540404045496, + "loss": 0.4384, + "step": 8483 + }, + { + "epoch": 1.13, + "grad_norm": 1.3203125, + "learning_rate": 0.00018027846102075756, + "loss": 0.4193, + "step": 8484 + }, + { + "epoch": 1.13, + "grad_norm": 0.486328125, + "learning_rate": 0.00018027151691243275, + "loss": 0.4769, + "step": 8485 + }, + { + "epoch": 1.13, + "grad_norm": 0.5546875, + "learning_rate": 0.00018026457171557462, + "loss": 0.5023, + "step": 8486 + }, + { + "epoch": 1.13, + "grad_norm": 0.8125, + "learning_rate": 0.00018025762543027738, + "loss": 0.3212, + "step": 8487 + }, + { + "epoch": 1.13, + "grad_norm": 0.5, + "learning_rate": 0.00018025067805663522, + "loss": 0.3994, + "step": 8488 + }, + { + "epoch": 1.13, + "grad_norm": 0.56640625, + "learning_rate": 0.00018024372959474238, + "loss": 0.5339, + "step": 8489 + }, + { + "epoch": 1.13, + "grad_norm": 0.55078125, + "learning_rate": 0.00018023678004469308, + "loss": 0.4875, + "step": 8490 + }, + { + "epoch": 1.13, + "grad_norm": 0.53125, + "learning_rate": 0.00018022982940658154, + "loss": 0.3406, + "step": 8491 + }, + { + "epoch": 1.13, + "grad_norm": 0.51953125, + "learning_rate": 0.00018022287768050202, + "loss": 0.3873, + "step": 8492 + }, + { + "epoch": 1.13, + "grad_norm": 0.609375, + "learning_rate": 0.00018021592486654878, + "loss": 0.386, + "step": 8493 + }, + { + "epoch": 1.13, + "grad_norm": 0.38671875, + "learning_rate": 0.00018020897096481612, + "loss": 0.2477, + "step": 8494 + }, + { + "epoch": 1.13, + "grad_norm": 0.474609375, + "learning_rate": 0.00018020201597539835, + "loss": 0.4406, + "step": 8495 + }, + { + "epoch": 1.13, + "grad_norm": 0.5703125, + "learning_rate": 0.00018019505989838973, + "loss": 0.2785, + "step": 8496 + }, + { + "epoch": 1.13, + "grad_norm": 0.408203125, + "learning_rate": 0.00018018810273388468, + "loss": 0.4429, + "step": 8497 + }, + { + "epoch": 1.13, + "grad_norm": 0.470703125, + "learning_rate": 0.00018018114448197748, + "loss": 0.2905, + "step": 8498 + }, + { + "epoch": 1.13, + "grad_norm": 0.79296875, + "learning_rate": 0.00018017418514276246, + "loss": 0.3666, + "step": 8499 + }, + { + "epoch": 1.13, + "grad_norm": 0.8828125, + "learning_rate": 0.0001801672247163341, + "loss": 0.4298, + "step": 8500 + }, + { + "epoch": 1.13, + "grad_norm": 0.6796875, + "learning_rate": 0.0001801602632027867, + "loss": 0.4759, + "step": 8501 + }, + { + "epoch": 1.13, + "grad_norm": 0.400390625, + "learning_rate": 0.00018015330060221465, + "loss": 0.3815, + "step": 8502 + }, + { + "epoch": 1.13, + "grad_norm": 0.43359375, + "learning_rate": 0.00018014633691471242, + "loss": 0.2933, + "step": 8503 + }, + { + "epoch": 1.13, + "grad_norm": 0.4453125, + "learning_rate": 0.00018013937214037443, + "loss": 0.355, + "step": 8504 + }, + { + "epoch": 1.13, + "grad_norm": 0.427734375, + "learning_rate": 0.0001801324062792951, + "loss": 0.3765, + "step": 8505 + }, + { + "epoch": 1.14, + "grad_norm": 0.578125, + "learning_rate": 0.00018012543933156896, + "loss": 0.4282, + "step": 8506 + }, + { + "epoch": 1.14, + "grad_norm": 0.515625, + "learning_rate": 0.0001801184712972904, + "loss": 0.4762, + "step": 8507 + }, + { + "epoch": 1.14, + "grad_norm": 0.45703125, + "learning_rate": 0.00018011150217655395, + "loss": 0.3375, + "step": 8508 + }, + { + "epoch": 1.14, + "grad_norm": 0.55859375, + "learning_rate": 0.00018010453196945413, + "loss": 0.4295, + "step": 8509 + }, + { + "epoch": 1.14, + "grad_norm": 0.703125, + "learning_rate": 0.00018009756067608544, + "loss": 0.9118, + "step": 8510 + }, + { + "epoch": 1.14, + "grad_norm": 0.6015625, + "learning_rate": 0.0001800905882965424, + "loss": 0.3586, + "step": 8511 + }, + { + "epoch": 1.14, + "grad_norm": 0.56640625, + "learning_rate": 0.00018008361483091965, + "loss": 0.2591, + "step": 8512 + }, + { + "epoch": 1.14, + "grad_norm": 0.48828125, + "learning_rate": 0.00018007664027931163, + "loss": 0.4705, + "step": 8513 + }, + { + "epoch": 1.14, + "grad_norm": 0.6171875, + "learning_rate": 0.00018006966464181295, + "loss": 0.8189, + "step": 8514 + }, + { + "epoch": 1.14, + "grad_norm": 0.62890625, + "learning_rate": 0.00018006268791851828, + "loss": 0.3937, + "step": 8515 + }, + { + "epoch": 1.14, + "grad_norm": 1.1015625, + "learning_rate": 0.00018005571010952217, + "loss": 0.4953, + "step": 8516 + }, + { + "epoch": 1.14, + "grad_norm": 0.546875, + "learning_rate": 0.00018004873121491924, + "loss": 0.3849, + "step": 8517 + }, + { + "epoch": 1.14, + "grad_norm": 0.408203125, + "learning_rate": 0.00018004175123480413, + "loss": 0.2858, + "step": 8518 + }, + { + "epoch": 1.14, + "grad_norm": 0.52734375, + "learning_rate": 0.00018003477016927152, + "loss": 0.5371, + "step": 8519 + }, + { + "epoch": 1.14, + "grad_norm": 0.54296875, + "learning_rate": 0.00018002778801841604, + "loss": 0.4244, + "step": 8520 + }, + { + "epoch": 1.14, + "grad_norm": 0.6640625, + "learning_rate": 0.0001800208047823324, + "loss": 0.4026, + "step": 8521 + }, + { + "epoch": 1.14, + "grad_norm": 0.404296875, + "learning_rate": 0.0001800138204611153, + "loss": 0.2184, + "step": 8522 + }, + { + "epoch": 1.14, + "grad_norm": 0.404296875, + "learning_rate": 0.00018000683505485946, + "loss": 0.214, + "step": 8523 + }, + { + "epoch": 1.14, + "grad_norm": 0.4296875, + "learning_rate": 0.00017999984856365957, + "loss": 0.4872, + "step": 8524 + }, + { + "epoch": 1.14, + "grad_norm": 0.4765625, + "learning_rate": 0.00017999286098761038, + "loss": 0.6329, + "step": 8525 + }, + { + "epoch": 1.14, + "grad_norm": 0.48046875, + "learning_rate": 0.00017998587232680667, + "loss": 0.3631, + "step": 8526 + }, + { + "epoch": 1.14, + "grad_norm": 0.5859375, + "learning_rate": 0.00017997888258134318, + "loss": 0.3594, + "step": 8527 + }, + { + "epoch": 1.14, + "grad_norm": 0.57421875, + "learning_rate": 0.00017997189175131473, + "loss": 0.3704, + "step": 8528 + }, + { + "epoch": 1.14, + "grad_norm": 0.54296875, + "learning_rate": 0.0001799648998368161, + "loss": 0.3833, + "step": 8529 + }, + { + "epoch": 1.14, + "grad_norm": 0.55078125, + "learning_rate": 0.0001799579068379421, + "loss": 0.4926, + "step": 8530 + }, + { + "epoch": 1.14, + "grad_norm": 0.55078125, + "learning_rate": 0.00017995091275478756, + "loss": 0.3376, + "step": 8531 + }, + { + "epoch": 1.14, + "grad_norm": 0.396484375, + "learning_rate": 0.00017994391758744733, + "loss": 0.3358, + "step": 8532 + }, + { + "epoch": 1.14, + "grad_norm": 0.55859375, + "learning_rate": 0.0001799369213360163, + "loss": 0.4551, + "step": 8533 + }, + { + "epoch": 1.14, + "grad_norm": 0.76953125, + "learning_rate": 0.0001799299240005893, + "loss": 0.4775, + "step": 8534 + }, + { + "epoch": 1.14, + "grad_norm": 0.734375, + "learning_rate": 0.00017992292558126121, + "loss": 0.3916, + "step": 8535 + }, + { + "epoch": 1.14, + "grad_norm": 0.474609375, + "learning_rate": 0.00017991592607812697, + "loss": 0.425, + "step": 8536 + }, + { + "epoch": 1.14, + "grad_norm": 0.46875, + "learning_rate": 0.00017990892549128148, + "loss": 0.3854, + "step": 8537 + }, + { + "epoch": 1.14, + "grad_norm": 0.7421875, + "learning_rate": 0.00017990192382081967, + "loss": 0.4641, + "step": 8538 + }, + { + "epoch": 1.14, + "grad_norm": 0.42578125, + "learning_rate": 0.0001798949210668365, + "loss": 0.2358, + "step": 8539 + }, + { + "epoch": 1.14, + "grad_norm": 0.5390625, + "learning_rate": 0.00017988791722942695, + "loss": 0.5721, + "step": 8540 + }, + { + "epoch": 1.14, + "grad_norm": 0.470703125, + "learning_rate": 0.00017988091230868595, + "loss": 0.6225, + "step": 8541 + }, + { + "epoch": 1.14, + "grad_norm": 0.53515625, + "learning_rate": 0.0001798739063047085, + "loss": 0.4687, + "step": 8542 + }, + { + "epoch": 1.14, + "grad_norm": 0.5859375, + "learning_rate": 0.00017986689921758962, + "loss": 0.4216, + "step": 8543 + }, + { + "epoch": 1.14, + "grad_norm": 0.6796875, + "learning_rate": 0.00017985989104742434, + "loss": 0.4053, + "step": 8544 + }, + { + "epoch": 1.14, + "grad_norm": 0.65234375, + "learning_rate": 0.00017985288179430767, + "loss": 0.6287, + "step": 8545 + }, + { + "epoch": 1.14, + "grad_norm": 0.53125, + "learning_rate": 0.00017984587145833468, + "loss": 0.4125, + "step": 8546 + }, + { + "epoch": 1.14, + "grad_norm": 0.58203125, + "learning_rate": 0.00017983886003960042, + "loss": 0.929, + "step": 8547 + }, + { + "epoch": 1.14, + "grad_norm": 0.66015625, + "learning_rate": 0.00017983184753819997, + "loss": 0.4895, + "step": 8548 + }, + { + "epoch": 1.14, + "grad_norm": 0.4296875, + "learning_rate": 0.00017982483395422846, + "loss": 0.3439, + "step": 8549 + }, + { + "epoch": 1.14, + "grad_norm": 0.68359375, + "learning_rate": 0.00017981781928778096, + "loss": 0.2291, + "step": 8550 + }, + { + "epoch": 1.14, + "grad_norm": 0.6953125, + "learning_rate": 0.0001798108035389526, + "loss": 0.2566, + "step": 8551 + }, + { + "epoch": 1.14, + "grad_norm": 0.51171875, + "learning_rate": 0.00017980378670783852, + "loss": 0.1774, + "step": 8552 + }, + { + "epoch": 1.14, + "grad_norm": 0.474609375, + "learning_rate": 0.0001797967687945339, + "loss": 0.5213, + "step": 8553 + }, + { + "epoch": 1.14, + "grad_norm": 0.671875, + "learning_rate": 0.00017978974979913383, + "loss": 0.29, + "step": 8554 + }, + { + "epoch": 1.14, + "grad_norm": 0.47265625, + "learning_rate": 0.0001797827297217336, + "loss": 0.3207, + "step": 8555 + }, + { + "epoch": 1.14, + "grad_norm": 0.50390625, + "learning_rate": 0.00017977570856242831, + "loss": 0.2568, + "step": 8556 + }, + { + "epoch": 1.14, + "grad_norm": 0.60546875, + "learning_rate": 0.00017976868632131322, + "loss": 0.5467, + "step": 8557 + }, + { + "epoch": 1.14, + "grad_norm": 0.5, + "learning_rate": 0.00017976166299848355, + "loss": 0.2285, + "step": 8558 + }, + { + "epoch": 1.14, + "grad_norm": 0.5859375, + "learning_rate": 0.00017975463859403457, + "loss": 0.1717, + "step": 8559 + }, + { + "epoch": 1.14, + "grad_norm": 0.6796875, + "learning_rate": 0.00017974761310806146, + "loss": 0.3988, + "step": 8560 + }, + { + "epoch": 1.14, + "grad_norm": 0.5234375, + "learning_rate": 0.00017974058654065959, + "loss": 0.3212, + "step": 8561 + }, + { + "epoch": 1.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00017973355889192412, + "loss": 0.4024, + "step": 8562 + }, + { + "epoch": 1.14, + "grad_norm": 0.44921875, + "learning_rate": 0.00017972653016195045, + "loss": 0.3561, + "step": 8563 + }, + { + "epoch": 1.14, + "grad_norm": 0.484375, + "learning_rate": 0.00017971950035083388, + "loss": 0.4624, + "step": 8564 + }, + { + "epoch": 1.14, + "grad_norm": 0.671875, + "learning_rate": 0.0001797124694586697, + "loss": 0.3486, + "step": 8565 + }, + { + "epoch": 1.14, + "grad_norm": 0.412109375, + "learning_rate": 0.00017970543748555327, + "loss": 0.3018, + "step": 8566 + }, + { + "epoch": 1.14, + "grad_norm": 0.39453125, + "learning_rate": 0.00017969840443157997, + "loss": 0.4521, + "step": 8567 + }, + { + "epoch": 1.14, + "grad_norm": 0.5, + "learning_rate": 0.00017969137029684516, + "loss": 0.4586, + "step": 8568 + }, + { + "epoch": 1.14, + "grad_norm": 0.56640625, + "learning_rate": 0.0001796843350814442, + "loss": 0.5729, + "step": 8569 + }, + { + "epoch": 1.14, + "grad_norm": 0.60546875, + "learning_rate": 0.0001796772987854725, + "loss": 0.4371, + "step": 8570 + }, + { + "epoch": 1.14, + "grad_norm": 0.345703125, + "learning_rate": 0.00017967026140902555, + "loss": 0.1475, + "step": 8571 + }, + { + "epoch": 1.14, + "grad_norm": 0.5, + "learning_rate": 0.00017966322295219869, + "loss": 0.3768, + "step": 8572 + }, + { + "epoch": 1.14, + "grad_norm": 0.546875, + "learning_rate": 0.00017965618341508741, + "loss": 0.3522, + "step": 8573 + }, + { + "epoch": 1.14, + "grad_norm": 0.6015625, + "learning_rate": 0.00017964914279778715, + "loss": 0.7695, + "step": 8574 + }, + { + "epoch": 1.14, + "grad_norm": 0.82421875, + "learning_rate": 0.0001796421011003934, + "loss": 0.4722, + "step": 8575 + }, + { + "epoch": 1.14, + "grad_norm": 0.61328125, + "learning_rate": 0.00017963505832300163, + "loss": 0.3746, + "step": 8576 + }, + { + "epoch": 1.14, + "grad_norm": 0.490234375, + "learning_rate": 0.0001796280144657074, + "loss": 0.3008, + "step": 8577 + }, + { + "epoch": 1.14, + "grad_norm": 0.51953125, + "learning_rate": 0.00017962096952860615, + "loss": 0.4404, + "step": 8578 + }, + { + "epoch": 1.14, + "grad_norm": 0.765625, + "learning_rate": 0.00017961392351179348, + "loss": 0.628, + "step": 8579 + }, + { + "epoch": 1.14, + "grad_norm": 0.90234375, + "learning_rate": 0.00017960687641536486, + "loss": 0.4947, + "step": 8580 + }, + { + "epoch": 1.15, + "grad_norm": 0.6640625, + "learning_rate": 0.00017959982823941595, + "loss": 0.5549, + "step": 8581 + }, + { + "epoch": 1.15, + "grad_norm": 0.5546875, + "learning_rate": 0.00017959277898404227, + "loss": 0.2428, + "step": 8582 + }, + { + "epoch": 1.15, + "grad_norm": 0.431640625, + "learning_rate": 0.00017958572864933942, + "loss": 0.4256, + "step": 8583 + }, + { + "epoch": 1.15, + "grad_norm": 0.474609375, + "learning_rate": 0.00017957867723540302, + "loss": 0.3757, + "step": 8584 + }, + { + "epoch": 1.15, + "grad_norm": 0.65625, + "learning_rate": 0.0001795716247423287, + "loss": 0.5733, + "step": 8585 + }, + { + "epoch": 1.15, + "grad_norm": 0.53125, + "learning_rate": 0.00017956457117021204, + "loss": 0.2631, + "step": 8586 + }, + { + "epoch": 1.15, + "grad_norm": 0.47265625, + "learning_rate": 0.00017955751651914877, + "loss": 0.3055, + "step": 8587 + }, + { + "epoch": 1.15, + "grad_norm": 0.4765625, + "learning_rate": 0.0001795504607892345, + "loss": 0.3972, + "step": 8588 + }, + { + "epoch": 1.15, + "grad_norm": 0.671875, + "learning_rate": 0.00017954340398056493, + "loss": 0.8209, + "step": 8589 + }, + { + "epoch": 1.15, + "grad_norm": 0.52734375, + "learning_rate": 0.00017953634609323572, + "loss": 0.2671, + "step": 8590 + }, + { + "epoch": 1.15, + "grad_norm": 0.53125, + "learning_rate": 0.00017952928712734268, + "loss": 0.232, + "step": 8591 + }, + { + "epoch": 1.15, + "grad_norm": 0.80078125, + "learning_rate": 0.0001795222270829814, + "loss": 0.8626, + "step": 8592 + }, + { + "epoch": 1.15, + "grad_norm": 0.5625, + "learning_rate": 0.00017951516596024768, + "loss": 0.3314, + "step": 8593 + }, + { + "epoch": 1.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00017950810375923733, + "loss": 0.3346, + "step": 8594 + }, + { + "epoch": 1.15, + "grad_norm": 0.3984375, + "learning_rate": 0.00017950104048004603, + "loss": 0.2396, + "step": 8595 + }, + { + "epoch": 1.15, + "grad_norm": 0.9140625, + "learning_rate": 0.0001794939761227696, + "loss": 0.3174, + "step": 8596 + }, + { + "epoch": 1.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00017948691068750384, + "loss": 0.3153, + "step": 8597 + }, + { + "epoch": 1.15, + "grad_norm": 0.5625, + "learning_rate": 0.00017947984417434457, + "loss": 0.5567, + "step": 8598 + }, + { + "epoch": 1.15, + "grad_norm": 0.57421875, + "learning_rate": 0.0001794727765833876, + "loss": 0.4582, + "step": 8599 + }, + { + "epoch": 1.15, + "grad_norm": 0.90234375, + "learning_rate": 0.00017946570791472875, + "loss": 0.6695, + "step": 8600 + }, + { + "epoch": 1.15, + "grad_norm": 0.5625, + "learning_rate": 0.00017945863816846394, + "loss": 0.3844, + "step": 8601 + }, + { + "epoch": 1.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00017945156734468896, + "loss": 0.3884, + "step": 8602 + }, + { + "epoch": 1.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00017944449544349974, + "loss": 0.3785, + "step": 8603 + }, + { + "epoch": 1.15, + "grad_norm": 0.94140625, + "learning_rate": 0.0001794374224649922, + "loss": 0.4545, + "step": 8604 + }, + { + "epoch": 1.15, + "grad_norm": 1.0859375, + "learning_rate": 0.0001794303484092622, + "loss": 0.1949, + "step": 8605 + }, + { + "epoch": 1.15, + "grad_norm": 0.55859375, + "learning_rate": 0.00017942327327640572, + "loss": 0.1955, + "step": 8606 + }, + { + "epoch": 1.15, + "grad_norm": 0.58984375, + "learning_rate": 0.00017941619706651868, + "loss": 0.436, + "step": 8607 + }, + { + "epoch": 1.15, + "grad_norm": 0.494140625, + "learning_rate": 0.00017940911977969704, + "loss": 0.2919, + "step": 8608 + }, + { + "epoch": 1.15, + "grad_norm": 0.357421875, + "learning_rate": 0.00017940204141603675, + "loss": 0.256, + "step": 8609 + }, + { + "epoch": 1.15, + "grad_norm": 0.5078125, + "learning_rate": 0.00017939496197563385, + "loss": 0.5021, + "step": 8610 + }, + { + "epoch": 1.15, + "grad_norm": 0.494140625, + "learning_rate": 0.00017938788145858427, + "loss": 0.4326, + "step": 8611 + }, + { + "epoch": 1.15, + "grad_norm": 0.43359375, + "learning_rate": 0.0001793807998649841, + "loss": 0.3699, + "step": 8612 + }, + { + "epoch": 1.15, + "grad_norm": 0.58203125, + "learning_rate": 0.00017937371719492932, + "loss": 0.3724, + "step": 8613 + }, + { + "epoch": 1.15, + "grad_norm": 0.625, + "learning_rate": 0.000179366633448516, + "loss": 0.3819, + "step": 8614 + }, + { + "epoch": 1.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00017935954862584018, + "loss": 0.2501, + "step": 8615 + }, + { + "epoch": 1.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00017935246272699797, + "loss": 0.6512, + "step": 8616 + }, + { + "epoch": 1.15, + "grad_norm": 0.5078125, + "learning_rate": 0.0001793453757520854, + "loss": 0.3111, + "step": 8617 + }, + { + "epoch": 1.15, + "grad_norm": 0.486328125, + "learning_rate": 0.00017933828770119864, + "loss": 0.3418, + "step": 8618 + }, + { + "epoch": 1.15, + "grad_norm": 0.68359375, + "learning_rate": 0.0001793311985744338, + "loss": 0.5272, + "step": 8619 + }, + { + "epoch": 1.15, + "grad_norm": 0.416015625, + "learning_rate": 0.00017932410837188698, + "loss": 0.2813, + "step": 8620 + }, + { + "epoch": 1.15, + "grad_norm": 0.53125, + "learning_rate": 0.0001793170170936543, + "loss": 0.4172, + "step": 8621 + }, + { + "epoch": 1.15, + "grad_norm": 0.640625, + "learning_rate": 0.00017930992473983205, + "loss": 0.709, + "step": 8622 + }, + { + "epoch": 1.15, + "grad_norm": 0.59375, + "learning_rate": 0.00017930283131051625, + "loss": 0.5497, + "step": 8623 + }, + { + "epoch": 1.15, + "grad_norm": 0.6640625, + "learning_rate": 0.00017929573680580316, + "loss": 0.5792, + "step": 8624 + }, + { + "epoch": 1.15, + "grad_norm": 0.66015625, + "learning_rate": 0.000179288641225789, + "loss": 0.6382, + "step": 8625 + }, + { + "epoch": 1.15, + "grad_norm": 0.80859375, + "learning_rate": 0.00017928154457056997, + "loss": 0.3527, + "step": 8626 + }, + { + "epoch": 1.15, + "grad_norm": 0.69140625, + "learning_rate": 0.00017927444684024232, + "loss": 0.3541, + "step": 8627 + }, + { + "epoch": 1.15, + "grad_norm": 0.4921875, + "learning_rate": 0.0001792673480349023, + "loss": 0.297, + "step": 8628 + }, + { + "epoch": 1.15, + "grad_norm": 0.439453125, + "learning_rate": 0.00017926024815464618, + "loss": 0.2671, + "step": 8629 + }, + { + "epoch": 1.15, + "grad_norm": 0.65625, + "learning_rate": 0.0001792531471995702, + "loss": 0.4209, + "step": 8630 + }, + { + "epoch": 1.15, + "grad_norm": 0.46875, + "learning_rate": 0.0001792460451697707, + "loss": 0.4035, + "step": 8631 + }, + { + "epoch": 1.15, + "grad_norm": 0.6171875, + "learning_rate": 0.00017923894206534397, + "loss": 0.3991, + "step": 8632 + }, + { + "epoch": 1.15, + "grad_norm": 0.62109375, + "learning_rate": 0.0001792318378863863, + "loss": 0.452, + "step": 8633 + }, + { + "epoch": 1.15, + "grad_norm": 0.4453125, + "learning_rate": 0.00017922473263299406, + "loss": 0.3607, + "step": 8634 + }, + { + "epoch": 1.15, + "grad_norm": 0.515625, + "learning_rate": 0.0001792176263052636, + "loss": 0.2562, + "step": 8635 + }, + { + "epoch": 1.15, + "grad_norm": 0.38671875, + "learning_rate": 0.00017921051890329133, + "loss": 0.2326, + "step": 8636 + }, + { + "epoch": 1.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00017920341042717357, + "loss": 0.3832, + "step": 8637 + }, + { + "epoch": 1.15, + "grad_norm": 0.4921875, + "learning_rate": 0.00017919630087700672, + "loss": 0.2598, + "step": 8638 + }, + { + "epoch": 1.15, + "grad_norm": 0.55859375, + "learning_rate": 0.00017918919025288718, + "loss": 0.2403, + "step": 8639 + }, + { + "epoch": 1.15, + "grad_norm": 0.5390625, + "learning_rate": 0.00017918207855491142, + "loss": 0.5518, + "step": 8640 + }, + { + "epoch": 1.15, + "grad_norm": 0.41015625, + "learning_rate": 0.00017917496578317586, + "loss": 0.1762, + "step": 8641 + }, + { + "epoch": 1.15, + "grad_norm": 0.482421875, + "learning_rate": 0.00017916785193777693, + "loss": 0.3576, + "step": 8642 + }, + { + "epoch": 1.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00017916073701881115, + "loss": 0.4573, + "step": 8643 + }, + { + "epoch": 1.15, + "grad_norm": 0.462890625, + "learning_rate": 0.00017915362102637498, + "loss": 0.6358, + "step": 8644 + }, + { + "epoch": 1.15, + "grad_norm": 0.4765625, + "learning_rate": 0.00017914650396056488, + "loss": 0.3546, + "step": 8645 + }, + { + "epoch": 1.15, + "grad_norm": 0.66015625, + "learning_rate": 0.00017913938582147737, + "loss": 0.3463, + "step": 8646 + }, + { + "epoch": 1.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00017913226660920903, + "loss": 0.4432, + "step": 8647 + }, + { + "epoch": 1.15, + "grad_norm": 0.859375, + "learning_rate": 0.00017912514632385636, + "loss": 0.3808, + "step": 8648 + }, + { + "epoch": 1.15, + "grad_norm": 0.474609375, + "learning_rate": 0.0001791180249655159, + "loss": 0.3588, + "step": 8649 + }, + { + "epoch": 1.15, + "grad_norm": 0.859375, + "learning_rate": 0.00017911090253428427, + "loss": 0.3742, + "step": 8650 + }, + { + "epoch": 1.15, + "grad_norm": 0.455078125, + "learning_rate": 0.00017910377903025804, + "loss": 0.2289, + "step": 8651 + }, + { + "epoch": 1.15, + "grad_norm": 0.58984375, + "learning_rate": 0.00017909665445353377, + "loss": 0.2641, + "step": 8652 + }, + { + "epoch": 1.15, + "grad_norm": 0.404296875, + "learning_rate": 0.00017908952880420814, + "loss": 0.3093, + "step": 8653 + }, + { + "epoch": 1.15, + "grad_norm": 0.703125, + "learning_rate": 0.00017908240208237768, + "loss": 0.4376, + "step": 8654 + }, + { + "epoch": 1.15, + "grad_norm": 0.3125, + "learning_rate": 0.00017907527428813916, + "loss": 0.1887, + "step": 8655 + }, + { + "epoch": 1.16, + "grad_norm": 0.6796875, + "learning_rate": 0.0001790681454215891, + "loss": 0.1907, + "step": 8656 + }, + { + "epoch": 1.16, + "grad_norm": 0.5078125, + "learning_rate": 0.0001790610154828243, + "loss": 0.2768, + "step": 8657 + }, + { + "epoch": 1.16, + "grad_norm": 0.380859375, + "learning_rate": 0.0001790538844719414, + "loss": 0.344, + "step": 8658 + }, + { + "epoch": 1.16, + "grad_norm": 0.4453125, + "learning_rate": 0.00017904675238903704, + "loss": 0.3362, + "step": 8659 + }, + { + "epoch": 1.16, + "grad_norm": 0.51953125, + "learning_rate": 0.000179039619234208, + "loss": 0.2999, + "step": 8660 + }, + { + "epoch": 1.16, + "grad_norm": 0.51171875, + "learning_rate": 0.000179032485007551, + "loss": 0.4112, + "step": 8661 + }, + { + "epoch": 1.16, + "grad_norm": 0.68359375, + "learning_rate": 0.00017902534970916278, + "loss": 0.4022, + "step": 8662 + }, + { + "epoch": 1.16, + "grad_norm": 0.56640625, + "learning_rate": 0.00017901821333914008, + "loss": 0.454, + "step": 8663 + }, + { + "epoch": 1.16, + "grad_norm": 0.5859375, + "learning_rate": 0.00017901107589757972, + "loss": 0.3967, + "step": 8664 + }, + { + "epoch": 1.16, + "grad_norm": 0.51953125, + "learning_rate": 0.00017900393738457847, + "loss": 0.3752, + "step": 8665 + }, + { + "epoch": 1.16, + "grad_norm": 0.5859375, + "learning_rate": 0.0001789967978002331, + "loss": 0.4524, + "step": 8666 + }, + { + "epoch": 1.16, + "grad_norm": 0.455078125, + "learning_rate": 0.00017898965714464046, + "loss": 0.4855, + "step": 8667 + }, + { + "epoch": 1.16, + "grad_norm": 0.494140625, + "learning_rate": 0.00017898251541789736, + "loss": 0.3118, + "step": 8668 + }, + { + "epoch": 1.16, + "grad_norm": 1.203125, + "learning_rate": 0.0001789753726201007, + "loss": 0.4161, + "step": 8669 + }, + { + "epoch": 1.16, + "grad_norm": 0.40625, + "learning_rate": 0.00017896822875134726, + "loss": 0.3477, + "step": 8670 + }, + { + "epoch": 1.16, + "grad_norm": 0.55859375, + "learning_rate": 0.000178961083811734, + "loss": 0.4236, + "step": 8671 + }, + { + "epoch": 1.16, + "grad_norm": 0.54296875, + "learning_rate": 0.00017895393780135773, + "loss": 0.4314, + "step": 8672 + }, + { + "epoch": 1.16, + "grad_norm": 0.34375, + "learning_rate": 0.00017894679072031543, + "loss": 0.1885, + "step": 8673 + }, + { + "epoch": 1.16, + "grad_norm": 0.578125, + "learning_rate": 0.00017893964256870392, + "loss": 0.3839, + "step": 8674 + }, + { + "epoch": 1.16, + "grad_norm": 0.48046875, + "learning_rate": 0.00017893249334662023, + "loss": 0.2675, + "step": 8675 + }, + { + "epoch": 1.16, + "grad_norm": 0.765625, + "learning_rate": 0.00017892534305416128, + "loss": 0.5877, + "step": 8676 + }, + { + "epoch": 1.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00017891819169142402, + "loss": 0.2354, + "step": 8677 + }, + { + "epoch": 1.16, + "grad_norm": 0.51171875, + "learning_rate": 0.0001789110392585054, + "loss": 0.5976, + "step": 8678 + }, + { + "epoch": 1.16, + "grad_norm": 0.474609375, + "learning_rate": 0.0001789038857555025, + "loss": 0.4356, + "step": 8679 + }, + { + "epoch": 1.16, + "grad_norm": 0.640625, + "learning_rate": 0.00017889673118251222, + "loss": 0.2458, + "step": 8680 + }, + { + "epoch": 1.16, + "grad_norm": 0.58203125, + "learning_rate": 0.00017888957553963163, + "loss": 0.528, + "step": 8681 + }, + { + "epoch": 1.16, + "grad_norm": 0.4921875, + "learning_rate": 0.0001788824188269578, + "loss": 0.375, + "step": 8682 + }, + { + "epoch": 1.16, + "grad_norm": 0.484375, + "learning_rate": 0.00017887526104458772, + "loss": 0.2574, + "step": 8683 + }, + { + "epoch": 1.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00017886810219261846, + "loss": 0.4413, + "step": 8684 + }, + { + "epoch": 1.16, + "grad_norm": 0.478515625, + "learning_rate": 0.00017886094227114715, + "loss": 0.528, + "step": 8685 + }, + { + "epoch": 1.16, + "grad_norm": 0.60546875, + "learning_rate": 0.00017885378128027087, + "loss": 0.4211, + "step": 8686 + }, + { + "epoch": 1.16, + "grad_norm": 0.453125, + "learning_rate": 0.00017884661922008667, + "loss": 0.3545, + "step": 8687 + }, + { + "epoch": 1.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00017883945609069172, + "loss": 0.4066, + "step": 8688 + }, + { + "epoch": 1.16, + "grad_norm": 0.75, + "learning_rate": 0.00017883229189218314, + "loss": 0.4496, + "step": 8689 + }, + { + "epoch": 1.16, + "grad_norm": 0.5234375, + "learning_rate": 0.00017882512662465808, + "loss": 0.4347, + "step": 8690 + }, + { + "epoch": 1.16, + "grad_norm": 0.8125, + "learning_rate": 0.00017881796028821373, + "loss": 0.4353, + "step": 8691 + }, + { + "epoch": 1.16, + "grad_norm": 0.451171875, + "learning_rate": 0.00017881079288294724, + "loss": 0.3052, + "step": 8692 + }, + { + "epoch": 1.16, + "grad_norm": 0.75390625, + "learning_rate": 0.00017880362440895583, + "loss": 0.61, + "step": 8693 + }, + { + "epoch": 1.16, + "grad_norm": 0.62890625, + "learning_rate": 0.00017879645486633668, + "loss": 0.534, + "step": 8694 + }, + { + "epoch": 1.16, + "grad_norm": 0.58203125, + "learning_rate": 0.00017878928425518707, + "loss": 0.2946, + "step": 8695 + }, + { + "epoch": 1.16, + "grad_norm": 0.5, + "learning_rate": 0.0001787821125756042, + "loss": 0.4586, + "step": 8696 + }, + { + "epoch": 1.16, + "grad_norm": 0.66015625, + "learning_rate": 0.00017877493982768527, + "loss": 0.3283, + "step": 8697 + }, + { + "epoch": 1.16, + "grad_norm": 0.458984375, + "learning_rate": 0.0001787677660115276, + "loss": 0.2877, + "step": 8698 + }, + { + "epoch": 1.16, + "grad_norm": 0.546875, + "learning_rate": 0.0001787605911272285, + "loss": 0.4738, + "step": 8699 + }, + { + "epoch": 1.16, + "grad_norm": 0.515625, + "learning_rate": 0.00017875341517488525, + "loss": 0.3679, + "step": 8700 + }, + { + "epoch": 1.16, + "grad_norm": 0.9296875, + "learning_rate": 0.0001787462381545951, + "loss": 0.4839, + "step": 8701 + }, + { + "epoch": 1.16, + "grad_norm": 0.6484375, + "learning_rate": 0.00017873906006645544, + "loss": 0.396, + "step": 8702 + }, + { + "epoch": 1.16, + "grad_norm": 0.484375, + "learning_rate": 0.00017873188091056363, + "loss": 0.3273, + "step": 8703 + }, + { + "epoch": 1.16, + "grad_norm": 0.4453125, + "learning_rate": 0.00017872470068701693, + "loss": 0.2229, + "step": 8704 + }, + { + "epoch": 1.16, + "grad_norm": 0.458984375, + "learning_rate": 0.00017871751939591278, + "loss": 0.3896, + "step": 8705 + }, + { + "epoch": 1.16, + "grad_norm": 0.57421875, + "learning_rate": 0.00017871033703734858, + "loss": 0.545, + "step": 8706 + }, + { + "epoch": 1.16, + "grad_norm": 0.51953125, + "learning_rate": 0.00017870315361142165, + "loss": 0.3008, + "step": 8707 + }, + { + "epoch": 1.16, + "grad_norm": 0.55859375, + "learning_rate": 0.00017869596911822947, + "loss": 0.6085, + "step": 8708 + }, + { + "epoch": 1.16, + "grad_norm": 0.609375, + "learning_rate": 0.00017868878355786945, + "loss": 0.3849, + "step": 8709 + }, + { + "epoch": 1.16, + "grad_norm": 0.52734375, + "learning_rate": 0.000178681596930439, + "loss": 0.2696, + "step": 8710 + }, + { + "epoch": 1.16, + "grad_norm": 0.55859375, + "learning_rate": 0.0001786744092360356, + "loss": 0.5575, + "step": 8711 + }, + { + "epoch": 1.16, + "grad_norm": 0.66796875, + "learning_rate": 0.00017866722047475673, + "loss": 0.6459, + "step": 8712 + }, + { + "epoch": 1.16, + "grad_norm": 0.71484375, + "learning_rate": 0.00017866003064669986, + "loss": 0.2688, + "step": 8713 + }, + { + "epoch": 1.16, + "grad_norm": 0.4453125, + "learning_rate": 0.0001786528397519625, + "loss": 0.4201, + "step": 8714 + }, + { + "epoch": 1.16, + "grad_norm": 0.490234375, + "learning_rate": 0.00017864564779064213, + "loss": 0.4389, + "step": 8715 + }, + { + "epoch": 1.16, + "grad_norm": 0.419921875, + "learning_rate": 0.0001786384547628363, + "loss": 0.3379, + "step": 8716 + }, + { + "epoch": 1.16, + "grad_norm": 0.78515625, + "learning_rate": 0.00017863126066864257, + "loss": 0.2292, + "step": 8717 + }, + { + "epoch": 1.16, + "grad_norm": 0.50390625, + "learning_rate": 0.0001786240655081585, + "loss": 0.3311, + "step": 8718 + }, + { + "epoch": 1.16, + "grad_norm": 0.640625, + "learning_rate": 0.0001786168692814816, + "loss": 0.2219, + "step": 8719 + }, + { + "epoch": 1.16, + "grad_norm": 0.58203125, + "learning_rate": 0.0001786096719887095, + "loss": 0.4217, + "step": 8720 + }, + { + "epoch": 1.16, + "grad_norm": 0.5, + "learning_rate": 0.00017860247362993983, + "loss": 0.2857, + "step": 8721 + }, + { + "epoch": 1.16, + "grad_norm": 1.0234375, + "learning_rate": 0.00017859527420527015, + "loss": 0.4551, + "step": 8722 + }, + { + "epoch": 1.16, + "grad_norm": 0.62109375, + "learning_rate": 0.0001785880737147981, + "loss": 0.4654, + "step": 8723 + }, + { + "epoch": 1.16, + "grad_norm": 0.62890625, + "learning_rate": 0.00017858087215862133, + "loss": 0.4297, + "step": 8724 + }, + { + "epoch": 1.16, + "grad_norm": 0.609375, + "learning_rate": 0.00017857366953683755, + "loss": 0.4366, + "step": 8725 + }, + { + "epoch": 1.16, + "grad_norm": 0.58203125, + "learning_rate": 0.00017856646584954434, + "loss": 0.5307, + "step": 8726 + }, + { + "epoch": 1.16, + "grad_norm": 0.63671875, + "learning_rate": 0.00017855926109683944, + "loss": 0.4037, + "step": 8727 + }, + { + "epoch": 1.16, + "grad_norm": 0.546875, + "learning_rate": 0.00017855205527882056, + "loss": 0.5762, + "step": 8728 + }, + { + "epoch": 1.16, + "grad_norm": 0.64453125, + "learning_rate": 0.00017854484839558533, + "loss": 0.2931, + "step": 8729 + }, + { + "epoch": 1.16, + "grad_norm": 0.46484375, + "learning_rate": 0.0001785376404472316, + "loss": 0.2917, + "step": 8730 + }, + { + "epoch": 1.17, + "grad_norm": 0.75, + "learning_rate": 0.00017853043143385705, + "loss": 0.5286, + "step": 8731 + }, + { + "epoch": 1.17, + "grad_norm": 0.625, + "learning_rate": 0.00017852322135555946, + "loss": 0.5843, + "step": 8732 + }, + { + "epoch": 1.17, + "grad_norm": 0.70703125, + "learning_rate": 0.0001785160102124366, + "loss": 0.3589, + "step": 8733 + }, + { + "epoch": 1.17, + "grad_norm": 0.55078125, + "learning_rate": 0.00017850879800458623, + "loss": 0.6263, + "step": 8734 + }, + { + "epoch": 1.17, + "grad_norm": 0.609375, + "learning_rate": 0.00017850158473210618, + "loss": 0.2554, + "step": 8735 + }, + { + "epoch": 1.17, + "grad_norm": 0.62890625, + "learning_rate": 0.00017849437039509422, + "loss": 0.2788, + "step": 8736 + }, + { + "epoch": 1.17, + "grad_norm": 0.4453125, + "learning_rate": 0.00017848715499364827, + "loss": 0.3961, + "step": 8737 + }, + { + "epoch": 1.17, + "grad_norm": 0.58984375, + "learning_rate": 0.0001784799385278661, + "loss": 0.3755, + "step": 8738 + }, + { + "epoch": 1.17, + "grad_norm": 0.53125, + "learning_rate": 0.00017847272099784562, + "loss": 0.6439, + "step": 8739 + }, + { + "epoch": 1.17, + "grad_norm": 0.57421875, + "learning_rate": 0.00017846550240368467, + "loss": 0.4859, + "step": 8740 + }, + { + "epoch": 1.17, + "grad_norm": 0.498046875, + "learning_rate": 0.00017845828274548113, + "loss": 0.4176, + "step": 8741 + }, + { + "epoch": 1.17, + "grad_norm": 0.55078125, + "learning_rate": 0.00017845106202333294, + "loss": 0.4986, + "step": 8742 + }, + { + "epoch": 1.17, + "grad_norm": 0.5078125, + "learning_rate": 0.00017844384023733798, + "loss": 0.5523, + "step": 8743 + }, + { + "epoch": 1.17, + "grad_norm": 0.56640625, + "learning_rate": 0.00017843661738759422, + "loss": 0.1982, + "step": 8744 + }, + { + "epoch": 1.17, + "grad_norm": 0.46484375, + "learning_rate": 0.0001784293934741996, + "loss": 0.3803, + "step": 8745 + }, + { + "epoch": 1.17, + "grad_norm": 0.58203125, + "learning_rate": 0.00017842216849725203, + "loss": 0.4664, + "step": 8746 + }, + { + "epoch": 1.17, + "grad_norm": 0.44921875, + "learning_rate": 0.00017841494245684954, + "loss": 0.3752, + "step": 8747 + }, + { + "epoch": 1.17, + "grad_norm": 0.478515625, + "learning_rate": 0.00017840771535309013, + "loss": 0.4487, + "step": 8748 + }, + { + "epoch": 1.17, + "grad_norm": 0.56640625, + "learning_rate": 0.00017840048718607176, + "loss": 0.3126, + "step": 8749 + }, + { + "epoch": 1.17, + "grad_norm": 0.5390625, + "learning_rate": 0.00017839325795589246, + "loss": 0.4927, + "step": 8750 + }, + { + "epoch": 1.17, + "grad_norm": 0.53515625, + "learning_rate": 0.0001783860276626503, + "loss": 0.3866, + "step": 8751 + }, + { + "epoch": 1.17, + "grad_norm": 0.3828125, + "learning_rate": 0.00017837879630644326, + "loss": 0.2869, + "step": 8752 + }, + { + "epoch": 1.17, + "grad_norm": 0.5078125, + "learning_rate": 0.00017837156388736945, + "loss": 0.3092, + "step": 8753 + }, + { + "epoch": 1.17, + "grad_norm": 0.47265625, + "learning_rate": 0.00017836433040552694, + "loss": 0.2181, + "step": 8754 + }, + { + "epoch": 1.17, + "grad_norm": 0.470703125, + "learning_rate": 0.00017835709586101382, + "loss": 0.3283, + "step": 8755 + }, + { + "epoch": 1.17, + "grad_norm": 0.59375, + "learning_rate": 0.00017834986025392816, + "loss": 0.3426, + "step": 8756 + }, + { + "epoch": 1.17, + "grad_norm": 0.6484375, + "learning_rate": 0.00017834262358436815, + "loss": 0.4986, + "step": 8757 + }, + { + "epoch": 1.17, + "grad_norm": 0.59765625, + "learning_rate": 0.00017833538585243185, + "loss": 0.2967, + "step": 8758 + }, + { + "epoch": 1.17, + "grad_norm": 0.54296875, + "learning_rate": 0.00017832814705821747, + "loss": 0.2386, + "step": 8759 + }, + { + "epoch": 1.17, + "grad_norm": 0.6015625, + "learning_rate": 0.00017832090720182312, + "loss": 0.6361, + "step": 8760 + }, + { + "epoch": 1.17, + "grad_norm": 0.8359375, + "learning_rate": 0.00017831366628334703, + "loss": 0.6191, + "step": 8761 + }, + { + "epoch": 1.17, + "grad_norm": 0.60546875, + "learning_rate": 0.00017830642430288735, + "loss": 0.3741, + "step": 8762 + }, + { + "epoch": 1.17, + "grad_norm": 0.494140625, + "learning_rate": 0.0001782991812605423, + "loss": 0.4075, + "step": 8763 + }, + { + "epoch": 1.17, + "grad_norm": 0.8828125, + "learning_rate": 0.0001782919371564101, + "loss": 0.2759, + "step": 8764 + }, + { + "epoch": 1.17, + "grad_norm": 0.5625, + "learning_rate": 0.000178284691990589, + "loss": 0.3583, + "step": 8765 + }, + { + "epoch": 1.17, + "grad_norm": 0.51953125, + "learning_rate": 0.0001782774457631772, + "loss": 0.349, + "step": 8766 + }, + { + "epoch": 1.17, + "grad_norm": 0.70703125, + "learning_rate": 0.00017827019847427305, + "loss": 0.4079, + "step": 8767 + }, + { + "epoch": 1.17, + "grad_norm": 0.57421875, + "learning_rate": 0.00017826295012397472, + "loss": 0.5887, + "step": 8768 + }, + { + "epoch": 1.17, + "grad_norm": 0.48046875, + "learning_rate": 0.00017825570071238058, + "loss": 0.4687, + "step": 8769 + }, + { + "epoch": 1.17, + "grad_norm": 0.5859375, + "learning_rate": 0.0001782484502395889, + "loss": 0.4509, + "step": 8770 + }, + { + "epoch": 1.17, + "grad_norm": 0.4765625, + "learning_rate": 0.00017824119870569804, + "loss": 0.3705, + "step": 8771 + }, + { + "epoch": 1.17, + "grad_norm": 0.373046875, + "learning_rate": 0.00017823394611080632, + "loss": 0.2217, + "step": 8772 + }, + { + "epoch": 1.17, + "grad_norm": 0.5546875, + "learning_rate": 0.00017822669245501206, + "loss": 0.3083, + "step": 8773 + }, + { + "epoch": 1.17, + "grad_norm": 0.462890625, + "learning_rate": 0.00017821943773841365, + "loss": 0.1527, + "step": 8774 + }, + { + "epoch": 1.17, + "grad_norm": 0.52734375, + "learning_rate": 0.00017821218196110945, + "loss": 0.3696, + "step": 8775 + }, + { + "epoch": 1.17, + "grad_norm": 0.6484375, + "learning_rate": 0.00017820492512319786, + "loss": 0.3764, + "step": 8776 + }, + { + "epoch": 1.17, + "grad_norm": 0.7421875, + "learning_rate": 0.00017819766722477733, + "loss": 0.6043, + "step": 8777 + }, + { + "epoch": 1.17, + "grad_norm": 0.87109375, + "learning_rate": 0.00017819040826594622, + "loss": 0.5885, + "step": 8778 + }, + { + "epoch": 1.17, + "grad_norm": 0.41796875, + "learning_rate": 0.000178183148246803, + "loss": 0.2246, + "step": 8779 + }, + { + "epoch": 1.17, + "grad_norm": 0.466796875, + "learning_rate": 0.00017817588716744612, + "loss": 0.2325, + "step": 8780 + }, + { + "epoch": 1.17, + "grad_norm": 0.5703125, + "learning_rate": 0.000178168625027974, + "loss": 0.5517, + "step": 8781 + }, + { + "epoch": 1.17, + "grad_norm": 0.46484375, + "learning_rate": 0.0001781613618284852, + "loss": 0.2296, + "step": 8782 + }, + { + "epoch": 1.17, + "grad_norm": 0.55859375, + "learning_rate": 0.00017815409756907812, + "loss": 0.3312, + "step": 8783 + }, + { + "epoch": 1.17, + "grad_norm": 0.68359375, + "learning_rate": 0.00017814683224985135, + "loss": 0.2212, + "step": 8784 + }, + { + "epoch": 1.17, + "grad_norm": 0.43359375, + "learning_rate": 0.0001781395658709034, + "loss": 0.3182, + "step": 8785 + }, + { + "epoch": 1.17, + "grad_norm": 0.609375, + "learning_rate": 0.00017813229843233275, + "loss": 0.539, + "step": 8786 + }, + { + "epoch": 1.17, + "grad_norm": 0.87109375, + "learning_rate": 0.00017812502993423804, + "loss": 0.5368, + "step": 8787 + }, + { + "epoch": 1.17, + "grad_norm": 0.57421875, + "learning_rate": 0.00017811776037671771, + "loss": 0.2809, + "step": 8788 + }, + { + "epoch": 1.17, + "grad_norm": 0.466796875, + "learning_rate": 0.00017811048975987048, + "loss": 0.3238, + "step": 8789 + }, + { + "epoch": 1.17, + "grad_norm": 0.57421875, + "learning_rate": 0.00017810321808379485, + "loss": 0.4896, + "step": 8790 + }, + { + "epoch": 1.17, + "grad_norm": 0.69921875, + "learning_rate": 0.00017809594534858944, + "loss": 0.3302, + "step": 8791 + }, + { + "epoch": 1.17, + "grad_norm": 0.498046875, + "learning_rate": 0.00017808867155435292, + "loss": 0.3855, + "step": 8792 + }, + { + "epoch": 1.17, + "grad_norm": 0.546875, + "learning_rate": 0.00017808139670118388, + "loss": 0.3949, + "step": 8793 + }, + { + "epoch": 1.17, + "grad_norm": 0.43359375, + "learning_rate": 0.000178074120789181, + "loss": 0.3353, + "step": 8794 + }, + { + "epoch": 1.17, + "grad_norm": 0.5546875, + "learning_rate": 0.0001780668438184429, + "loss": 0.5651, + "step": 8795 + }, + { + "epoch": 1.17, + "grad_norm": 0.578125, + "learning_rate": 0.00017805956578906832, + "loss": 0.2812, + "step": 8796 + }, + { + "epoch": 1.17, + "grad_norm": 0.46484375, + "learning_rate": 0.00017805228670115595, + "loss": 0.4536, + "step": 8797 + }, + { + "epoch": 1.17, + "grad_norm": 0.7734375, + "learning_rate": 0.00017804500655480446, + "loss": 0.7842, + "step": 8798 + }, + { + "epoch": 1.17, + "grad_norm": 0.6796875, + "learning_rate": 0.0001780377253501126, + "loss": 0.4711, + "step": 8799 + }, + { + "epoch": 1.17, + "grad_norm": 0.498046875, + "learning_rate": 0.0001780304430871791, + "loss": 0.3824, + "step": 8800 + }, + { + "epoch": 1.17, + "grad_norm": 0.57421875, + "learning_rate": 0.00017802315976610272, + "loss": 0.5843, + "step": 8801 + }, + { + "epoch": 1.17, + "grad_norm": 0.62109375, + "learning_rate": 0.00017801587538698218, + "loss": 0.3759, + "step": 8802 + }, + { + "epoch": 1.17, + "grad_norm": 0.484375, + "learning_rate": 0.00017800858994991632, + "loss": 0.4222, + "step": 8803 + }, + { + "epoch": 1.17, + "grad_norm": 0.4453125, + "learning_rate": 0.00017800130345500391, + "loss": 0.1988, + "step": 8804 + }, + { + "epoch": 1.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00017799401590234376, + "loss": 0.4242, + "step": 8805 + }, + { + "epoch": 1.18, + "grad_norm": 0.5078125, + "learning_rate": 0.00017798672729203472, + "loss": 0.3762, + "step": 8806 + }, + { + "epoch": 1.18, + "grad_norm": 0.41015625, + "learning_rate": 0.0001779794376241756, + "loss": 0.2364, + "step": 8807 + }, + { + "epoch": 1.18, + "grad_norm": 0.4453125, + "learning_rate": 0.00017797214689886526, + "loss": 0.3294, + "step": 8808 + }, + { + "epoch": 1.18, + "grad_norm": 0.73046875, + "learning_rate": 0.00017796485511620256, + "loss": 0.3707, + "step": 8809 + }, + { + "epoch": 1.18, + "grad_norm": 0.462890625, + "learning_rate": 0.0001779575622762864, + "loss": 0.5237, + "step": 8810 + }, + { + "epoch": 1.18, + "grad_norm": 0.5390625, + "learning_rate": 0.00017795026837921562, + "loss": 0.3028, + "step": 8811 + }, + { + "epoch": 1.18, + "grad_norm": 0.57421875, + "learning_rate": 0.0001779429734250892, + "loss": 0.289, + "step": 8812 + }, + { + "epoch": 1.18, + "grad_norm": 0.72265625, + "learning_rate": 0.00017793567741400604, + "loss": 0.5017, + "step": 8813 + }, + { + "epoch": 1.18, + "grad_norm": 0.63671875, + "learning_rate": 0.0001779283803460651, + "loss": 0.562, + "step": 8814 + }, + { + "epoch": 1.18, + "grad_norm": 0.890625, + "learning_rate": 0.0001779210822213653, + "loss": 0.649, + "step": 8815 + }, + { + "epoch": 1.18, + "grad_norm": 0.5390625, + "learning_rate": 0.0001779137830400056, + "loss": 0.3549, + "step": 8816 + }, + { + "epoch": 1.18, + "grad_norm": 0.58984375, + "learning_rate": 0.00017790648280208496, + "loss": 0.6369, + "step": 8817 + }, + { + "epoch": 1.18, + "grad_norm": 0.83984375, + "learning_rate": 0.00017789918150770245, + "loss": 0.3676, + "step": 8818 + }, + { + "epoch": 1.18, + "grad_norm": 0.5859375, + "learning_rate": 0.00017789187915695704, + "loss": 0.2986, + "step": 8819 + }, + { + "epoch": 1.18, + "grad_norm": 0.6015625, + "learning_rate": 0.00017788457574994778, + "loss": 0.6417, + "step": 8820 + }, + { + "epoch": 1.18, + "grad_norm": 0.88671875, + "learning_rate": 0.00017787727128677365, + "loss": 0.4639, + "step": 8821 + }, + { + "epoch": 1.18, + "grad_norm": 0.48828125, + "learning_rate": 0.00017786996576753378, + "loss": 0.5021, + "step": 8822 + }, + { + "epoch": 1.18, + "grad_norm": 0.7578125, + "learning_rate": 0.00017786265919232717, + "loss": 0.2814, + "step": 8823 + }, + { + "epoch": 1.18, + "grad_norm": 0.53515625, + "learning_rate": 0.0001778553515612529, + "loss": 0.4234, + "step": 8824 + }, + { + "epoch": 1.18, + "grad_norm": 0.609375, + "learning_rate": 0.00017784804287441017, + "loss": 0.4874, + "step": 8825 + }, + { + "epoch": 1.18, + "grad_norm": 0.55078125, + "learning_rate": 0.00017784073313189795, + "loss": 0.2956, + "step": 8826 + }, + { + "epoch": 1.18, + "grad_norm": 0.60546875, + "learning_rate": 0.00017783342233381548, + "loss": 0.2272, + "step": 8827 + }, + { + "epoch": 1.18, + "grad_norm": 0.5390625, + "learning_rate": 0.00017782611048026184, + "loss": 0.4076, + "step": 8828 + }, + { + "epoch": 1.18, + "grad_norm": 0.439453125, + "learning_rate": 0.00017781879757133618, + "loss": 0.1932, + "step": 8829 + }, + { + "epoch": 1.18, + "grad_norm": 0.65625, + "learning_rate": 0.00017781148360713768, + "loss": 0.5322, + "step": 8830 + }, + { + "epoch": 1.18, + "grad_norm": 0.466796875, + "learning_rate": 0.0001778041685877655, + "loss": 0.5078, + "step": 8831 + }, + { + "epoch": 1.18, + "grad_norm": 0.56640625, + "learning_rate": 0.0001777968525133189, + "loss": 0.2748, + "step": 8832 + }, + { + "epoch": 1.18, + "grad_norm": 0.47265625, + "learning_rate": 0.00017778953538389702, + "loss": 0.3113, + "step": 8833 + }, + { + "epoch": 1.18, + "grad_norm": 0.65625, + "learning_rate": 0.00017778221719959913, + "loss": 0.2255, + "step": 8834 + }, + { + "epoch": 1.18, + "grad_norm": 0.6640625, + "learning_rate": 0.00017777489796052447, + "loss": 0.1904, + "step": 8835 + }, + { + "epoch": 1.18, + "grad_norm": 0.609375, + "learning_rate": 0.00017776757766677225, + "loss": 0.2278, + "step": 8836 + }, + { + "epoch": 1.18, + "grad_norm": 0.53515625, + "learning_rate": 0.00017776025631844176, + "loss": 0.4427, + "step": 8837 + }, + { + "epoch": 1.18, + "grad_norm": 0.49609375, + "learning_rate": 0.00017775293391563234, + "loss": 0.2891, + "step": 8838 + }, + { + "epoch": 1.18, + "grad_norm": 0.55859375, + "learning_rate": 0.00017774561045844317, + "loss": 0.2968, + "step": 8839 + }, + { + "epoch": 1.18, + "grad_norm": 0.578125, + "learning_rate": 0.00017773828594697368, + "loss": 0.3353, + "step": 8840 + }, + { + "epoch": 1.18, + "grad_norm": 0.55859375, + "learning_rate": 0.00017773096038132313, + "loss": 0.1937, + "step": 8841 + }, + { + "epoch": 1.18, + "grad_norm": 0.64453125, + "learning_rate": 0.00017772363376159083, + "loss": 0.291, + "step": 8842 + }, + { + "epoch": 1.18, + "grad_norm": 0.7109375, + "learning_rate": 0.00017771630608787623, + "loss": 0.2867, + "step": 8843 + }, + { + "epoch": 1.18, + "grad_norm": 0.478515625, + "learning_rate": 0.0001777089773602786, + "loss": 0.3129, + "step": 8844 + }, + { + "epoch": 1.18, + "grad_norm": 0.515625, + "learning_rate": 0.00017770164757889738, + "loss": 0.3834, + "step": 8845 + }, + { + "epoch": 1.18, + "grad_norm": 0.431640625, + "learning_rate": 0.00017769431674383195, + "loss": 0.2744, + "step": 8846 + }, + { + "epoch": 1.18, + "grad_norm": 0.703125, + "learning_rate": 0.0001776869848551817, + "loss": 0.2644, + "step": 8847 + }, + { + "epoch": 1.18, + "grad_norm": 0.7265625, + "learning_rate": 0.0001776796519130461, + "loss": 0.6002, + "step": 8848 + }, + { + "epoch": 1.18, + "grad_norm": 0.66015625, + "learning_rate": 0.00017767231791752456, + "loss": 0.3395, + "step": 8849 + }, + { + "epoch": 1.18, + "grad_norm": 0.69140625, + "learning_rate": 0.00017766498286871655, + "loss": 0.2431, + "step": 8850 + }, + { + "epoch": 1.18, + "grad_norm": 0.4765625, + "learning_rate": 0.00017765764676672152, + "loss": 0.2036, + "step": 8851 + }, + { + "epoch": 1.18, + "grad_norm": 0.57421875, + "learning_rate": 0.00017765030961163896, + "loss": 0.3075, + "step": 8852 + }, + { + "epoch": 1.18, + "grad_norm": 0.431640625, + "learning_rate": 0.00017764297140356833, + "loss": 0.4762, + "step": 8853 + }, + { + "epoch": 1.18, + "grad_norm": 0.56640625, + "learning_rate": 0.00017763563214260924, + "loss": 0.4137, + "step": 8854 + }, + { + "epoch": 1.18, + "grad_norm": 0.515625, + "learning_rate": 0.0001776282918288611, + "loss": 0.2788, + "step": 8855 + }, + { + "epoch": 1.18, + "grad_norm": 0.48828125, + "learning_rate": 0.0001776209504624235, + "loss": 0.2216, + "step": 8856 + }, + { + "epoch": 1.18, + "grad_norm": 0.369140625, + "learning_rate": 0.00017761360804339604, + "loss": 0.1956, + "step": 8857 + }, + { + "epoch": 1.18, + "grad_norm": 0.474609375, + "learning_rate": 0.0001776062645718782, + "loss": 0.463, + "step": 8858 + }, + { + "epoch": 1.18, + "grad_norm": 0.57421875, + "learning_rate": 0.00017759892004796964, + "loss": 0.446, + "step": 8859 + }, + { + "epoch": 1.18, + "grad_norm": 0.59375, + "learning_rate": 0.0001775915744717699, + "loss": 0.2658, + "step": 8860 + }, + { + "epoch": 1.18, + "grad_norm": 0.7421875, + "learning_rate": 0.00017758422784337863, + "loss": 0.4404, + "step": 8861 + }, + { + "epoch": 1.18, + "grad_norm": 0.48046875, + "learning_rate": 0.00017757688016289543, + "loss": 0.2523, + "step": 8862 + }, + { + "epoch": 1.18, + "grad_norm": 0.484375, + "learning_rate": 0.00017756953143041994, + "loss": 0.2943, + "step": 8863 + }, + { + "epoch": 1.18, + "grad_norm": 0.427734375, + "learning_rate": 0.0001775621816460518, + "loss": 0.3426, + "step": 8864 + }, + { + "epoch": 1.18, + "grad_norm": 0.408203125, + "learning_rate": 0.00017755483080989076, + "loss": 0.2647, + "step": 8865 + }, + { + "epoch": 1.18, + "grad_norm": 0.396484375, + "learning_rate": 0.00017754747892203637, + "loss": 0.2006, + "step": 8866 + }, + { + "epoch": 1.18, + "grad_norm": 0.61328125, + "learning_rate": 0.0001775401259825884, + "loss": 0.1781, + "step": 8867 + }, + { + "epoch": 1.18, + "grad_norm": 0.6796875, + "learning_rate": 0.0001775327719916466, + "loss": 0.4515, + "step": 8868 + }, + { + "epoch": 1.18, + "grad_norm": 0.5703125, + "learning_rate": 0.00017752541694931065, + "loss": 0.5047, + "step": 8869 + }, + { + "epoch": 1.18, + "grad_norm": 0.482421875, + "learning_rate": 0.00017751806085568027, + "loss": 0.24, + "step": 8870 + }, + { + "epoch": 1.18, + "grad_norm": 0.67578125, + "learning_rate": 0.00017751070371085524, + "loss": 0.2674, + "step": 8871 + }, + { + "epoch": 1.18, + "grad_norm": 0.48046875, + "learning_rate": 0.0001775033455149353, + "loss": 0.4176, + "step": 8872 + }, + { + "epoch": 1.18, + "grad_norm": 0.53515625, + "learning_rate": 0.00017749598626802028, + "loss": 0.2857, + "step": 8873 + }, + { + "epoch": 1.18, + "grad_norm": 0.494140625, + "learning_rate": 0.00017748862597020994, + "loss": 0.4196, + "step": 8874 + }, + { + "epoch": 1.18, + "grad_norm": 0.5546875, + "learning_rate": 0.0001774812646216041, + "loss": 0.338, + "step": 8875 + }, + { + "epoch": 1.18, + "grad_norm": 0.326171875, + "learning_rate": 0.0001774739022223026, + "loss": 0.151, + "step": 8876 + }, + { + "epoch": 1.18, + "grad_norm": 0.84375, + "learning_rate": 0.00017746653877240525, + "loss": 0.3927, + "step": 8877 + }, + { + "epoch": 1.18, + "grad_norm": 0.59375, + "learning_rate": 0.00017745917427201192, + "loss": 0.565, + "step": 8878 + }, + { + "epoch": 1.18, + "grad_norm": 0.609375, + "learning_rate": 0.00017745180872122247, + "loss": 0.2231, + "step": 8879 + }, + { + "epoch": 1.18, + "grad_norm": 0.73828125, + "learning_rate": 0.0001774444421201368, + "loss": 0.6848, + "step": 8880 + }, + { + "epoch": 1.19, + "grad_norm": 0.62890625, + "learning_rate": 0.00017743707446885476, + "loss": 0.4844, + "step": 8881 + }, + { + "epoch": 1.19, + "grad_norm": 0.408203125, + "learning_rate": 0.00017742970576747633, + "loss": 0.2994, + "step": 8882 + }, + { + "epoch": 1.19, + "grad_norm": 0.59765625, + "learning_rate": 0.00017742233601610136, + "loss": 0.6208, + "step": 8883 + }, + { + "epoch": 1.19, + "grad_norm": 0.55859375, + "learning_rate": 0.00017741496521482986, + "loss": 0.1947, + "step": 8884 + }, + { + "epoch": 1.19, + "grad_norm": 0.609375, + "learning_rate": 0.0001774075933637617, + "loss": 0.5318, + "step": 8885 + }, + { + "epoch": 1.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00017740022046299693, + "loss": 0.494, + "step": 8886 + }, + { + "epoch": 1.19, + "grad_norm": 0.79296875, + "learning_rate": 0.0001773928465126355, + "loss": 0.4567, + "step": 8887 + }, + { + "epoch": 1.19, + "grad_norm": 0.47265625, + "learning_rate": 0.00017738547151277737, + "loss": 0.5144, + "step": 8888 + }, + { + "epoch": 1.19, + "grad_norm": 0.625, + "learning_rate": 0.0001773780954635226, + "loss": 0.4484, + "step": 8889 + }, + { + "epoch": 1.19, + "grad_norm": 0.5625, + "learning_rate": 0.00017737071836497118, + "loss": 0.556, + "step": 8890 + }, + { + "epoch": 1.19, + "grad_norm": 0.431640625, + "learning_rate": 0.00017736334021722317, + "loss": 0.488, + "step": 8891 + }, + { + "epoch": 1.19, + "grad_norm": 0.4921875, + "learning_rate": 0.00017735596102037864, + "loss": 0.3518, + "step": 8892 + }, + { + "epoch": 1.19, + "grad_norm": 0.60546875, + "learning_rate": 0.00017734858077453762, + "loss": 0.2314, + "step": 8893 + }, + { + "epoch": 1.19, + "grad_norm": 0.4375, + "learning_rate": 0.00017734119947980019, + "loss": 0.2359, + "step": 8894 + }, + { + "epoch": 1.19, + "grad_norm": 0.75, + "learning_rate": 0.00017733381713626648, + "loss": 0.3234, + "step": 8895 + }, + { + "epoch": 1.19, + "grad_norm": 0.53125, + "learning_rate": 0.00017732643374403654, + "loss": 0.5443, + "step": 8896 + }, + { + "epoch": 1.19, + "grad_norm": 0.625, + "learning_rate": 0.00017731904930321055, + "loss": 0.3945, + "step": 8897 + }, + { + "epoch": 1.19, + "grad_norm": 0.83203125, + "learning_rate": 0.00017731166381388863, + "loss": 0.4239, + "step": 8898 + }, + { + "epoch": 1.19, + "grad_norm": 0.55859375, + "learning_rate": 0.00017730427727617094, + "loss": 0.4106, + "step": 8899 + }, + { + "epoch": 1.19, + "grad_norm": 0.6484375, + "learning_rate": 0.00017729688969015764, + "loss": 0.3007, + "step": 8900 + }, + { + "epoch": 1.19, + "grad_norm": 0.84765625, + "learning_rate": 0.0001772895010559489, + "loss": 0.3101, + "step": 8901 + }, + { + "epoch": 1.19, + "grad_norm": 0.68359375, + "learning_rate": 0.00017728211137364489, + "loss": 0.384, + "step": 8902 + }, + { + "epoch": 1.19, + "grad_norm": 0.64453125, + "learning_rate": 0.00017727472064334588, + "loss": 0.587, + "step": 8903 + }, + { + "epoch": 1.19, + "grad_norm": 0.79296875, + "learning_rate": 0.00017726732886515207, + "loss": 0.5624, + "step": 8904 + }, + { + "epoch": 1.19, + "grad_norm": 0.63671875, + "learning_rate": 0.00017725993603916366, + "loss": 0.5132, + "step": 8905 + }, + { + "epoch": 1.19, + "grad_norm": 0.375, + "learning_rate": 0.00017725254216548097, + "loss": 0.4342, + "step": 8906 + }, + { + "epoch": 1.19, + "grad_norm": 0.4453125, + "learning_rate": 0.0001772451472442042, + "loss": 0.2382, + "step": 8907 + }, + { + "epoch": 1.19, + "grad_norm": 0.56640625, + "learning_rate": 0.00017723775127543367, + "loss": 0.3345, + "step": 8908 + }, + { + "epoch": 1.19, + "grad_norm": 0.458984375, + "learning_rate": 0.00017723035425926966, + "loss": 0.497, + "step": 8909 + }, + { + "epoch": 1.19, + "grad_norm": 0.44921875, + "learning_rate": 0.0001772229561958125, + "loss": 0.3035, + "step": 8910 + }, + { + "epoch": 1.19, + "grad_norm": 0.419921875, + "learning_rate": 0.00017721555708516244, + "loss": 0.5077, + "step": 8911 + }, + { + "epoch": 1.19, + "grad_norm": 0.52734375, + "learning_rate": 0.0001772081569274199, + "loss": 0.4879, + "step": 8912 + }, + { + "epoch": 1.19, + "grad_norm": 0.51171875, + "learning_rate": 0.00017720075572268522, + "loss": 0.2747, + "step": 8913 + }, + { + "epoch": 1.19, + "grad_norm": 0.56640625, + "learning_rate": 0.00017719335347105873, + "loss": 0.5897, + "step": 8914 + }, + { + "epoch": 1.19, + "grad_norm": 0.466796875, + "learning_rate": 0.00017718595017264082, + "loss": 0.301, + "step": 8915 + }, + { + "epoch": 1.19, + "grad_norm": 0.453125, + "learning_rate": 0.00017717854582753186, + "loss": 0.2679, + "step": 8916 + }, + { + "epoch": 1.19, + "grad_norm": 0.4375, + "learning_rate": 0.00017717114043583232, + "loss": 0.2737, + "step": 8917 + }, + { + "epoch": 1.19, + "grad_norm": 0.54296875, + "learning_rate": 0.0001771637339976426, + "loss": 0.3851, + "step": 8918 + }, + { + "epoch": 1.19, + "grad_norm": 0.56640625, + "learning_rate": 0.0001771563265130631, + "loss": 0.3205, + "step": 8919 + }, + { + "epoch": 1.19, + "grad_norm": 0.5, + "learning_rate": 0.0001771489179821943, + "loss": 0.5473, + "step": 8920 + }, + { + "epoch": 1.19, + "grad_norm": 0.6015625, + "learning_rate": 0.00017714150840513666, + "loss": 0.3749, + "step": 8921 + }, + { + "epoch": 1.19, + "grad_norm": 0.58203125, + "learning_rate": 0.00017713409778199066, + "loss": 0.2911, + "step": 8922 + }, + { + "epoch": 1.19, + "grad_norm": 0.51171875, + "learning_rate": 0.0001771266861128568, + "loss": 0.3082, + "step": 8923 + }, + { + "epoch": 1.19, + "grad_norm": 0.53515625, + "learning_rate": 0.00017711927339783556, + "loss": 0.6932, + "step": 8924 + }, + { + "epoch": 1.19, + "grad_norm": 0.44140625, + "learning_rate": 0.00017711185963702745, + "loss": 0.443, + "step": 8925 + }, + { + "epoch": 1.19, + "grad_norm": 0.55859375, + "learning_rate": 0.0001771044448305331, + "loss": 0.2874, + "step": 8926 + }, + { + "epoch": 1.19, + "grad_norm": 0.6796875, + "learning_rate": 0.00017709702897845294, + "loss": 0.2535, + "step": 8927 + }, + { + "epoch": 1.19, + "grad_norm": 0.58984375, + "learning_rate": 0.00017708961208088759, + "loss": 0.2502, + "step": 8928 + }, + { + "epoch": 1.19, + "grad_norm": 0.60546875, + "learning_rate": 0.00017708219413793764, + "loss": 0.6512, + "step": 8929 + }, + { + "epoch": 1.19, + "grad_norm": 0.78125, + "learning_rate": 0.00017707477514970367, + "loss": 0.5746, + "step": 8930 + }, + { + "epoch": 1.19, + "grad_norm": 0.546875, + "learning_rate": 0.0001770673551162863, + "loss": 0.2997, + "step": 8931 + }, + { + "epoch": 1.19, + "grad_norm": 0.5078125, + "learning_rate": 0.00017705993403778608, + "loss": 0.3263, + "step": 8932 + }, + { + "epoch": 1.19, + "grad_norm": 0.40234375, + "learning_rate": 0.00017705251191430372, + "loss": 0.3127, + "step": 8933 + }, + { + "epoch": 1.19, + "grad_norm": 0.66015625, + "learning_rate": 0.00017704508874593987, + "loss": 0.2298, + "step": 8934 + }, + { + "epoch": 1.19, + "grad_norm": 0.73046875, + "learning_rate": 0.00017703766453279514, + "loss": 0.2511, + "step": 8935 + }, + { + "epoch": 1.19, + "grad_norm": 0.546875, + "learning_rate": 0.00017703023927497026, + "loss": 0.4468, + "step": 8936 + }, + { + "epoch": 1.19, + "grad_norm": 0.921875, + "learning_rate": 0.00017702281297256588, + "loss": 0.4636, + "step": 8937 + }, + { + "epoch": 1.19, + "grad_norm": 0.48046875, + "learning_rate": 0.00017701538562568274, + "loss": 0.4018, + "step": 8938 + }, + { + "epoch": 1.19, + "grad_norm": 0.64453125, + "learning_rate": 0.00017700795723442152, + "loss": 0.9462, + "step": 8939 + }, + { + "epoch": 1.19, + "grad_norm": 0.40625, + "learning_rate": 0.00017700052779888301, + "loss": 0.2414, + "step": 8940 + }, + { + "epoch": 1.19, + "grad_norm": 0.79296875, + "learning_rate": 0.00017699309731916788, + "loss": 0.3629, + "step": 8941 + }, + { + "epoch": 1.19, + "grad_norm": 0.5859375, + "learning_rate": 0.00017698566579537697, + "loss": 0.6401, + "step": 8942 + }, + { + "epoch": 1.19, + "grad_norm": 0.6015625, + "learning_rate": 0.000176978233227611, + "loss": 0.6144, + "step": 8943 + }, + { + "epoch": 1.19, + "grad_norm": 0.56640625, + "learning_rate": 0.0001769707996159708, + "loss": 0.3107, + "step": 8944 + }, + { + "epoch": 1.19, + "grad_norm": 0.50390625, + "learning_rate": 0.00017696336496055717, + "loss": 0.5715, + "step": 8945 + }, + { + "epoch": 1.19, + "grad_norm": 0.61328125, + "learning_rate": 0.0001769559292614709, + "loss": 0.2587, + "step": 8946 + }, + { + "epoch": 1.19, + "grad_norm": 0.6015625, + "learning_rate": 0.00017694849251881283, + "loss": 0.4872, + "step": 8947 + }, + { + "epoch": 1.19, + "grad_norm": 0.357421875, + "learning_rate": 0.00017694105473268384, + "loss": 0.2185, + "step": 8948 + }, + { + "epoch": 1.19, + "grad_norm": 0.60546875, + "learning_rate": 0.00017693361590318473, + "loss": 0.2564, + "step": 8949 + }, + { + "epoch": 1.19, + "grad_norm": 0.578125, + "learning_rate": 0.00017692617603041645, + "loss": 0.3255, + "step": 8950 + }, + { + "epoch": 1.19, + "grad_norm": 0.55078125, + "learning_rate": 0.00017691873511447982, + "loss": 0.4388, + "step": 8951 + }, + { + "epoch": 1.19, + "grad_norm": 0.6328125, + "learning_rate": 0.0001769112931554758, + "loss": 0.3276, + "step": 8952 + }, + { + "epoch": 1.19, + "grad_norm": 0.48046875, + "learning_rate": 0.00017690385015350528, + "loss": 0.256, + "step": 8953 + }, + { + "epoch": 1.19, + "grad_norm": 0.50390625, + "learning_rate": 0.0001768964061086692, + "loss": 0.4189, + "step": 8954 + }, + { + "epoch": 1.19, + "grad_norm": 0.546875, + "learning_rate": 0.0001768889610210685, + "loss": 0.4828, + "step": 8955 + }, + { + "epoch": 1.2, + "grad_norm": 0.46875, + "learning_rate": 0.00017688151489080414, + "loss": 0.4389, + "step": 8956 + }, + { + "epoch": 1.2, + "grad_norm": 0.55859375, + "learning_rate": 0.00017687406771797707, + "loss": 0.4624, + "step": 8957 + }, + { + "epoch": 1.2, + "grad_norm": 0.41015625, + "learning_rate": 0.00017686661950268835, + "loss": 0.2666, + "step": 8958 + }, + { + "epoch": 1.2, + "grad_norm": 0.427734375, + "learning_rate": 0.0001768591702450389, + "loss": 0.264, + "step": 8959 + }, + { + "epoch": 1.2, + "grad_norm": 0.55078125, + "learning_rate": 0.0001768517199451298, + "loss": 0.2076, + "step": 8960 + }, + { + "epoch": 1.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00017684426860306202, + "loss": 0.2483, + "step": 8961 + }, + { + "epoch": 1.2, + "grad_norm": 0.75390625, + "learning_rate": 0.00017683681621893666, + "loss": 0.3519, + "step": 8962 + }, + { + "epoch": 1.2, + "grad_norm": 0.439453125, + "learning_rate": 0.00017682936279285477, + "loss": 0.3042, + "step": 8963 + }, + { + "epoch": 1.2, + "grad_norm": 0.60546875, + "learning_rate": 0.00017682190832491742, + "loss": 0.3887, + "step": 8964 + }, + { + "epoch": 1.2, + "grad_norm": 0.578125, + "learning_rate": 0.00017681445281522566, + "loss": 0.4291, + "step": 8965 + }, + { + "epoch": 1.2, + "grad_norm": 0.54296875, + "learning_rate": 0.0001768069962638806, + "loss": 0.3974, + "step": 8966 + }, + { + "epoch": 1.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00017679953867098344, + "loss": 0.4241, + "step": 8967 + }, + { + "epoch": 1.2, + "grad_norm": 0.416015625, + "learning_rate": 0.00017679208003663522, + "loss": 0.3123, + "step": 8968 + }, + { + "epoch": 1.2, + "grad_norm": 0.46875, + "learning_rate": 0.00017678462036093708, + "loss": 0.2708, + "step": 8969 + }, + { + "epoch": 1.2, + "grad_norm": 0.462890625, + "learning_rate": 0.00017677715964399023, + "loss": 0.3142, + "step": 8970 + }, + { + "epoch": 1.2, + "grad_norm": 0.4921875, + "learning_rate": 0.00017676969788589584, + "loss": 0.4499, + "step": 8971 + }, + { + "epoch": 1.2, + "grad_norm": 0.671875, + "learning_rate": 0.00017676223508675503, + "loss": 0.639, + "step": 8972 + }, + { + "epoch": 1.2, + "grad_norm": 0.6015625, + "learning_rate": 0.0001767547712466691, + "loss": 0.456, + "step": 8973 + }, + { + "epoch": 1.2, + "grad_norm": 0.435546875, + "learning_rate": 0.00017674730636573917, + "loss": 0.2208, + "step": 8974 + }, + { + "epoch": 1.2, + "grad_norm": 0.4296875, + "learning_rate": 0.0001767398404440665, + "loss": 0.2037, + "step": 8975 + }, + { + "epoch": 1.2, + "grad_norm": 0.431640625, + "learning_rate": 0.0001767323734817524, + "loss": 0.2531, + "step": 8976 + }, + { + "epoch": 1.2, + "grad_norm": 0.59375, + "learning_rate": 0.000176724905478898, + "loss": 0.2747, + "step": 8977 + }, + { + "epoch": 1.2, + "grad_norm": 0.419921875, + "learning_rate": 0.0001767174364356047, + "loss": 0.3444, + "step": 8978 + }, + { + "epoch": 1.2, + "grad_norm": 0.462890625, + "learning_rate": 0.0001767099663519737, + "loss": 0.2924, + "step": 8979 + }, + { + "epoch": 1.2, + "grad_norm": 0.515625, + "learning_rate": 0.00017670249522810632, + "loss": 0.3489, + "step": 8980 + }, + { + "epoch": 1.2, + "grad_norm": 0.37109375, + "learning_rate": 0.0001766950230641039, + "loss": 0.3021, + "step": 8981 + }, + { + "epoch": 1.2, + "grad_norm": 0.9765625, + "learning_rate": 0.0001766875498600677, + "loss": 0.4735, + "step": 8982 + }, + { + "epoch": 1.2, + "grad_norm": 0.4921875, + "learning_rate": 0.00017668007561609913, + "loss": 0.2229, + "step": 8983 + }, + { + "epoch": 1.2, + "grad_norm": 0.53515625, + "learning_rate": 0.00017667260033229953, + "loss": 0.2614, + "step": 8984 + }, + { + "epoch": 1.2, + "grad_norm": 0.8828125, + "learning_rate": 0.00017666512400877027, + "loss": 0.3066, + "step": 8985 + }, + { + "epoch": 1.2, + "grad_norm": 0.419921875, + "learning_rate": 0.0001766576466456127, + "loss": 0.3181, + "step": 8986 + }, + { + "epoch": 1.2, + "grad_norm": 0.58984375, + "learning_rate": 0.00017665016824292825, + "loss": 0.3461, + "step": 8987 + }, + { + "epoch": 1.2, + "grad_norm": 0.47265625, + "learning_rate": 0.00017664268880081832, + "loss": 0.4834, + "step": 8988 + }, + { + "epoch": 1.2, + "grad_norm": 0.486328125, + "learning_rate": 0.00017663520831938436, + "loss": 0.3753, + "step": 8989 + }, + { + "epoch": 1.2, + "grad_norm": 0.6953125, + "learning_rate": 0.00017662772679872778, + "loss": 0.3604, + "step": 8990 + }, + { + "epoch": 1.2, + "grad_norm": 0.65625, + "learning_rate": 0.00017662024423895005, + "loss": 0.6017, + "step": 8991 + }, + { + "epoch": 1.2, + "grad_norm": 0.447265625, + "learning_rate": 0.00017661276064015265, + "loss": 0.3431, + "step": 8992 + }, + { + "epoch": 1.2, + "grad_norm": 0.66796875, + "learning_rate": 0.000176605276002437, + "loss": 0.5965, + "step": 8993 + }, + { + "epoch": 1.2, + "grad_norm": 0.6328125, + "learning_rate": 0.00017659779032590467, + "loss": 0.276, + "step": 8994 + }, + { + "epoch": 1.2, + "grad_norm": 0.4609375, + "learning_rate": 0.00017659030361065714, + "loss": 0.4139, + "step": 8995 + }, + { + "epoch": 1.2, + "grad_norm": 0.80859375, + "learning_rate": 0.00017658281585679594, + "loss": 0.5213, + "step": 8996 + }, + { + "epoch": 1.2, + "grad_norm": 0.53125, + "learning_rate": 0.0001765753270644226, + "loss": 0.4688, + "step": 8997 + }, + { + "epoch": 1.2, + "grad_norm": 0.5546875, + "learning_rate": 0.0001765678372336387, + "loss": 0.4773, + "step": 8998 + }, + { + "epoch": 1.2, + "grad_norm": 0.33984375, + "learning_rate": 0.00017656034636454577, + "loss": 0.3205, + "step": 8999 + }, + { + "epoch": 1.2, + "grad_norm": 0.51953125, + "learning_rate": 0.0001765528544572454, + "loss": 0.4397, + "step": 9000 + }, + { + "epoch": 1.2, + "grad_norm": 0.73828125, + "learning_rate": 0.0001765453615118392, + "loss": 0.4822, + "step": 9001 + }, + { + "epoch": 1.2, + "grad_norm": 0.62890625, + "learning_rate": 0.00017653786752842878, + "loss": 0.3463, + "step": 9002 + }, + { + "epoch": 1.2, + "grad_norm": 0.52734375, + "learning_rate": 0.00017653037250711572, + "loss": 0.6692, + "step": 9003 + }, + { + "epoch": 1.2, + "grad_norm": 0.90234375, + "learning_rate": 0.00017652287644800174, + "loss": 0.3383, + "step": 9004 + }, + { + "epoch": 1.2, + "grad_norm": 0.609375, + "learning_rate": 0.0001765153793511884, + "loss": 0.6482, + "step": 9005 + }, + { + "epoch": 1.2, + "grad_norm": 0.87890625, + "learning_rate": 0.00017650788121677743, + "loss": 0.4992, + "step": 9006 + }, + { + "epoch": 1.2, + "grad_norm": 0.67578125, + "learning_rate": 0.00017650038204487052, + "loss": 0.3935, + "step": 9007 + }, + { + "epoch": 1.2, + "grad_norm": 0.8828125, + "learning_rate": 0.0001764928818355693, + "loss": 0.3533, + "step": 9008 + }, + { + "epoch": 1.2, + "grad_norm": 0.5078125, + "learning_rate": 0.0001764853805889755, + "loss": 0.3398, + "step": 9009 + }, + { + "epoch": 1.2, + "grad_norm": 0.6015625, + "learning_rate": 0.00017647787830519084, + "loss": 0.3054, + "step": 9010 + }, + { + "epoch": 1.2, + "grad_norm": 0.42578125, + "learning_rate": 0.0001764703749843171, + "loss": 0.3987, + "step": 9011 + }, + { + "epoch": 1.2, + "grad_norm": 0.55078125, + "learning_rate": 0.000176462870626456, + "loss": 0.2896, + "step": 9012 + }, + { + "epoch": 1.2, + "grad_norm": 0.54296875, + "learning_rate": 0.00017645536523170927, + "loss": 0.4714, + "step": 9013 + }, + { + "epoch": 1.2, + "grad_norm": 0.48828125, + "learning_rate": 0.00017644785880017874, + "loss": 0.2369, + "step": 9014 + }, + { + "epoch": 1.2, + "grad_norm": 1.140625, + "learning_rate": 0.00017644035133196616, + "loss": 0.4177, + "step": 9015 + }, + { + "epoch": 1.2, + "grad_norm": 0.7109375, + "learning_rate": 0.0001764328428271734, + "loss": 0.2653, + "step": 9016 + }, + { + "epoch": 1.2, + "grad_norm": 0.48046875, + "learning_rate": 0.00017642533328590218, + "loss": 0.4014, + "step": 9017 + }, + { + "epoch": 1.2, + "grad_norm": 0.73828125, + "learning_rate": 0.00017641782270825442, + "loss": 0.4435, + "step": 9018 + }, + { + "epoch": 1.2, + "grad_norm": 0.494140625, + "learning_rate": 0.00017641031109433194, + "loss": 0.4314, + "step": 9019 + }, + { + "epoch": 1.2, + "grad_norm": 0.5546875, + "learning_rate": 0.00017640279844423663, + "loss": 0.2917, + "step": 9020 + }, + { + "epoch": 1.2, + "grad_norm": 0.5390625, + "learning_rate": 0.0001763952847580703, + "loss": 0.2824, + "step": 9021 + }, + { + "epoch": 1.2, + "grad_norm": 0.48046875, + "learning_rate": 0.00017638777003593488, + "loss": 0.3998, + "step": 9022 + }, + { + "epoch": 1.2, + "grad_norm": 0.6015625, + "learning_rate": 0.00017638025427793228, + "loss": 0.5324, + "step": 9023 + }, + { + "epoch": 1.2, + "grad_norm": 0.65234375, + "learning_rate": 0.00017637273748416442, + "loss": 0.2286, + "step": 9024 + }, + { + "epoch": 1.2, + "grad_norm": 0.73046875, + "learning_rate": 0.00017636521965473323, + "loss": 0.2763, + "step": 9025 + }, + { + "epoch": 1.2, + "grad_norm": 0.55078125, + "learning_rate": 0.00017635770078974061, + "loss": 0.5376, + "step": 9026 + }, + { + "epoch": 1.2, + "grad_norm": 0.5390625, + "learning_rate": 0.00017635018088928858, + "loss": 0.5051, + "step": 9027 + }, + { + "epoch": 1.2, + "grad_norm": 0.73046875, + "learning_rate": 0.0001763426599534791, + "loss": 0.235, + "step": 9028 + }, + { + "epoch": 1.2, + "grad_norm": 0.8515625, + "learning_rate": 0.0001763351379824142, + "loss": 0.3353, + "step": 9029 + }, + { + "epoch": 1.2, + "grad_norm": 0.53515625, + "learning_rate": 0.00017632761497619578, + "loss": 0.3675, + "step": 9030 + }, + { + "epoch": 1.21, + "grad_norm": 0.482421875, + "learning_rate": 0.00017632009093492595, + "loss": 0.2545, + "step": 9031 + }, + { + "epoch": 1.21, + "grad_norm": 0.50390625, + "learning_rate": 0.00017631256585870665, + "loss": 0.2642, + "step": 9032 + }, + { + "epoch": 1.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00017630503974764002, + "loss": 0.2574, + "step": 9033 + }, + { + "epoch": 1.21, + "grad_norm": 0.62890625, + "learning_rate": 0.00017629751260182807, + "loss": 0.3108, + "step": 9034 + }, + { + "epoch": 1.21, + "grad_norm": 0.56640625, + "learning_rate": 0.0001762899844213729, + "loss": 0.3443, + "step": 9035 + }, + { + "epoch": 1.21, + "grad_norm": 0.51171875, + "learning_rate": 0.00017628245520637657, + "loss": 0.4472, + "step": 9036 + }, + { + "epoch": 1.21, + "grad_norm": 0.62109375, + "learning_rate": 0.00017627492495694117, + "loss": 0.3306, + "step": 9037 + }, + { + "epoch": 1.21, + "grad_norm": 0.44921875, + "learning_rate": 0.00017626739367316888, + "loss": 0.2326, + "step": 9038 + }, + { + "epoch": 1.21, + "grad_norm": 0.5234375, + "learning_rate": 0.00017625986135516174, + "loss": 0.2766, + "step": 9039 + }, + { + "epoch": 1.21, + "grad_norm": 0.5546875, + "learning_rate": 0.00017625232800302194, + "loss": 0.633, + "step": 9040 + }, + { + "epoch": 1.21, + "grad_norm": 0.57421875, + "learning_rate": 0.00017624479361685164, + "loss": 0.3347, + "step": 9041 + }, + { + "epoch": 1.21, + "grad_norm": 0.75390625, + "learning_rate": 0.000176237258196753, + "loss": 0.4069, + "step": 9042 + }, + { + "epoch": 1.21, + "grad_norm": 0.515625, + "learning_rate": 0.00017622972174282823, + "loss": 0.4633, + "step": 9043 + }, + { + "epoch": 1.21, + "grad_norm": 0.466796875, + "learning_rate": 0.0001762221842551795, + "loss": 0.5216, + "step": 9044 + }, + { + "epoch": 1.21, + "grad_norm": 0.74609375, + "learning_rate": 0.00017621464573390903, + "loss": 0.3298, + "step": 9045 + }, + { + "epoch": 1.21, + "grad_norm": 0.73046875, + "learning_rate": 0.00017620710617911905, + "loss": 0.5861, + "step": 9046 + }, + { + "epoch": 1.21, + "grad_norm": 0.45703125, + "learning_rate": 0.0001761995655909118, + "loss": 0.4518, + "step": 9047 + }, + { + "epoch": 1.21, + "grad_norm": 0.578125, + "learning_rate": 0.00017619202396938955, + "loss": 0.2967, + "step": 9048 + }, + { + "epoch": 1.21, + "grad_norm": 0.470703125, + "learning_rate": 0.00017618448131465452, + "loss": 0.309, + "step": 9049 + }, + { + "epoch": 1.21, + "grad_norm": 0.83203125, + "learning_rate": 0.00017617693762680907, + "loss": 0.4834, + "step": 9050 + }, + { + "epoch": 1.21, + "grad_norm": 0.45703125, + "learning_rate": 0.0001761693929059554, + "loss": 0.4069, + "step": 9051 + }, + { + "epoch": 1.21, + "grad_norm": 0.9453125, + "learning_rate": 0.00017616184715219592, + "loss": 0.41, + "step": 9052 + }, + { + "epoch": 1.21, + "grad_norm": 0.6171875, + "learning_rate": 0.00017615430036563293, + "loss": 0.7389, + "step": 9053 + }, + { + "epoch": 1.21, + "grad_norm": 0.57421875, + "learning_rate": 0.00017614675254636872, + "loss": 0.2397, + "step": 9054 + }, + { + "epoch": 1.21, + "grad_norm": 0.5859375, + "learning_rate": 0.00017613920369450567, + "loss": 0.2785, + "step": 9055 + }, + { + "epoch": 1.21, + "grad_norm": 0.65625, + "learning_rate": 0.00017613165381014616, + "loss": 0.2543, + "step": 9056 + }, + { + "epoch": 1.21, + "grad_norm": 0.5234375, + "learning_rate": 0.00017612410289339256, + "loss": 0.4786, + "step": 9057 + }, + { + "epoch": 1.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00017611655094434728, + "loss": 0.3683, + "step": 9058 + }, + { + "epoch": 1.21, + "grad_norm": 0.51171875, + "learning_rate": 0.00017610899796311274, + "loss": 0.3372, + "step": 9059 + }, + { + "epoch": 1.21, + "grad_norm": 0.369140625, + "learning_rate": 0.0001761014439497913, + "loss": 0.2382, + "step": 9060 + }, + { + "epoch": 1.21, + "grad_norm": 0.48828125, + "learning_rate": 0.00017609388890448547, + "loss": 0.5211, + "step": 9061 + }, + { + "epoch": 1.21, + "grad_norm": 0.609375, + "learning_rate": 0.00017608633282729762, + "loss": 0.2581, + "step": 9062 + }, + { + "epoch": 1.21, + "grad_norm": 0.453125, + "learning_rate": 0.0001760787757183303, + "loss": 0.4292, + "step": 9063 + }, + { + "epoch": 1.21, + "grad_norm": 0.5859375, + "learning_rate": 0.00017607121757768596, + "loss": 0.4439, + "step": 9064 + }, + { + "epoch": 1.21, + "grad_norm": 0.6015625, + "learning_rate": 0.00017606365840546707, + "loss": 0.345, + "step": 9065 + }, + { + "epoch": 1.21, + "grad_norm": 0.490234375, + "learning_rate": 0.00017605609820177617, + "loss": 0.4079, + "step": 9066 + }, + { + "epoch": 1.21, + "grad_norm": 0.57421875, + "learning_rate": 0.00017604853696671577, + "loss": 0.5515, + "step": 9067 + }, + { + "epoch": 1.21, + "grad_norm": 0.44921875, + "learning_rate": 0.00017604097470038838, + "loss": 0.2576, + "step": 9068 + }, + { + "epoch": 1.21, + "grad_norm": 0.52734375, + "learning_rate": 0.00017603341140289659, + "loss": 0.3256, + "step": 9069 + }, + { + "epoch": 1.21, + "grad_norm": 0.52734375, + "learning_rate": 0.00017602584707434294, + "loss": 0.3884, + "step": 9070 + }, + { + "epoch": 1.21, + "grad_norm": 1.125, + "learning_rate": 0.00017601828171483002, + "loss": 0.655, + "step": 9071 + }, + { + "epoch": 1.21, + "grad_norm": 0.94921875, + "learning_rate": 0.00017601071532446038, + "loss": 0.3123, + "step": 9072 + }, + { + "epoch": 1.21, + "grad_norm": 0.5625, + "learning_rate": 0.00017600314790333667, + "loss": 0.3506, + "step": 9073 + }, + { + "epoch": 1.21, + "grad_norm": 0.71875, + "learning_rate": 0.00017599557945156152, + "loss": 0.477, + "step": 9074 + }, + { + "epoch": 1.21, + "grad_norm": 0.66796875, + "learning_rate": 0.0001759880099692375, + "loss": 0.3442, + "step": 9075 + }, + { + "epoch": 1.21, + "grad_norm": 0.7734375, + "learning_rate": 0.00017598043945646732, + "loss": 0.5045, + "step": 9076 + }, + { + "epoch": 1.21, + "grad_norm": 0.373046875, + "learning_rate": 0.00017597286791335361, + "loss": 0.3737, + "step": 9077 + }, + { + "epoch": 1.21, + "grad_norm": 0.376953125, + "learning_rate": 0.00017596529533999905, + "loss": 0.2128, + "step": 9078 + }, + { + "epoch": 1.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00017595772173650636, + "loss": 0.2551, + "step": 9079 + }, + { + "epoch": 1.21, + "grad_norm": 0.5703125, + "learning_rate": 0.00017595014710297818, + "loss": 0.8407, + "step": 9080 + }, + { + "epoch": 1.21, + "grad_norm": 0.64453125, + "learning_rate": 0.00017594257143951726, + "loss": 0.3647, + "step": 9081 + }, + { + "epoch": 1.21, + "grad_norm": 0.6484375, + "learning_rate": 0.00017593499474622635, + "loss": 0.5818, + "step": 9082 + }, + { + "epoch": 1.21, + "grad_norm": 0.5390625, + "learning_rate": 0.00017592741702320816, + "loss": 0.3555, + "step": 9083 + }, + { + "epoch": 1.21, + "grad_norm": 0.6484375, + "learning_rate": 0.00017591983827056548, + "loss": 0.5048, + "step": 9084 + }, + { + "epoch": 1.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00017591225848840106, + "loss": 0.3148, + "step": 9085 + }, + { + "epoch": 1.21, + "grad_norm": 0.46484375, + "learning_rate": 0.0001759046776768177, + "loss": 0.2972, + "step": 9086 + }, + { + "epoch": 1.21, + "grad_norm": 0.609375, + "learning_rate": 0.00017589709583591817, + "loss": 0.4297, + "step": 9087 + }, + { + "epoch": 1.21, + "grad_norm": 0.51953125, + "learning_rate": 0.00017588951296580537, + "loss": 0.2308, + "step": 9088 + }, + { + "epoch": 1.21, + "grad_norm": 0.59375, + "learning_rate": 0.00017588192906658201, + "loss": 0.5428, + "step": 9089 + }, + { + "epoch": 1.21, + "grad_norm": 0.47265625, + "learning_rate": 0.00017587434413835103, + "loss": 0.356, + "step": 9090 + }, + { + "epoch": 1.21, + "grad_norm": 0.4296875, + "learning_rate": 0.0001758667581812152, + "loss": 0.2222, + "step": 9091 + }, + { + "epoch": 1.21, + "grad_norm": 1.2265625, + "learning_rate": 0.0001758591711952775, + "loss": 0.2263, + "step": 9092 + }, + { + "epoch": 1.21, + "grad_norm": 0.51171875, + "learning_rate": 0.0001758515831806407, + "loss": 0.4304, + "step": 9093 + }, + { + "epoch": 1.21, + "grad_norm": 0.51171875, + "learning_rate": 0.00017584399413740775, + "loss": 0.3814, + "step": 9094 + }, + { + "epoch": 1.21, + "grad_norm": 0.73046875, + "learning_rate": 0.0001758364040656816, + "loss": 0.4693, + "step": 9095 + }, + { + "epoch": 1.21, + "grad_norm": 0.58984375, + "learning_rate": 0.00017582881296556512, + "loss": 0.4119, + "step": 9096 + }, + { + "epoch": 1.21, + "grad_norm": 0.361328125, + "learning_rate": 0.00017582122083716127, + "loss": 0.1337, + "step": 9097 + }, + { + "epoch": 1.21, + "grad_norm": 0.546875, + "learning_rate": 0.00017581362768057302, + "loss": 0.4118, + "step": 9098 + }, + { + "epoch": 1.21, + "grad_norm": 0.40625, + "learning_rate": 0.0001758060334959033, + "loss": 0.2359, + "step": 9099 + }, + { + "epoch": 1.21, + "grad_norm": 0.5703125, + "learning_rate": 0.0001757984382832551, + "loss": 0.5531, + "step": 9100 + }, + { + "epoch": 1.21, + "grad_norm": 0.49609375, + "learning_rate": 0.0001757908420427315, + "loss": 0.3247, + "step": 9101 + }, + { + "epoch": 1.21, + "grad_norm": 0.55078125, + "learning_rate": 0.00017578324477443536, + "loss": 0.3812, + "step": 9102 + }, + { + "epoch": 1.21, + "grad_norm": 0.5625, + "learning_rate": 0.00017577564647846983, + "loss": 0.6567, + "step": 9103 + }, + { + "epoch": 1.21, + "grad_norm": 0.470703125, + "learning_rate": 0.0001757680471549379, + "loss": 0.3206, + "step": 9104 + }, + { + "epoch": 1.21, + "grad_norm": 0.75390625, + "learning_rate": 0.0001757604468039426, + "loss": 0.3781, + "step": 9105 + }, + { + "epoch": 1.22, + "grad_norm": 0.494140625, + "learning_rate": 0.000175752845425587, + "loss": 0.345, + "step": 9106 + }, + { + "epoch": 1.22, + "grad_norm": 0.5546875, + "learning_rate": 0.00017574524301997423, + "loss": 0.2313, + "step": 9107 + }, + { + "epoch": 1.22, + "grad_norm": 0.53125, + "learning_rate": 0.00017573763958720736, + "loss": 0.3365, + "step": 9108 + }, + { + "epoch": 1.22, + "grad_norm": 0.60546875, + "learning_rate": 0.00017573003512738947, + "loss": 0.4294, + "step": 9109 + }, + { + "epoch": 1.22, + "grad_norm": 0.57421875, + "learning_rate": 0.0001757224296406237, + "loss": 0.7285, + "step": 9110 + }, + { + "epoch": 1.22, + "grad_norm": 0.5234375, + "learning_rate": 0.00017571482312701318, + "loss": 0.3842, + "step": 9111 + }, + { + "epoch": 1.22, + "grad_norm": 0.6015625, + "learning_rate": 0.0001757072155866611, + "loss": 0.2973, + "step": 9112 + }, + { + "epoch": 1.22, + "grad_norm": 0.46875, + "learning_rate": 0.00017569960701967059, + "loss": 0.2028, + "step": 9113 + }, + { + "epoch": 1.22, + "grad_norm": 0.80078125, + "learning_rate": 0.0001756919974261448, + "loss": 0.9155, + "step": 9114 + }, + { + "epoch": 1.22, + "grad_norm": 0.5, + "learning_rate": 0.00017568438680618694, + "loss": 0.3659, + "step": 9115 + }, + { + "epoch": 1.22, + "grad_norm": 0.59765625, + "learning_rate": 0.0001756767751599002, + "loss": 0.6286, + "step": 9116 + }, + { + "epoch": 1.22, + "grad_norm": 0.80078125, + "learning_rate": 0.00017566916248738787, + "loss": 0.5518, + "step": 9117 + }, + { + "epoch": 1.22, + "grad_norm": 0.73046875, + "learning_rate": 0.0001756615487887531, + "loss": 0.6206, + "step": 9118 + }, + { + "epoch": 1.22, + "grad_norm": 0.58984375, + "learning_rate": 0.0001756539340640992, + "loss": 0.602, + "step": 9119 + }, + { + "epoch": 1.22, + "grad_norm": 0.85546875, + "learning_rate": 0.00017564631831352938, + "loss": 0.4158, + "step": 9120 + }, + { + "epoch": 1.22, + "grad_norm": 0.4765625, + "learning_rate": 0.0001756387015371469, + "loss": 0.2942, + "step": 9121 + }, + { + "epoch": 1.22, + "grad_norm": 0.4921875, + "learning_rate": 0.00017563108373505512, + "loss": 0.3473, + "step": 9122 + }, + { + "epoch": 1.22, + "grad_norm": 0.80078125, + "learning_rate": 0.00017562346490735732, + "loss": 0.4787, + "step": 9123 + }, + { + "epoch": 1.22, + "grad_norm": 0.54296875, + "learning_rate": 0.00017561584505415678, + "loss": 0.1834, + "step": 9124 + }, + { + "epoch": 1.22, + "grad_norm": 0.54296875, + "learning_rate": 0.00017560822417555686, + "loss": 0.4335, + "step": 9125 + }, + { + "epoch": 1.22, + "grad_norm": 0.41015625, + "learning_rate": 0.0001756006022716609, + "loss": 0.4448, + "step": 9126 + }, + { + "epoch": 1.22, + "grad_norm": 0.44921875, + "learning_rate": 0.00017559297934257223, + "loss": 0.1979, + "step": 9127 + }, + { + "epoch": 1.22, + "grad_norm": 0.546875, + "learning_rate": 0.00017558535538839424, + "loss": 0.3856, + "step": 9128 + }, + { + "epoch": 1.22, + "grad_norm": 0.51171875, + "learning_rate": 0.00017557773040923036, + "loss": 0.4059, + "step": 9129 + }, + { + "epoch": 1.22, + "grad_norm": 0.5390625, + "learning_rate": 0.0001755701044051839, + "loss": 0.4113, + "step": 9130 + }, + { + "epoch": 1.22, + "grad_norm": 0.80078125, + "learning_rate": 0.00017556247737635837, + "loss": 0.5355, + "step": 9131 + }, + { + "epoch": 1.22, + "grad_norm": 0.578125, + "learning_rate": 0.00017555484932285713, + "loss": 0.2889, + "step": 9132 + }, + { + "epoch": 1.22, + "grad_norm": 0.55078125, + "learning_rate": 0.00017554722024478364, + "loss": 0.2667, + "step": 9133 + }, + { + "epoch": 1.22, + "grad_norm": 0.4609375, + "learning_rate": 0.00017553959014224136, + "loss": 0.2599, + "step": 9134 + }, + { + "epoch": 1.22, + "grad_norm": 0.421875, + "learning_rate": 0.00017553195901533374, + "loss": 0.3174, + "step": 9135 + }, + { + "epoch": 1.22, + "grad_norm": 0.62109375, + "learning_rate": 0.00017552432686416432, + "loss": 0.6218, + "step": 9136 + }, + { + "epoch": 1.22, + "grad_norm": 0.69140625, + "learning_rate": 0.00017551669368883653, + "loss": 0.4908, + "step": 9137 + }, + { + "epoch": 1.22, + "grad_norm": 0.46875, + "learning_rate": 0.0001755090594894539, + "loss": 0.379, + "step": 9138 + }, + { + "epoch": 1.22, + "grad_norm": 0.55078125, + "learning_rate": 0.00017550142426611998, + "loss": 0.4611, + "step": 9139 + }, + { + "epoch": 1.22, + "grad_norm": 0.431640625, + "learning_rate": 0.0001754937880189383, + "loss": 0.2614, + "step": 9140 + }, + { + "epoch": 1.22, + "grad_norm": 0.353515625, + "learning_rate": 0.0001754861507480124, + "loss": 0.1985, + "step": 9141 + }, + { + "epoch": 1.22, + "grad_norm": 0.6015625, + "learning_rate": 0.00017547851245344582, + "loss": 0.3765, + "step": 9142 + }, + { + "epoch": 1.22, + "grad_norm": 0.4765625, + "learning_rate": 0.00017547087313534218, + "loss": 0.5143, + "step": 9143 + }, + { + "epoch": 1.22, + "grad_norm": 0.4140625, + "learning_rate": 0.0001754632327938051, + "loss": 0.3594, + "step": 9144 + }, + { + "epoch": 1.22, + "grad_norm": 0.65234375, + "learning_rate": 0.0001754555914289381, + "loss": 0.3786, + "step": 9145 + }, + { + "epoch": 1.22, + "grad_norm": 0.6171875, + "learning_rate": 0.00017544794904084487, + "loss": 0.6978, + "step": 9146 + }, + { + "epoch": 1.22, + "grad_norm": 0.734375, + "learning_rate": 0.00017544030562962904, + "loss": 0.555, + "step": 9147 + }, + { + "epoch": 1.22, + "grad_norm": 0.365234375, + "learning_rate": 0.00017543266119539422, + "loss": 0.1754, + "step": 9148 + }, + { + "epoch": 1.22, + "grad_norm": 0.63671875, + "learning_rate": 0.00017542501573824412, + "loss": 0.7209, + "step": 9149 + }, + { + "epoch": 1.22, + "grad_norm": 0.7578125, + "learning_rate": 0.00017541736925828243, + "loss": 0.4291, + "step": 9150 + }, + { + "epoch": 1.22, + "grad_norm": 0.7578125, + "learning_rate": 0.00017540972175561276, + "loss": 0.3843, + "step": 9151 + }, + { + "epoch": 1.22, + "grad_norm": 0.447265625, + "learning_rate": 0.00017540207323033888, + "loss": 0.4287, + "step": 9152 + }, + { + "epoch": 1.22, + "grad_norm": 0.48828125, + "learning_rate": 0.0001753944236825645, + "loss": 0.288, + "step": 9153 + }, + { + "epoch": 1.22, + "grad_norm": 0.46484375, + "learning_rate": 0.0001753867731123934, + "loss": 0.4448, + "step": 9154 + }, + { + "epoch": 1.22, + "grad_norm": 0.609375, + "learning_rate": 0.00017537912151992919, + "loss": 0.5951, + "step": 9155 + }, + { + "epoch": 1.22, + "grad_norm": 0.498046875, + "learning_rate": 0.00017537146890527576, + "loss": 0.5063, + "step": 9156 + }, + { + "epoch": 1.22, + "grad_norm": 0.498046875, + "learning_rate": 0.0001753638152685368, + "loss": 0.3107, + "step": 9157 + }, + { + "epoch": 1.22, + "grad_norm": 0.353515625, + "learning_rate": 0.00017535616060981618, + "loss": 0.2198, + "step": 9158 + }, + { + "epoch": 1.22, + "grad_norm": 0.59375, + "learning_rate": 0.00017534850492921763, + "loss": 0.7646, + "step": 9159 + }, + { + "epoch": 1.22, + "grad_norm": 0.4921875, + "learning_rate": 0.000175340848226845, + "loss": 0.2655, + "step": 9160 + }, + { + "epoch": 1.22, + "grad_norm": 0.6953125, + "learning_rate": 0.00017533319050280213, + "loss": 0.4136, + "step": 9161 + }, + { + "epoch": 1.22, + "grad_norm": 0.51171875, + "learning_rate": 0.00017532553175719285, + "loss": 0.2128, + "step": 9162 + }, + { + "epoch": 1.22, + "grad_norm": 0.5703125, + "learning_rate": 0.000175317871990121, + "loss": 0.5547, + "step": 9163 + }, + { + "epoch": 1.22, + "grad_norm": 0.52734375, + "learning_rate": 0.0001753102112016905, + "loss": 0.3976, + "step": 9164 + }, + { + "epoch": 1.22, + "grad_norm": 0.59375, + "learning_rate": 0.00017530254939200518, + "loss": 0.3922, + "step": 9165 + }, + { + "epoch": 1.22, + "grad_norm": 0.5390625, + "learning_rate": 0.00017529488656116893, + "loss": 0.5302, + "step": 9166 + }, + { + "epoch": 1.22, + "grad_norm": 0.50390625, + "learning_rate": 0.0001752872227092857, + "loss": 0.4317, + "step": 9167 + }, + { + "epoch": 1.22, + "grad_norm": 0.431640625, + "learning_rate": 0.00017527955783645945, + "loss": 0.197, + "step": 9168 + }, + { + "epoch": 1.22, + "grad_norm": 0.625, + "learning_rate": 0.00017527189194279405, + "loss": 0.3286, + "step": 9169 + }, + { + "epoch": 1.22, + "grad_norm": 0.515625, + "learning_rate": 0.0001752642250283935, + "loss": 0.2929, + "step": 9170 + }, + { + "epoch": 1.22, + "grad_norm": 0.43359375, + "learning_rate": 0.00017525655709336175, + "loss": 0.3684, + "step": 9171 + }, + { + "epoch": 1.22, + "grad_norm": 0.6171875, + "learning_rate": 0.0001752488881378028, + "loss": 0.693, + "step": 9172 + }, + { + "epoch": 1.22, + "grad_norm": 0.51953125, + "learning_rate": 0.0001752412181618206, + "loss": 0.5623, + "step": 9173 + }, + { + "epoch": 1.22, + "grad_norm": 0.69921875, + "learning_rate": 0.00017523354716551923, + "loss": 0.6258, + "step": 9174 + }, + { + "epoch": 1.22, + "grad_norm": 0.58203125, + "learning_rate": 0.00017522587514900265, + "loss": 0.4092, + "step": 9175 + }, + { + "epoch": 1.22, + "grad_norm": 1.0859375, + "learning_rate": 0.00017521820211237495, + "loss": 0.3581, + "step": 9176 + }, + { + "epoch": 1.22, + "grad_norm": 0.61328125, + "learning_rate": 0.00017521052805574014, + "loss": 0.3542, + "step": 9177 + }, + { + "epoch": 1.22, + "grad_norm": 0.546875, + "learning_rate": 0.00017520285297920228, + "loss": 0.3283, + "step": 9178 + }, + { + "epoch": 1.22, + "grad_norm": 0.515625, + "learning_rate": 0.0001751951768828655, + "loss": 0.4392, + "step": 9179 + }, + { + "epoch": 1.22, + "grad_norm": 0.42578125, + "learning_rate": 0.00017518749976683386, + "loss": 0.2104, + "step": 9180 + }, + { + "epoch": 1.23, + "grad_norm": 0.462890625, + "learning_rate": 0.00017517982163121144, + "loss": 0.2475, + "step": 9181 + }, + { + "epoch": 1.23, + "grad_norm": 0.62890625, + "learning_rate": 0.00017517214247610242, + "loss": 0.3271, + "step": 9182 + }, + { + "epoch": 1.23, + "grad_norm": 0.58984375, + "learning_rate": 0.00017516446230161087, + "loss": 0.5182, + "step": 9183 + }, + { + "epoch": 1.23, + "grad_norm": 0.58984375, + "learning_rate": 0.000175156781107841, + "loss": 0.6725, + "step": 9184 + }, + { + "epoch": 1.23, + "grad_norm": 0.62890625, + "learning_rate": 0.00017514909889489692, + "loss": 0.5072, + "step": 9185 + }, + { + "epoch": 1.23, + "grad_norm": 0.482421875, + "learning_rate": 0.00017514141566288288, + "loss": 0.4044, + "step": 9186 + }, + { + "epoch": 1.23, + "grad_norm": 0.64453125, + "learning_rate": 0.00017513373141190295, + "loss": 0.2687, + "step": 9187 + }, + { + "epoch": 1.23, + "grad_norm": 0.6484375, + "learning_rate": 0.00017512604614206142, + "loss": 0.5513, + "step": 9188 + }, + { + "epoch": 1.23, + "grad_norm": 0.703125, + "learning_rate": 0.00017511835985346253, + "loss": 0.3921, + "step": 9189 + }, + { + "epoch": 1.23, + "grad_norm": 0.5078125, + "learning_rate": 0.00017511067254621042, + "loss": 0.4184, + "step": 9190 + }, + { + "epoch": 1.23, + "grad_norm": 0.373046875, + "learning_rate": 0.00017510298422040941, + "loss": 0.2229, + "step": 9191 + }, + { + "epoch": 1.23, + "grad_norm": 0.93359375, + "learning_rate": 0.00017509529487616372, + "loss": 0.4888, + "step": 9192 + }, + { + "epoch": 1.23, + "grad_norm": 0.58984375, + "learning_rate": 0.00017508760451357764, + "loss": 0.2316, + "step": 9193 + }, + { + "epoch": 1.23, + "grad_norm": 0.66015625, + "learning_rate": 0.00017507991313275544, + "loss": 0.5973, + "step": 9194 + }, + { + "epoch": 1.23, + "grad_norm": 0.484375, + "learning_rate": 0.00017507222073380146, + "loss": 0.2268, + "step": 9195 + }, + { + "epoch": 1.23, + "grad_norm": 0.427734375, + "learning_rate": 0.00017506452731681996, + "loss": 0.2478, + "step": 9196 + }, + { + "epoch": 1.23, + "grad_norm": 0.62109375, + "learning_rate": 0.00017505683288191533, + "loss": 0.58, + "step": 9197 + }, + { + "epoch": 1.23, + "grad_norm": 0.54296875, + "learning_rate": 0.00017504913742919182, + "loss": 0.4386, + "step": 9198 + }, + { + "epoch": 1.23, + "grad_norm": 0.48046875, + "learning_rate": 0.00017504144095875388, + "loss": 0.2832, + "step": 9199 + }, + { + "epoch": 1.23, + "grad_norm": 0.609375, + "learning_rate": 0.00017503374347070585, + "loss": 0.3221, + "step": 9200 + }, + { + "epoch": 1.23, + "grad_norm": 0.6328125, + "learning_rate": 0.00017502604496515207, + "loss": 0.3721, + "step": 9201 + }, + { + "epoch": 1.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00017501834544219697, + "loss": 0.2548, + "step": 9202 + }, + { + "epoch": 1.23, + "grad_norm": 0.35546875, + "learning_rate": 0.000175010644901945, + "loss": 0.1138, + "step": 9203 + }, + { + "epoch": 1.23, + "grad_norm": 0.578125, + "learning_rate": 0.0001750029433445005, + "loss": 0.7618, + "step": 9204 + }, + { + "epoch": 1.23, + "grad_norm": 0.71875, + "learning_rate": 0.00017499524076996796, + "loss": 0.4766, + "step": 9205 + }, + { + "epoch": 1.23, + "grad_norm": 0.41015625, + "learning_rate": 0.00017498753717845182, + "loss": 0.1745, + "step": 9206 + }, + { + "epoch": 1.23, + "grad_norm": 0.478515625, + "learning_rate": 0.00017497983257005656, + "loss": 0.2481, + "step": 9207 + }, + { + "epoch": 1.23, + "grad_norm": 0.50390625, + "learning_rate": 0.00017497212694488664, + "loss": 0.2154, + "step": 9208 + }, + { + "epoch": 1.23, + "grad_norm": 0.62109375, + "learning_rate": 0.00017496442030304657, + "loss": 0.4089, + "step": 9209 + }, + { + "epoch": 1.23, + "grad_norm": 0.435546875, + "learning_rate": 0.00017495671264464085, + "loss": 0.3331, + "step": 9210 + }, + { + "epoch": 1.23, + "grad_norm": 0.462890625, + "learning_rate": 0.000174949003969774, + "loss": 0.4254, + "step": 9211 + }, + { + "epoch": 1.23, + "grad_norm": 0.52734375, + "learning_rate": 0.00017494129427855055, + "loss": 0.5807, + "step": 9212 + }, + { + "epoch": 1.23, + "grad_norm": 0.453125, + "learning_rate": 0.0001749335835710751, + "loss": 0.233, + "step": 9213 + }, + { + "epoch": 1.23, + "grad_norm": 0.5859375, + "learning_rate": 0.00017492587184745207, + "loss": 0.1171, + "step": 9214 + }, + { + "epoch": 1.23, + "grad_norm": 0.3984375, + "learning_rate": 0.0001749181591077862, + "loss": 0.2597, + "step": 9215 + }, + { + "epoch": 1.23, + "grad_norm": 0.546875, + "learning_rate": 0.00017491044535218196, + "loss": 0.6717, + "step": 9216 + }, + { + "epoch": 1.23, + "grad_norm": 0.5703125, + "learning_rate": 0.00017490273058074404, + "loss": 0.4716, + "step": 9217 + }, + { + "epoch": 1.23, + "grad_norm": 0.6875, + "learning_rate": 0.000174895014793577, + "loss": 0.4576, + "step": 9218 + }, + { + "epoch": 1.23, + "grad_norm": 0.40234375, + "learning_rate": 0.00017488729799078554, + "loss": 0.3771, + "step": 9219 + }, + { + "epoch": 1.23, + "grad_norm": 0.87109375, + "learning_rate": 0.0001748795801724742, + "loss": 0.448, + "step": 9220 + }, + { + "epoch": 1.23, + "grad_norm": 0.91796875, + "learning_rate": 0.00017487186133874772, + "loss": 0.6793, + "step": 9221 + }, + { + "epoch": 1.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00017486414148971076, + "loss": 0.4199, + "step": 9222 + }, + { + "epoch": 1.23, + "grad_norm": 0.63671875, + "learning_rate": 0.00017485642062546798, + "loss": 0.3434, + "step": 9223 + }, + { + "epoch": 1.23, + "grad_norm": 0.64453125, + "learning_rate": 0.0001748486987461241, + "loss": 0.4585, + "step": 9224 + }, + { + "epoch": 1.23, + "grad_norm": 0.48828125, + "learning_rate": 0.0001748409758517838, + "loss": 0.2887, + "step": 9225 + }, + { + "epoch": 1.23, + "grad_norm": 0.375, + "learning_rate": 0.00017483325194255186, + "loss": 0.2478, + "step": 9226 + }, + { + "epoch": 1.23, + "grad_norm": 0.5390625, + "learning_rate": 0.00017482552701853302, + "loss": 0.3893, + "step": 9227 + }, + { + "epoch": 1.23, + "grad_norm": 0.7578125, + "learning_rate": 0.00017481780107983196, + "loss": 0.5468, + "step": 9228 + }, + { + "epoch": 1.23, + "grad_norm": 0.5546875, + "learning_rate": 0.00017481007412655352, + "loss": 0.5061, + "step": 9229 + }, + { + "epoch": 1.23, + "grad_norm": 0.486328125, + "learning_rate": 0.00017480234615880247, + "loss": 0.5106, + "step": 9230 + }, + { + "epoch": 1.23, + "grad_norm": 0.5078125, + "learning_rate": 0.0001747946171766836, + "loss": 0.2452, + "step": 9231 + }, + { + "epoch": 1.23, + "grad_norm": 0.5625, + "learning_rate": 0.0001747868871803017, + "loss": 0.5346, + "step": 9232 + }, + { + "epoch": 1.23, + "grad_norm": 0.5859375, + "learning_rate": 0.00017477915616976163, + "loss": 0.3942, + "step": 9233 + }, + { + "epoch": 1.23, + "grad_norm": 0.380859375, + "learning_rate": 0.0001747714241451682, + "loss": 0.2481, + "step": 9234 + }, + { + "epoch": 1.23, + "grad_norm": 0.66796875, + "learning_rate": 0.00017476369110662628, + "loss": 0.4506, + "step": 9235 + }, + { + "epoch": 1.23, + "grad_norm": 0.7109375, + "learning_rate": 0.0001747559570542407, + "loss": 0.7778, + "step": 9236 + }, + { + "epoch": 1.23, + "grad_norm": 0.875, + "learning_rate": 0.0001747482219881164, + "loss": 0.6318, + "step": 9237 + }, + { + "epoch": 1.23, + "grad_norm": 0.53515625, + "learning_rate": 0.00017474048590835822, + "loss": 0.5241, + "step": 9238 + }, + { + "epoch": 1.23, + "grad_norm": 0.46484375, + "learning_rate": 0.0001747327488150711, + "loss": 0.3824, + "step": 9239 + }, + { + "epoch": 1.23, + "grad_norm": 0.5546875, + "learning_rate": 0.0001747250107083599, + "loss": 0.6324, + "step": 9240 + }, + { + "epoch": 1.23, + "grad_norm": 0.51953125, + "learning_rate": 0.0001747172715883296, + "loss": 0.3884, + "step": 9241 + }, + { + "epoch": 1.23, + "grad_norm": 0.59765625, + "learning_rate": 0.0001747095314550852, + "loss": 0.5652, + "step": 9242 + }, + { + "epoch": 1.23, + "grad_norm": 0.55078125, + "learning_rate": 0.00017470179030873158, + "loss": 0.4605, + "step": 9243 + }, + { + "epoch": 1.23, + "grad_norm": 0.7109375, + "learning_rate": 0.00017469404814937373, + "loss": 0.5452, + "step": 9244 + }, + { + "epoch": 1.23, + "grad_norm": 0.5703125, + "learning_rate": 0.00017468630497711665, + "loss": 0.3366, + "step": 9245 + }, + { + "epoch": 1.23, + "grad_norm": 0.59375, + "learning_rate": 0.00017467856079206534, + "loss": 0.5499, + "step": 9246 + }, + { + "epoch": 1.23, + "grad_norm": 0.55078125, + "learning_rate": 0.0001746708155943248, + "loss": 0.5561, + "step": 9247 + }, + { + "epoch": 1.23, + "grad_norm": 0.5703125, + "learning_rate": 0.00017466306938400012, + "loss": 0.3928, + "step": 9248 + }, + { + "epoch": 1.23, + "grad_norm": 0.578125, + "learning_rate": 0.00017465532216119625, + "loss": 0.4093, + "step": 9249 + }, + { + "epoch": 1.23, + "grad_norm": 0.6015625, + "learning_rate": 0.00017464757392601836, + "loss": 0.4401, + "step": 9250 + }, + { + "epoch": 1.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00017463982467857142, + "loss": 0.3351, + "step": 9251 + }, + { + "epoch": 1.23, + "grad_norm": 0.6796875, + "learning_rate": 0.00017463207441896055, + "loss": 0.259, + "step": 9252 + }, + { + "epoch": 1.23, + "grad_norm": 0.470703125, + "learning_rate": 0.00017462432314729087, + "loss": 0.2576, + "step": 9253 + }, + { + "epoch": 1.23, + "grad_norm": 0.5, + "learning_rate": 0.00017461657086366748, + "loss": 0.3497, + "step": 9254 + }, + { + "epoch": 1.23, + "grad_norm": 0.5390625, + "learning_rate": 0.00017460881756819547, + "loss": 0.4854, + "step": 9255 + }, + { + "epoch": 1.24, + "grad_norm": 0.50390625, + "learning_rate": 0.00017460106326098004, + "loss": 0.274, + "step": 9256 + }, + { + "epoch": 1.24, + "grad_norm": 0.51171875, + "learning_rate": 0.00017459330794212627, + "loss": 0.3713, + "step": 9257 + }, + { + "epoch": 1.24, + "grad_norm": 0.6640625, + "learning_rate": 0.00017458555161173938, + "loss": 0.3128, + "step": 9258 + }, + { + "epoch": 1.24, + "grad_norm": 0.59375, + "learning_rate": 0.00017457779426992458, + "loss": 0.4701, + "step": 9259 + }, + { + "epoch": 1.24, + "grad_norm": 0.470703125, + "learning_rate": 0.00017457003591678702, + "loss": 0.424, + "step": 9260 + }, + { + "epoch": 1.24, + "grad_norm": 0.53125, + "learning_rate": 0.0001745622765524319, + "loss": 0.2642, + "step": 9261 + }, + { + "epoch": 1.24, + "grad_norm": 0.55859375, + "learning_rate": 0.00017455451617696448, + "loss": 0.431, + "step": 9262 + }, + { + "epoch": 1.24, + "grad_norm": 0.4296875, + "learning_rate": 0.00017454675479048994, + "loss": 0.282, + "step": 9263 + }, + { + "epoch": 1.24, + "grad_norm": 0.5390625, + "learning_rate": 0.00017453899239311356, + "loss": 0.5259, + "step": 9264 + }, + { + "epoch": 1.24, + "grad_norm": 0.333984375, + "learning_rate": 0.00017453122898494061, + "loss": 0.3519, + "step": 9265 + }, + { + "epoch": 1.24, + "grad_norm": 0.76953125, + "learning_rate": 0.00017452346456607636, + "loss": 0.475, + "step": 9266 + }, + { + "epoch": 1.24, + "grad_norm": 0.494140625, + "learning_rate": 0.00017451569913662612, + "loss": 0.408, + "step": 9267 + }, + { + "epoch": 1.24, + "grad_norm": 0.8828125, + "learning_rate": 0.00017450793269669517, + "loss": 0.4118, + "step": 9268 + }, + { + "epoch": 1.24, + "grad_norm": 0.55078125, + "learning_rate": 0.00017450016524638882, + "loss": 0.2699, + "step": 9269 + }, + { + "epoch": 1.24, + "grad_norm": 0.53515625, + "learning_rate": 0.00017449239678581243, + "loss": 0.4175, + "step": 9270 + }, + { + "epoch": 1.24, + "grad_norm": 0.5546875, + "learning_rate": 0.0001744846273150713, + "loss": 0.3331, + "step": 9271 + }, + { + "epoch": 1.24, + "grad_norm": 0.48828125, + "learning_rate": 0.00017447685683427087, + "loss": 0.2791, + "step": 9272 + }, + { + "epoch": 1.24, + "grad_norm": 0.59375, + "learning_rate": 0.00017446908534351642, + "loss": 0.3782, + "step": 9273 + }, + { + "epoch": 1.24, + "grad_norm": 0.609375, + "learning_rate": 0.0001744613128429134, + "loss": 0.4506, + "step": 9274 + }, + { + "epoch": 1.24, + "grad_norm": 0.453125, + "learning_rate": 0.0001744535393325672, + "loss": 0.3501, + "step": 9275 + }, + { + "epoch": 1.24, + "grad_norm": 0.5546875, + "learning_rate": 0.00017444576481258318, + "loss": 0.4157, + "step": 9276 + }, + { + "epoch": 1.24, + "grad_norm": 0.4140625, + "learning_rate": 0.00017443798928306684, + "loss": 0.2634, + "step": 9277 + }, + { + "epoch": 1.24, + "grad_norm": 0.64453125, + "learning_rate": 0.0001744302127441236, + "loss": 0.5421, + "step": 9278 + }, + { + "epoch": 1.24, + "grad_norm": 0.5546875, + "learning_rate": 0.0001744224351958589, + "loss": 0.508, + "step": 9279 + }, + { + "epoch": 1.24, + "grad_norm": 0.443359375, + "learning_rate": 0.00017441465663837818, + "loss": 0.4355, + "step": 9280 + }, + { + "epoch": 1.24, + "grad_norm": 0.470703125, + "learning_rate": 0.000174406877071787, + "loss": 0.4899, + "step": 9281 + }, + { + "epoch": 1.24, + "grad_norm": 0.80078125, + "learning_rate": 0.0001743990964961908, + "loss": 0.345, + "step": 9282 + }, + { + "epoch": 1.24, + "grad_norm": 0.703125, + "learning_rate": 0.0001743913149116951, + "loss": 0.4092, + "step": 9283 + }, + { + "epoch": 1.24, + "grad_norm": 0.5546875, + "learning_rate": 0.00017438353231840543, + "loss": 0.6215, + "step": 9284 + }, + { + "epoch": 1.24, + "grad_norm": 0.58984375, + "learning_rate": 0.00017437574871642731, + "loss": 0.5469, + "step": 9285 + }, + { + "epoch": 1.24, + "grad_norm": 0.6328125, + "learning_rate": 0.00017436796410586633, + "loss": 0.326, + "step": 9286 + }, + { + "epoch": 1.24, + "grad_norm": 0.50390625, + "learning_rate": 0.00017436017848682802, + "loss": 0.2548, + "step": 9287 + }, + { + "epoch": 1.24, + "grad_norm": 0.5, + "learning_rate": 0.00017435239185941795, + "loss": 0.2253, + "step": 9288 + }, + { + "epoch": 1.24, + "grad_norm": 0.53125, + "learning_rate": 0.00017434460422374173, + "loss": 0.2775, + "step": 9289 + }, + { + "epoch": 1.24, + "grad_norm": 0.703125, + "learning_rate": 0.00017433681557990497, + "loss": 0.5908, + "step": 9290 + }, + { + "epoch": 1.24, + "grad_norm": 0.6015625, + "learning_rate": 0.00017432902592801327, + "loss": 0.3442, + "step": 9291 + }, + { + "epoch": 1.24, + "grad_norm": 0.6171875, + "learning_rate": 0.00017432123526817228, + "loss": 0.2311, + "step": 9292 + }, + { + "epoch": 1.24, + "grad_norm": 0.447265625, + "learning_rate": 0.00017431344360048764, + "loss": 0.3772, + "step": 9293 + }, + { + "epoch": 1.24, + "grad_norm": 0.45703125, + "learning_rate": 0.00017430565092506502, + "loss": 0.549, + "step": 9294 + }, + { + "epoch": 1.24, + "grad_norm": 0.474609375, + "learning_rate": 0.00017429785724201009, + "loss": 0.2626, + "step": 9295 + }, + { + "epoch": 1.24, + "grad_norm": 0.5703125, + "learning_rate": 0.00017429006255142851, + "loss": 0.2171, + "step": 9296 + }, + { + "epoch": 1.24, + "grad_norm": 0.46484375, + "learning_rate": 0.00017428226685342602, + "loss": 0.5262, + "step": 9297 + }, + { + "epoch": 1.24, + "grad_norm": 0.60546875, + "learning_rate": 0.0001742744701481083, + "loss": 0.6648, + "step": 9298 + }, + { + "epoch": 1.24, + "grad_norm": 0.65234375, + "learning_rate": 0.00017426667243558114, + "loss": 0.2519, + "step": 9299 + }, + { + "epoch": 1.24, + "grad_norm": 0.451171875, + "learning_rate": 0.00017425887371595015, + "loss": 0.3508, + "step": 9300 + }, + { + "epoch": 1.24, + "grad_norm": 0.48828125, + "learning_rate": 0.00017425107398932126, + "loss": 0.4192, + "step": 9301 + }, + { + "epoch": 1.24, + "grad_norm": 0.6015625, + "learning_rate": 0.0001742432732558001, + "loss": 0.5012, + "step": 9302 + }, + { + "epoch": 1.24, + "grad_norm": 0.5390625, + "learning_rate": 0.00017423547151549255, + "loss": 0.6578, + "step": 9303 + }, + { + "epoch": 1.24, + "grad_norm": 0.5859375, + "learning_rate": 0.0001742276687685043, + "loss": 0.4088, + "step": 9304 + }, + { + "epoch": 1.24, + "grad_norm": 0.369140625, + "learning_rate": 0.00017421986501494127, + "loss": 0.3162, + "step": 9305 + }, + { + "epoch": 1.24, + "grad_norm": 0.51953125, + "learning_rate": 0.00017421206025490925, + "loss": 0.3592, + "step": 9306 + }, + { + "epoch": 1.24, + "grad_norm": 0.52734375, + "learning_rate": 0.00017420425448851402, + "loss": 0.5217, + "step": 9307 + }, + { + "epoch": 1.24, + "grad_norm": 0.5234375, + "learning_rate": 0.00017419644771586153, + "loss": 0.4026, + "step": 9308 + }, + { + "epoch": 1.24, + "grad_norm": 0.58203125, + "learning_rate": 0.00017418863993705753, + "loss": 0.4666, + "step": 9309 + }, + { + "epoch": 1.24, + "grad_norm": 0.5078125, + "learning_rate": 0.000174180831152208, + "loss": 0.3708, + "step": 9310 + }, + { + "epoch": 1.24, + "grad_norm": 0.486328125, + "learning_rate": 0.00017417302136141878, + "loss": 0.3094, + "step": 9311 + }, + { + "epoch": 1.24, + "grad_norm": 0.55078125, + "learning_rate": 0.00017416521056479577, + "loss": 0.2278, + "step": 9312 + }, + { + "epoch": 1.24, + "grad_norm": 0.52734375, + "learning_rate": 0.00017415739876244492, + "loss": 0.3298, + "step": 9313 + }, + { + "epoch": 1.24, + "grad_norm": 0.52734375, + "learning_rate": 0.00017414958595447213, + "loss": 0.5572, + "step": 9314 + }, + { + "epoch": 1.24, + "grad_norm": 0.453125, + "learning_rate": 0.0001741417721409834, + "loss": 0.3568, + "step": 9315 + }, + { + "epoch": 1.24, + "grad_norm": 0.50390625, + "learning_rate": 0.00017413395732208462, + "loss": 0.2557, + "step": 9316 + }, + { + "epoch": 1.24, + "grad_norm": 0.703125, + "learning_rate": 0.00017412614149788183, + "loss": 0.7755, + "step": 9317 + }, + { + "epoch": 1.24, + "grad_norm": 0.7421875, + "learning_rate": 0.00017411832466848097, + "loss": 0.3784, + "step": 9318 + }, + { + "epoch": 1.24, + "grad_norm": 0.5390625, + "learning_rate": 0.00017411050683398805, + "loss": 0.3597, + "step": 9319 + }, + { + "epoch": 1.24, + "grad_norm": 0.44921875, + "learning_rate": 0.00017410268799450914, + "loss": 0.288, + "step": 9320 + }, + { + "epoch": 1.24, + "grad_norm": 0.400390625, + "learning_rate": 0.00017409486815015017, + "loss": 0.3215, + "step": 9321 + }, + { + "epoch": 1.24, + "grad_norm": 0.53125, + "learning_rate": 0.00017408704730101726, + "loss": 0.3661, + "step": 9322 + }, + { + "epoch": 1.24, + "grad_norm": 0.376953125, + "learning_rate": 0.00017407922544721642, + "loss": 0.3255, + "step": 9323 + }, + { + "epoch": 1.24, + "grad_norm": 0.671875, + "learning_rate": 0.00017407140258885378, + "loss": 0.1886, + "step": 9324 + }, + { + "epoch": 1.24, + "grad_norm": 0.5625, + "learning_rate": 0.00017406357872603532, + "loss": 0.3454, + "step": 9325 + }, + { + "epoch": 1.24, + "grad_norm": 0.55859375, + "learning_rate": 0.00017405575385886723, + "loss": 0.3936, + "step": 9326 + }, + { + "epoch": 1.24, + "grad_norm": 0.5625, + "learning_rate": 0.0001740479279874556, + "loss": 0.3163, + "step": 9327 + }, + { + "epoch": 1.24, + "grad_norm": 0.5859375, + "learning_rate": 0.00017404010111190653, + "loss": 0.4218, + "step": 9328 + }, + { + "epoch": 1.24, + "grad_norm": 0.625, + "learning_rate": 0.00017403227323232619, + "loss": 0.1994, + "step": 9329 + }, + { + "epoch": 1.24, + "grad_norm": 0.6484375, + "learning_rate": 0.00017402444434882068, + "loss": 0.9015, + "step": 9330 + }, + { + "epoch": 1.25, + "grad_norm": 0.546875, + "learning_rate": 0.0001740166144614962, + "loss": 0.4533, + "step": 9331 + }, + { + "epoch": 1.25, + "grad_norm": 0.64453125, + "learning_rate": 0.00017400878357045894, + "loss": 0.3161, + "step": 9332 + }, + { + "epoch": 1.25, + "grad_norm": 0.53125, + "learning_rate": 0.00017400095167581509, + "loss": 0.2313, + "step": 9333 + }, + { + "epoch": 1.25, + "grad_norm": 0.46484375, + "learning_rate": 0.00017399311877767083, + "loss": 0.3318, + "step": 9334 + }, + { + "epoch": 1.25, + "grad_norm": 0.6171875, + "learning_rate": 0.0001739852848761324, + "loss": 0.3806, + "step": 9335 + }, + { + "epoch": 1.25, + "grad_norm": 0.7265625, + "learning_rate": 0.000173977449971306, + "loss": 0.452, + "step": 9336 + }, + { + "epoch": 1.25, + "grad_norm": 0.72265625, + "learning_rate": 0.0001739696140632979, + "loss": 0.3326, + "step": 9337 + }, + { + "epoch": 1.25, + "grad_norm": 0.578125, + "learning_rate": 0.0001739617771522144, + "loss": 0.5787, + "step": 9338 + }, + { + "epoch": 1.25, + "grad_norm": 0.50390625, + "learning_rate": 0.00017395393923816169, + "loss": 0.3278, + "step": 9339 + }, + { + "epoch": 1.25, + "grad_norm": 0.484375, + "learning_rate": 0.0001739461003212461, + "loss": 0.5217, + "step": 9340 + }, + { + "epoch": 1.25, + "grad_norm": 0.4375, + "learning_rate": 0.00017393826040157398, + "loss": 0.5539, + "step": 9341 + }, + { + "epoch": 1.25, + "grad_norm": 0.466796875, + "learning_rate": 0.00017393041947925153, + "loss": 0.2042, + "step": 9342 + }, + { + "epoch": 1.25, + "grad_norm": 0.5546875, + "learning_rate": 0.00017392257755438517, + "loss": 0.2745, + "step": 9343 + }, + { + "epoch": 1.25, + "grad_norm": 0.458984375, + "learning_rate": 0.00017391473462708123, + "loss": 0.3805, + "step": 9344 + }, + { + "epoch": 1.25, + "grad_norm": 0.6484375, + "learning_rate": 0.00017390689069744607, + "loss": 0.6473, + "step": 9345 + }, + { + "epoch": 1.25, + "grad_norm": 0.57421875, + "learning_rate": 0.00017389904576558597, + "loss": 0.4875, + "step": 9346 + }, + { + "epoch": 1.25, + "grad_norm": 0.486328125, + "learning_rate": 0.00017389119983160742, + "loss": 0.2764, + "step": 9347 + }, + { + "epoch": 1.25, + "grad_norm": 0.58203125, + "learning_rate": 0.00017388335289561678, + "loss": 0.5538, + "step": 9348 + }, + { + "epoch": 1.25, + "grad_norm": 0.482421875, + "learning_rate": 0.00017387550495772045, + "loss": 0.3192, + "step": 9349 + }, + { + "epoch": 1.25, + "grad_norm": 0.6640625, + "learning_rate": 0.0001738676560180249, + "loss": 0.3178, + "step": 9350 + }, + { + "epoch": 1.25, + "grad_norm": 0.470703125, + "learning_rate": 0.00017385980607663646, + "loss": 0.3612, + "step": 9351 + }, + { + "epoch": 1.25, + "grad_norm": 0.75, + "learning_rate": 0.00017385195513366168, + "loss": 0.4294, + "step": 9352 + }, + { + "epoch": 1.25, + "grad_norm": 0.53515625, + "learning_rate": 0.00017384410318920697, + "loss": 0.448, + "step": 9353 + }, + { + "epoch": 1.25, + "grad_norm": 0.59375, + "learning_rate": 0.00017383625024337886, + "loss": 0.5835, + "step": 9354 + }, + { + "epoch": 1.25, + "grad_norm": 0.486328125, + "learning_rate": 0.0001738283962962838, + "loss": 0.4994, + "step": 9355 + }, + { + "epoch": 1.25, + "grad_norm": 0.44921875, + "learning_rate": 0.0001738205413480283, + "loss": 0.4279, + "step": 9356 + }, + { + "epoch": 1.25, + "grad_norm": 0.55078125, + "learning_rate": 0.0001738126853987189, + "loss": 0.5122, + "step": 9357 + }, + { + "epoch": 1.25, + "grad_norm": 0.50390625, + "learning_rate": 0.00017380482844846207, + "loss": 0.4471, + "step": 9358 + }, + { + "epoch": 1.25, + "grad_norm": 0.484375, + "learning_rate": 0.00017379697049736443, + "loss": 0.3474, + "step": 9359 + }, + { + "epoch": 1.25, + "grad_norm": 0.5390625, + "learning_rate": 0.00017378911154553248, + "loss": 0.3444, + "step": 9360 + }, + { + "epoch": 1.25, + "grad_norm": 0.6640625, + "learning_rate": 0.00017378125159307287, + "loss": 0.5871, + "step": 9361 + }, + { + "epoch": 1.25, + "grad_norm": 0.65234375, + "learning_rate": 0.00017377339064009208, + "loss": 0.4861, + "step": 9362 + }, + { + "epoch": 1.25, + "grad_norm": 0.51171875, + "learning_rate": 0.0001737655286866968, + "loss": 0.4455, + "step": 9363 + }, + { + "epoch": 1.25, + "grad_norm": 0.4296875, + "learning_rate": 0.0001737576657329936, + "loss": 0.3222, + "step": 9364 + }, + { + "epoch": 1.25, + "grad_norm": 0.447265625, + "learning_rate": 0.00017374980177908912, + "loss": 0.4115, + "step": 9365 + }, + { + "epoch": 1.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00017374193682508996, + "loss": 0.3441, + "step": 9366 + }, + { + "epoch": 1.25, + "grad_norm": 0.65625, + "learning_rate": 0.00017373407087110286, + "loss": 0.5886, + "step": 9367 + }, + { + "epoch": 1.25, + "grad_norm": 0.64453125, + "learning_rate": 0.0001737262039172344, + "loss": 0.5311, + "step": 9368 + }, + { + "epoch": 1.25, + "grad_norm": 0.6484375, + "learning_rate": 0.00017371833596359134, + "loss": 0.5787, + "step": 9369 + }, + { + "epoch": 1.25, + "grad_norm": 0.60546875, + "learning_rate": 0.00017371046701028032, + "loss": 0.3317, + "step": 9370 + }, + { + "epoch": 1.25, + "grad_norm": 0.484375, + "learning_rate": 0.00017370259705740804, + "loss": 0.1541, + "step": 9371 + }, + { + "epoch": 1.25, + "grad_norm": 0.703125, + "learning_rate": 0.00017369472610508123, + "loss": 0.1487, + "step": 9372 + }, + { + "epoch": 1.25, + "grad_norm": 0.59375, + "learning_rate": 0.00017368685415340663, + "loss": 0.6452, + "step": 9373 + }, + { + "epoch": 1.25, + "grad_norm": 0.412109375, + "learning_rate": 0.00017367898120249103, + "loss": 0.3892, + "step": 9374 + }, + { + "epoch": 1.25, + "grad_norm": 0.466796875, + "learning_rate": 0.00017367110725244117, + "loss": 0.4207, + "step": 9375 + }, + { + "epoch": 1.25, + "grad_norm": 0.5234375, + "learning_rate": 0.00017366323230336378, + "loss": 0.6198, + "step": 9376 + }, + { + "epoch": 1.25, + "grad_norm": 0.6015625, + "learning_rate": 0.00017365535635536572, + "loss": 0.2661, + "step": 9377 + }, + { + "epoch": 1.25, + "grad_norm": 0.4921875, + "learning_rate": 0.00017364747940855373, + "loss": 0.2554, + "step": 9378 + }, + { + "epoch": 1.25, + "grad_norm": 0.51953125, + "learning_rate": 0.00017363960146303463, + "loss": 0.4342, + "step": 9379 + }, + { + "epoch": 1.25, + "grad_norm": 0.5625, + "learning_rate": 0.0001736317225189153, + "loss": 0.2321, + "step": 9380 + }, + { + "epoch": 1.25, + "grad_norm": 0.5625, + "learning_rate": 0.00017362384257630257, + "loss": 0.8043, + "step": 9381 + }, + { + "epoch": 1.25, + "grad_norm": 0.3828125, + "learning_rate": 0.00017361596163530324, + "loss": 0.3813, + "step": 9382 + }, + { + "epoch": 1.25, + "grad_norm": 0.5234375, + "learning_rate": 0.00017360807969602426, + "loss": 0.316, + "step": 9383 + }, + { + "epoch": 1.25, + "grad_norm": 0.7890625, + "learning_rate": 0.00017360019675857248, + "loss": 0.4946, + "step": 9384 + }, + { + "epoch": 1.25, + "grad_norm": 0.72265625, + "learning_rate": 0.00017359231282305476, + "loss": 0.4724, + "step": 9385 + }, + { + "epoch": 1.25, + "grad_norm": 0.59765625, + "learning_rate": 0.00017358442788957808, + "loss": 0.3609, + "step": 9386 + }, + { + "epoch": 1.25, + "grad_norm": 0.5703125, + "learning_rate": 0.00017357654195824935, + "loss": 0.6151, + "step": 9387 + }, + { + "epoch": 1.25, + "grad_norm": 0.494140625, + "learning_rate": 0.00017356865502917546, + "loss": 0.4714, + "step": 9388 + }, + { + "epoch": 1.25, + "grad_norm": 0.57421875, + "learning_rate": 0.00017356076710246338, + "loss": 0.3504, + "step": 9389 + }, + { + "epoch": 1.25, + "grad_norm": 0.69921875, + "learning_rate": 0.00017355287817822013, + "loss": 0.6214, + "step": 9390 + }, + { + "epoch": 1.25, + "grad_norm": 0.4375, + "learning_rate": 0.0001735449882565526, + "loss": 0.245, + "step": 9391 + }, + { + "epoch": 1.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00017353709733756783, + "loss": 0.5736, + "step": 9392 + }, + { + "epoch": 1.25, + "grad_norm": 0.5, + "learning_rate": 0.00017352920542137285, + "loss": 0.4079, + "step": 9393 + }, + { + "epoch": 1.25, + "grad_norm": 0.447265625, + "learning_rate": 0.00017352131250807467, + "loss": 0.3428, + "step": 9394 + }, + { + "epoch": 1.25, + "grad_norm": 0.47265625, + "learning_rate": 0.00017351341859778025, + "loss": 0.2863, + "step": 9395 + }, + { + "epoch": 1.25, + "grad_norm": 0.55859375, + "learning_rate": 0.00017350552369059673, + "loss": 0.6137, + "step": 9396 + }, + { + "epoch": 1.25, + "grad_norm": 0.6953125, + "learning_rate": 0.00017349762778663113, + "loss": 0.3826, + "step": 9397 + }, + { + "epoch": 1.25, + "grad_norm": 0.61328125, + "learning_rate": 0.00017348973088599054, + "loss": 0.2684, + "step": 9398 + }, + { + "epoch": 1.25, + "grad_norm": 0.419921875, + "learning_rate": 0.000173481832988782, + "loss": 0.1725, + "step": 9399 + }, + { + "epoch": 1.25, + "grad_norm": 0.62109375, + "learning_rate": 0.00017347393409511267, + "loss": 0.6734, + "step": 9400 + }, + { + "epoch": 1.25, + "grad_norm": 0.341796875, + "learning_rate": 0.00017346603420508963, + "loss": 0.2253, + "step": 9401 + }, + { + "epoch": 1.25, + "grad_norm": 0.58984375, + "learning_rate": 0.00017345813331882002, + "loss": 0.4977, + "step": 9402 + }, + { + "epoch": 1.25, + "grad_norm": 0.578125, + "learning_rate": 0.00017345023143641096, + "loss": 0.3338, + "step": 9403 + }, + { + "epoch": 1.25, + "grad_norm": 0.69140625, + "learning_rate": 0.00017344232855796965, + "loss": 0.2139, + "step": 9404 + }, + { + "epoch": 1.26, + "grad_norm": 0.435546875, + "learning_rate": 0.0001734344246836032, + "loss": 0.1688, + "step": 9405 + }, + { + "epoch": 1.26, + "grad_norm": 0.5234375, + "learning_rate": 0.00017342651981341883, + "loss": 0.5909, + "step": 9406 + }, + { + "epoch": 1.26, + "grad_norm": 0.54296875, + "learning_rate": 0.00017341861394752373, + "loss": 0.3429, + "step": 9407 + }, + { + "epoch": 1.26, + "grad_norm": 0.431640625, + "learning_rate": 0.00017341070708602513, + "loss": 0.2289, + "step": 9408 + }, + { + "epoch": 1.26, + "grad_norm": 0.486328125, + "learning_rate": 0.0001734027992290302, + "loss": 0.4592, + "step": 9409 + }, + { + "epoch": 1.26, + "grad_norm": 0.43359375, + "learning_rate": 0.0001733948903766462, + "loss": 0.2085, + "step": 9410 + }, + { + "epoch": 1.26, + "grad_norm": 0.484375, + "learning_rate": 0.0001733869805289804, + "loss": 0.3276, + "step": 9411 + }, + { + "epoch": 1.26, + "grad_norm": 0.69140625, + "learning_rate": 0.00017337906968614, + "loss": 0.5456, + "step": 9412 + }, + { + "epoch": 1.26, + "grad_norm": 0.8359375, + "learning_rate": 0.0001733711578482324, + "loss": 0.4471, + "step": 9413 + }, + { + "epoch": 1.26, + "grad_norm": 0.515625, + "learning_rate": 0.00017336324501536474, + "loss": 0.3186, + "step": 9414 + }, + { + "epoch": 1.26, + "grad_norm": 0.74609375, + "learning_rate": 0.00017335533118764443, + "loss": 0.4531, + "step": 9415 + }, + { + "epoch": 1.26, + "grad_norm": 0.515625, + "learning_rate": 0.00017334741636517874, + "loss": 0.5495, + "step": 9416 + }, + { + "epoch": 1.26, + "grad_norm": 0.478515625, + "learning_rate": 0.00017333950054807502, + "loss": 0.4804, + "step": 9417 + }, + { + "epoch": 1.26, + "grad_norm": 0.5, + "learning_rate": 0.00017333158373644057, + "loss": 0.2743, + "step": 9418 + }, + { + "epoch": 1.26, + "grad_norm": 0.53125, + "learning_rate": 0.00017332366593038283, + "loss": 0.3002, + "step": 9419 + }, + { + "epoch": 1.26, + "grad_norm": 0.5078125, + "learning_rate": 0.0001733157471300091, + "loss": 0.3286, + "step": 9420 + }, + { + "epoch": 1.26, + "grad_norm": 0.6484375, + "learning_rate": 0.00017330782733542677, + "loss": 0.5053, + "step": 9421 + }, + { + "epoch": 1.26, + "grad_norm": 1.6796875, + "learning_rate": 0.00017329990654674324, + "loss": 0.3842, + "step": 9422 + }, + { + "epoch": 1.26, + "grad_norm": 0.6796875, + "learning_rate": 0.00017329198476406598, + "loss": 0.3919, + "step": 9423 + }, + { + "epoch": 1.26, + "grad_norm": 0.80859375, + "learning_rate": 0.00017328406198750232, + "loss": 0.4639, + "step": 9424 + }, + { + "epoch": 1.26, + "grad_norm": 0.68359375, + "learning_rate": 0.0001732761382171598, + "loss": 0.5638, + "step": 9425 + }, + { + "epoch": 1.26, + "grad_norm": 0.62890625, + "learning_rate": 0.00017326821345314578, + "loss": 0.6202, + "step": 9426 + }, + { + "epoch": 1.26, + "grad_norm": 0.85546875, + "learning_rate": 0.00017326028769556776, + "loss": 0.3486, + "step": 9427 + }, + { + "epoch": 1.26, + "grad_norm": 0.53125, + "learning_rate": 0.00017325236094453323, + "loss": 0.4153, + "step": 9428 + }, + { + "epoch": 1.26, + "grad_norm": 0.458984375, + "learning_rate": 0.00017324443320014966, + "loss": 0.4073, + "step": 9429 + }, + { + "epoch": 1.26, + "grad_norm": 0.4140625, + "learning_rate": 0.0001732365044625246, + "loss": 0.5158, + "step": 9430 + }, + { + "epoch": 1.26, + "grad_norm": 0.330078125, + "learning_rate": 0.0001732285747317655, + "loss": 0.23, + "step": 9431 + }, + { + "epoch": 1.26, + "grad_norm": 0.6953125, + "learning_rate": 0.00017322064400797997, + "loss": 0.4426, + "step": 9432 + }, + { + "epoch": 1.26, + "grad_norm": 0.498046875, + "learning_rate": 0.0001732127122912755, + "loss": 0.3924, + "step": 9433 + }, + { + "epoch": 1.26, + "grad_norm": 0.5859375, + "learning_rate": 0.00017320477958175965, + "loss": 0.2215, + "step": 9434 + }, + { + "epoch": 1.26, + "grad_norm": 0.51171875, + "learning_rate": 0.00017319684587954002, + "loss": 0.3242, + "step": 9435 + }, + { + "epoch": 1.26, + "grad_norm": 0.390625, + "learning_rate": 0.00017318891118472414, + "loss": 0.2038, + "step": 9436 + }, + { + "epoch": 1.26, + "grad_norm": 0.51171875, + "learning_rate": 0.0001731809754974197, + "loss": 0.4442, + "step": 9437 + }, + { + "epoch": 1.26, + "grad_norm": 0.6171875, + "learning_rate": 0.00017317303881773425, + "loss": 0.4067, + "step": 9438 + }, + { + "epoch": 1.26, + "grad_norm": 0.53515625, + "learning_rate": 0.00017316510114577543, + "loss": 0.5897, + "step": 9439 + }, + { + "epoch": 1.26, + "grad_norm": 0.48046875, + "learning_rate": 0.0001731571624816509, + "loss": 0.3536, + "step": 9440 + }, + { + "epoch": 1.26, + "grad_norm": 0.54296875, + "learning_rate": 0.00017314922282546827, + "loss": 0.253, + "step": 9441 + }, + { + "epoch": 1.26, + "grad_norm": 0.50390625, + "learning_rate": 0.00017314128217733524, + "loss": 0.3514, + "step": 9442 + }, + { + "epoch": 1.26, + "grad_norm": 0.515625, + "learning_rate": 0.00017313334053735947, + "loss": 0.5441, + "step": 9443 + }, + { + "epoch": 1.26, + "grad_norm": 0.53515625, + "learning_rate": 0.0001731253979056487, + "loss": 0.3787, + "step": 9444 + }, + { + "epoch": 1.26, + "grad_norm": 0.4609375, + "learning_rate": 0.00017311745428231056, + "loss": 0.4204, + "step": 9445 + }, + { + "epoch": 1.26, + "grad_norm": 0.58984375, + "learning_rate": 0.00017310950966745284, + "loss": 0.3581, + "step": 9446 + }, + { + "epoch": 1.26, + "grad_norm": 0.3984375, + "learning_rate": 0.00017310156406118323, + "loss": 0.3817, + "step": 9447 + }, + { + "epoch": 1.26, + "grad_norm": 0.52734375, + "learning_rate": 0.00017309361746360952, + "loss": 0.2903, + "step": 9448 + }, + { + "epoch": 1.26, + "grad_norm": 0.39453125, + "learning_rate": 0.00017308566987483942, + "loss": 0.2093, + "step": 9449 + }, + { + "epoch": 1.26, + "grad_norm": 0.59765625, + "learning_rate": 0.00017307772129498077, + "loss": 0.6916, + "step": 9450 + }, + { + "epoch": 1.26, + "grad_norm": 0.71484375, + "learning_rate": 0.00017306977172414128, + "loss": 0.4985, + "step": 9451 + }, + { + "epoch": 1.26, + "grad_norm": 0.5625, + "learning_rate": 0.0001730618211624288, + "loss": 0.5396, + "step": 9452 + }, + { + "epoch": 1.26, + "grad_norm": 0.546875, + "learning_rate": 0.00017305386960995113, + "loss": 0.4223, + "step": 9453 + }, + { + "epoch": 1.26, + "grad_norm": 0.53515625, + "learning_rate": 0.00017304591706681613, + "loss": 0.5529, + "step": 9454 + }, + { + "epoch": 1.26, + "grad_norm": 0.6171875, + "learning_rate": 0.00017303796353313158, + "loss": 0.615, + "step": 9455 + }, + { + "epoch": 1.26, + "grad_norm": 0.953125, + "learning_rate": 0.0001730300090090054, + "loss": 0.516, + "step": 9456 + }, + { + "epoch": 1.26, + "grad_norm": 0.54296875, + "learning_rate": 0.00017302205349454538, + "loss": 0.5128, + "step": 9457 + }, + { + "epoch": 1.26, + "grad_norm": 0.431640625, + "learning_rate": 0.00017301409698985948, + "loss": 0.2968, + "step": 9458 + }, + { + "epoch": 1.26, + "grad_norm": 0.47265625, + "learning_rate": 0.00017300613949505555, + "loss": 0.5143, + "step": 9459 + }, + { + "epoch": 1.26, + "grad_norm": 0.59375, + "learning_rate": 0.00017299818101024154, + "loss": 0.3805, + "step": 9460 + }, + { + "epoch": 1.26, + "grad_norm": 0.5, + "learning_rate": 0.00017299022153552534, + "loss": 0.3096, + "step": 9461 + }, + { + "epoch": 1.26, + "grad_norm": 0.392578125, + "learning_rate": 0.00017298226107101487, + "loss": 0.3043, + "step": 9462 + }, + { + "epoch": 1.26, + "grad_norm": 0.45703125, + "learning_rate": 0.00017297429961681812, + "loss": 0.4677, + "step": 9463 + }, + { + "epoch": 1.26, + "grad_norm": 0.5625, + "learning_rate": 0.000172966337173043, + "loss": 0.5456, + "step": 9464 + }, + { + "epoch": 1.26, + "grad_norm": 0.54296875, + "learning_rate": 0.00017295837373979754, + "loss": 0.5545, + "step": 9465 + }, + { + "epoch": 1.26, + "grad_norm": 0.59375, + "learning_rate": 0.00017295040931718974, + "loss": 0.4518, + "step": 9466 + }, + { + "epoch": 1.26, + "grad_norm": 0.318359375, + "learning_rate": 0.00017294244390532755, + "loss": 0.1971, + "step": 9467 + }, + { + "epoch": 1.26, + "grad_norm": 0.5859375, + "learning_rate": 0.00017293447750431898, + "loss": 0.3374, + "step": 9468 + }, + { + "epoch": 1.26, + "grad_norm": 0.58984375, + "learning_rate": 0.00017292651011427212, + "loss": 0.2795, + "step": 9469 + }, + { + "epoch": 1.26, + "grad_norm": 0.45703125, + "learning_rate": 0.00017291854173529493, + "loss": 0.3069, + "step": 9470 + }, + { + "epoch": 1.26, + "grad_norm": 0.6171875, + "learning_rate": 0.00017291057236749557, + "loss": 0.5264, + "step": 9471 + }, + { + "epoch": 1.26, + "grad_norm": 0.515625, + "learning_rate": 0.00017290260201098203, + "loss": 0.3006, + "step": 9472 + }, + { + "epoch": 1.26, + "grad_norm": 0.83203125, + "learning_rate": 0.00017289463066586243, + "loss": 0.3551, + "step": 9473 + }, + { + "epoch": 1.26, + "grad_norm": 0.828125, + "learning_rate": 0.00017288665833224486, + "loss": 0.4277, + "step": 9474 + }, + { + "epoch": 1.26, + "grad_norm": 0.60546875, + "learning_rate": 0.0001728786850102374, + "loss": 0.3405, + "step": 9475 + }, + { + "epoch": 1.26, + "grad_norm": 0.390625, + "learning_rate": 0.0001728707106999482, + "loss": 0.2313, + "step": 9476 + }, + { + "epoch": 1.26, + "grad_norm": 0.478515625, + "learning_rate": 0.00017286273540148542, + "loss": 0.4731, + "step": 9477 + }, + { + "epoch": 1.26, + "grad_norm": 0.56640625, + "learning_rate": 0.00017285475911495716, + "loss": 0.3649, + "step": 9478 + }, + { + "epoch": 1.26, + "grad_norm": 0.66015625, + "learning_rate": 0.00017284678184047161, + "loss": 0.4088, + "step": 9479 + }, + { + "epoch": 1.27, + "grad_norm": 0.431640625, + "learning_rate": 0.00017283880357813698, + "loss": 0.3152, + "step": 9480 + }, + { + "epoch": 1.27, + "grad_norm": 0.5703125, + "learning_rate": 0.00017283082432806136, + "loss": 0.9076, + "step": 9481 + }, + { + "epoch": 1.27, + "grad_norm": 0.5390625, + "learning_rate": 0.00017282284409035308, + "loss": 0.2887, + "step": 9482 + }, + { + "epoch": 1.27, + "grad_norm": 0.59765625, + "learning_rate": 0.00017281486286512029, + "loss": 0.5069, + "step": 9483 + }, + { + "epoch": 1.27, + "grad_norm": 1.109375, + "learning_rate": 0.00017280688065247118, + "loss": 0.6918, + "step": 9484 + }, + { + "epoch": 1.27, + "grad_norm": 0.478515625, + "learning_rate": 0.0001727988974525141, + "loss": 0.2657, + "step": 9485 + }, + { + "epoch": 1.27, + "grad_norm": 0.63671875, + "learning_rate": 0.00017279091326535722, + "loss": 0.4262, + "step": 9486 + }, + { + "epoch": 1.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00017278292809110883, + "loss": 0.3072, + "step": 9487 + }, + { + "epoch": 1.27, + "grad_norm": 0.5078125, + "learning_rate": 0.00017277494192987725, + "loss": 0.295, + "step": 9488 + }, + { + "epoch": 1.27, + "grad_norm": 0.5703125, + "learning_rate": 0.00017276695478177071, + "loss": 0.2974, + "step": 9489 + }, + { + "epoch": 1.27, + "grad_norm": 0.53515625, + "learning_rate": 0.0001727589666468976, + "loss": 0.5516, + "step": 9490 + }, + { + "epoch": 1.27, + "grad_norm": 0.4609375, + "learning_rate": 0.0001727509775253662, + "loss": 0.4795, + "step": 9491 + }, + { + "epoch": 1.27, + "grad_norm": 0.515625, + "learning_rate": 0.00017274298741728483, + "loss": 0.2435, + "step": 9492 + }, + { + "epoch": 1.27, + "grad_norm": 0.515625, + "learning_rate": 0.00017273499632276189, + "loss": 0.4826, + "step": 9493 + }, + { + "epoch": 1.27, + "grad_norm": 0.765625, + "learning_rate": 0.0001727270042419057, + "loss": 0.3595, + "step": 9494 + }, + { + "epoch": 1.27, + "grad_norm": 0.62890625, + "learning_rate": 0.00017271901117482468, + "loss": 0.3009, + "step": 9495 + }, + { + "epoch": 1.27, + "grad_norm": 0.55078125, + "learning_rate": 0.00017271101712162718, + "loss": 0.525, + "step": 9496 + }, + { + "epoch": 1.27, + "grad_norm": 0.53515625, + "learning_rate": 0.00017270302208242163, + "loss": 0.3883, + "step": 9497 + }, + { + "epoch": 1.27, + "grad_norm": 0.5, + "learning_rate": 0.00017269502605731643, + "loss": 0.3953, + "step": 9498 + }, + { + "epoch": 1.27, + "grad_norm": 0.640625, + "learning_rate": 0.00017268702904642004, + "loss": 0.3133, + "step": 9499 + }, + { + "epoch": 1.27, + "grad_norm": 0.4921875, + "learning_rate": 0.00017267903104984086, + "loss": 0.3081, + "step": 9500 + }, + { + "epoch": 1.27, + "grad_norm": 0.458984375, + "learning_rate": 0.00017267103206768737, + "loss": 0.4057, + "step": 9501 + }, + { + "epoch": 1.27, + "grad_norm": 0.62890625, + "learning_rate": 0.00017266303210006808, + "loss": 0.2615, + "step": 9502 + }, + { + "epoch": 1.27, + "grad_norm": 0.61328125, + "learning_rate": 0.00017265503114709144, + "loss": 0.5328, + "step": 9503 + }, + { + "epoch": 1.27, + "grad_norm": 0.53515625, + "learning_rate": 0.0001726470292088659, + "loss": 0.3655, + "step": 9504 + }, + { + "epoch": 1.27, + "grad_norm": 0.494140625, + "learning_rate": 0.00017263902628550007, + "loss": 0.4364, + "step": 9505 + }, + { + "epoch": 1.27, + "grad_norm": 0.71875, + "learning_rate": 0.0001726310223771024, + "loss": 0.4609, + "step": 9506 + }, + { + "epoch": 1.27, + "grad_norm": 0.5546875, + "learning_rate": 0.00017262301748378146, + "loss": 0.4387, + "step": 9507 + }, + { + "epoch": 1.27, + "grad_norm": 0.49609375, + "learning_rate": 0.0001726150116056458, + "loss": 0.32, + "step": 9508 + }, + { + "epoch": 1.27, + "grad_norm": 0.46875, + "learning_rate": 0.00017260700474280396, + "loss": 0.3824, + "step": 9509 + }, + { + "epoch": 1.27, + "grad_norm": 0.609375, + "learning_rate": 0.00017259899689536453, + "loss": 0.5635, + "step": 9510 + }, + { + "epoch": 1.27, + "grad_norm": 0.39453125, + "learning_rate": 0.00017259098806343614, + "loss": 0.2705, + "step": 9511 + }, + { + "epoch": 1.27, + "grad_norm": 0.578125, + "learning_rate": 0.00017258297824712736, + "loss": 0.5221, + "step": 9512 + }, + { + "epoch": 1.27, + "grad_norm": 0.6640625, + "learning_rate": 0.00017257496744654677, + "loss": 0.3482, + "step": 9513 + }, + { + "epoch": 1.27, + "grad_norm": 0.76953125, + "learning_rate": 0.00017256695566180308, + "loss": 0.3945, + "step": 9514 + }, + { + "epoch": 1.27, + "grad_norm": 0.55859375, + "learning_rate": 0.0001725589428930049, + "loss": 0.4173, + "step": 9515 + }, + { + "epoch": 1.27, + "grad_norm": 0.55859375, + "learning_rate": 0.00017255092914026087, + "loss": 0.5127, + "step": 9516 + }, + { + "epoch": 1.27, + "grad_norm": 0.94921875, + "learning_rate": 0.00017254291440367968, + "loss": 0.4361, + "step": 9517 + }, + { + "epoch": 1.27, + "grad_norm": 0.478515625, + "learning_rate": 0.00017253489868337, + "loss": 0.3254, + "step": 9518 + }, + { + "epoch": 1.27, + "grad_norm": 0.478515625, + "learning_rate": 0.0001725268819794406, + "loss": 0.2684, + "step": 9519 + }, + { + "epoch": 1.27, + "grad_norm": 0.65234375, + "learning_rate": 0.0001725188642920001, + "loss": 0.2505, + "step": 9520 + }, + { + "epoch": 1.27, + "grad_norm": 0.51953125, + "learning_rate": 0.00017251084562115723, + "loss": 0.4855, + "step": 9521 + }, + { + "epoch": 1.27, + "grad_norm": 0.451171875, + "learning_rate": 0.00017250282596702077, + "loss": 0.2561, + "step": 9522 + }, + { + "epoch": 1.27, + "grad_norm": 0.5859375, + "learning_rate": 0.00017249480532969947, + "loss": 0.4252, + "step": 9523 + }, + { + "epoch": 1.27, + "grad_norm": 0.4140625, + "learning_rate": 0.00017248678370930206, + "loss": 0.3881, + "step": 9524 + }, + { + "epoch": 1.27, + "grad_norm": 0.65234375, + "learning_rate": 0.00017247876110593736, + "loss": 0.5713, + "step": 9525 + }, + { + "epoch": 1.27, + "grad_norm": 0.490234375, + "learning_rate": 0.00017247073751971414, + "loss": 0.3169, + "step": 9526 + }, + { + "epoch": 1.27, + "grad_norm": 0.466796875, + "learning_rate": 0.0001724627129507412, + "loss": 0.5579, + "step": 9527 + }, + { + "epoch": 1.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0001724546873991274, + "loss": 0.2306, + "step": 9528 + }, + { + "epoch": 1.27, + "grad_norm": 0.5234375, + "learning_rate": 0.0001724466608649815, + "loss": 0.4033, + "step": 9529 + }, + { + "epoch": 1.27, + "grad_norm": 1.0859375, + "learning_rate": 0.0001724386333484124, + "loss": 0.736, + "step": 9530 + }, + { + "epoch": 1.27, + "grad_norm": 0.40234375, + "learning_rate": 0.00017243060484952894, + "loss": 0.3183, + "step": 9531 + }, + { + "epoch": 1.27, + "grad_norm": 0.5625, + "learning_rate": 0.00017242257536844, + "loss": 0.2836, + "step": 9532 + }, + { + "epoch": 1.27, + "grad_norm": 0.451171875, + "learning_rate": 0.00017241454490525443, + "loss": 0.4331, + "step": 9533 + }, + { + "epoch": 1.27, + "grad_norm": 0.61328125, + "learning_rate": 0.00017240651346008118, + "loss": 0.57, + "step": 9534 + }, + { + "epoch": 1.27, + "grad_norm": 0.59765625, + "learning_rate": 0.00017239848103302912, + "loss": 0.5678, + "step": 9535 + }, + { + "epoch": 1.27, + "grad_norm": 0.58203125, + "learning_rate": 0.0001723904476242072, + "loss": 0.4197, + "step": 9536 + }, + { + "epoch": 1.27, + "grad_norm": 0.5078125, + "learning_rate": 0.00017238241323372434, + "loss": 0.4213, + "step": 9537 + }, + { + "epoch": 1.27, + "grad_norm": 0.55859375, + "learning_rate": 0.00017237437786168952, + "loss": 0.2349, + "step": 9538 + }, + { + "epoch": 1.27, + "grad_norm": 0.60546875, + "learning_rate": 0.00017236634150821168, + "loss": 0.6021, + "step": 9539 + }, + { + "epoch": 1.27, + "grad_norm": 0.4375, + "learning_rate": 0.00017235830417339983, + "loss": 0.3601, + "step": 9540 + }, + { + "epoch": 1.27, + "grad_norm": 0.734375, + "learning_rate": 0.0001723502658573629, + "loss": 0.357, + "step": 9541 + }, + { + "epoch": 1.27, + "grad_norm": 0.5703125, + "learning_rate": 0.00017234222656020993, + "loss": 0.4599, + "step": 9542 + }, + { + "epoch": 1.27, + "grad_norm": 0.68359375, + "learning_rate": 0.00017233418628204996, + "loss": 0.4804, + "step": 9543 + }, + { + "epoch": 1.27, + "grad_norm": 0.546875, + "learning_rate": 0.000172326145022992, + "loss": 0.2421, + "step": 9544 + }, + { + "epoch": 1.27, + "grad_norm": 0.55859375, + "learning_rate": 0.00017231810278314509, + "loss": 0.2766, + "step": 9545 + }, + { + "epoch": 1.27, + "grad_norm": 0.66015625, + "learning_rate": 0.00017231005956261827, + "loss": 0.5284, + "step": 9546 + }, + { + "epoch": 1.27, + "grad_norm": 0.5390625, + "learning_rate": 0.00017230201536152067, + "loss": 0.6096, + "step": 9547 + }, + { + "epoch": 1.27, + "grad_norm": 0.56640625, + "learning_rate": 0.00017229397017996128, + "loss": 0.5515, + "step": 9548 + }, + { + "epoch": 1.27, + "grad_norm": 0.47265625, + "learning_rate": 0.0001722859240180493, + "loss": 0.3093, + "step": 9549 + }, + { + "epoch": 1.27, + "grad_norm": 0.765625, + "learning_rate": 0.00017227787687589384, + "loss": 0.4707, + "step": 9550 + }, + { + "epoch": 1.27, + "grad_norm": 0.5390625, + "learning_rate": 0.00017226982875360392, + "loss": 0.3701, + "step": 9551 + }, + { + "epoch": 1.27, + "grad_norm": 0.6171875, + "learning_rate": 0.00017226177965128873, + "loss": 0.7022, + "step": 9552 + }, + { + "epoch": 1.27, + "grad_norm": 0.65234375, + "learning_rate": 0.0001722537295690575, + "loss": 0.3571, + "step": 9553 + }, + { + "epoch": 1.27, + "grad_norm": 1.2734375, + "learning_rate": 0.00017224567850701928, + "loss": 0.5804, + "step": 9554 + }, + { + "epoch": 1.28, + "grad_norm": 0.443359375, + "learning_rate": 0.00017223762646528327, + "loss": 0.2703, + "step": 9555 + }, + { + "epoch": 1.28, + "grad_norm": 0.5390625, + "learning_rate": 0.00017222957344395872, + "loss": 0.4661, + "step": 9556 + }, + { + "epoch": 1.28, + "grad_norm": 0.57421875, + "learning_rate": 0.00017222151944315477, + "loss": 0.3727, + "step": 9557 + }, + { + "epoch": 1.28, + "grad_norm": 0.59375, + "learning_rate": 0.0001722134644629807, + "loss": 0.4114, + "step": 9558 + }, + { + "epoch": 1.28, + "grad_norm": 0.6796875, + "learning_rate": 0.0001722054085035457, + "loss": 0.3522, + "step": 9559 + }, + { + "epoch": 1.28, + "grad_norm": 0.68359375, + "learning_rate": 0.000172197351564959, + "loss": 0.4921, + "step": 9560 + }, + { + "epoch": 1.28, + "grad_norm": 0.443359375, + "learning_rate": 0.00017218929364732984, + "loss": 0.3812, + "step": 9561 + }, + { + "epoch": 1.28, + "grad_norm": 0.62890625, + "learning_rate": 0.00017218123475076758, + "loss": 0.4164, + "step": 9562 + }, + { + "epoch": 1.28, + "grad_norm": 0.6015625, + "learning_rate": 0.00017217317487538148, + "loss": 0.6736, + "step": 9563 + }, + { + "epoch": 1.28, + "grad_norm": 0.58203125, + "learning_rate": 0.00017216511402128076, + "loss": 0.567, + "step": 9564 + }, + { + "epoch": 1.28, + "grad_norm": 0.423828125, + "learning_rate": 0.0001721570521885748, + "loss": 0.2856, + "step": 9565 + }, + { + "epoch": 1.28, + "grad_norm": 0.546875, + "learning_rate": 0.00017214898937737284, + "loss": 0.3386, + "step": 9566 + }, + { + "epoch": 1.28, + "grad_norm": 0.60546875, + "learning_rate": 0.00017214092558778433, + "loss": 0.5067, + "step": 9567 + }, + { + "epoch": 1.28, + "grad_norm": 0.8671875, + "learning_rate": 0.0001721328608199186, + "loss": 0.4236, + "step": 9568 + }, + { + "epoch": 1.28, + "grad_norm": 0.62890625, + "learning_rate": 0.00017212479507388494, + "loss": 0.3063, + "step": 9569 + }, + { + "epoch": 1.28, + "grad_norm": 0.44921875, + "learning_rate": 0.00017211672834979277, + "loss": 0.3208, + "step": 9570 + }, + { + "epoch": 1.28, + "grad_norm": 0.5625, + "learning_rate": 0.0001721086606477515, + "loss": 0.7307, + "step": 9571 + }, + { + "epoch": 1.28, + "grad_norm": 0.55859375, + "learning_rate": 0.0001721005919678705, + "loss": 0.2191, + "step": 9572 + }, + { + "epoch": 1.28, + "grad_norm": 0.81640625, + "learning_rate": 0.00017209252231025922, + "loss": 0.5132, + "step": 9573 + }, + { + "epoch": 1.28, + "grad_norm": 0.6484375, + "learning_rate": 0.00017208445167502703, + "loss": 0.43, + "step": 9574 + }, + { + "epoch": 1.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00017207638006228347, + "loss": 0.6042, + "step": 9575 + }, + { + "epoch": 1.28, + "grad_norm": 0.921875, + "learning_rate": 0.00017206830747213786, + "loss": 0.6093, + "step": 9576 + }, + { + "epoch": 1.28, + "grad_norm": 0.83203125, + "learning_rate": 0.0001720602339046998, + "loss": 0.3373, + "step": 9577 + }, + { + "epoch": 1.28, + "grad_norm": 0.6875, + "learning_rate": 0.0001720521593600787, + "loss": 0.5522, + "step": 9578 + }, + { + "epoch": 1.28, + "grad_norm": 0.6484375, + "learning_rate": 0.00017204408383838412, + "loss": 0.5149, + "step": 9579 + }, + { + "epoch": 1.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00017203600733972546, + "loss": 0.3543, + "step": 9580 + }, + { + "epoch": 1.28, + "grad_norm": 0.67578125, + "learning_rate": 0.00017202792986421235, + "loss": 0.5383, + "step": 9581 + }, + { + "epoch": 1.28, + "grad_norm": 0.70703125, + "learning_rate": 0.00017201985141195424, + "loss": 0.3065, + "step": 9582 + }, + { + "epoch": 1.28, + "grad_norm": 0.7421875, + "learning_rate": 0.00017201177198306077, + "loss": 0.5577, + "step": 9583 + }, + { + "epoch": 1.28, + "grad_norm": 0.59375, + "learning_rate": 0.00017200369157764148, + "loss": 0.2971, + "step": 9584 + }, + { + "epoch": 1.28, + "grad_norm": 0.6328125, + "learning_rate": 0.00017199561019580587, + "loss": 0.3135, + "step": 9585 + }, + { + "epoch": 1.28, + "grad_norm": 0.55859375, + "learning_rate": 0.00017198752783766357, + "loss": 0.4274, + "step": 9586 + }, + { + "epoch": 1.28, + "grad_norm": 0.703125, + "learning_rate": 0.0001719794445033242, + "loss": 0.5631, + "step": 9587 + }, + { + "epoch": 1.28, + "grad_norm": 0.5703125, + "learning_rate": 0.00017197136019289738, + "loss": 0.5661, + "step": 9588 + }, + { + "epoch": 1.28, + "grad_norm": 0.5859375, + "learning_rate": 0.00017196327490649275, + "loss": 0.6787, + "step": 9589 + }, + { + "epoch": 1.28, + "grad_norm": 0.453125, + "learning_rate": 0.00017195518864421988, + "loss": 0.2139, + "step": 9590 + }, + { + "epoch": 1.28, + "grad_norm": 0.51171875, + "learning_rate": 0.0001719471014061885, + "loss": 0.4023, + "step": 9591 + }, + { + "epoch": 1.28, + "grad_norm": 0.546875, + "learning_rate": 0.00017193901319250822, + "loss": 0.4356, + "step": 9592 + }, + { + "epoch": 1.28, + "grad_norm": 0.55859375, + "learning_rate": 0.00017193092400328877, + "loss": 0.5207, + "step": 9593 + }, + { + "epoch": 1.28, + "grad_norm": 0.625, + "learning_rate": 0.00017192283383863982, + "loss": 0.5438, + "step": 9594 + }, + { + "epoch": 1.28, + "grad_norm": 0.494140625, + "learning_rate": 0.0001719147426986711, + "loss": 0.4703, + "step": 9595 + }, + { + "epoch": 1.28, + "grad_norm": 0.439453125, + "learning_rate": 0.00017190665058349232, + "loss": 0.4259, + "step": 9596 + }, + { + "epoch": 1.28, + "grad_norm": 0.54296875, + "learning_rate": 0.0001718985574932132, + "loss": 0.5162, + "step": 9597 + }, + { + "epoch": 1.28, + "grad_norm": 0.8125, + "learning_rate": 0.00017189046342794346, + "loss": 0.9508, + "step": 9598 + }, + { + "epoch": 1.28, + "grad_norm": 0.52734375, + "learning_rate": 0.00017188236838779295, + "loss": 0.5172, + "step": 9599 + }, + { + "epoch": 1.28, + "grad_norm": 0.7109375, + "learning_rate": 0.00017187427237287138, + "loss": 0.4917, + "step": 9600 + }, + { + "epoch": 1.28, + "grad_norm": 0.75390625, + "learning_rate": 0.0001718661753832885, + "loss": 0.3524, + "step": 9601 + }, + { + "epoch": 1.28, + "grad_norm": 0.47265625, + "learning_rate": 0.00017185807741915422, + "loss": 0.261, + "step": 9602 + }, + { + "epoch": 1.28, + "grad_norm": 0.451171875, + "learning_rate": 0.00017184997848057826, + "loss": 0.3731, + "step": 9603 + }, + { + "epoch": 1.28, + "grad_norm": 0.42578125, + "learning_rate": 0.0001718418785676705, + "loss": 0.343, + "step": 9604 + }, + { + "epoch": 1.28, + "grad_norm": 0.56640625, + "learning_rate": 0.0001718337776805407, + "loss": 0.5618, + "step": 9605 + }, + { + "epoch": 1.28, + "grad_norm": 0.451171875, + "learning_rate": 0.00017182567581929881, + "loss": 0.6, + "step": 9606 + }, + { + "epoch": 1.28, + "grad_norm": 0.66015625, + "learning_rate": 0.00017181757298405468, + "loss": 0.369, + "step": 9607 + }, + { + "epoch": 1.28, + "grad_norm": 0.73828125, + "learning_rate": 0.0001718094691749182, + "loss": 0.297, + "step": 9608 + }, + { + "epoch": 1.28, + "grad_norm": 0.69140625, + "learning_rate": 0.00017180136439199915, + "loss": 0.5942, + "step": 9609 + }, + { + "epoch": 1.28, + "grad_norm": 0.59765625, + "learning_rate": 0.00017179325863540755, + "loss": 0.5031, + "step": 9610 + }, + { + "epoch": 1.28, + "grad_norm": 0.68359375, + "learning_rate": 0.00017178515190525328, + "loss": 0.4325, + "step": 9611 + }, + { + "epoch": 1.28, + "grad_norm": 0.58203125, + "learning_rate": 0.0001717770442016463, + "loss": 0.5819, + "step": 9612 + }, + { + "epoch": 1.28, + "grad_norm": 0.7421875, + "learning_rate": 0.00017176893552469651, + "loss": 0.372, + "step": 9613 + }, + { + "epoch": 1.28, + "grad_norm": 0.68359375, + "learning_rate": 0.0001717608258745139, + "loss": 0.3977, + "step": 9614 + }, + { + "epoch": 1.28, + "grad_norm": 0.6015625, + "learning_rate": 0.00017175271525120844, + "loss": 0.4769, + "step": 9615 + }, + { + "epoch": 1.28, + "grad_norm": 0.498046875, + "learning_rate": 0.0001717446036548901, + "loss": 0.6959, + "step": 9616 + }, + { + "epoch": 1.28, + "grad_norm": 0.625, + "learning_rate": 0.00017173649108566893, + "loss": 0.651, + "step": 9617 + }, + { + "epoch": 1.28, + "grad_norm": 0.515625, + "learning_rate": 0.00017172837754365485, + "loss": 0.2708, + "step": 9618 + }, + { + "epoch": 1.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00017172026302895795, + "loss": 0.2104, + "step": 9619 + }, + { + "epoch": 1.28, + "grad_norm": 0.55859375, + "learning_rate": 0.00017171214754168826, + "loss": 0.4217, + "step": 9620 + }, + { + "epoch": 1.28, + "grad_norm": 0.54296875, + "learning_rate": 0.0001717040310819558, + "loss": 0.5821, + "step": 9621 + }, + { + "epoch": 1.28, + "grad_norm": 0.46484375, + "learning_rate": 0.0001716959136498707, + "loss": 0.3725, + "step": 9622 + }, + { + "epoch": 1.28, + "grad_norm": 0.431640625, + "learning_rate": 0.00017168779524554299, + "loss": 0.4598, + "step": 9623 + }, + { + "epoch": 1.28, + "grad_norm": 0.5546875, + "learning_rate": 0.00017167967586908275, + "loss": 0.6109, + "step": 9624 + }, + { + "epoch": 1.28, + "grad_norm": 0.423828125, + "learning_rate": 0.0001716715555206001, + "loss": 0.2328, + "step": 9625 + }, + { + "epoch": 1.28, + "grad_norm": 0.5859375, + "learning_rate": 0.00017166343420020515, + "loss": 0.3178, + "step": 9626 + }, + { + "epoch": 1.28, + "grad_norm": 0.314453125, + "learning_rate": 0.00017165531190800807, + "loss": 0.2179, + "step": 9627 + }, + { + "epoch": 1.28, + "grad_norm": 0.56640625, + "learning_rate": 0.00017164718864411897, + "loss": 0.3703, + "step": 9628 + }, + { + "epoch": 1.28, + "grad_norm": 0.55078125, + "learning_rate": 0.000171639064408648, + "loss": 0.2451, + "step": 9629 + }, + { + "epoch": 1.29, + "grad_norm": 0.62890625, + "learning_rate": 0.00017163093920170532, + "loss": 0.5739, + "step": 9630 + }, + { + "epoch": 1.29, + "grad_norm": 0.51171875, + "learning_rate": 0.00017162281302340112, + "loss": 0.2619, + "step": 9631 + }, + { + "epoch": 1.29, + "grad_norm": 0.443359375, + "learning_rate": 0.00017161468587384565, + "loss": 0.2234, + "step": 9632 + }, + { + "epoch": 1.29, + "grad_norm": 0.419921875, + "learning_rate": 0.00017160655775314907, + "loss": 0.4478, + "step": 9633 + }, + { + "epoch": 1.29, + "grad_norm": 0.609375, + "learning_rate": 0.00017159842866142162, + "loss": 0.4016, + "step": 9634 + }, + { + "epoch": 1.29, + "grad_norm": 0.73828125, + "learning_rate": 0.0001715902985987735, + "loss": 0.3453, + "step": 9635 + }, + { + "epoch": 1.29, + "grad_norm": 0.57421875, + "learning_rate": 0.000171582167565315, + "loss": 0.5897, + "step": 9636 + }, + { + "epoch": 1.29, + "grad_norm": 0.447265625, + "learning_rate": 0.00017157403556115636, + "loss": 0.3655, + "step": 9637 + }, + { + "epoch": 1.29, + "grad_norm": 0.421875, + "learning_rate": 0.00017156590258640786, + "loss": 0.3054, + "step": 9638 + }, + { + "epoch": 1.29, + "grad_norm": 0.5703125, + "learning_rate": 0.00017155776864117978, + "loss": 0.2858, + "step": 9639 + }, + { + "epoch": 1.29, + "grad_norm": 0.7265625, + "learning_rate": 0.00017154963372558246, + "loss": 0.5095, + "step": 9640 + }, + { + "epoch": 1.29, + "grad_norm": 0.55078125, + "learning_rate": 0.00017154149783972614, + "loss": 0.4683, + "step": 9641 + }, + { + "epoch": 1.29, + "grad_norm": 0.51171875, + "learning_rate": 0.00017153336098372127, + "loss": 0.3953, + "step": 9642 + }, + { + "epoch": 1.29, + "grad_norm": 0.5234375, + "learning_rate": 0.00017152522315767805, + "loss": 0.4202, + "step": 9643 + }, + { + "epoch": 1.29, + "grad_norm": 0.73046875, + "learning_rate": 0.00017151708436170693, + "loss": 0.3693, + "step": 9644 + }, + { + "epoch": 1.29, + "grad_norm": 0.58203125, + "learning_rate": 0.00017150894459591823, + "loss": 0.2978, + "step": 9645 + }, + { + "epoch": 1.29, + "grad_norm": 0.71875, + "learning_rate": 0.00017150080386042237, + "loss": 0.5859, + "step": 9646 + }, + { + "epoch": 1.29, + "grad_norm": 0.6015625, + "learning_rate": 0.00017149266215532968, + "loss": 0.3163, + "step": 9647 + }, + { + "epoch": 1.29, + "grad_norm": 0.4921875, + "learning_rate": 0.00017148451948075064, + "loss": 0.4838, + "step": 9648 + }, + { + "epoch": 1.29, + "grad_norm": 0.53515625, + "learning_rate": 0.0001714763758367956, + "loss": 0.5176, + "step": 9649 + }, + { + "epoch": 1.29, + "grad_norm": 0.462890625, + "learning_rate": 0.0001714682312235751, + "loss": 0.3089, + "step": 9650 + }, + { + "epoch": 1.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017146008564119947, + "loss": 0.1576, + "step": 9651 + }, + { + "epoch": 1.29, + "grad_norm": 0.60546875, + "learning_rate": 0.0001714519390897792, + "loss": 0.4285, + "step": 9652 + }, + { + "epoch": 1.29, + "grad_norm": 0.52734375, + "learning_rate": 0.0001714437915694248, + "loss": 0.382, + "step": 9653 + }, + { + "epoch": 1.29, + "grad_norm": 0.62109375, + "learning_rate": 0.00017143564308024674, + "loss": 0.6887, + "step": 9654 + }, + { + "epoch": 1.29, + "grad_norm": 0.7109375, + "learning_rate": 0.0001714274936223555, + "loss": 0.3409, + "step": 9655 + }, + { + "epoch": 1.29, + "grad_norm": 0.546875, + "learning_rate": 0.00017141934319586164, + "loss": 0.2495, + "step": 9656 + }, + { + "epoch": 1.29, + "grad_norm": 0.318359375, + "learning_rate": 0.0001714111918008756, + "loss": 0.1857, + "step": 9657 + }, + { + "epoch": 1.29, + "grad_norm": 0.53125, + "learning_rate": 0.00017140303943750799, + "loss": 0.5554, + "step": 9658 + }, + { + "epoch": 1.29, + "grad_norm": 0.60546875, + "learning_rate": 0.00017139488610586935, + "loss": 0.273, + "step": 9659 + }, + { + "epoch": 1.29, + "grad_norm": 0.337890625, + "learning_rate": 0.00017138673180607024, + "loss": 0.2615, + "step": 9660 + }, + { + "epoch": 1.29, + "grad_norm": 0.4140625, + "learning_rate": 0.00017137857653822121, + "loss": 0.2736, + "step": 9661 + }, + { + "epoch": 1.29, + "grad_norm": 0.6640625, + "learning_rate": 0.00017137042030243284, + "loss": 0.4884, + "step": 9662 + }, + { + "epoch": 1.29, + "grad_norm": 0.439453125, + "learning_rate": 0.00017136226309881584, + "loss": 0.2799, + "step": 9663 + }, + { + "epoch": 1.29, + "grad_norm": 0.6328125, + "learning_rate": 0.00017135410492748068, + "loss": 0.5319, + "step": 9664 + }, + { + "epoch": 1.29, + "grad_norm": 0.51171875, + "learning_rate": 0.0001713459457885381, + "loss": 0.5448, + "step": 9665 + }, + { + "epoch": 1.29, + "grad_norm": 0.67578125, + "learning_rate": 0.0001713377856820987, + "loss": 0.5798, + "step": 9666 + }, + { + "epoch": 1.29, + "grad_norm": 0.68359375, + "learning_rate": 0.00017132962460827313, + "loss": 0.2267, + "step": 9667 + }, + { + "epoch": 1.29, + "grad_norm": 0.77734375, + "learning_rate": 0.0001713214625671721, + "loss": 0.6379, + "step": 9668 + }, + { + "epoch": 1.29, + "grad_norm": 0.49609375, + "learning_rate": 0.00017131329955890625, + "loss": 0.3521, + "step": 9669 + }, + { + "epoch": 1.29, + "grad_norm": 0.609375, + "learning_rate": 0.00017130513558358628, + "loss": 0.6188, + "step": 9670 + }, + { + "epoch": 1.29, + "grad_norm": 0.455078125, + "learning_rate": 0.0001712969706413229, + "loss": 0.3989, + "step": 9671 + }, + { + "epoch": 1.29, + "grad_norm": 0.5390625, + "learning_rate": 0.0001712888047322269, + "loss": 0.3287, + "step": 9672 + }, + { + "epoch": 1.29, + "grad_norm": 0.46875, + "learning_rate": 0.00017128063785640886, + "loss": 0.6255, + "step": 9673 + }, + { + "epoch": 1.29, + "grad_norm": 0.486328125, + "learning_rate": 0.0001712724700139797, + "loss": 0.4223, + "step": 9674 + }, + { + "epoch": 1.29, + "grad_norm": 0.62890625, + "learning_rate": 0.00017126430120505007, + "loss": 0.5455, + "step": 9675 + }, + { + "epoch": 1.29, + "grad_norm": 0.41796875, + "learning_rate": 0.0001712561314297308, + "loss": 0.2928, + "step": 9676 + }, + { + "epoch": 1.29, + "grad_norm": 0.349609375, + "learning_rate": 0.00017124796068813264, + "loss": 0.3062, + "step": 9677 + }, + { + "epoch": 1.29, + "grad_norm": 0.44921875, + "learning_rate": 0.0001712397889803664, + "loss": 0.418, + "step": 9678 + }, + { + "epoch": 1.29, + "grad_norm": 0.6328125, + "learning_rate": 0.00017123161630654292, + "loss": 0.2239, + "step": 9679 + }, + { + "epoch": 1.29, + "grad_norm": 0.4921875, + "learning_rate": 0.00017122344266677297, + "loss": 0.2902, + "step": 9680 + }, + { + "epoch": 1.29, + "grad_norm": 0.5625, + "learning_rate": 0.00017121526806116748, + "loss": 0.3892, + "step": 9681 + }, + { + "epoch": 1.29, + "grad_norm": 0.7109375, + "learning_rate": 0.0001712070924898372, + "loss": 0.2033, + "step": 9682 + }, + { + "epoch": 1.29, + "grad_norm": 0.3125, + "learning_rate": 0.00017119891595289305, + "loss": 0.1696, + "step": 9683 + }, + { + "epoch": 1.29, + "grad_norm": 0.65234375, + "learning_rate": 0.00017119073845044597, + "loss": 0.4312, + "step": 9684 + }, + { + "epoch": 1.29, + "grad_norm": 0.474609375, + "learning_rate": 0.00017118255998260673, + "loss": 0.4563, + "step": 9685 + }, + { + "epoch": 1.29, + "grad_norm": 0.498046875, + "learning_rate": 0.00017117438054948628, + "loss": 0.3091, + "step": 9686 + }, + { + "epoch": 1.29, + "grad_norm": 0.400390625, + "learning_rate": 0.00017116620015119555, + "loss": 0.3919, + "step": 9687 + }, + { + "epoch": 1.29, + "grad_norm": 0.494140625, + "learning_rate": 0.00017115801878784552, + "loss": 0.298, + "step": 9688 + }, + { + "epoch": 1.29, + "grad_norm": 1.203125, + "learning_rate": 0.00017114983645954705, + "loss": 0.4233, + "step": 9689 + }, + { + "epoch": 1.29, + "grad_norm": 0.451171875, + "learning_rate": 0.00017114165316641113, + "loss": 0.2883, + "step": 9690 + }, + { + "epoch": 1.29, + "grad_norm": 0.53125, + "learning_rate": 0.00017113346890854875, + "loss": 0.436, + "step": 9691 + }, + { + "epoch": 1.29, + "grad_norm": 0.55078125, + "learning_rate": 0.0001711252836860709, + "loss": 0.38, + "step": 9692 + }, + { + "epoch": 1.29, + "grad_norm": 0.58203125, + "learning_rate": 0.0001711170974990885, + "loss": 0.5113, + "step": 9693 + }, + { + "epoch": 1.29, + "grad_norm": 0.58203125, + "learning_rate": 0.00017110891034771266, + "loss": 0.1864, + "step": 9694 + }, + { + "epoch": 1.29, + "grad_norm": 0.65234375, + "learning_rate": 0.00017110072223205434, + "loss": 0.3899, + "step": 9695 + }, + { + "epoch": 1.29, + "grad_norm": 0.5, + "learning_rate": 0.0001710925331522246, + "loss": 0.2508, + "step": 9696 + }, + { + "epoch": 1.29, + "grad_norm": 0.58984375, + "learning_rate": 0.00017108434310833447, + "loss": 0.5407, + "step": 9697 + }, + { + "epoch": 1.29, + "grad_norm": 0.625, + "learning_rate": 0.00017107615210049505, + "loss": 0.3503, + "step": 9698 + }, + { + "epoch": 1.29, + "grad_norm": 0.5078125, + "learning_rate": 0.0001710679601288174, + "loss": 0.3786, + "step": 9699 + }, + { + "epoch": 1.29, + "grad_norm": 0.443359375, + "learning_rate": 0.00017105976719341258, + "loss": 0.3985, + "step": 9700 + }, + { + "epoch": 1.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017105157329439167, + "loss": 0.508, + "step": 9701 + }, + { + "epoch": 1.29, + "grad_norm": 0.55078125, + "learning_rate": 0.00017104337843186588, + "loss": 0.3697, + "step": 9702 + }, + { + "epoch": 1.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017103518260594625, + "loss": 0.4978, + "step": 9703 + }, + { + "epoch": 1.29, + "grad_norm": 0.5625, + "learning_rate": 0.00017102698581674398, + "loss": 0.6333, + "step": 9704 + }, + { + "epoch": 1.3, + "grad_norm": 0.4765625, + "learning_rate": 0.00017101878806437018, + "loss": 0.5845, + "step": 9705 + }, + { + "epoch": 1.3, + "grad_norm": 0.58203125, + "learning_rate": 0.00017101058934893609, + "loss": 0.6271, + "step": 9706 + }, + { + "epoch": 1.3, + "grad_norm": 0.59765625, + "learning_rate": 0.00017100238967055278, + "loss": 0.4097, + "step": 9707 + }, + { + "epoch": 1.3, + "grad_norm": 0.6640625, + "learning_rate": 0.0001709941890293315, + "loss": 0.5936, + "step": 9708 + }, + { + "epoch": 1.3, + "grad_norm": 0.4609375, + "learning_rate": 0.00017098598742538345, + "loss": 0.5509, + "step": 9709 + }, + { + "epoch": 1.3, + "grad_norm": 0.48828125, + "learning_rate": 0.00017097778485881986, + "loss": 0.368, + "step": 9710 + }, + { + "epoch": 1.3, + "grad_norm": 0.57421875, + "learning_rate": 0.00017096958132975195, + "loss": 0.3171, + "step": 9711 + }, + { + "epoch": 1.3, + "grad_norm": 0.474609375, + "learning_rate": 0.00017096137683829098, + "loss": 0.4406, + "step": 9712 + }, + { + "epoch": 1.3, + "grad_norm": 0.75, + "learning_rate": 0.00017095317138454818, + "loss": 0.4343, + "step": 9713 + }, + { + "epoch": 1.3, + "grad_norm": 0.6015625, + "learning_rate": 0.00017094496496863486, + "loss": 0.304, + "step": 9714 + }, + { + "epoch": 1.3, + "grad_norm": 0.6015625, + "learning_rate": 0.00017093675759066227, + "loss": 0.5694, + "step": 9715 + }, + { + "epoch": 1.3, + "grad_norm": 0.546875, + "learning_rate": 0.00017092854925074172, + "loss": 0.3883, + "step": 9716 + }, + { + "epoch": 1.3, + "grad_norm": 0.49609375, + "learning_rate": 0.00017092033994898454, + "loss": 0.3729, + "step": 9717 + }, + { + "epoch": 1.3, + "grad_norm": 0.77734375, + "learning_rate": 0.00017091212968550198, + "loss": 0.2249, + "step": 9718 + }, + { + "epoch": 1.3, + "grad_norm": 0.498046875, + "learning_rate": 0.00017090391846040547, + "loss": 0.2471, + "step": 9719 + }, + { + "epoch": 1.3, + "grad_norm": 0.51171875, + "learning_rate": 0.00017089570627380632, + "loss": 0.2128, + "step": 9720 + }, + { + "epoch": 1.3, + "grad_norm": 0.859375, + "learning_rate": 0.0001708874931258159, + "loss": 0.4429, + "step": 9721 + }, + { + "epoch": 1.3, + "grad_norm": 0.73046875, + "learning_rate": 0.00017087927901654557, + "loss": 0.3691, + "step": 9722 + }, + { + "epoch": 1.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017087106394610672, + "loss": 0.6201, + "step": 9723 + }, + { + "epoch": 1.3, + "grad_norm": 0.4609375, + "learning_rate": 0.0001708628479146108, + "loss": 0.4194, + "step": 9724 + }, + { + "epoch": 1.3, + "grad_norm": 0.61328125, + "learning_rate": 0.00017085463092216912, + "loss": 0.5526, + "step": 9725 + }, + { + "epoch": 1.3, + "grad_norm": 0.61328125, + "learning_rate": 0.00017084641296889318, + "loss": 0.5267, + "step": 9726 + }, + { + "epoch": 1.3, + "grad_norm": 0.5859375, + "learning_rate": 0.00017083819405489442, + "loss": 0.5156, + "step": 9727 + }, + { + "epoch": 1.3, + "grad_norm": 0.70703125, + "learning_rate": 0.0001708299741802843, + "loss": 0.39, + "step": 9728 + }, + { + "epoch": 1.3, + "grad_norm": 0.478515625, + "learning_rate": 0.00017082175334517428, + "loss": 0.3266, + "step": 9729 + }, + { + "epoch": 1.3, + "grad_norm": 0.51953125, + "learning_rate": 0.00017081353154967582, + "loss": 0.509, + "step": 9730 + }, + { + "epoch": 1.3, + "grad_norm": 0.41796875, + "learning_rate": 0.0001708053087939004, + "loss": 0.249, + "step": 9731 + }, + { + "epoch": 1.3, + "grad_norm": 0.37109375, + "learning_rate": 0.00017079708507795958, + "loss": 0.2602, + "step": 9732 + }, + { + "epoch": 1.3, + "grad_norm": 0.65234375, + "learning_rate": 0.00017078886040196482, + "loss": 0.4698, + "step": 9733 + }, + { + "epoch": 1.3, + "grad_norm": 0.75390625, + "learning_rate": 0.0001707806347660277, + "loss": 0.2602, + "step": 9734 + }, + { + "epoch": 1.3, + "grad_norm": 0.6328125, + "learning_rate": 0.00017077240817025974, + "loss": 0.7554, + "step": 9735 + }, + { + "epoch": 1.3, + "grad_norm": 0.640625, + "learning_rate": 0.00017076418061477252, + "loss": 0.5099, + "step": 9736 + }, + { + "epoch": 1.3, + "grad_norm": 0.515625, + "learning_rate": 0.00017075595209967758, + "loss": 0.2996, + "step": 9737 + }, + { + "epoch": 1.3, + "grad_norm": 0.376953125, + "learning_rate": 0.00017074772262508652, + "loss": 0.2637, + "step": 9738 + }, + { + "epoch": 1.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017073949219111097, + "loss": 0.4057, + "step": 9739 + }, + { + "epoch": 1.3, + "grad_norm": 0.50390625, + "learning_rate": 0.00017073126079786245, + "loss": 0.299, + "step": 9740 + }, + { + "epoch": 1.3, + "grad_norm": 0.56640625, + "learning_rate": 0.00017072302844545268, + "loss": 0.309, + "step": 9741 + }, + { + "epoch": 1.3, + "grad_norm": 0.57421875, + "learning_rate": 0.0001707147951339932, + "loss": 0.4501, + "step": 9742 + }, + { + "epoch": 1.3, + "grad_norm": 0.6875, + "learning_rate": 0.0001707065608635958, + "loss": 0.7076, + "step": 9743 + }, + { + "epoch": 1.3, + "grad_norm": 0.4609375, + "learning_rate": 0.00017069832563437198, + "loss": 0.438, + "step": 9744 + }, + { + "epoch": 1.3, + "grad_norm": 0.484375, + "learning_rate": 0.0001706900894464335, + "loss": 0.3014, + "step": 9745 + }, + { + "epoch": 1.3, + "grad_norm": 0.51953125, + "learning_rate": 0.00017068185229989207, + "loss": 0.5424, + "step": 9746 + }, + { + "epoch": 1.3, + "grad_norm": 0.5, + "learning_rate": 0.00017067361419485934, + "loss": 0.2936, + "step": 9747 + }, + { + "epoch": 1.3, + "grad_norm": 0.51953125, + "learning_rate": 0.00017066537513144706, + "loss": 0.5333, + "step": 9748 + }, + { + "epoch": 1.3, + "grad_norm": 0.62890625, + "learning_rate": 0.0001706571351097669, + "loss": 0.4293, + "step": 9749 + }, + { + "epoch": 1.3, + "grad_norm": 0.275390625, + "learning_rate": 0.00017064889412993068, + "loss": 0.2104, + "step": 9750 + }, + { + "epoch": 1.3, + "grad_norm": 0.66796875, + "learning_rate": 0.0001706406521920501, + "loss": 0.3556, + "step": 9751 + }, + { + "epoch": 1.3, + "grad_norm": 0.48046875, + "learning_rate": 0.00017063240929623695, + "loss": 0.3042, + "step": 9752 + }, + { + "epoch": 1.3, + "grad_norm": 0.6875, + "learning_rate": 0.00017062416544260297, + "loss": 0.3556, + "step": 9753 + }, + { + "epoch": 1.3, + "grad_norm": 0.7890625, + "learning_rate": 0.00017061592063126002, + "loss": 0.4294, + "step": 9754 + }, + { + "epoch": 1.3, + "grad_norm": 0.67578125, + "learning_rate": 0.00017060767486231986, + "loss": 0.3472, + "step": 9755 + }, + { + "epoch": 1.3, + "grad_norm": 0.57421875, + "learning_rate": 0.0001705994281358943, + "loss": 0.3319, + "step": 9756 + }, + { + "epoch": 1.3, + "grad_norm": 0.478515625, + "learning_rate": 0.00017059118045209515, + "loss": 0.2757, + "step": 9757 + }, + { + "epoch": 1.3, + "grad_norm": 0.42578125, + "learning_rate": 0.00017058293181103433, + "loss": 0.245, + "step": 9758 + }, + { + "epoch": 1.3, + "grad_norm": 0.57421875, + "learning_rate": 0.00017057468221282364, + "loss": 0.4332, + "step": 9759 + }, + { + "epoch": 1.3, + "grad_norm": 0.671875, + "learning_rate": 0.00017056643165757498, + "loss": 0.3099, + "step": 9760 + }, + { + "epoch": 1.3, + "grad_norm": 0.671875, + "learning_rate": 0.0001705581801454002, + "loss": 0.6114, + "step": 9761 + }, + { + "epoch": 1.3, + "grad_norm": 0.52734375, + "learning_rate": 0.00017054992767641123, + "loss": 0.5332, + "step": 9762 + }, + { + "epoch": 1.3, + "grad_norm": 0.63671875, + "learning_rate": 0.00017054167425071995, + "loss": 0.2909, + "step": 9763 + }, + { + "epoch": 1.3, + "grad_norm": 0.55859375, + "learning_rate": 0.0001705334198684383, + "loss": 0.5533, + "step": 9764 + }, + { + "epoch": 1.3, + "grad_norm": 0.7109375, + "learning_rate": 0.0001705251645296782, + "loss": 0.2858, + "step": 9765 + }, + { + "epoch": 1.3, + "grad_norm": 0.59375, + "learning_rate": 0.00017051690823455162, + "loss": 0.3417, + "step": 9766 + }, + { + "epoch": 1.3, + "grad_norm": 0.5390625, + "learning_rate": 0.00017050865098317051, + "loss": 0.5898, + "step": 9767 + }, + { + "epoch": 1.3, + "grad_norm": 0.64453125, + "learning_rate": 0.00017050039277564683, + "loss": 0.3007, + "step": 9768 + }, + { + "epoch": 1.3, + "grad_norm": 0.466796875, + "learning_rate": 0.00017049213361209256, + "loss": 0.2207, + "step": 9769 + }, + { + "epoch": 1.3, + "grad_norm": 0.578125, + "learning_rate": 0.00017048387349261973, + "loss": 0.4796, + "step": 9770 + }, + { + "epoch": 1.3, + "grad_norm": 0.462890625, + "learning_rate": 0.00017047561241734035, + "loss": 0.3834, + "step": 9771 + }, + { + "epoch": 1.3, + "grad_norm": 0.50390625, + "learning_rate": 0.00017046735038636643, + "loss": 0.3586, + "step": 9772 + }, + { + "epoch": 1.3, + "grad_norm": 0.416015625, + "learning_rate": 0.00017045908739981, + "loss": 0.2139, + "step": 9773 + }, + { + "epoch": 1.3, + "grad_norm": 0.5546875, + "learning_rate": 0.00017045082345778313, + "loss": 0.3962, + "step": 9774 + }, + { + "epoch": 1.3, + "grad_norm": 0.55859375, + "learning_rate": 0.0001704425585603979, + "loss": 0.3569, + "step": 9775 + }, + { + "epoch": 1.3, + "grad_norm": 0.431640625, + "learning_rate": 0.00017043429270776635, + "loss": 0.4659, + "step": 9776 + }, + { + "epoch": 1.3, + "grad_norm": 0.453125, + "learning_rate": 0.00017042602590000057, + "loss": 0.36, + "step": 9777 + }, + { + "epoch": 1.3, + "grad_norm": 0.45703125, + "learning_rate": 0.0001704177581372127, + "loss": 0.2841, + "step": 9778 + }, + { + "epoch": 1.3, + "grad_norm": 0.412109375, + "learning_rate": 0.00017040948941951482, + "loss": 0.2502, + "step": 9779 + }, + { + "epoch": 1.31, + "grad_norm": 0.71875, + "learning_rate": 0.0001704012197470191, + "loss": 0.4785, + "step": 9780 + }, + { + "epoch": 1.31, + "grad_norm": 0.498046875, + "learning_rate": 0.00017039294911983763, + "loss": 0.1567, + "step": 9781 + }, + { + "epoch": 1.31, + "grad_norm": 0.44140625, + "learning_rate": 0.00017038467753808262, + "loss": 0.377, + "step": 9782 + }, + { + "epoch": 1.31, + "grad_norm": 0.89453125, + "learning_rate": 0.0001703764050018662, + "loss": 0.2979, + "step": 9783 + }, + { + "epoch": 1.31, + "grad_norm": 0.6015625, + "learning_rate": 0.00017036813151130055, + "loss": 0.545, + "step": 9784 + }, + { + "epoch": 1.31, + "grad_norm": 0.55859375, + "learning_rate": 0.00017035985706649788, + "loss": 0.4517, + "step": 9785 + }, + { + "epoch": 1.31, + "grad_norm": 0.5, + "learning_rate": 0.0001703515816675704, + "loss": 0.4252, + "step": 9786 + }, + { + "epoch": 1.31, + "grad_norm": 0.474609375, + "learning_rate": 0.00017034330531463029, + "loss": 0.307, + "step": 9787 + }, + { + "epoch": 1.31, + "grad_norm": 0.71484375, + "learning_rate": 0.00017033502800778986, + "loss": 0.3059, + "step": 9788 + }, + { + "epoch": 1.31, + "grad_norm": 0.53515625, + "learning_rate": 0.00017032674974716127, + "loss": 0.4605, + "step": 9789 + }, + { + "epoch": 1.31, + "grad_norm": 0.57421875, + "learning_rate": 0.00017031847053285685, + "loss": 0.3317, + "step": 9790 + }, + { + "epoch": 1.31, + "grad_norm": 0.57421875, + "learning_rate": 0.00017031019036498881, + "loss": 0.1931, + "step": 9791 + }, + { + "epoch": 1.31, + "grad_norm": 0.54296875, + "learning_rate": 0.00017030190924366946, + "loss": 0.2927, + "step": 9792 + }, + { + "epoch": 1.31, + "grad_norm": 0.59375, + "learning_rate": 0.00017029362716901113, + "loss": 0.5111, + "step": 9793 + }, + { + "epoch": 1.31, + "grad_norm": 0.5078125, + "learning_rate": 0.0001702853441411261, + "loss": 0.4149, + "step": 9794 + }, + { + "epoch": 1.31, + "grad_norm": 0.7265625, + "learning_rate": 0.0001702770601601267, + "loss": 0.3712, + "step": 9795 + }, + { + "epoch": 1.31, + "grad_norm": 0.7578125, + "learning_rate": 0.00017026877522612527, + "loss": 0.39, + "step": 9796 + }, + { + "epoch": 1.31, + "grad_norm": 0.52734375, + "learning_rate": 0.0001702604893392341, + "loss": 0.5683, + "step": 9797 + }, + { + "epoch": 1.31, + "grad_norm": 0.5625, + "learning_rate": 0.00017025220249956564, + "loss": 0.1866, + "step": 9798 + }, + { + "epoch": 1.31, + "grad_norm": 0.5546875, + "learning_rate": 0.0001702439147072322, + "loss": 0.2188, + "step": 9799 + }, + { + "epoch": 1.31, + "grad_norm": 0.7265625, + "learning_rate": 0.0001702356259623462, + "loss": 0.4969, + "step": 9800 + }, + { + "epoch": 1.31, + "grad_norm": 1.0390625, + "learning_rate": 0.00017022733626502004, + "loss": 0.2762, + "step": 9801 + }, + { + "epoch": 1.31, + "grad_norm": 0.54296875, + "learning_rate": 0.00017021904561536616, + "loss": 0.348, + "step": 9802 + }, + { + "epoch": 1.31, + "grad_norm": 0.58984375, + "learning_rate": 0.00017021075401349692, + "loss": 0.304, + "step": 9803 + }, + { + "epoch": 1.31, + "grad_norm": 0.65625, + "learning_rate": 0.00017020246145952477, + "loss": 0.463, + "step": 9804 + }, + { + "epoch": 1.31, + "grad_norm": 0.8046875, + "learning_rate": 0.00017019416795356225, + "loss": 0.3274, + "step": 9805 + }, + { + "epoch": 1.31, + "grad_norm": 0.578125, + "learning_rate": 0.00017018587349572172, + "loss": 0.6464, + "step": 9806 + }, + { + "epoch": 1.31, + "grad_norm": 0.48046875, + "learning_rate": 0.00017017757808611571, + "loss": 0.5713, + "step": 9807 + }, + { + "epoch": 1.31, + "grad_norm": 0.55078125, + "learning_rate": 0.0001701692817248567, + "loss": 0.4118, + "step": 9808 + }, + { + "epoch": 1.31, + "grad_norm": 0.435546875, + "learning_rate": 0.00017016098441205722, + "loss": 0.2368, + "step": 9809 + }, + { + "epoch": 1.31, + "grad_norm": 0.490234375, + "learning_rate": 0.00017015268614782974, + "loss": 0.3472, + "step": 9810 + }, + { + "epoch": 1.31, + "grad_norm": 0.5625, + "learning_rate": 0.0001701443869322868, + "loss": 0.3091, + "step": 9811 + }, + { + "epoch": 1.31, + "grad_norm": 0.427734375, + "learning_rate": 0.00017013608676554097, + "loss": 0.3277, + "step": 9812 + }, + { + "epoch": 1.31, + "grad_norm": 0.388671875, + "learning_rate": 0.00017012778564770483, + "loss": 0.1766, + "step": 9813 + }, + { + "epoch": 1.31, + "grad_norm": 0.314453125, + "learning_rate": 0.00017011948357889084, + "loss": 0.1427, + "step": 9814 + }, + { + "epoch": 1.31, + "grad_norm": 0.470703125, + "learning_rate": 0.00017011118055921168, + "loss": 0.5409, + "step": 9815 + }, + { + "epoch": 1.31, + "grad_norm": 0.55859375, + "learning_rate": 0.0001701028765887799, + "loss": 0.4071, + "step": 9816 + }, + { + "epoch": 1.31, + "grad_norm": 0.61328125, + "learning_rate": 0.00017009457166770813, + "loss": 0.5408, + "step": 9817 + }, + { + "epoch": 1.31, + "grad_norm": 0.59765625, + "learning_rate": 0.00017008626579610898, + "loss": 0.4825, + "step": 9818 + }, + { + "epoch": 1.31, + "grad_norm": 0.50390625, + "learning_rate": 0.0001700779589740951, + "loss": 0.4785, + "step": 9819 + }, + { + "epoch": 1.31, + "grad_norm": 0.47265625, + "learning_rate": 0.00017006965120177912, + "loss": 0.3337, + "step": 9820 + }, + { + "epoch": 1.31, + "grad_norm": 0.462890625, + "learning_rate": 0.00017006134247927367, + "loss": 0.2314, + "step": 9821 + }, + { + "epoch": 1.31, + "grad_norm": 0.54296875, + "learning_rate": 0.00017005303280669148, + "loss": 0.5077, + "step": 9822 + }, + { + "epoch": 1.31, + "grad_norm": 0.5390625, + "learning_rate": 0.0001700447221841452, + "loss": 0.269, + "step": 9823 + }, + { + "epoch": 1.31, + "grad_norm": 0.6640625, + "learning_rate": 0.00017003641061174752, + "loss": 0.4485, + "step": 9824 + }, + { + "epoch": 1.31, + "grad_norm": 0.3828125, + "learning_rate": 0.0001700280980896112, + "loss": 0.2513, + "step": 9825 + }, + { + "epoch": 1.31, + "grad_norm": 0.55078125, + "learning_rate": 0.0001700197846178489, + "loss": 0.5424, + "step": 9826 + }, + { + "epoch": 1.31, + "grad_norm": 0.72265625, + "learning_rate": 0.00017001147019657338, + "loss": 0.3497, + "step": 9827 + }, + { + "epoch": 1.31, + "grad_norm": 0.58984375, + "learning_rate": 0.0001700031548258974, + "loss": 0.1924, + "step": 9828 + }, + { + "epoch": 1.31, + "grad_norm": 0.58984375, + "learning_rate": 0.00016999483850593369, + "loss": 0.3096, + "step": 9829 + }, + { + "epoch": 1.31, + "grad_norm": 0.7109375, + "learning_rate": 0.00016998652123679503, + "loss": 0.373, + "step": 9830 + }, + { + "epoch": 1.31, + "grad_norm": 0.486328125, + "learning_rate": 0.00016997820301859428, + "loss": 0.2957, + "step": 9831 + }, + { + "epoch": 1.31, + "grad_norm": 0.578125, + "learning_rate": 0.00016996988385144417, + "loss": 0.6333, + "step": 9832 + }, + { + "epoch": 1.31, + "grad_norm": 0.5, + "learning_rate": 0.0001699615637354575, + "loss": 0.2838, + "step": 9833 + }, + { + "epoch": 1.31, + "grad_norm": 0.54296875, + "learning_rate": 0.00016995324267074716, + "loss": 0.1701, + "step": 9834 + }, + { + "epoch": 1.31, + "grad_norm": 0.3984375, + "learning_rate": 0.00016994492065742594, + "loss": 0.2274, + "step": 9835 + }, + { + "epoch": 1.31, + "grad_norm": 0.78515625, + "learning_rate": 0.0001699365976956067, + "loss": 0.2717, + "step": 9836 + }, + { + "epoch": 1.31, + "grad_norm": 0.5703125, + "learning_rate": 0.00016992827378540235, + "loss": 0.3217, + "step": 9837 + }, + { + "epoch": 1.31, + "grad_norm": 0.46484375, + "learning_rate": 0.00016991994892692568, + "loss": 0.6606, + "step": 9838 + }, + { + "epoch": 1.31, + "grad_norm": 0.58203125, + "learning_rate": 0.00016991162312028963, + "loss": 0.4235, + "step": 9839 + }, + { + "epoch": 1.31, + "grad_norm": 0.5390625, + "learning_rate": 0.0001699032963656071, + "loss": 0.3065, + "step": 9840 + }, + { + "epoch": 1.31, + "grad_norm": 0.61328125, + "learning_rate": 0.00016989496866299102, + "loss": 0.3636, + "step": 9841 + }, + { + "epoch": 1.31, + "grad_norm": 0.55078125, + "learning_rate": 0.00016988664001255433, + "loss": 0.6237, + "step": 9842 + }, + { + "epoch": 1.31, + "grad_norm": 0.44140625, + "learning_rate": 0.00016987831041440993, + "loss": 0.5805, + "step": 9843 + }, + { + "epoch": 1.31, + "grad_norm": 0.8125, + "learning_rate": 0.0001698699798686708, + "loss": 0.4775, + "step": 9844 + }, + { + "epoch": 1.31, + "grad_norm": 0.5546875, + "learning_rate": 0.00016986164837544987, + "loss": 0.3295, + "step": 9845 + }, + { + "epoch": 1.31, + "grad_norm": 0.4921875, + "learning_rate": 0.00016985331593486017, + "loss": 0.2352, + "step": 9846 + }, + { + "epoch": 1.31, + "grad_norm": 0.5703125, + "learning_rate": 0.00016984498254701467, + "loss": 0.4762, + "step": 9847 + }, + { + "epoch": 1.31, + "grad_norm": 0.625, + "learning_rate": 0.00016983664821202638, + "loss": 0.4927, + "step": 9848 + }, + { + "epoch": 1.31, + "grad_norm": 0.431640625, + "learning_rate": 0.00016982831293000834, + "loss": 0.4427, + "step": 9849 + }, + { + "epoch": 1.31, + "grad_norm": 0.6484375, + "learning_rate": 0.00016981997670107354, + "loss": 0.5609, + "step": 9850 + }, + { + "epoch": 1.31, + "grad_norm": 0.498046875, + "learning_rate": 0.00016981163952533505, + "loss": 0.318, + "step": 9851 + }, + { + "epoch": 1.31, + "grad_norm": 0.65625, + "learning_rate": 0.00016980330140290594, + "loss": 0.3283, + "step": 9852 + }, + { + "epoch": 1.31, + "grad_norm": 0.52734375, + "learning_rate": 0.00016979496233389925, + "loss": 0.6145, + "step": 9853 + }, + { + "epoch": 1.31, + "grad_norm": 0.46875, + "learning_rate": 0.00016978662231842805, + "loss": 0.3159, + "step": 9854 + }, + { + "epoch": 1.32, + "grad_norm": 0.443359375, + "learning_rate": 0.0001697782813566055, + "loss": 0.1606, + "step": 9855 + }, + { + "epoch": 1.32, + "grad_norm": 0.59765625, + "learning_rate": 0.00016976993944854468, + "loss": 0.3854, + "step": 9856 + }, + { + "epoch": 1.32, + "grad_norm": 0.67578125, + "learning_rate": 0.00016976159659435868, + "loss": 0.2886, + "step": 9857 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546875, + "learning_rate": 0.00016975325279416063, + "loss": 0.4684, + "step": 9858 + }, + { + "epoch": 1.32, + "grad_norm": 0.5, + "learning_rate": 0.00016974490804806376, + "loss": 0.4612, + "step": 9859 + }, + { + "epoch": 1.32, + "grad_norm": 0.5, + "learning_rate": 0.00016973656235618113, + "loss": 0.3448, + "step": 9860 + }, + { + "epoch": 1.32, + "grad_norm": 0.61328125, + "learning_rate": 0.000169728215718626, + "loss": 0.3504, + "step": 9861 + }, + { + "epoch": 1.32, + "grad_norm": 0.58203125, + "learning_rate": 0.00016971986813551148, + "loss": 0.4057, + "step": 9862 + }, + { + "epoch": 1.32, + "grad_norm": 0.58203125, + "learning_rate": 0.00016971151960695082, + "loss": 0.4992, + "step": 9863 + }, + { + "epoch": 1.32, + "grad_norm": 0.55078125, + "learning_rate": 0.0001697031701330572, + "loss": 0.514, + "step": 9864 + }, + { + "epoch": 1.32, + "grad_norm": 0.478515625, + "learning_rate": 0.00016969481971394385, + "loss": 0.3184, + "step": 9865 + }, + { + "epoch": 1.32, + "grad_norm": 0.53125, + "learning_rate": 0.00016968646834972405, + "loss": 0.2346, + "step": 9866 + }, + { + "epoch": 1.32, + "grad_norm": 0.5234375, + "learning_rate": 0.00016967811604051098, + "loss": 0.3971, + "step": 9867 + }, + { + "epoch": 1.32, + "grad_norm": 0.4921875, + "learning_rate": 0.00016966976278641796, + "loss": 0.3918, + "step": 9868 + }, + { + "epoch": 1.32, + "grad_norm": 0.54296875, + "learning_rate": 0.00016966140858755822, + "loss": 0.4403, + "step": 9869 + }, + { + "epoch": 1.32, + "grad_norm": 0.828125, + "learning_rate": 0.0001696530534440451, + "loss": 0.373, + "step": 9870 + }, + { + "epoch": 1.32, + "grad_norm": 0.8984375, + "learning_rate": 0.00016964469735599184, + "loss": 0.3102, + "step": 9871 + }, + { + "epoch": 1.32, + "grad_norm": 0.486328125, + "learning_rate": 0.0001696363403235118, + "loss": 0.4924, + "step": 9872 + }, + { + "epoch": 1.32, + "grad_norm": 0.458984375, + "learning_rate": 0.00016962798234671831, + "loss": 0.2323, + "step": 9873 + }, + { + "epoch": 1.32, + "grad_norm": 0.57421875, + "learning_rate": 0.0001696196234257247, + "loss": 0.4271, + "step": 9874 + }, + { + "epoch": 1.32, + "grad_norm": 0.58984375, + "learning_rate": 0.0001696112635606443, + "loss": 0.377, + "step": 9875 + }, + { + "epoch": 1.32, + "grad_norm": 0.51171875, + "learning_rate": 0.00016960290275159045, + "loss": 0.2716, + "step": 9876 + }, + { + "epoch": 1.32, + "grad_norm": 0.63671875, + "learning_rate": 0.00016959454099867662, + "loss": 0.4193, + "step": 9877 + }, + { + "epoch": 1.32, + "grad_norm": 0.58984375, + "learning_rate": 0.00016958617830201614, + "loss": 0.3145, + "step": 9878 + }, + { + "epoch": 1.32, + "grad_norm": 0.4765625, + "learning_rate": 0.0001695778146617224, + "loss": 0.2598, + "step": 9879 + }, + { + "epoch": 1.32, + "grad_norm": 0.9140625, + "learning_rate": 0.00016956945007790886, + "loss": 0.4626, + "step": 9880 + }, + { + "epoch": 1.32, + "grad_norm": 0.5625, + "learning_rate": 0.00016956108455068892, + "loss": 0.6181, + "step": 9881 + }, + { + "epoch": 1.32, + "grad_norm": 0.55859375, + "learning_rate": 0.000169552718080176, + "loss": 0.5761, + "step": 9882 + }, + { + "epoch": 1.32, + "grad_norm": 0.68359375, + "learning_rate": 0.00016954435066648363, + "loss": 0.4141, + "step": 9883 + }, + { + "epoch": 1.32, + "grad_norm": 0.5703125, + "learning_rate": 0.00016953598230972522, + "loss": 0.3657, + "step": 9884 + }, + { + "epoch": 1.32, + "grad_norm": 0.462890625, + "learning_rate": 0.00016952761301001424, + "loss": 0.4769, + "step": 9885 + }, + { + "epoch": 1.32, + "grad_norm": 0.60546875, + "learning_rate": 0.00016951924276746425, + "loss": 0.3478, + "step": 9886 + }, + { + "epoch": 1.32, + "grad_norm": 0.61328125, + "learning_rate": 0.0001695108715821887, + "loss": 0.2769, + "step": 9887 + }, + { + "epoch": 1.32, + "grad_norm": 0.546875, + "learning_rate": 0.00016950249945430108, + "loss": 0.3805, + "step": 9888 + }, + { + "epoch": 1.32, + "grad_norm": 0.58984375, + "learning_rate": 0.00016949412638391498, + "loss": 0.3358, + "step": 9889 + }, + { + "epoch": 1.32, + "grad_norm": 0.66015625, + "learning_rate": 0.0001694857523711439, + "loss": 0.2918, + "step": 9890 + }, + { + "epoch": 1.32, + "grad_norm": 0.5078125, + "learning_rate": 0.00016947737741610145, + "loss": 0.3122, + "step": 9891 + }, + { + "epoch": 1.32, + "grad_norm": 0.640625, + "learning_rate": 0.0001694690015189012, + "loss": 0.4471, + "step": 9892 + }, + { + "epoch": 1.32, + "grad_norm": 0.546875, + "learning_rate": 0.00016946062467965666, + "loss": 0.2933, + "step": 9893 + }, + { + "epoch": 1.32, + "grad_norm": 0.5234375, + "learning_rate": 0.0001694522468984815, + "loss": 0.3731, + "step": 9894 + }, + { + "epoch": 1.32, + "grad_norm": 0.5234375, + "learning_rate": 0.0001694438681754893, + "loss": 0.6362, + "step": 9895 + }, + { + "epoch": 1.32, + "grad_norm": 0.765625, + "learning_rate": 0.00016943548851079365, + "loss": 0.472, + "step": 9896 + }, + { + "epoch": 1.32, + "grad_norm": 0.55859375, + "learning_rate": 0.0001694271079045082, + "loss": 0.4678, + "step": 9897 + }, + { + "epoch": 1.32, + "grad_norm": 0.66796875, + "learning_rate": 0.00016941872635674665, + "loss": 0.2567, + "step": 9898 + }, + { + "epoch": 1.32, + "grad_norm": 0.392578125, + "learning_rate": 0.00016941034386762263, + "loss": 0.3391, + "step": 9899 + }, + { + "epoch": 1.32, + "grad_norm": 0.75, + "learning_rate": 0.00016940196043724974, + "loss": 0.4849, + "step": 9900 + }, + { + "epoch": 1.32, + "grad_norm": 0.54296875, + "learning_rate": 0.0001693935760657418, + "loss": 0.4105, + "step": 9901 + }, + { + "epoch": 1.32, + "grad_norm": 0.41796875, + "learning_rate": 0.00016938519075321236, + "loss": 0.2438, + "step": 9902 + }, + { + "epoch": 1.32, + "grad_norm": 0.5390625, + "learning_rate": 0.00016937680449977523, + "loss": 0.4473, + "step": 9903 + }, + { + "epoch": 1.32, + "grad_norm": 0.67578125, + "learning_rate": 0.00016936841730554408, + "loss": 0.6767, + "step": 9904 + }, + { + "epoch": 1.32, + "grad_norm": 0.369140625, + "learning_rate": 0.00016936002917063272, + "loss": 0.1424, + "step": 9905 + }, + { + "epoch": 1.32, + "grad_norm": 0.609375, + "learning_rate": 0.00016935164009515482, + "loss": 0.6608, + "step": 9906 + }, + { + "epoch": 1.32, + "grad_norm": 0.474609375, + "learning_rate": 0.00016934325007922417, + "loss": 0.3115, + "step": 9907 + }, + { + "epoch": 1.32, + "grad_norm": 0.609375, + "learning_rate": 0.00016933485912295457, + "loss": 0.474, + "step": 9908 + }, + { + "epoch": 1.32, + "grad_norm": 0.57421875, + "learning_rate": 0.00016932646722645975, + "loss": 0.7183, + "step": 9909 + }, + { + "epoch": 1.32, + "grad_norm": 0.423828125, + "learning_rate": 0.00016931807438985358, + "loss": 0.3227, + "step": 9910 + }, + { + "epoch": 1.32, + "grad_norm": 0.61328125, + "learning_rate": 0.0001693096806132498, + "loss": 0.4439, + "step": 9911 + }, + { + "epoch": 1.32, + "grad_norm": 0.6328125, + "learning_rate": 0.00016930128589676228, + "loss": 0.7772, + "step": 9912 + }, + { + "epoch": 1.32, + "grad_norm": 0.4609375, + "learning_rate": 0.00016929289024050483, + "loss": 0.4311, + "step": 9913 + }, + { + "epoch": 1.32, + "grad_norm": 0.60546875, + "learning_rate": 0.00016928449364459134, + "loss": 0.3531, + "step": 9914 + }, + { + "epoch": 1.32, + "grad_norm": 0.6640625, + "learning_rate": 0.00016927609610913562, + "loss": 0.5308, + "step": 9915 + }, + { + "epoch": 1.32, + "grad_norm": 0.60546875, + "learning_rate": 0.0001692676976342516, + "loss": 0.4082, + "step": 9916 + }, + { + "epoch": 1.32, + "grad_norm": 0.431640625, + "learning_rate": 0.00016925929822005314, + "loss": 0.3328, + "step": 9917 + }, + { + "epoch": 1.32, + "grad_norm": 0.62890625, + "learning_rate": 0.00016925089786665412, + "loss": 0.3714, + "step": 9918 + }, + { + "epoch": 1.32, + "grad_norm": 0.6171875, + "learning_rate": 0.00016924249657416853, + "loss": 0.6348, + "step": 9919 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546875, + "learning_rate": 0.0001692340943427102, + "loss": 0.3185, + "step": 9920 + }, + { + "epoch": 1.32, + "grad_norm": 0.671875, + "learning_rate": 0.00016922569117239314, + "loss": 0.2191, + "step": 9921 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546875, + "learning_rate": 0.0001692172870633313, + "loss": 0.2732, + "step": 9922 + }, + { + "epoch": 1.32, + "grad_norm": 0.44140625, + "learning_rate": 0.0001692088820156386, + "loss": 0.2183, + "step": 9923 + }, + { + "epoch": 1.32, + "grad_norm": 0.59375, + "learning_rate": 0.00016920047602942903, + "loss": 0.3405, + "step": 9924 + }, + { + "epoch": 1.32, + "grad_norm": 0.404296875, + "learning_rate": 0.0001691920691048166, + "loss": 0.2942, + "step": 9925 + }, + { + "epoch": 1.32, + "grad_norm": 0.396484375, + "learning_rate": 0.0001691836612419153, + "loss": 0.1299, + "step": 9926 + }, + { + "epoch": 1.32, + "grad_norm": 0.65625, + "learning_rate": 0.00016917525244083918, + "loss": 0.5963, + "step": 9927 + }, + { + "epoch": 1.32, + "grad_norm": 0.6015625, + "learning_rate": 0.00016916684270170223, + "loss": 0.3385, + "step": 9928 + }, + { + "epoch": 1.32, + "grad_norm": 0.6953125, + "learning_rate": 0.0001691584320246185, + "loss": 0.5941, + "step": 9929 + }, + { + "epoch": 1.33, + "grad_norm": 0.4140625, + "learning_rate": 0.00016915002040970202, + "loss": 0.155, + "step": 9930 + }, + { + "epoch": 1.33, + "grad_norm": 0.546875, + "learning_rate": 0.0001691416078570669, + "loss": 0.513, + "step": 9931 + }, + { + "epoch": 1.33, + "grad_norm": 0.6640625, + "learning_rate": 0.0001691331943668272, + "loss": 0.4664, + "step": 9932 + }, + { + "epoch": 1.33, + "grad_norm": 0.7265625, + "learning_rate": 0.00016912477993909705, + "loss": 0.7589, + "step": 9933 + }, + { + "epoch": 1.33, + "grad_norm": 0.640625, + "learning_rate": 0.0001691163645739905, + "loss": 0.4732, + "step": 9934 + }, + { + "epoch": 1.33, + "grad_norm": 0.55859375, + "learning_rate": 0.00016910794827162167, + "loss": 0.2999, + "step": 9935 + }, + { + "epoch": 1.33, + "grad_norm": 0.6015625, + "learning_rate": 0.0001690995310321047, + "loss": 0.3575, + "step": 9936 + }, + { + "epoch": 1.33, + "grad_norm": 0.5546875, + "learning_rate": 0.00016909111285555377, + "loss": 0.3759, + "step": 9937 + }, + { + "epoch": 1.33, + "grad_norm": 0.51953125, + "learning_rate": 0.000169082693742083, + "loss": 0.1986, + "step": 9938 + }, + { + "epoch": 1.33, + "grad_norm": 0.7265625, + "learning_rate": 0.00016907427369180654, + "loss": 0.3049, + "step": 9939 + }, + { + "epoch": 1.33, + "grad_norm": 0.427734375, + "learning_rate": 0.00016906585270483866, + "loss": 0.27, + "step": 9940 + }, + { + "epoch": 1.33, + "grad_norm": 0.546875, + "learning_rate": 0.00016905743078129347, + "loss": 0.3548, + "step": 9941 + }, + { + "epoch": 1.33, + "grad_norm": 0.482421875, + "learning_rate": 0.00016904900792128516, + "loss": 0.2288, + "step": 9942 + }, + { + "epoch": 1.33, + "grad_norm": 0.5703125, + "learning_rate": 0.00016904058412492797, + "loss": 0.4328, + "step": 9943 + }, + { + "epoch": 1.33, + "grad_norm": 0.5859375, + "learning_rate": 0.00016903215939233622, + "loss": 0.4354, + "step": 9944 + }, + { + "epoch": 1.33, + "grad_norm": 0.62109375, + "learning_rate": 0.00016902373372362404, + "loss": 0.3269, + "step": 9945 + }, + { + "epoch": 1.33, + "grad_norm": 0.4375, + "learning_rate": 0.00016901530711890575, + "loss": 0.313, + "step": 9946 + }, + { + "epoch": 1.33, + "grad_norm": 0.625, + "learning_rate": 0.00016900687957829556, + "loss": 0.4532, + "step": 9947 + }, + { + "epoch": 1.33, + "grad_norm": 0.53515625, + "learning_rate": 0.00016899845110190787, + "loss": 0.2901, + "step": 9948 + }, + { + "epoch": 1.33, + "grad_norm": 0.6171875, + "learning_rate": 0.00016899002168985683, + "loss": 0.3539, + "step": 9949 + }, + { + "epoch": 1.33, + "grad_norm": 0.4921875, + "learning_rate": 0.00016898159134225686, + "loss": 0.6065, + "step": 9950 + }, + { + "epoch": 1.33, + "grad_norm": 0.470703125, + "learning_rate": 0.00016897316005922222, + "loss": 0.3803, + "step": 9951 + }, + { + "epoch": 1.33, + "grad_norm": 0.5625, + "learning_rate": 0.00016896472784086726, + "loss": 0.3892, + "step": 9952 + }, + { + "epoch": 1.33, + "grad_norm": 0.55859375, + "learning_rate": 0.00016895629468730635, + "loss": 0.3461, + "step": 9953 + }, + { + "epoch": 1.33, + "grad_norm": 0.48828125, + "learning_rate": 0.00016894786059865383, + "loss": 0.1902, + "step": 9954 + }, + { + "epoch": 1.33, + "grad_norm": 0.84375, + "learning_rate": 0.00016893942557502404, + "loss": 0.4034, + "step": 9955 + }, + { + "epoch": 1.33, + "grad_norm": 0.640625, + "learning_rate": 0.0001689309896165314, + "loss": 0.3602, + "step": 9956 + }, + { + "epoch": 1.33, + "grad_norm": 0.59375, + "learning_rate": 0.00016892255272329032, + "loss": 0.2184, + "step": 9957 + }, + { + "epoch": 1.33, + "grad_norm": 0.609375, + "learning_rate": 0.00016891411489541517, + "loss": 0.5382, + "step": 9958 + }, + { + "epoch": 1.33, + "grad_norm": 0.54296875, + "learning_rate": 0.00016890567613302043, + "loss": 0.4576, + "step": 9959 + }, + { + "epoch": 1.33, + "grad_norm": 0.37109375, + "learning_rate": 0.00016889723643622048, + "loss": 0.2625, + "step": 9960 + }, + { + "epoch": 1.33, + "grad_norm": 0.4921875, + "learning_rate": 0.00016888879580512976, + "loss": 0.2041, + "step": 9961 + }, + { + "epoch": 1.33, + "grad_norm": 0.5859375, + "learning_rate": 0.00016888035423986278, + "loss": 0.4014, + "step": 9962 + }, + { + "epoch": 1.33, + "grad_norm": 0.6328125, + "learning_rate": 0.00016887191174053397, + "loss": 0.3519, + "step": 9963 + }, + { + "epoch": 1.33, + "grad_norm": 0.57421875, + "learning_rate": 0.00016886346830725785, + "loss": 0.4628, + "step": 9964 + }, + { + "epoch": 1.33, + "grad_norm": 0.376953125, + "learning_rate": 0.00016885502394014892, + "loss": 0.2827, + "step": 9965 + }, + { + "epoch": 1.33, + "grad_norm": 0.6640625, + "learning_rate": 0.00016884657863932165, + "loss": 0.2248, + "step": 9966 + }, + { + "epoch": 1.33, + "grad_norm": 0.67578125, + "learning_rate": 0.00016883813240489058, + "loss": 0.3268, + "step": 9967 + }, + { + "epoch": 1.33, + "grad_norm": 0.5078125, + "learning_rate": 0.00016882968523697028, + "loss": 0.2862, + "step": 9968 + }, + { + "epoch": 1.33, + "grad_norm": 0.65234375, + "learning_rate": 0.00016882123713567526, + "loss": 0.3717, + "step": 9969 + }, + { + "epoch": 1.33, + "grad_norm": 0.482421875, + "learning_rate": 0.0001688127881011201, + "loss": 0.4404, + "step": 9970 + }, + { + "epoch": 1.33, + "grad_norm": 0.416015625, + "learning_rate": 0.0001688043381334194, + "loss": 0.434, + "step": 9971 + }, + { + "epoch": 1.33, + "grad_norm": 0.66015625, + "learning_rate": 0.00016879588723268767, + "loss": 0.3558, + "step": 9972 + }, + { + "epoch": 1.33, + "grad_norm": 0.65625, + "learning_rate": 0.00016878743539903957, + "loss": 0.3725, + "step": 9973 + }, + { + "epoch": 1.33, + "grad_norm": 0.5390625, + "learning_rate": 0.0001687789826325897, + "loss": 0.2123, + "step": 9974 + }, + { + "epoch": 1.33, + "grad_norm": 0.6484375, + "learning_rate": 0.0001687705289334527, + "loss": 0.3576, + "step": 9975 + }, + { + "epoch": 1.33, + "grad_norm": 0.53125, + "learning_rate": 0.00016876207430174315, + "loss": 0.3965, + "step": 9976 + }, + { + "epoch": 1.33, + "grad_norm": 0.53125, + "learning_rate": 0.00016875361873757577, + "loss": 0.5595, + "step": 9977 + }, + { + "epoch": 1.33, + "grad_norm": 0.59375, + "learning_rate": 0.00016874516224106518, + "loss": 0.702, + "step": 9978 + }, + { + "epoch": 1.33, + "grad_norm": 0.53515625, + "learning_rate": 0.0001687367048123261, + "loss": 0.6286, + "step": 9979 + }, + { + "epoch": 1.33, + "grad_norm": 0.392578125, + "learning_rate": 0.00016872824645147315, + "loss": 0.1925, + "step": 9980 + }, + { + "epoch": 1.33, + "grad_norm": 0.390625, + "learning_rate": 0.0001687197871586211, + "loss": 0.1996, + "step": 9981 + }, + { + "epoch": 1.33, + "grad_norm": 0.474609375, + "learning_rate": 0.0001687113269338846, + "loss": 0.707, + "step": 9982 + }, + { + "epoch": 1.33, + "grad_norm": 0.42578125, + "learning_rate": 0.00016870286577737846, + "loss": 0.2119, + "step": 9983 + }, + { + "epoch": 1.33, + "grad_norm": 0.55078125, + "learning_rate": 0.00016869440368921733, + "loss": 0.4869, + "step": 9984 + }, + { + "epoch": 1.33, + "grad_norm": 0.58203125, + "learning_rate": 0.00016868594066951598, + "loss": 0.3167, + "step": 9985 + }, + { + "epoch": 1.33, + "grad_norm": 0.484375, + "learning_rate": 0.00016867747671838924, + "loss": 0.2584, + "step": 9986 + }, + { + "epoch": 1.33, + "grad_norm": 0.51953125, + "learning_rate": 0.0001686690118359518, + "loss": 0.3755, + "step": 9987 + }, + { + "epoch": 1.33, + "grad_norm": 0.48046875, + "learning_rate": 0.00016866054602231848, + "loss": 0.3032, + "step": 9988 + }, + { + "epoch": 1.33, + "grad_norm": 0.5, + "learning_rate": 0.00016865207927760413, + "loss": 0.3418, + "step": 9989 + }, + { + "epoch": 1.33, + "grad_norm": 0.5390625, + "learning_rate": 0.00016864361160192352, + "loss": 0.3362, + "step": 9990 + }, + { + "epoch": 1.33, + "grad_norm": 0.43359375, + "learning_rate": 0.00016863514299539145, + "loss": 0.4241, + "step": 9991 + }, + { + "epoch": 1.33, + "grad_norm": 0.63671875, + "learning_rate": 0.00016862667345812283, + "loss": 0.4938, + "step": 9992 + }, + { + "epoch": 1.33, + "grad_norm": 0.63671875, + "learning_rate": 0.00016861820299023244, + "loss": 0.3677, + "step": 9993 + }, + { + "epoch": 1.33, + "grad_norm": 0.93359375, + "learning_rate": 0.00016860973159183517, + "loss": 0.5809, + "step": 9994 + }, + { + "epoch": 1.33, + "grad_norm": 0.6796875, + "learning_rate": 0.00016860125926304592, + "loss": 0.4523, + "step": 9995 + }, + { + "epoch": 1.33, + "grad_norm": 0.431640625, + "learning_rate": 0.00016859278600397958, + "loss": 0.4359, + "step": 9996 + }, + { + "epoch": 1.33, + "grad_norm": 0.423828125, + "learning_rate": 0.00016858431181475102, + "loss": 0.3026, + "step": 9997 + }, + { + "epoch": 1.33, + "grad_norm": 0.5234375, + "learning_rate": 0.0001685758366954752, + "loss": 0.5239, + "step": 9998 + }, + { + "epoch": 1.33, + "grad_norm": 0.5078125, + "learning_rate": 0.000168567360646267, + "loss": 0.806, + "step": 9999 + }, + { + "epoch": 1.33, + "grad_norm": 0.5546875, + "learning_rate": 0.00016855888366724143, + "loss": 0.488, + "step": 10000 + }, + { + "epoch": 1.33, + "grad_norm": 0.3828125, + "learning_rate": 0.00016855040575851335, + "loss": 0.1793, + "step": 10001 + }, + { + "epoch": 1.33, + "grad_norm": 0.5703125, + "learning_rate": 0.00016854192692019777, + "loss": 0.4476, + "step": 10002 + }, + { + "epoch": 1.33, + "grad_norm": 0.61328125, + "learning_rate": 0.0001685334471524097, + "loss": 0.3906, + "step": 10003 + }, + { + "epoch": 1.33, + "grad_norm": 0.462890625, + "learning_rate": 0.00016852496645526407, + "loss": 0.372, + "step": 10004 + }, + { + "epoch": 1.34, + "grad_norm": 0.54296875, + "learning_rate": 0.00016851648482887592, + "loss": 0.3261, + "step": 10005 + }, + { + "epoch": 1.34, + "grad_norm": 0.498046875, + "learning_rate": 0.00016850800227336029, + "loss": 0.302, + "step": 10006 + }, + { + "epoch": 1.34, + "grad_norm": 0.65234375, + "learning_rate": 0.00016849951878883217, + "loss": 0.2585, + "step": 10007 + }, + { + "epoch": 1.34, + "grad_norm": 0.60546875, + "learning_rate": 0.0001684910343754066, + "loss": 0.7402, + "step": 10008 + }, + { + "epoch": 1.34, + "grad_norm": 0.58984375, + "learning_rate": 0.00016848254903319867, + "loss": 0.1575, + "step": 10009 + }, + { + "epoch": 1.34, + "grad_norm": 0.33203125, + "learning_rate": 0.00016847406276232341, + "loss": 0.1886, + "step": 10010 + }, + { + "epoch": 1.34, + "grad_norm": 0.4296875, + "learning_rate": 0.0001684655755628959, + "loss": 0.3705, + "step": 10011 + }, + { + "epoch": 1.34, + "grad_norm": 0.44921875, + "learning_rate": 0.00016845708743503122, + "loss": 0.2977, + "step": 10012 + }, + { + "epoch": 1.34, + "grad_norm": 0.59375, + "learning_rate": 0.00016844859837884455, + "loss": 0.3956, + "step": 10013 + }, + { + "epoch": 1.34, + "grad_norm": 0.8046875, + "learning_rate": 0.00016844010839445095, + "loss": 0.6038, + "step": 10014 + }, + { + "epoch": 1.34, + "grad_norm": 0.70703125, + "learning_rate": 0.00016843161748196554, + "loss": 0.5083, + "step": 10015 + }, + { + "epoch": 1.34, + "grad_norm": 0.57421875, + "learning_rate": 0.00016842312564150348, + "loss": 0.3894, + "step": 10016 + }, + { + "epoch": 1.34, + "grad_norm": 0.67578125, + "learning_rate": 0.00016841463287317995, + "loss": 0.6052, + "step": 10017 + }, + { + "epoch": 1.34, + "grad_norm": 0.55078125, + "learning_rate": 0.00016840613917711004, + "loss": 0.5369, + "step": 10018 + }, + { + "epoch": 1.34, + "grad_norm": 0.4296875, + "learning_rate": 0.000168397644553409, + "loss": 0.2091, + "step": 10019 + }, + { + "epoch": 1.34, + "grad_norm": 0.5234375, + "learning_rate": 0.000168389149002192, + "loss": 0.2585, + "step": 10020 + }, + { + "epoch": 1.34, + "grad_norm": 0.546875, + "learning_rate": 0.00016838065252357427, + "loss": 0.2081, + "step": 10021 + }, + { + "epoch": 1.34, + "grad_norm": 0.453125, + "learning_rate": 0.00016837215511767098, + "loss": 0.4319, + "step": 10022 + }, + { + "epoch": 1.34, + "grad_norm": 0.52734375, + "learning_rate": 0.0001683636567845974, + "loss": 0.6882, + "step": 10023 + }, + { + "epoch": 1.34, + "grad_norm": 0.48046875, + "learning_rate": 0.00016835515752446875, + "loss": 0.228, + "step": 10024 + }, + { + "epoch": 1.34, + "grad_norm": 0.478515625, + "learning_rate": 0.00016834665733740032, + "loss": 0.4351, + "step": 10025 + }, + { + "epoch": 1.34, + "grad_norm": 0.70703125, + "learning_rate": 0.0001683381562235073, + "loss": 0.5234, + "step": 10026 + }, + { + "epoch": 1.34, + "grad_norm": 0.62109375, + "learning_rate": 0.00016832965418290505, + "loss": 0.4386, + "step": 10027 + }, + { + "epoch": 1.34, + "grad_norm": 0.69140625, + "learning_rate": 0.00016832115121570885, + "loss": 0.191, + "step": 10028 + }, + { + "epoch": 1.34, + "grad_norm": 0.640625, + "learning_rate": 0.00016831264732203397, + "loss": 0.4432, + "step": 10029 + }, + { + "epoch": 1.34, + "grad_norm": 0.74609375, + "learning_rate": 0.00016830414250199578, + "loss": 0.4111, + "step": 10030 + }, + { + "epoch": 1.34, + "grad_norm": 0.5859375, + "learning_rate": 0.00016829563675570958, + "loss": 0.5824, + "step": 10031 + }, + { + "epoch": 1.34, + "grad_norm": 0.59375, + "learning_rate": 0.00016828713008329067, + "loss": 0.5334, + "step": 10032 + }, + { + "epoch": 1.34, + "grad_norm": 0.6015625, + "learning_rate": 0.0001682786224848545, + "loss": 0.5433, + "step": 10033 + }, + { + "epoch": 1.34, + "grad_norm": 0.59765625, + "learning_rate": 0.00016827011396051633, + "loss": 0.3043, + "step": 10034 + }, + { + "epoch": 1.34, + "grad_norm": 0.6640625, + "learning_rate": 0.00016826160451039162, + "loss": 0.4067, + "step": 10035 + }, + { + "epoch": 1.34, + "grad_norm": 0.5859375, + "learning_rate": 0.00016825309413459575, + "loss": 0.5624, + "step": 10036 + }, + { + "epoch": 1.34, + "grad_norm": 0.828125, + "learning_rate": 0.00016824458283324413, + "loss": 0.5178, + "step": 10037 + }, + { + "epoch": 1.34, + "grad_norm": 0.52734375, + "learning_rate": 0.00016823607060645216, + "loss": 0.3552, + "step": 10038 + }, + { + "epoch": 1.34, + "grad_norm": 0.515625, + "learning_rate": 0.00016822755745433528, + "loss": 0.4755, + "step": 10039 + }, + { + "epoch": 1.34, + "grad_norm": 0.498046875, + "learning_rate": 0.00016821904337700893, + "loss": 0.3847, + "step": 10040 + }, + { + "epoch": 1.34, + "grad_norm": 0.59375, + "learning_rate": 0.0001682105283745886, + "loss": 0.5436, + "step": 10041 + }, + { + "epoch": 1.34, + "grad_norm": 0.57421875, + "learning_rate": 0.00016820201244718968, + "loss": 0.2869, + "step": 10042 + }, + { + "epoch": 1.34, + "grad_norm": 0.52734375, + "learning_rate": 0.00016819349559492772, + "loss": 0.4364, + "step": 10043 + }, + { + "epoch": 1.34, + "grad_norm": 0.54296875, + "learning_rate": 0.00016818497781791818, + "loss": 0.2334, + "step": 10044 + }, + { + "epoch": 1.34, + "grad_norm": 0.6484375, + "learning_rate": 0.0001681764591162766, + "loss": 0.2332, + "step": 10045 + }, + { + "epoch": 1.34, + "grad_norm": 0.56640625, + "learning_rate": 0.00016816793949011848, + "loss": 0.5124, + "step": 10046 + }, + { + "epoch": 1.34, + "grad_norm": 0.41015625, + "learning_rate": 0.00016815941893955934, + "loss": 0.1858, + "step": 10047 + }, + { + "epoch": 1.34, + "grad_norm": 0.76171875, + "learning_rate": 0.0001681508974647147, + "loss": 0.5316, + "step": 10048 + }, + { + "epoch": 1.34, + "grad_norm": 0.6953125, + "learning_rate": 0.0001681423750657002, + "loss": 0.7304, + "step": 10049 + }, + { + "epoch": 1.34, + "grad_norm": 0.40625, + "learning_rate": 0.00016813385174263137, + "loss": 0.2268, + "step": 10050 + }, + { + "epoch": 1.34, + "grad_norm": 0.515625, + "learning_rate": 0.00016812532749562375, + "loss": 0.354, + "step": 10051 + }, + { + "epoch": 1.34, + "grad_norm": 0.765625, + "learning_rate": 0.00016811680232479298, + "loss": 0.3894, + "step": 10052 + }, + { + "epoch": 1.34, + "grad_norm": 0.48828125, + "learning_rate": 0.00016810827623025464, + "loss": 0.4007, + "step": 10053 + }, + { + "epoch": 1.34, + "grad_norm": 0.58984375, + "learning_rate": 0.0001680997492121244, + "loss": 0.5844, + "step": 10054 + }, + { + "epoch": 1.34, + "grad_norm": 0.5, + "learning_rate": 0.00016809122127051782, + "loss": 0.498, + "step": 10055 + }, + { + "epoch": 1.34, + "grad_norm": 0.67578125, + "learning_rate": 0.0001680826924055506, + "loss": 0.51, + "step": 10056 + }, + { + "epoch": 1.34, + "grad_norm": 0.609375, + "learning_rate": 0.00016807416261733834, + "loss": 0.3452, + "step": 10057 + }, + { + "epoch": 1.34, + "grad_norm": 0.490234375, + "learning_rate": 0.00016806563190599678, + "loss": 0.538, + "step": 10058 + }, + { + "epoch": 1.34, + "grad_norm": 0.447265625, + "learning_rate": 0.00016805710027164154, + "loss": 0.446, + "step": 10059 + }, + { + "epoch": 1.34, + "grad_norm": 0.59375, + "learning_rate": 0.00016804856771438835, + "loss": 0.4131, + "step": 10060 + }, + { + "epoch": 1.34, + "grad_norm": 0.8359375, + "learning_rate": 0.00016804003423435293, + "loss": 0.3691, + "step": 10061 + }, + { + "epoch": 1.34, + "grad_norm": 0.54296875, + "learning_rate": 0.00016803149983165097, + "loss": 0.4517, + "step": 10062 + }, + { + "epoch": 1.34, + "grad_norm": 0.609375, + "learning_rate": 0.0001680229645063982, + "loss": 0.7129, + "step": 10063 + }, + { + "epoch": 1.34, + "grad_norm": 0.6015625, + "learning_rate": 0.0001680144282587104, + "loss": 0.3829, + "step": 10064 + }, + { + "epoch": 1.34, + "grad_norm": 0.64453125, + "learning_rate": 0.00016800589108870325, + "loss": 0.3962, + "step": 10065 + }, + { + "epoch": 1.34, + "grad_norm": 0.60546875, + "learning_rate": 0.0001679973529964926, + "loss": 0.5383, + "step": 10066 + }, + { + "epoch": 1.34, + "grad_norm": 0.59765625, + "learning_rate": 0.0001679888139821942, + "loss": 0.4089, + "step": 10067 + }, + { + "epoch": 1.34, + "grad_norm": 0.65234375, + "learning_rate": 0.00016798027404592384, + "loss": 0.353, + "step": 10068 + }, + { + "epoch": 1.34, + "grad_norm": 0.482421875, + "learning_rate": 0.00016797173318779736, + "loss": 0.2761, + "step": 10069 + }, + { + "epoch": 1.34, + "grad_norm": 0.671875, + "learning_rate": 0.00016796319140793053, + "loss": 0.7816, + "step": 10070 + }, + { + "epoch": 1.34, + "grad_norm": 0.63671875, + "learning_rate": 0.00016795464870643923, + "loss": 0.4171, + "step": 10071 + }, + { + "epoch": 1.34, + "grad_norm": 0.62109375, + "learning_rate": 0.00016794610508343927, + "loss": 0.4235, + "step": 10072 + }, + { + "epoch": 1.34, + "grad_norm": 0.5390625, + "learning_rate": 0.00016793756053904647, + "loss": 0.2412, + "step": 10073 + }, + { + "epoch": 1.34, + "grad_norm": 0.57421875, + "learning_rate": 0.00016792901507337683, + "loss": 0.3226, + "step": 10074 + }, + { + "epoch": 1.34, + "grad_norm": 0.66796875, + "learning_rate": 0.0001679204686865461, + "loss": 0.3576, + "step": 10075 + }, + { + "epoch": 1.34, + "grad_norm": 0.6953125, + "learning_rate": 0.00016791192137867022, + "loss": 0.4565, + "step": 10076 + }, + { + "epoch": 1.34, + "grad_norm": 0.640625, + "learning_rate": 0.0001679033731498651, + "loss": 0.4684, + "step": 10077 + }, + { + "epoch": 1.34, + "grad_norm": 0.53125, + "learning_rate": 0.0001678948240002467, + "loss": 0.4093, + "step": 10078 + }, + { + "epoch": 1.34, + "grad_norm": 0.55078125, + "learning_rate": 0.00016788627392993086, + "loss": 0.4299, + "step": 10079 + }, + { + "epoch": 1.35, + "grad_norm": 0.640625, + "learning_rate": 0.00016787772293903362, + "loss": 0.6004, + "step": 10080 + }, + { + "epoch": 1.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00016786917102767088, + "loss": 0.248, + "step": 10081 + }, + { + "epoch": 1.35, + "grad_norm": 0.50390625, + "learning_rate": 0.00016786061819595863, + "loss": 0.3457, + "step": 10082 + }, + { + "epoch": 1.35, + "grad_norm": 0.478515625, + "learning_rate": 0.00016785206444401283, + "loss": 0.2612, + "step": 10083 + }, + { + "epoch": 1.35, + "grad_norm": 0.5546875, + "learning_rate": 0.00016784350977194946, + "loss": 0.3048, + "step": 10084 + }, + { + "epoch": 1.35, + "grad_norm": 0.7578125, + "learning_rate": 0.0001678349541798846, + "loss": 0.4079, + "step": 10085 + }, + { + "epoch": 1.35, + "grad_norm": 0.515625, + "learning_rate": 0.00016782639766793422, + "loss": 0.3836, + "step": 10086 + }, + { + "epoch": 1.35, + "grad_norm": 0.5234375, + "learning_rate": 0.00016781784023621436, + "loss": 0.3978, + "step": 10087 + }, + { + "epoch": 1.35, + "grad_norm": 0.5234375, + "learning_rate": 0.00016780928188484104, + "loss": 0.4985, + "step": 10088 + }, + { + "epoch": 1.35, + "grad_norm": 0.68359375, + "learning_rate": 0.00016780072261393035, + "loss": 0.3091, + "step": 10089 + }, + { + "epoch": 1.35, + "grad_norm": 0.55078125, + "learning_rate": 0.00016779216242359833, + "loss": 0.3923, + "step": 10090 + }, + { + "epoch": 1.35, + "grad_norm": 0.4296875, + "learning_rate": 0.0001677836013139611, + "loss": 0.2132, + "step": 10091 + }, + { + "epoch": 1.35, + "grad_norm": 0.462890625, + "learning_rate": 0.00016777503928513474, + "loss": 0.3372, + "step": 10092 + }, + { + "epoch": 1.35, + "grad_norm": 0.4921875, + "learning_rate": 0.00016776647633723535, + "loss": 0.3125, + "step": 10093 + }, + { + "epoch": 1.35, + "grad_norm": 0.423828125, + "learning_rate": 0.00016775791247037903, + "loss": 0.444, + "step": 10094 + }, + { + "epoch": 1.35, + "grad_norm": 0.578125, + "learning_rate": 0.0001677493476846819, + "loss": 0.4819, + "step": 10095 + }, + { + "epoch": 1.35, + "grad_norm": 0.5390625, + "learning_rate": 0.00016774078198026018, + "loss": 0.5196, + "step": 10096 + }, + { + "epoch": 1.35, + "grad_norm": 0.86328125, + "learning_rate": 0.00016773221535722996, + "loss": 0.5188, + "step": 10097 + }, + { + "epoch": 1.35, + "grad_norm": 0.486328125, + "learning_rate": 0.00016772364781570746, + "loss": 0.2221, + "step": 10098 + }, + { + "epoch": 1.35, + "grad_norm": 0.486328125, + "learning_rate": 0.00016771507935580882, + "loss": 0.219, + "step": 10099 + }, + { + "epoch": 1.35, + "grad_norm": 0.8046875, + "learning_rate": 0.0001677065099776502, + "loss": 0.6644, + "step": 10100 + }, + { + "epoch": 1.35, + "grad_norm": 0.83203125, + "learning_rate": 0.00016769793968134791, + "loss": 0.7164, + "step": 10101 + }, + { + "epoch": 1.35, + "grad_norm": 0.546875, + "learning_rate": 0.00016768936846701807, + "loss": 0.6194, + "step": 10102 + }, + { + "epoch": 1.35, + "grad_norm": 0.53515625, + "learning_rate": 0.00016768079633477698, + "loss": 0.4621, + "step": 10103 + }, + { + "epoch": 1.35, + "grad_norm": 0.49609375, + "learning_rate": 0.00016767222328474082, + "loss": 0.4085, + "step": 10104 + }, + { + "epoch": 1.35, + "grad_norm": 0.439453125, + "learning_rate": 0.00016766364931702593, + "loss": 0.1797, + "step": 10105 + }, + { + "epoch": 1.35, + "grad_norm": 0.5078125, + "learning_rate": 0.0001676550744317485, + "loss": 0.4177, + "step": 10106 + }, + { + "epoch": 1.35, + "grad_norm": 0.4609375, + "learning_rate": 0.00016764649862902484, + "loss": 0.4607, + "step": 10107 + }, + { + "epoch": 1.35, + "grad_norm": 0.50390625, + "learning_rate": 0.00016763792190897122, + "loss": 0.2421, + "step": 10108 + }, + { + "epoch": 1.35, + "grad_norm": 0.5546875, + "learning_rate": 0.000167629344271704, + "loss": 0.5136, + "step": 10109 + }, + { + "epoch": 1.35, + "grad_norm": 0.73828125, + "learning_rate": 0.00016762076571733947, + "loss": 0.5273, + "step": 10110 + }, + { + "epoch": 1.35, + "grad_norm": 0.5546875, + "learning_rate": 0.00016761218624599395, + "loss": 0.3715, + "step": 10111 + }, + { + "epoch": 1.35, + "grad_norm": 0.72265625, + "learning_rate": 0.0001676036058577838, + "loss": 0.4329, + "step": 10112 + }, + { + "epoch": 1.35, + "grad_norm": 0.39453125, + "learning_rate": 0.00016759502455282536, + "loss": 0.26, + "step": 10113 + }, + { + "epoch": 1.35, + "grad_norm": 0.41796875, + "learning_rate": 0.00016758644233123499, + "loss": 0.2431, + "step": 10114 + }, + { + "epoch": 1.35, + "grad_norm": 0.56640625, + "learning_rate": 0.0001675778591931291, + "loss": 0.3488, + "step": 10115 + }, + { + "epoch": 1.35, + "grad_norm": 0.4921875, + "learning_rate": 0.00016756927513862405, + "loss": 0.2394, + "step": 10116 + }, + { + "epoch": 1.35, + "grad_norm": 0.5546875, + "learning_rate": 0.0001675606901678363, + "loss": 0.312, + "step": 10117 + }, + { + "epoch": 1.35, + "grad_norm": 0.474609375, + "learning_rate": 0.00016755210428088223, + "loss": 0.397, + "step": 10118 + }, + { + "epoch": 1.35, + "grad_norm": 0.53515625, + "learning_rate": 0.0001675435174778782, + "loss": 0.2741, + "step": 10119 + }, + { + "epoch": 1.35, + "grad_norm": 0.734375, + "learning_rate": 0.00016753492975894082, + "loss": 0.1729, + "step": 10120 + }, + { + "epoch": 1.35, + "grad_norm": 0.515625, + "learning_rate": 0.0001675263411241864, + "loss": 0.4084, + "step": 10121 + }, + { + "epoch": 1.35, + "grad_norm": 0.44140625, + "learning_rate": 0.00016751775157373145, + "loss": 0.356, + "step": 10122 + }, + { + "epoch": 1.35, + "grad_norm": 0.6484375, + "learning_rate": 0.00016750916110769247, + "loss": 0.6811, + "step": 10123 + }, + { + "epoch": 1.35, + "grad_norm": 0.45703125, + "learning_rate": 0.0001675005697261859, + "loss": 0.3566, + "step": 10124 + }, + { + "epoch": 1.35, + "grad_norm": 0.427734375, + "learning_rate": 0.00016749197742932834, + "loss": 0.2481, + "step": 10125 + }, + { + "epoch": 1.35, + "grad_norm": 0.67578125, + "learning_rate": 0.00016748338421723622, + "loss": 0.385, + "step": 10126 + }, + { + "epoch": 1.35, + "grad_norm": 0.44921875, + "learning_rate": 0.00016747479009002612, + "loss": 0.1823, + "step": 10127 + }, + { + "epoch": 1.35, + "grad_norm": 0.71875, + "learning_rate": 0.00016746619504781455, + "loss": 0.3161, + "step": 10128 + }, + { + "epoch": 1.35, + "grad_norm": 0.435546875, + "learning_rate": 0.00016745759909071807, + "loss": 0.2952, + "step": 10129 + }, + { + "epoch": 1.35, + "grad_norm": 0.462890625, + "learning_rate": 0.00016744900221885324, + "loss": 0.2591, + "step": 10130 + }, + { + "epoch": 1.35, + "grad_norm": 0.58203125, + "learning_rate": 0.0001674404044323367, + "loss": 0.3916, + "step": 10131 + }, + { + "epoch": 1.35, + "grad_norm": 0.51953125, + "learning_rate": 0.00016743180573128495, + "loss": 0.2156, + "step": 10132 + }, + { + "epoch": 1.35, + "grad_norm": 0.6953125, + "learning_rate": 0.00016742320611581464, + "loss": 0.545, + "step": 10133 + }, + { + "epoch": 1.35, + "grad_norm": 0.455078125, + "learning_rate": 0.0001674146055860424, + "loss": 0.2883, + "step": 10134 + }, + { + "epoch": 1.35, + "grad_norm": 0.640625, + "learning_rate": 0.00016740600414208483, + "loss": 0.4822, + "step": 10135 + }, + { + "epoch": 1.35, + "grad_norm": 0.51171875, + "learning_rate": 0.0001673974017840586, + "loss": 0.4168, + "step": 10136 + }, + { + "epoch": 1.35, + "grad_norm": 0.59765625, + "learning_rate": 0.0001673887985120803, + "loss": 0.5572, + "step": 10137 + }, + { + "epoch": 1.35, + "grad_norm": 0.6171875, + "learning_rate": 0.00016738019432626668, + "loss": 0.363, + "step": 10138 + }, + { + "epoch": 1.35, + "grad_norm": 0.5234375, + "learning_rate": 0.00016737158922673436, + "loss": 0.3746, + "step": 10139 + }, + { + "epoch": 1.35, + "grad_norm": 0.453125, + "learning_rate": 0.00016736298321360008, + "loss": 0.2351, + "step": 10140 + }, + { + "epoch": 1.35, + "grad_norm": 0.57421875, + "learning_rate": 0.0001673543762869805, + "loss": 0.4195, + "step": 10141 + }, + { + "epoch": 1.35, + "grad_norm": 0.5859375, + "learning_rate": 0.00016734576844699235, + "loss": 0.572, + "step": 10142 + }, + { + "epoch": 1.35, + "grad_norm": 0.65625, + "learning_rate": 0.00016733715969375238, + "loss": 0.5016, + "step": 10143 + }, + { + "epoch": 1.35, + "grad_norm": 0.498046875, + "learning_rate": 0.00016732855002737725, + "loss": 0.4135, + "step": 10144 + }, + { + "epoch": 1.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00016731993944798382, + "loss": 0.6254, + "step": 10145 + }, + { + "epoch": 1.35, + "grad_norm": 0.51171875, + "learning_rate": 0.00016731132795568877, + "loss": 0.3891, + "step": 10146 + }, + { + "epoch": 1.35, + "grad_norm": 0.67578125, + "learning_rate": 0.00016730271555060898, + "loss": 0.3663, + "step": 10147 + }, + { + "epoch": 1.35, + "grad_norm": 0.62109375, + "learning_rate": 0.00016729410223286113, + "loss": 0.4621, + "step": 10148 + }, + { + "epoch": 1.35, + "grad_norm": 0.49609375, + "learning_rate": 0.00016728548800256206, + "loss": 0.2736, + "step": 10149 + }, + { + "epoch": 1.35, + "grad_norm": 0.5546875, + "learning_rate": 0.00016727687285982858, + "loss": 0.8333, + "step": 10150 + }, + { + "epoch": 1.35, + "grad_norm": 0.5546875, + "learning_rate": 0.00016726825680477751, + "loss": 0.5488, + "step": 10151 + }, + { + "epoch": 1.35, + "grad_norm": 0.546875, + "learning_rate": 0.00016725963983752574, + "loss": 0.2849, + "step": 10152 + }, + { + "epoch": 1.35, + "grad_norm": 0.6484375, + "learning_rate": 0.0001672510219581901, + "loss": 0.4583, + "step": 10153 + }, + { + "epoch": 1.35, + "grad_norm": 0.73828125, + "learning_rate": 0.00016724240316688742, + "loss": 0.3403, + "step": 10154 + }, + { + "epoch": 1.36, + "grad_norm": 0.68359375, + "learning_rate": 0.0001672337834637346, + "loss": 0.3685, + "step": 10155 + }, + { + "epoch": 1.36, + "grad_norm": 0.69921875, + "learning_rate": 0.00016722516284884851, + "loss": 0.7692, + "step": 10156 + }, + { + "epoch": 1.36, + "grad_norm": 0.51953125, + "learning_rate": 0.00016721654132234609, + "loss": 0.343, + "step": 10157 + }, + { + "epoch": 1.36, + "grad_norm": 0.5390625, + "learning_rate": 0.00016720791888434418, + "loss": 0.4403, + "step": 10158 + }, + { + "epoch": 1.36, + "grad_norm": 0.55078125, + "learning_rate": 0.00016719929553495982, + "loss": 0.3139, + "step": 10159 + }, + { + "epoch": 1.36, + "grad_norm": 0.494140625, + "learning_rate": 0.00016719067127430988, + "loss": 0.2676, + "step": 10160 + }, + { + "epoch": 1.36, + "grad_norm": 0.73046875, + "learning_rate": 0.00016718204610251128, + "loss": 0.2771, + "step": 10161 + }, + { + "epoch": 1.36, + "grad_norm": 0.578125, + "learning_rate": 0.00016717342001968104, + "loss": 0.4891, + "step": 10162 + }, + { + "epoch": 1.36, + "grad_norm": 0.6015625, + "learning_rate": 0.0001671647930259361, + "loss": 0.394, + "step": 10163 + }, + { + "epoch": 1.36, + "grad_norm": 0.484375, + "learning_rate": 0.00016715616512139348, + "loss": 0.3142, + "step": 10164 + }, + { + "epoch": 1.36, + "grad_norm": 0.376953125, + "learning_rate": 0.00016714753630617013, + "loss": 0.1488, + "step": 10165 + }, + { + "epoch": 1.36, + "grad_norm": 0.578125, + "learning_rate": 0.0001671389065803831, + "loss": 0.325, + "step": 10166 + }, + { + "epoch": 1.36, + "grad_norm": 0.50390625, + "learning_rate": 0.00016713027594414946, + "loss": 0.474, + "step": 10167 + }, + { + "epoch": 1.36, + "grad_norm": 0.578125, + "learning_rate": 0.00016712164439758613, + "loss": 0.3629, + "step": 10168 + }, + { + "epoch": 1.36, + "grad_norm": 0.58203125, + "learning_rate": 0.00016711301194081025, + "loss": 0.345, + "step": 10169 + }, + { + "epoch": 1.36, + "grad_norm": 0.984375, + "learning_rate": 0.00016710437857393883, + "loss": 0.4381, + "step": 10170 + }, + { + "epoch": 1.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016709574429708902, + "loss": 0.2897, + "step": 10171 + }, + { + "epoch": 1.36, + "grad_norm": 0.5546875, + "learning_rate": 0.0001670871091103778, + "loss": 0.5893, + "step": 10172 + }, + { + "epoch": 1.36, + "grad_norm": 0.6015625, + "learning_rate": 0.00016707847301392236, + "loss": 0.2665, + "step": 10173 + }, + { + "epoch": 1.36, + "grad_norm": 0.50390625, + "learning_rate": 0.00016706983600783977, + "loss": 0.3686, + "step": 10174 + }, + { + "epoch": 1.36, + "grad_norm": 0.3984375, + "learning_rate": 0.00016706119809224715, + "loss": 0.236, + "step": 10175 + }, + { + "epoch": 1.36, + "grad_norm": 0.40625, + "learning_rate": 0.00016705255926726165, + "loss": 0.1712, + "step": 10176 + }, + { + "epoch": 1.36, + "grad_norm": 0.62109375, + "learning_rate": 0.0001670439195330004, + "loss": 0.5416, + "step": 10177 + }, + { + "epoch": 1.36, + "grad_norm": 0.55078125, + "learning_rate": 0.0001670352788895806, + "loss": 0.2618, + "step": 10178 + }, + { + "epoch": 1.36, + "grad_norm": 0.60546875, + "learning_rate": 0.00016702663733711936, + "loss": 0.4753, + "step": 10179 + }, + { + "epoch": 1.36, + "grad_norm": 0.640625, + "learning_rate": 0.0001670179948757339, + "loss": 0.3455, + "step": 10180 + }, + { + "epoch": 1.36, + "grad_norm": 0.388671875, + "learning_rate": 0.00016700935150554144, + "loss": 0.3253, + "step": 10181 + }, + { + "epoch": 1.36, + "grad_norm": 0.640625, + "learning_rate": 0.00016700070722665917, + "loss": 0.369, + "step": 10182 + }, + { + "epoch": 1.36, + "grad_norm": 0.58203125, + "learning_rate": 0.0001669920620392043, + "loss": 0.557, + "step": 10183 + }, + { + "epoch": 1.36, + "grad_norm": 0.54296875, + "learning_rate": 0.0001669834159432941, + "loss": 0.4488, + "step": 10184 + }, + { + "epoch": 1.36, + "grad_norm": 0.59375, + "learning_rate": 0.00016697476893904576, + "loss": 0.5149, + "step": 10185 + }, + { + "epoch": 1.36, + "grad_norm": 0.63671875, + "learning_rate": 0.00016696612102657656, + "loss": 0.4431, + "step": 10186 + }, + { + "epoch": 1.36, + "grad_norm": 0.44921875, + "learning_rate": 0.00016695747220600378, + "loss": 0.309, + "step": 10187 + }, + { + "epoch": 1.36, + "grad_norm": 0.63671875, + "learning_rate": 0.00016694882247744475, + "loss": 0.3977, + "step": 10188 + }, + { + "epoch": 1.36, + "grad_norm": 0.42578125, + "learning_rate": 0.00016694017184101667, + "loss": 0.2838, + "step": 10189 + }, + { + "epoch": 1.36, + "grad_norm": 0.78515625, + "learning_rate": 0.00016693152029683694, + "loss": 0.3782, + "step": 10190 + }, + { + "epoch": 1.36, + "grad_norm": 0.50390625, + "learning_rate": 0.0001669228678450228, + "loss": 0.3215, + "step": 10191 + }, + { + "epoch": 1.36, + "grad_norm": 0.5625, + "learning_rate": 0.00016691421448569163, + "loss": 0.3342, + "step": 10192 + }, + { + "epoch": 1.36, + "grad_norm": 0.451171875, + "learning_rate": 0.00016690556021896078, + "loss": 0.4161, + "step": 10193 + }, + { + "epoch": 1.36, + "grad_norm": 0.482421875, + "learning_rate": 0.0001668969050449476, + "loss": 0.3712, + "step": 10194 + }, + { + "epoch": 1.36, + "grad_norm": 0.64453125, + "learning_rate": 0.00016688824896376943, + "loss": 0.3181, + "step": 10195 + }, + { + "epoch": 1.36, + "grad_norm": 0.447265625, + "learning_rate": 0.00016687959197554368, + "loss": 0.3439, + "step": 10196 + }, + { + "epoch": 1.36, + "grad_norm": 0.5234375, + "learning_rate": 0.00016687093408038775, + "loss": 0.2037, + "step": 10197 + }, + { + "epoch": 1.36, + "grad_norm": 0.455078125, + "learning_rate": 0.00016686227527841903, + "loss": 0.3036, + "step": 10198 + }, + { + "epoch": 1.36, + "grad_norm": 0.50390625, + "learning_rate": 0.00016685361556975493, + "loss": 0.3934, + "step": 10199 + }, + { + "epoch": 1.36, + "grad_norm": 0.3984375, + "learning_rate": 0.00016684495495451293, + "loss": 0.3084, + "step": 10200 + }, + { + "epoch": 1.36, + "grad_norm": 0.60546875, + "learning_rate": 0.00016683629343281045, + "loss": 0.464, + "step": 10201 + }, + { + "epoch": 1.36, + "grad_norm": 0.765625, + "learning_rate": 0.00016682763100476489, + "loss": 0.6404, + "step": 10202 + }, + { + "epoch": 1.36, + "grad_norm": 0.490234375, + "learning_rate": 0.00016681896767049378, + "loss": 0.4405, + "step": 10203 + }, + { + "epoch": 1.36, + "grad_norm": 0.671875, + "learning_rate": 0.0001668103034301146, + "loss": 0.4788, + "step": 10204 + }, + { + "epoch": 1.36, + "grad_norm": 0.490234375, + "learning_rate": 0.00016680163828374476, + "loss": 0.456, + "step": 10205 + }, + { + "epoch": 1.36, + "grad_norm": 0.875, + "learning_rate": 0.0001667929722315019, + "loss": 0.4167, + "step": 10206 + }, + { + "epoch": 1.36, + "grad_norm": 0.546875, + "learning_rate": 0.00016678430527350348, + "loss": 0.3468, + "step": 10207 + }, + { + "epoch": 1.36, + "grad_norm": 0.515625, + "learning_rate": 0.000166775637409867, + "loss": 0.3368, + "step": 10208 + }, + { + "epoch": 1.36, + "grad_norm": 0.60546875, + "learning_rate": 0.00016676696864071, + "loss": 0.5097, + "step": 10209 + }, + { + "epoch": 1.36, + "grad_norm": 0.48828125, + "learning_rate": 0.00016675829896615007, + "loss": 0.4564, + "step": 10210 + }, + { + "epoch": 1.36, + "grad_norm": 0.6640625, + "learning_rate": 0.00016674962838630477, + "loss": 0.4058, + "step": 10211 + }, + { + "epoch": 1.36, + "grad_norm": 0.66796875, + "learning_rate": 0.00016674095690129165, + "loss": 0.4268, + "step": 10212 + }, + { + "epoch": 1.36, + "grad_norm": 0.5390625, + "learning_rate": 0.00016673228451122833, + "loss": 0.2577, + "step": 10213 + }, + { + "epoch": 1.36, + "grad_norm": 0.64453125, + "learning_rate": 0.00016672361121623238, + "loss": 0.4202, + "step": 10214 + }, + { + "epoch": 1.36, + "grad_norm": 0.51953125, + "learning_rate": 0.0001667149370164215, + "loss": 0.5337, + "step": 10215 + }, + { + "epoch": 1.36, + "grad_norm": 0.60546875, + "learning_rate": 0.00016670626191191322, + "loss": 0.4172, + "step": 10216 + }, + { + "epoch": 1.36, + "grad_norm": 0.53515625, + "learning_rate": 0.0001666975859028252, + "loss": 0.2855, + "step": 10217 + }, + { + "epoch": 1.36, + "grad_norm": 0.443359375, + "learning_rate": 0.0001666889089892751, + "loss": 0.5429, + "step": 10218 + }, + { + "epoch": 1.36, + "grad_norm": 0.546875, + "learning_rate": 0.00016668023117138062, + "loss": 0.3606, + "step": 10219 + }, + { + "epoch": 1.36, + "grad_norm": 0.5390625, + "learning_rate": 0.00016667155244925942, + "loss": 0.5731, + "step": 10220 + }, + { + "epoch": 1.36, + "grad_norm": 0.6484375, + "learning_rate": 0.00016666287282302917, + "loss": 0.3626, + "step": 10221 + }, + { + "epoch": 1.36, + "grad_norm": 0.56640625, + "learning_rate": 0.0001666541922928076, + "loss": 0.2377, + "step": 10222 + }, + { + "epoch": 1.36, + "grad_norm": 0.5625, + "learning_rate": 0.00016664551085871236, + "loss": 0.4516, + "step": 10223 + }, + { + "epoch": 1.36, + "grad_norm": 0.439453125, + "learning_rate": 0.00016663682852086125, + "loss": 0.1651, + "step": 10224 + }, + { + "epoch": 1.36, + "grad_norm": 0.482421875, + "learning_rate": 0.00016662814527937197, + "loss": 0.4003, + "step": 10225 + }, + { + "epoch": 1.36, + "grad_norm": 0.484375, + "learning_rate": 0.0001666194611343623, + "loss": 0.4433, + "step": 10226 + }, + { + "epoch": 1.36, + "grad_norm": 0.48828125, + "learning_rate": 0.00016661077608594993, + "loss": 0.4009, + "step": 10227 + }, + { + "epoch": 1.36, + "grad_norm": 0.69921875, + "learning_rate": 0.00016660209013425272, + "loss": 0.7188, + "step": 10228 + }, + { + "epoch": 1.36, + "grad_norm": 0.6015625, + "learning_rate": 0.0001665934032793884, + "loss": 0.2671, + "step": 10229 + }, + { + "epoch": 1.37, + "grad_norm": 0.451171875, + "learning_rate": 0.0001665847155214748, + "loss": 0.1972, + "step": 10230 + }, + { + "epoch": 1.37, + "grad_norm": 0.58984375, + "learning_rate": 0.00016657602686062975, + "loss": 0.3458, + "step": 10231 + }, + { + "epoch": 1.37, + "grad_norm": 0.62890625, + "learning_rate": 0.00016656733729697104, + "loss": 0.4268, + "step": 10232 + }, + { + "epoch": 1.37, + "grad_norm": 0.59765625, + "learning_rate": 0.0001665586468306165, + "loss": 0.4288, + "step": 10233 + }, + { + "epoch": 1.37, + "grad_norm": 0.65234375, + "learning_rate": 0.00016654995546168402, + "loss": 0.5792, + "step": 10234 + }, + { + "epoch": 1.37, + "grad_norm": 0.62109375, + "learning_rate": 0.0001665412631902914, + "loss": 0.464, + "step": 10235 + }, + { + "epoch": 1.37, + "grad_norm": 0.6484375, + "learning_rate": 0.00016653257001655652, + "loss": 0.5926, + "step": 10236 + }, + { + "epoch": 1.37, + "grad_norm": 0.94921875, + "learning_rate": 0.00016652387594059736, + "loss": 0.5724, + "step": 10237 + }, + { + "epoch": 1.37, + "grad_norm": 0.58203125, + "learning_rate": 0.0001665151809625317, + "loss": 0.3053, + "step": 10238 + }, + { + "epoch": 1.37, + "grad_norm": 0.56640625, + "learning_rate": 0.00016650648508247752, + "loss": 0.7317, + "step": 10239 + }, + { + "epoch": 1.37, + "grad_norm": 0.65234375, + "learning_rate": 0.0001664977883005527, + "loss": 0.3495, + "step": 10240 + }, + { + "epoch": 1.37, + "grad_norm": 0.56640625, + "learning_rate": 0.0001664890906168752, + "loss": 0.5727, + "step": 10241 + }, + { + "epoch": 1.37, + "grad_norm": 0.45703125, + "learning_rate": 0.00016648039203156297, + "loss": 0.5125, + "step": 10242 + }, + { + "epoch": 1.37, + "grad_norm": 0.53125, + "learning_rate": 0.00016647169254473395, + "loss": 0.5379, + "step": 10243 + }, + { + "epoch": 1.37, + "grad_norm": 0.6953125, + "learning_rate": 0.0001664629921565061, + "loss": 0.4203, + "step": 10244 + }, + { + "epoch": 1.37, + "grad_norm": 0.53125, + "learning_rate": 0.00016645429086699744, + "loss": 0.4974, + "step": 10245 + }, + { + "epoch": 1.37, + "grad_norm": 0.484375, + "learning_rate": 0.00016644558867632593, + "loss": 0.2776, + "step": 10246 + }, + { + "epoch": 1.37, + "grad_norm": 0.5625, + "learning_rate": 0.0001664368855846096, + "loss": 0.2306, + "step": 10247 + }, + { + "epoch": 1.37, + "grad_norm": 0.64453125, + "learning_rate": 0.00016642818159196647, + "loss": 0.4786, + "step": 10248 + }, + { + "epoch": 1.37, + "grad_norm": 0.5234375, + "learning_rate": 0.00016641947669851457, + "loss": 0.3106, + "step": 10249 + }, + { + "epoch": 1.37, + "grad_norm": 0.447265625, + "learning_rate": 0.00016641077090437194, + "loss": 0.3119, + "step": 10250 + }, + { + "epoch": 1.37, + "grad_norm": 0.5078125, + "learning_rate": 0.00016640206420965662, + "loss": 0.4073, + "step": 10251 + }, + { + "epoch": 1.37, + "grad_norm": 0.5703125, + "learning_rate": 0.0001663933566144867, + "loss": 0.3932, + "step": 10252 + }, + { + "epoch": 1.37, + "grad_norm": 0.423828125, + "learning_rate": 0.00016638464811898025, + "loss": 0.2301, + "step": 10253 + }, + { + "epoch": 1.37, + "grad_norm": 0.5078125, + "learning_rate": 0.00016637593872325534, + "loss": 0.3674, + "step": 10254 + }, + { + "epoch": 1.37, + "grad_norm": 0.6796875, + "learning_rate": 0.00016636722842743013, + "loss": 0.2507, + "step": 10255 + }, + { + "epoch": 1.37, + "grad_norm": 0.51171875, + "learning_rate": 0.00016635851723162273, + "loss": 0.297, + "step": 10256 + }, + { + "epoch": 1.37, + "grad_norm": 0.4609375, + "learning_rate": 0.0001663498051359512, + "loss": 0.2405, + "step": 10257 + }, + { + "epoch": 1.37, + "grad_norm": 0.6171875, + "learning_rate": 0.00016634109214053378, + "loss": 0.3514, + "step": 10258 + }, + { + "epoch": 1.37, + "grad_norm": 0.546875, + "learning_rate": 0.00016633237824548853, + "loss": 0.4707, + "step": 10259 + }, + { + "epoch": 1.37, + "grad_norm": 0.46875, + "learning_rate": 0.00016632366345093366, + "loss": 0.3043, + "step": 10260 + }, + { + "epoch": 1.37, + "grad_norm": 0.59375, + "learning_rate": 0.00016631494775698737, + "loss": 0.419, + "step": 10261 + }, + { + "epoch": 1.37, + "grad_norm": 0.6953125, + "learning_rate": 0.00016630623116376783, + "loss": 0.4347, + "step": 10262 + }, + { + "epoch": 1.37, + "grad_norm": 0.65625, + "learning_rate": 0.00016629751367139322, + "loss": 0.3005, + "step": 10263 + }, + { + "epoch": 1.37, + "grad_norm": 0.4609375, + "learning_rate": 0.00016628879527998177, + "loss": 0.3725, + "step": 10264 + }, + { + "epoch": 1.37, + "grad_norm": 0.72265625, + "learning_rate": 0.00016628007598965173, + "loss": 0.2666, + "step": 10265 + }, + { + "epoch": 1.37, + "grad_norm": 0.61328125, + "learning_rate": 0.00016627135580052132, + "loss": 0.6678, + "step": 10266 + }, + { + "epoch": 1.37, + "grad_norm": 0.54296875, + "learning_rate": 0.0001662626347127088, + "loss": 0.524, + "step": 10267 + }, + { + "epoch": 1.37, + "grad_norm": 0.578125, + "learning_rate": 0.00016625391272633242, + "loss": 0.3786, + "step": 10268 + }, + { + "epoch": 1.37, + "grad_norm": 0.5234375, + "learning_rate": 0.00016624518984151046, + "loss": 0.3047, + "step": 10269 + }, + { + "epoch": 1.37, + "grad_norm": 0.78515625, + "learning_rate": 0.0001662364660583612, + "loss": 0.4347, + "step": 10270 + }, + { + "epoch": 1.37, + "grad_norm": 0.447265625, + "learning_rate": 0.00016622774137700297, + "loss": 0.3192, + "step": 10271 + }, + { + "epoch": 1.37, + "grad_norm": 0.451171875, + "learning_rate": 0.00016621901579755405, + "loss": 0.5738, + "step": 10272 + }, + { + "epoch": 1.37, + "grad_norm": 0.55859375, + "learning_rate": 0.00016621028932013276, + "loss": 0.3637, + "step": 10273 + }, + { + "epoch": 1.37, + "grad_norm": 0.45703125, + "learning_rate": 0.0001662015619448575, + "loss": 0.2122, + "step": 10274 + }, + { + "epoch": 1.37, + "grad_norm": 0.44140625, + "learning_rate": 0.00016619283367184655, + "loss": 0.3217, + "step": 10275 + }, + { + "epoch": 1.37, + "grad_norm": 0.466796875, + "learning_rate": 0.00016618410450121827, + "loss": 0.1842, + "step": 10276 + }, + { + "epoch": 1.37, + "grad_norm": 0.53125, + "learning_rate": 0.00016617537443309107, + "loss": 0.4093, + "step": 10277 + }, + { + "epoch": 1.37, + "grad_norm": 0.65234375, + "learning_rate": 0.00016616664346758332, + "loss": 0.41, + "step": 10278 + }, + { + "epoch": 1.37, + "grad_norm": 0.48046875, + "learning_rate": 0.00016615791160481345, + "loss": 0.2905, + "step": 10279 + }, + { + "epoch": 1.37, + "grad_norm": 0.58203125, + "learning_rate": 0.0001661491788448998, + "loss": 0.4358, + "step": 10280 + }, + { + "epoch": 1.37, + "grad_norm": 0.55078125, + "learning_rate": 0.00016614044518796086, + "loss": 0.57, + "step": 10281 + }, + { + "epoch": 1.37, + "grad_norm": 0.70703125, + "learning_rate": 0.000166131710634115, + "loss": 0.4277, + "step": 10282 + }, + { + "epoch": 1.37, + "grad_norm": 0.5546875, + "learning_rate": 0.00016612297518348073, + "loss": 0.3379, + "step": 10283 + }, + { + "epoch": 1.37, + "grad_norm": 0.38671875, + "learning_rate": 0.00016611423883617645, + "loss": 0.441, + "step": 10284 + }, + { + "epoch": 1.37, + "grad_norm": 0.67578125, + "learning_rate": 0.00016610550159232068, + "loss": 0.5028, + "step": 10285 + }, + { + "epoch": 1.37, + "grad_norm": 0.451171875, + "learning_rate": 0.0001660967634520319, + "loss": 0.2381, + "step": 10286 + }, + { + "epoch": 1.37, + "grad_norm": 0.5234375, + "learning_rate": 0.00016608802441542855, + "loss": 0.4062, + "step": 10287 + }, + { + "epoch": 1.37, + "grad_norm": 0.75390625, + "learning_rate": 0.0001660792844826292, + "loss": 0.2686, + "step": 10288 + }, + { + "epoch": 1.37, + "grad_norm": 0.484375, + "learning_rate": 0.00016607054365375232, + "loss": 0.4656, + "step": 10289 + }, + { + "epoch": 1.37, + "grad_norm": 0.65625, + "learning_rate": 0.00016606180192891648, + "loss": 0.2783, + "step": 10290 + }, + { + "epoch": 1.37, + "grad_norm": 0.671875, + "learning_rate": 0.0001660530593082402, + "loss": 0.5645, + "step": 10291 + }, + { + "epoch": 1.37, + "grad_norm": 0.51171875, + "learning_rate": 0.00016604431579184204, + "loss": 0.5432, + "step": 10292 + }, + { + "epoch": 1.37, + "grad_norm": 0.99609375, + "learning_rate": 0.00016603557137984058, + "loss": 0.224, + "step": 10293 + }, + { + "epoch": 1.37, + "grad_norm": 0.73828125, + "learning_rate": 0.0001660268260723544, + "loss": 0.3724, + "step": 10294 + }, + { + "epoch": 1.37, + "grad_norm": 0.71875, + "learning_rate": 0.00016601807986950208, + "loss": 0.5169, + "step": 10295 + }, + { + "epoch": 1.37, + "grad_norm": 0.421875, + "learning_rate": 0.0001660093327714022, + "loss": 0.376, + "step": 10296 + }, + { + "epoch": 1.37, + "grad_norm": 0.5546875, + "learning_rate": 0.00016600058477817345, + "loss": 0.2111, + "step": 10297 + }, + { + "epoch": 1.37, + "grad_norm": 0.515625, + "learning_rate": 0.00016599183588993438, + "loss": 0.3471, + "step": 10298 + }, + { + "epoch": 1.37, + "grad_norm": 0.609375, + "learning_rate": 0.00016598308610680368, + "loss": 0.328, + "step": 10299 + }, + { + "epoch": 1.37, + "grad_norm": 0.75, + "learning_rate": 0.00016597433542889998, + "loss": 0.307, + "step": 10300 + }, + { + "epoch": 1.37, + "grad_norm": 0.47265625, + "learning_rate": 0.00016596558385634192, + "loss": 0.34, + "step": 10301 + }, + { + "epoch": 1.37, + "grad_norm": 0.5546875, + "learning_rate": 0.00016595683138924823, + "loss": 0.403, + "step": 10302 + }, + { + "epoch": 1.37, + "grad_norm": 0.58984375, + "learning_rate": 0.00016594807802773756, + "loss": 0.4158, + "step": 10303 + }, + { + "epoch": 1.37, + "grad_norm": 0.455078125, + "learning_rate": 0.00016593932377192863, + "loss": 0.3129, + "step": 10304 + }, + { + "epoch": 1.38, + "grad_norm": 0.70703125, + "learning_rate": 0.00016593056862194017, + "loss": 0.3349, + "step": 10305 + }, + { + "epoch": 1.38, + "grad_norm": 0.515625, + "learning_rate": 0.00016592181257789088, + "loss": 0.375, + "step": 10306 + }, + { + "epoch": 1.38, + "grad_norm": 0.408203125, + "learning_rate": 0.00016591305563989952, + "loss": 0.1909, + "step": 10307 + }, + { + "epoch": 1.38, + "grad_norm": 0.72265625, + "learning_rate": 0.00016590429780808478, + "loss": 0.3909, + "step": 10308 + }, + { + "epoch": 1.38, + "grad_norm": 0.6328125, + "learning_rate": 0.00016589553908256548, + "loss": 0.4004, + "step": 10309 + }, + { + "epoch": 1.38, + "grad_norm": 0.51171875, + "learning_rate": 0.00016588677946346037, + "loss": 0.4892, + "step": 10310 + }, + { + "epoch": 1.38, + "grad_norm": 0.515625, + "learning_rate": 0.00016587801895088828, + "loss": 0.5068, + "step": 10311 + }, + { + "epoch": 1.38, + "grad_norm": 0.5234375, + "learning_rate": 0.00016586925754496793, + "loss": 0.2199, + "step": 10312 + }, + { + "epoch": 1.38, + "grad_norm": 0.546875, + "learning_rate": 0.00016586049524581822, + "loss": 0.4404, + "step": 10313 + }, + { + "epoch": 1.38, + "grad_norm": 0.51171875, + "learning_rate": 0.0001658517320535579, + "loss": 0.479, + "step": 10314 + }, + { + "epoch": 1.38, + "grad_norm": 0.5625, + "learning_rate": 0.00016584296796830583, + "loss": 0.4028, + "step": 10315 + }, + { + "epoch": 1.38, + "grad_norm": 0.48828125, + "learning_rate": 0.00016583420299018086, + "loss": 0.2955, + "step": 10316 + }, + { + "epoch": 1.38, + "grad_norm": 0.55859375, + "learning_rate": 0.00016582543711930186, + "loss": 0.3807, + "step": 10317 + }, + { + "epoch": 1.38, + "grad_norm": 0.5078125, + "learning_rate": 0.00016581667035578764, + "loss": 0.2856, + "step": 10318 + }, + { + "epoch": 1.38, + "grad_norm": 0.6015625, + "learning_rate": 0.00016580790269975717, + "loss": 0.2668, + "step": 10319 + }, + { + "epoch": 1.38, + "grad_norm": 0.40625, + "learning_rate": 0.0001657991341513293, + "loss": 0.2357, + "step": 10320 + }, + { + "epoch": 1.38, + "grad_norm": 0.421875, + "learning_rate": 0.00016579036471062292, + "loss": 0.2067, + "step": 10321 + }, + { + "epoch": 1.38, + "grad_norm": 0.61328125, + "learning_rate": 0.00016578159437775696, + "loss": 0.6443, + "step": 10322 + }, + { + "epoch": 1.38, + "grad_norm": 0.5, + "learning_rate": 0.00016577282315285042, + "loss": 0.3933, + "step": 10323 + }, + { + "epoch": 1.38, + "grad_norm": 0.62890625, + "learning_rate": 0.00016576405103602216, + "loss": 0.5077, + "step": 10324 + }, + { + "epoch": 1.38, + "grad_norm": 0.38671875, + "learning_rate": 0.00016575527802739114, + "loss": 0.2879, + "step": 10325 + }, + { + "epoch": 1.38, + "grad_norm": 0.6171875, + "learning_rate": 0.00016574650412707636, + "loss": 0.3648, + "step": 10326 + }, + { + "epoch": 1.38, + "grad_norm": 0.57421875, + "learning_rate": 0.00016573772933519676, + "loss": 0.3137, + "step": 10327 + }, + { + "epoch": 1.38, + "grad_norm": 0.58203125, + "learning_rate": 0.0001657289536518714, + "loss": 0.3581, + "step": 10328 + }, + { + "epoch": 1.38, + "grad_norm": 0.60546875, + "learning_rate": 0.00016572017707721924, + "loss": 0.318, + "step": 10329 + }, + { + "epoch": 1.38, + "grad_norm": 0.4453125, + "learning_rate": 0.00016571139961135927, + "loss": 0.2563, + "step": 10330 + }, + { + "epoch": 1.38, + "grad_norm": 0.40234375, + "learning_rate": 0.0001657026212544106, + "loss": 0.3007, + "step": 10331 + }, + { + "epoch": 1.38, + "grad_norm": 0.54296875, + "learning_rate": 0.00016569384200649219, + "loss": 0.5035, + "step": 10332 + }, + { + "epoch": 1.38, + "grad_norm": 0.474609375, + "learning_rate": 0.0001656850618677231, + "loss": 0.3446, + "step": 10333 + }, + { + "epoch": 1.38, + "grad_norm": 0.6015625, + "learning_rate": 0.00016567628083822243, + "loss": 0.4994, + "step": 10334 + }, + { + "epoch": 1.38, + "grad_norm": 0.609375, + "learning_rate": 0.00016566749891810926, + "loss": 0.3369, + "step": 10335 + }, + { + "epoch": 1.38, + "grad_norm": 0.482421875, + "learning_rate": 0.00016565871610750262, + "loss": 0.3737, + "step": 10336 + }, + { + "epoch": 1.38, + "grad_norm": 0.55859375, + "learning_rate": 0.0001656499324065217, + "loss": 0.2307, + "step": 10337 + }, + { + "epoch": 1.38, + "grad_norm": 0.5859375, + "learning_rate": 0.00016564114781528552, + "loss": 0.537, + "step": 10338 + }, + { + "epoch": 1.38, + "grad_norm": 0.66015625, + "learning_rate": 0.00016563236233391328, + "loss": 0.3257, + "step": 10339 + }, + { + "epoch": 1.38, + "grad_norm": 0.55859375, + "learning_rate": 0.00016562357596252406, + "loss": 0.2862, + "step": 10340 + }, + { + "epoch": 1.38, + "grad_norm": 0.59765625, + "learning_rate": 0.00016561478870123707, + "loss": 0.3739, + "step": 10341 + }, + { + "epoch": 1.38, + "grad_norm": 0.5234375, + "learning_rate": 0.0001656060005501714, + "loss": 0.2232, + "step": 10342 + }, + { + "epoch": 1.38, + "grad_norm": 0.5625, + "learning_rate": 0.00016559721150944624, + "loss": 0.5204, + "step": 10343 + }, + { + "epoch": 1.38, + "grad_norm": 0.447265625, + "learning_rate": 0.0001655884215791808, + "loss": 0.2083, + "step": 10344 + }, + { + "epoch": 1.38, + "grad_norm": 0.5, + "learning_rate": 0.00016557963075949429, + "loss": 0.4891, + "step": 10345 + }, + { + "epoch": 1.38, + "grad_norm": 0.54296875, + "learning_rate": 0.0001655708390505059, + "loss": 0.5745, + "step": 10346 + }, + { + "epoch": 1.38, + "grad_norm": 0.52734375, + "learning_rate": 0.00016556204645233486, + "loss": 0.6547, + "step": 10347 + }, + { + "epoch": 1.38, + "grad_norm": 0.390625, + "learning_rate": 0.00016555325296510037, + "loss": 0.1611, + "step": 10348 + }, + { + "epoch": 1.38, + "grad_norm": 0.46484375, + "learning_rate": 0.0001655444585889217, + "loss": 0.3376, + "step": 10349 + }, + { + "epoch": 1.38, + "grad_norm": 0.6171875, + "learning_rate": 0.0001655356633239181, + "loss": 0.4025, + "step": 10350 + }, + { + "epoch": 1.38, + "grad_norm": 0.828125, + "learning_rate": 0.00016552686717020884, + "loss": 0.5756, + "step": 10351 + }, + { + "epoch": 1.38, + "grad_norm": 0.43359375, + "learning_rate": 0.00016551807012791324, + "loss": 0.3187, + "step": 10352 + }, + { + "epoch": 1.38, + "grad_norm": 0.68359375, + "learning_rate": 0.00016550927219715052, + "loss": 0.3635, + "step": 10353 + }, + { + "epoch": 1.38, + "grad_norm": 0.515625, + "learning_rate": 0.00016550047337804004, + "loss": 0.214, + "step": 10354 + }, + { + "epoch": 1.38, + "grad_norm": 0.69140625, + "learning_rate": 0.0001654916736707011, + "loss": 0.6424, + "step": 10355 + }, + { + "epoch": 1.38, + "grad_norm": 0.671875, + "learning_rate": 0.00016548287307525304, + "loss": 0.7943, + "step": 10356 + }, + { + "epoch": 1.38, + "grad_norm": 0.640625, + "learning_rate": 0.0001654740715918152, + "loss": 0.3785, + "step": 10357 + }, + { + "epoch": 1.38, + "grad_norm": 0.69140625, + "learning_rate": 0.00016546526922050695, + "loss": 0.379, + "step": 10358 + }, + { + "epoch": 1.38, + "grad_norm": 0.546875, + "learning_rate": 0.0001654564659614476, + "loss": 0.3538, + "step": 10359 + }, + { + "epoch": 1.38, + "grad_norm": 0.80078125, + "learning_rate": 0.00016544766181475655, + "loss": 0.458, + "step": 10360 + }, + { + "epoch": 1.38, + "grad_norm": 0.380859375, + "learning_rate": 0.00016543885678055323, + "loss": 0.1737, + "step": 10361 + }, + { + "epoch": 1.38, + "grad_norm": 0.61328125, + "learning_rate": 0.00016543005085895699, + "loss": 0.5773, + "step": 10362 + }, + { + "epoch": 1.38, + "grad_norm": 0.47265625, + "learning_rate": 0.0001654212440500873, + "loss": 0.3853, + "step": 10363 + }, + { + "epoch": 1.38, + "grad_norm": 0.71484375, + "learning_rate": 0.0001654124363540635, + "loss": 0.2206, + "step": 10364 + }, + { + "epoch": 1.38, + "grad_norm": 0.70703125, + "learning_rate": 0.00016540362777100516, + "loss": 0.6505, + "step": 10365 + }, + { + "epoch": 1.38, + "grad_norm": 0.447265625, + "learning_rate": 0.00016539481830103159, + "loss": 0.1789, + "step": 10366 + }, + { + "epoch": 1.38, + "grad_norm": 0.671875, + "learning_rate": 0.0001653860079442623, + "loss": 0.5732, + "step": 10367 + }, + { + "epoch": 1.38, + "grad_norm": 0.5, + "learning_rate": 0.00016537719670081682, + "loss": 0.4488, + "step": 10368 + }, + { + "epoch": 1.38, + "grad_norm": 0.6640625, + "learning_rate": 0.00016536838457081459, + "loss": 0.4725, + "step": 10369 + }, + { + "epoch": 1.38, + "grad_norm": 1.1015625, + "learning_rate": 0.0001653595715543751, + "loss": 0.4466, + "step": 10370 + }, + { + "epoch": 1.38, + "grad_norm": 0.58203125, + "learning_rate": 0.0001653507576516179, + "loss": 0.5491, + "step": 10371 + }, + { + "epoch": 1.38, + "grad_norm": 1.0078125, + "learning_rate": 0.00016534194286266246, + "loss": 0.3539, + "step": 10372 + }, + { + "epoch": 1.38, + "grad_norm": 0.515625, + "learning_rate": 0.00016533312718762833, + "loss": 0.5831, + "step": 10373 + }, + { + "epoch": 1.38, + "grad_norm": 0.4765625, + "learning_rate": 0.00016532431062663507, + "loss": 0.2682, + "step": 10374 + }, + { + "epoch": 1.38, + "grad_norm": 0.482421875, + "learning_rate": 0.00016531549317980225, + "loss": 0.6221, + "step": 10375 + }, + { + "epoch": 1.38, + "grad_norm": 0.578125, + "learning_rate": 0.0001653066748472494, + "loss": 0.311, + "step": 10376 + }, + { + "epoch": 1.38, + "grad_norm": 0.5078125, + "learning_rate": 0.00016529785562909615, + "loss": 0.2502, + "step": 10377 + }, + { + "epoch": 1.38, + "grad_norm": 0.392578125, + "learning_rate": 0.00016528903552546207, + "loss": 0.3523, + "step": 10378 + }, + { + "epoch": 1.38, + "grad_norm": 0.625, + "learning_rate": 0.00016528021453646673, + "loss": 0.5275, + "step": 10379 + }, + { + "epoch": 1.39, + "grad_norm": 0.439453125, + "learning_rate": 0.0001652713926622298, + "loss": 0.5435, + "step": 10380 + }, + { + "epoch": 1.39, + "grad_norm": 0.478515625, + "learning_rate": 0.0001652625699028709, + "loss": 0.3832, + "step": 10381 + }, + { + "epoch": 1.39, + "grad_norm": 0.5859375, + "learning_rate": 0.00016525374625850968, + "loss": 0.6111, + "step": 10382 + }, + { + "epoch": 1.39, + "grad_norm": 0.447265625, + "learning_rate": 0.00016524492172926577, + "loss": 0.3792, + "step": 10383 + }, + { + "epoch": 1.39, + "grad_norm": 0.5234375, + "learning_rate": 0.00016523609631525886, + "loss": 0.324, + "step": 10384 + }, + { + "epoch": 1.39, + "grad_norm": 0.5234375, + "learning_rate": 0.0001652272700166086, + "loss": 0.4834, + "step": 10385 + }, + { + "epoch": 1.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00016521844283343468, + "loss": 0.3141, + "step": 10386 + }, + { + "epoch": 1.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00016520961476585686, + "loss": 0.4188, + "step": 10387 + }, + { + "epoch": 1.39, + "grad_norm": 0.3671875, + "learning_rate": 0.0001652007858139948, + "loss": 0.3136, + "step": 10388 + }, + { + "epoch": 1.39, + "grad_norm": 0.75390625, + "learning_rate": 0.00016519195597796823, + "loss": 0.3053, + "step": 10389 + }, + { + "epoch": 1.39, + "grad_norm": 0.447265625, + "learning_rate": 0.00016518312525789686, + "loss": 0.3138, + "step": 10390 + }, + { + "epoch": 1.39, + "grad_norm": 0.71875, + "learning_rate": 0.00016517429365390054, + "loss": 0.5979, + "step": 10391 + }, + { + "epoch": 1.39, + "grad_norm": 0.5234375, + "learning_rate": 0.00016516546116609893, + "loss": 0.3575, + "step": 10392 + }, + { + "epoch": 1.39, + "grad_norm": 0.671875, + "learning_rate": 0.00016515662779461186, + "loss": 0.2292, + "step": 10393 + }, + { + "epoch": 1.39, + "grad_norm": 0.58203125, + "learning_rate": 0.0001651477935395591, + "loss": 0.4501, + "step": 10394 + }, + { + "epoch": 1.39, + "grad_norm": 0.53515625, + "learning_rate": 0.00016513895840106045, + "loss": 0.4793, + "step": 10395 + }, + { + "epoch": 1.39, + "grad_norm": 0.482421875, + "learning_rate": 0.00016513012237923572, + "loss": 0.3014, + "step": 10396 + }, + { + "epoch": 1.39, + "grad_norm": 0.50390625, + "learning_rate": 0.00016512128547420472, + "loss": 0.4054, + "step": 10397 + }, + { + "epoch": 1.39, + "grad_norm": 0.5859375, + "learning_rate": 0.00016511244768608729, + "loss": 0.4434, + "step": 10398 + }, + { + "epoch": 1.39, + "grad_norm": 0.44921875, + "learning_rate": 0.00016510360901500327, + "loss": 0.4152, + "step": 10399 + }, + { + "epoch": 1.39, + "grad_norm": 0.47265625, + "learning_rate": 0.00016509476946107254, + "loss": 0.4745, + "step": 10400 + }, + { + "epoch": 1.39, + "grad_norm": 0.47265625, + "learning_rate": 0.00016508592902441497, + "loss": 0.6178, + "step": 10401 + }, + { + "epoch": 1.39, + "grad_norm": 0.6328125, + "learning_rate": 0.00016507708770515045, + "loss": 0.6626, + "step": 10402 + }, + { + "epoch": 1.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00016506824550339882, + "loss": 0.4912, + "step": 10403 + }, + { + "epoch": 1.39, + "grad_norm": 0.47265625, + "learning_rate": 0.00016505940241928, + "loss": 0.2605, + "step": 10404 + }, + { + "epoch": 1.39, + "grad_norm": 0.61328125, + "learning_rate": 0.00016505055845291399, + "loss": 0.4644, + "step": 10405 + }, + { + "epoch": 1.39, + "grad_norm": 0.671875, + "learning_rate": 0.0001650417136044206, + "loss": 0.5127, + "step": 10406 + }, + { + "epoch": 1.39, + "grad_norm": 0.57421875, + "learning_rate": 0.00016503286787391988, + "loss": 0.3392, + "step": 10407 + }, + { + "epoch": 1.39, + "grad_norm": 0.38671875, + "learning_rate": 0.0001650240212615317, + "loss": 0.2352, + "step": 10408 + }, + { + "epoch": 1.39, + "grad_norm": 0.53515625, + "learning_rate": 0.0001650151737673761, + "loss": 0.3453, + "step": 10409 + }, + { + "epoch": 1.39, + "grad_norm": 0.43359375, + "learning_rate": 0.00016500632539157296, + "loss": 0.4202, + "step": 10410 + }, + { + "epoch": 1.39, + "grad_norm": 0.71484375, + "learning_rate": 0.00016499747613424236, + "loss": 0.5532, + "step": 10411 + }, + { + "epoch": 1.39, + "grad_norm": 0.66796875, + "learning_rate": 0.00016498862599550427, + "loss": 0.3256, + "step": 10412 + }, + { + "epoch": 1.39, + "grad_norm": 0.470703125, + "learning_rate": 0.00016497977497547867, + "loss": 0.2291, + "step": 10413 + }, + { + "epoch": 1.39, + "grad_norm": 0.578125, + "learning_rate": 0.00016497092307428564, + "loss": 0.4717, + "step": 10414 + }, + { + "epoch": 1.39, + "grad_norm": 0.6875, + "learning_rate": 0.0001649620702920452, + "loss": 0.2842, + "step": 10415 + }, + { + "epoch": 1.39, + "grad_norm": 0.498046875, + "learning_rate": 0.00016495321662887742, + "loss": 0.3154, + "step": 10416 + }, + { + "epoch": 1.39, + "grad_norm": 0.703125, + "learning_rate": 0.0001649443620849023, + "loss": 0.2779, + "step": 10417 + }, + { + "epoch": 1.39, + "grad_norm": 0.5, + "learning_rate": 0.00016493550666023995, + "loss": 0.5377, + "step": 10418 + }, + { + "epoch": 1.39, + "grad_norm": 0.62890625, + "learning_rate": 0.00016492665035501046, + "loss": 0.4151, + "step": 10419 + }, + { + "epoch": 1.39, + "grad_norm": 0.51953125, + "learning_rate": 0.00016491779316933394, + "loss": 0.4868, + "step": 10420 + }, + { + "epoch": 1.39, + "grad_norm": 0.4375, + "learning_rate": 0.00016490893510333048, + "loss": 0.1932, + "step": 10421 + }, + { + "epoch": 1.39, + "grad_norm": 0.5390625, + "learning_rate": 0.00016490007615712018, + "loss": 0.4558, + "step": 10422 + }, + { + "epoch": 1.39, + "grad_norm": 0.5546875, + "learning_rate": 0.00016489121633082317, + "loss": 0.4215, + "step": 10423 + }, + { + "epoch": 1.39, + "grad_norm": 0.5078125, + "learning_rate": 0.00016488235562455965, + "loss": 0.2122, + "step": 10424 + }, + { + "epoch": 1.39, + "grad_norm": 0.73046875, + "learning_rate": 0.00016487349403844975, + "loss": 0.2771, + "step": 10425 + }, + { + "epoch": 1.39, + "grad_norm": 0.57421875, + "learning_rate": 0.00016486463157261365, + "loss": 0.2143, + "step": 10426 + }, + { + "epoch": 1.39, + "grad_norm": 0.455078125, + "learning_rate": 0.0001648557682271715, + "loss": 0.3548, + "step": 10427 + }, + { + "epoch": 1.39, + "grad_norm": 0.51171875, + "learning_rate": 0.0001648469040022435, + "loss": 0.365, + "step": 10428 + }, + { + "epoch": 1.39, + "grad_norm": 0.578125, + "learning_rate": 0.00016483803889794985, + "loss": 0.3143, + "step": 10429 + }, + { + "epoch": 1.39, + "grad_norm": 0.6796875, + "learning_rate": 0.00016482917291441083, + "loss": 0.3812, + "step": 10430 + }, + { + "epoch": 1.39, + "grad_norm": 0.52734375, + "learning_rate": 0.00016482030605174657, + "loss": 0.2661, + "step": 10431 + }, + { + "epoch": 1.39, + "grad_norm": 0.49609375, + "learning_rate": 0.0001648114383100774, + "loss": 0.3005, + "step": 10432 + }, + { + "epoch": 1.39, + "grad_norm": 0.490234375, + "learning_rate": 0.00016480256968952347, + "loss": 0.6312, + "step": 10433 + }, + { + "epoch": 1.39, + "grad_norm": 0.578125, + "learning_rate": 0.00016479370019020518, + "loss": 0.3539, + "step": 10434 + }, + { + "epoch": 1.39, + "grad_norm": 0.42578125, + "learning_rate": 0.00016478482981224267, + "loss": 0.4595, + "step": 10435 + }, + { + "epoch": 1.39, + "grad_norm": 0.466796875, + "learning_rate": 0.00016477595855575632, + "loss": 0.5822, + "step": 10436 + }, + { + "epoch": 1.39, + "grad_norm": 0.5859375, + "learning_rate": 0.00016476708642086638, + "loss": 0.5481, + "step": 10437 + }, + { + "epoch": 1.39, + "grad_norm": 0.47265625, + "learning_rate": 0.0001647582134076932, + "loss": 0.1824, + "step": 10438 + }, + { + "epoch": 1.39, + "grad_norm": 0.5625, + "learning_rate": 0.00016474933951635706, + "loss": 0.3571, + "step": 10439 + }, + { + "epoch": 1.39, + "grad_norm": 0.54296875, + "learning_rate": 0.00016474046474697834, + "loss": 0.4887, + "step": 10440 + }, + { + "epoch": 1.39, + "grad_norm": 0.8125, + "learning_rate": 0.00016473158909967737, + "loss": 0.4303, + "step": 10441 + }, + { + "epoch": 1.39, + "grad_norm": 0.6484375, + "learning_rate": 0.0001647227125745745, + "loss": 0.516, + "step": 10442 + }, + { + "epoch": 1.39, + "grad_norm": 0.98046875, + "learning_rate": 0.0001647138351717901, + "loss": 0.3129, + "step": 10443 + }, + { + "epoch": 1.39, + "grad_norm": 0.64453125, + "learning_rate": 0.00016470495689144458, + "loss": 0.3761, + "step": 10444 + }, + { + "epoch": 1.39, + "grad_norm": 0.6796875, + "learning_rate": 0.0001646960777336583, + "loss": 0.3264, + "step": 10445 + }, + { + "epoch": 1.39, + "grad_norm": 0.6640625, + "learning_rate": 0.00016468719769855173, + "loss": 0.5784, + "step": 10446 + }, + { + "epoch": 1.39, + "grad_norm": 0.4296875, + "learning_rate": 0.00016467831678624518, + "loss": 0.1889, + "step": 10447 + }, + { + "epoch": 1.39, + "grad_norm": 0.458984375, + "learning_rate": 0.0001646694349968592, + "loss": 0.4017, + "step": 10448 + }, + { + "epoch": 1.39, + "grad_norm": 0.37890625, + "learning_rate": 0.00016466055233051413, + "loss": 0.4317, + "step": 10449 + }, + { + "epoch": 1.39, + "grad_norm": 0.6171875, + "learning_rate": 0.00016465166878733048, + "loss": 0.5383, + "step": 10450 + }, + { + "epoch": 1.39, + "grad_norm": 0.39453125, + "learning_rate": 0.00016464278436742876, + "loss": 0.3853, + "step": 10451 + }, + { + "epoch": 1.39, + "grad_norm": 0.609375, + "learning_rate": 0.00016463389907092933, + "loss": 0.4547, + "step": 10452 + }, + { + "epoch": 1.39, + "grad_norm": 0.484375, + "learning_rate": 0.00016462501289795278, + "loss": 0.3544, + "step": 10453 + }, + { + "epoch": 1.39, + "grad_norm": 0.5, + "learning_rate": 0.00016461612584861953, + "loss": 0.3073, + "step": 10454 + }, + { + "epoch": 1.4, + "grad_norm": 0.57421875, + "learning_rate": 0.0001646072379230502, + "loss": 0.5649, + "step": 10455 + }, + { + "epoch": 1.4, + "grad_norm": 0.484375, + "learning_rate": 0.00016459834912136527, + "loss": 0.4448, + "step": 10456 + }, + { + "epoch": 1.4, + "grad_norm": 0.52734375, + "learning_rate": 0.00016458945944368522, + "loss": 0.3888, + "step": 10457 + }, + { + "epoch": 1.4, + "grad_norm": 0.515625, + "learning_rate": 0.00016458056889013067, + "loss": 0.6433, + "step": 10458 + }, + { + "epoch": 1.4, + "grad_norm": 0.5546875, + "learning_rate": 0.00016457167746082213, + "loss": 0.3816, + "step": 10459 + }, + { + "epoch": 1.4, + "grad_norm": 0.57421875, + "learning_rate": 0.00016456278515588024, + "loss": 0.3202, + "step": 10460 + }, + { + "epoch": 1.4, + "grad_norm": 0.55859375, + "learning_rate": 0.00016455389197542555, + "loss": 0.5075, + "step": 10461 + }, + { + "epoch": 1.4, + "grad_norm": 0.451171875, + "learning_rate": 0.00016454499791957865, + "loss": 0.2601, + "step": 10462 + }, + { + "epoch": 1.4, + "grad_norm": 0.490234375, + "learning_rate": 0.00016453610298846014, + "loss": 0.4781, + "step": 10463 + }, + { + "epoch": 1.4, + "grad_norm": 0.56640625, + "learning_rate": 0.00016452720718219063, + "loss": 0.3221, + "step": 10464 + }, + { + "epoch": 1.4, + "grad_norm": 0.578125, + "learning_rate": 0.0001645183105008908, + "loss": 0.3621, + "step": 10465 + }, + { + "epoch": 1.4, + "grad_norm": 0.50390625, + "learning_rate": 0.00016450941294468126, + "loss": 0.3855, + "step": 10466 + }, + { + "epoch": 1.4, + "grad_norm": 0.53515625, + "learning_rate": 0.0001645005145136827, + "loss": 0.386, + "step": 10467 + }, + { + "epoch": 1.4, + "grad_norm": 0.46875, + "learning_rate": 0.0001644916152080158, + "loss": 0.3782, + "step": 10468 + }, + { + "epoch": 1.4, + "grad_norm": 0.56640625, + "learning_rate": 0.00016448271502780118, + "loss": 0.4125, + "step": 10469 + }, + { + "epoch": 1.4, + "grad_norm": 0.54296875, + "learning_rate": 0.00016447381397315954, + "loss": 0.3353, + "step": 10470 + }, + { + "epoch": 1.4, + "grad_norm": 0.6015625, + "learning_rate": 0.0001644649120442116, + "loss": 0.5266, + "step": 10471 + }, + { + "epoch": 1.4, + "grad_norm": 0.7734375, + "learning_rate": 0.0001644560092410781, + "loss": 0.4546, + "step": 10472 + }, + { + "epoch": 1.4, + "grad_norm": 0.59765625, + "learning_rate": 0.0001644471055638797, + "loss": 0.6631, + "step": 10473 + }, + { + "epoch": 1.4, + "grad_norm": 0.9296875, + "learning_rate": 0.00016443820101273723, + "loss": 0.4055, + "step": 10474 + }, + { + "epoch": 1.4, + "grad_norm": 0.9765625, + "learning_rate": 0.00016442929558777142, + "loss": 0.361, + "step": 10475 + }, + { + "epoch": 1.4, + "grad_norm": 0.64453125, + "learning_rate": 0.00016442038928910296, + "loss": 0.4088, + "step": 10476 + }, + { + "epoch": 1.4, + "grad_norm": 0.578125, + "learning_rate": 0.0001644114821168527, + "loss": 0.5598, + "step": 10477 + }, + { + "epoch": 1.4, + "grad_norm": 0.53515625, + "learning_rate": 0.00016440257407114142, + "loss": 0.2807, + "step": 10478 + }, + { + "epoch": 1.4, + "grad_norm": 0.50390625, + "learning_rate": 0.00016439366515208988, + "loss": 0.2664, + "step": 10479 + }, + { + "epoch": 1.4, + "grad_norm": 0.73046875, + "learning_rate": 0.00016438475535981891, + "loss": 0.6224, + "step": 10480 + }, + { + "epoch": 1.4, + "grad_norm": 0.46875, + "learning_rate": 0.00016437584469444937, + "loss": 0.3291, + "step": 10481 + }, + { + "epoch": 1.4, + "grad_norm": 0.55078125, + "learning_rate": 0.00016436693315610202, + "loss": 0.3748, + "step": 10482 + }, + { + "epoch": 1.4, + "grad_norm": 0.578125, + "learning_rate": 0.00016435802074489777, + "loss": 0.441, + "step": 10483 + }, + { + "epoch": 1.4, + "grad_norm": 0.3359375, + "learning_rate": 0.00016434910746095744, + "loss": 0.175, + "step": 10484 + }, + { + "epoch": 1.4, + "grad_norm": 0.69921875, + "learning_rate": 0.00016434019330440193, + "loss": 0.5324, + "step": 10485 + }, + { + "epoch": 1.4, + "grad_norm": 0.41796875, + "learning_rate": 0.0001643312782753521, + "loss": 0.4058, + "step": 10486 + }, + { + "epoch": 1.4, + "grad_norm": 0.62109375, + "learning_rate": 0.00016432236237392883, + "loss": 0.4476, + "step": 10487 + }, + { + "epoch": 1.4, + "grad_norm": 0.49609375, + "learning_rate": 0.00016431344560025306, + "loss": 0.3516, + "step": 10488 + }, + { + "epoch": 1.4, + "grad_norm": 0.671875, + "learning_rate": 0.0001643045279544457, + "loss": 0.3468, + "step": 10489 + }, + { + "epoch": 1.4, + "grad_norm": 0.44921875, + "learning_rate": 0.00016429560943662764, + "loss": 0.289, + "step": 10490 + }, + { + "epoch": 1.4, + "grad_norm": 0.484375, + "learning_rate": 0.00016428669004691987, + "loss": 0.4146, + "step": 10491 + }, + { + "epoch": 1.4, + "grad_norm": 0.75, + "learning_rate": 0.00016427776978544331, + "loss": 0.3964, + "step": 10492 + }, + { + "epoch": 1.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00016426884865231898, + "loss": 0.2469, + "step": 10493 + }, + { + "epoch": 1.4, + "grad_norm": 0.466796875, + "learning_rate": 0.0001642599266476678, + "loss": 0.6008, + "step": 10494 + }, + { + "epoch": 1.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00016425100377161076, + "loss": 0.226, + "step": 10495 + }, + { + "epoch": 1.4, + "grad_norm": 0.6328125, + "learning_rate": 0.00016424208002426886, + "loss": 0.6002, + "step": 10496 + }, + { + "epoch": 1.4, + "grad_norm": 0.73828125, + "learning_rate": 0.00016423315540576314, + "loss": 0.5145, + "step": 10497 + }, + { + "epoch": 1.4, + "grad_norm": 0.57421875, + "learning_rate": 0.0001642242299162146, + "loss": 0.6637, + "step": 10498 + }, + { + "epoch": 1.4, + "grad_norm": 0.4765625, + "learning_rate": 0.00016421530355574431, + "loss": 0.2599, + "step": 10499 + }, + { + "epoch": 1.4, + "grad_norm": 0.55859375, + "learning_rate": 0.00016420637632447326, + "loss": 0.486, + "step": 10500 + }, + { + "epoch": 1.4, + "grad_norm": 0.5078125, + "learning_rate": 0.00016419744822252253, + "loss": 0.4829, + "step": 10501 + }, + { + "epoch": 1.4, + "grad_norm": 0.60546875, + "learning_rate": 0.00016418851925001325, + "loss": 0.7766, + "step": 10502 + }, + { + "epoch": 1.4, + "grad_norm": 0.609375, + "learning_rate": 0.00016417958940706642, + "loss": 0.3845, + "step": 10503 + }, + { + "epoch": 1.4, + "grad_norm": 0.58984375, + "learning_rate": 0.00016417065869380316, + "loss": 0.361, + "step": 10504 + }, + { + "epoch": 1.4, + "grad_norm": 0.482421875, + "learning_rate": 0.0001641617271103446, + "loss": 0.3502, + "step": 10505 + }, + { + "epoch": 1.4, + "grad_norm": 0.60546875, + "learning_rate": 0.00016415279465681185, + "loss": 0.3708, + "step": 10506 + }, + { + "epoch": 1.4, + "grad_norm": 0.5078125, + "learning_rate": 0.000164143861333326, + "loss": 0.3818, + "step": 10507 + }, + { + "epoch": 1.4, + "grad_norm": 0.498046875, + "learning_rate": 0.00016413492714000824, + "loss": 0.3935, + "step": 10508 + }, + { + "epoch": 1.4, + "grad_norm": 0.640625, + "learning_rate": 0.00016412599207697972, + "loss": 0.3245, + "step": 10509 + }, + { + "epoch": 1.4, + "grad_norm": 0.58203125, + "learning_rate": 0.0001641170561443616, + "loss": 0.373, + "step": 10510 + }, + { + "epoch": 1.4, + "grad_norm": 0.42578125, + "learning_rate": 0.00016410811934227502, + "loss": 0.2479, + "step": 10511 + }, + { + "epoch": 1.4, + "grad_norm": 0.890625, + "learning_rate": 0.0001640991816708412, + "loss": 0.4125, + "step": 10512 + }, + { + "epoch": 1.4, + "grad_norm": 0.54296875, + "learning_rate": 0.00016409024313018136, + "loss": 0.4053, + "step": 10513 + }, + { + "epoch": 1.4, + "grad_norm": 0.66015625, + "learning_rate": 0.00016408130372041666, + "loss": 0.2175, + "step": 10514 + }, + { + "epoch": 1.4, + "grad_norm": 0.55859375, + "learning_rate": 0.00016407236344166836, + "loss": 0.4794, + "step": 10515 + }, + { + "epoch": 1.4, + "grad_norm": 0.390625, + "learning_rate": 0.00016406342229405768, + "loss": 0.219, + "step": 10516 + }, + { + "epoch": 1.4, + "grad_norm": 0.57421875, + "learning_rate": 0.00016405448027770594, + "loss": 0.3691, + "step": 10517 + }, + { + "epoch": 1.4, + "grad_norm": 0.64453125, + "learning_rate": 0.00016404553739273427, + "loss": 0.3777, + "step": 10518 + }, + { + "epoch": 1.4, + "grad_norm": 0.4453125, + "learning_rate": 0.00016403659363926405, + "loss": 0.2694, + "step": 10519 + }, + { + "epoch": 1.4, + "grad_norm": 0.5234375, + "learning_rate": 0.0001640276490174165, + "loss": 0.2606, + "step": 10520 + }, + { + "epoch": 1.4, + "grad_norm": 0.4921875, + "learning_rate": 0.00016401870352731295, + "loss": 0.5207, + "step": 10521 + }, + { + "epoch": 1.4, + "grad_norm": 0.376953125, + "learning_rate": 0.00016400975716907468, + "loss": 0.1724, + "step": 10522 + }, + { + "epoch": 1.4, + "grad_norm": 0.61328125, + "learning_rate": 0.00016400080994282302, + "loss": 0.588, + "step": 10523 + }, + { + "epoch": 1.4, + "grad_norm": 0.62109375, + "learning_rate": 0.0001639918618486793, + "loss": 0.3299, + "step": 10524 + }, + { + "epoch": 1.4, + "grad_norm": 0.55859375, + "learning_rate": 0.0001639829128867649, + "loss": 0.2752, + "step": 10525 + }, + { + "epoch": 1.4, + "grad_norm": 0.578125, + "learning_rate": 0.00016397396305720108, + "loss": 0.4163, + "step": 10526 + }, + { + "epoch": 1.4, + "grad_norm": 0.73828125, + "learning_rate": 0.0001639650123601093, + "loss": 0.2153, + "step": 10527 + }, + { + "epoch": 1.4, + "grad_norm": 0.828125, + "learning_rate": 0.0001639560607956109, + "loss": 0.4134, + "step": 10528 + }, + { + "epoch": 1.4, + "grad_norm": 0.60546875, + "learning_rate": 0.00016394710836382725, + "loss": 0.3797, + "step": 10529 + }, + { + "epoch": 1.41, + "grad_norm": 0.52734375, + "learning_rate": 0.0001639381550648798, + "loss": 0.3912, + "step": 10530 + }, + { + "epoch": 1.41, + "grad_norm": 0.50390625, + "learning_rate": 0.00016392920089888989, + "loss": 0.4045, + "step": 10531 + }, + { + "epoch": 1.41, + "grad_norm": 0.61328125, + "learning_rate": 0.00016392024586597904, + "loss": 0.48, + "step": 10532 + }, + { + "epoch": 1.41, + "grad_norm": 0.609375, + "learning_rate": 0.00016391128996626856, + "loss": 0.295, + "step": 10533 + }, + { + "epoch": 1.41, + "grad_norm": 0.63671875, + "learning_rate": 0.00016390233319988002, + "loss": 0.4145, + "step": 10534 + }, + { + "epoch": 1.41, + "grad_norm": 0.55078125, + "learning_rate": 0.00016389337556693484, + "loss": 0.566, + "step": 10535 + }, + { + "epoch": 1.41, + "grad_norm": 0.42578125, + "learning_rate": 0.00016388441706755447, + "loss": 0.4999, + "step": 10536 + }, + { + "epoch": 1.41, + "grad_norm": 0.486328125, + "learning_rate": 0.00016387545770186037, + "loss": 0.5413, + "step": 10537 + }, + { + "epoch": 1.41, + "grad_norm": 0.458984375, + "learning_rate": 0.0001638664974699741, + "loss": 0.2008, + "step": 10538 + }, + { + "epoch": 1.41, + "grad_norm": 0.69921875, + "learning_rate": 0.00016385753637201714, + "loss": 0.3462, + "step": 10539 + }, + { + "epoch": 1.41, + "grad_norm": 0.462890625, + "learning_rate": 0.000163848574408111, + "loss": 0.396, + "step": 10540 + }, + { + "epoch": 1.41, + "grad_norm": 0.546875, + "learning_rate": 0.0001638396115783772, + "loss": 0.6936, + "step": 10541 + }, + { + "epoch": 1.41, + "grad_norm": 0.51953125, + "learning_rate": 0.0001638306478829373, + "loss": 0.4752, + "step": 10542 + }, + { + "epoch": 1.41, + "grad_norm": 0.3671875, + "learning_rate": 0.00016382168332191284, + "loss": 0.2516, + "step": 10543 + }, + { + "epoch": 1.41, + "grad_norm": 0.5390625, + "learning_rate": 0.0001638127178954254, + "loss": 0.4998, + "step": 10544 + }, + { + "epoch": 1.41, + "grad_norm": 0.51171875, + "learning_rate": 0.00016380375160359654, + "loss": 0.2848, + "step": 10545 + }, + { + "epoch": 1.41, + "grad_norm": 0.443359375, + "learning_rate": 0.00016379478444654788, + "loss": 0.3638, + "step": 10546 + }, + { + "epoch": 1.41, + "grad_norm": 0.439453125, + "learning_rate": 0.00016378581642440102, + "loss": 0.2651, + "step": 10547 + }, + { + "epoch": 1.41, + "grad_norm": 0.49609375, + "learning_rate": 0.0001637768475372775, + "loss": 0.3491, + "step": 10548 + }, + { + "epoch": 1.41, + "grad_norm": 0.55078125, + "learning_rate": 0.00016376787778529903, + "loss": 0.5391, + "step": 10549 + }, + { + "epoch": 1.41, + "grad_norm": 0.44140625, + "learning_rate": 0.0001637589071685872, + "loss": 0.3266, + "step": 10550 + }, + { + "epoch": 1.41, + "grad_norm": 0.451171875, + "learning_rate": 0.00016374993568726368, + "loss": 0.1684, + "step": 10551 + }, + { + "epoch": 1.41, + "grad_norm": 0.482421875, + "learning_rate": 0.00016374096334145007, + "loss": 0.1804, + "step": 10552 + }, + { + "epoch": 1.41, + "grad_norm": 0.75, + "learning_rate": 0.00016373199013126818, + "loss": 0.4487, + "step": 10553 + }, + { + "epoch": 1.41, + "grad_norm": 0.51953125, + "learning_rate": 0.00016372301605683954, + "loss": 0.2851, + "step": 10554 + }, + { + "epoch": 1.41, + "grad_norm": 0.5390625, + "learning_rate": 0.0001637140411182859, + "loss": 0.4621, + "step": 10555 + }, + { + "epoch": 1.41, + "grad_norm": 0.494140625, + "learning_rate": 0.00016370506531572895, + "loss": 0.5765, + "step": 10556 + }, + { + "epoch": 1.41, + "grad_norm": 0.486328125, + "learning_rate": 0.00016369608864929048, + "loss": 0.2777, + "step": 10557 + }, + { + "epoch": 1.41, + "grad_norm": 0.470703125, + "learning_rate": 0.00016368711111909215, + "loss": 0.467, + "step": 10558 + }, + { + "epoch": 1.41, + "grad_norm": 0.5078125, + "learning_rate": 0.00016367813272525573, + "loss": 0.3142, + "step": 10559 + }, + { + "epoch": 1.41, + "grad_norm": 0.421875, + "learning_rate": 0.00016366915346790296, + "loss": 0.2176, + "step": 10560 + }, + { + "epoch": 1.41, + "grad_norm": 0.5390625, + "learning_rate": 0.00016366017334715557, + "loss": 0.3516, + "step": 10561 + }, + { + "epoch": 1.41, + "grad_norm": 0.462890625, + "learning_rate": 0.0001636511923631354, + "loss": 0.3303, + "step": 10562 + }, + { + "epoch": 1.41, + "grad_norm": 0.4921875, + "learning_rate": 0.0001636422105159642, + "loss": 0.5735, + "step": 10563 + }, + { + "epoch": 1.41, + "grad_norm": 0.4921875, + "learning_rate": 0.0001636332278057638, + "loss": 0.233, + "step": 10564 + }, + { + "epoch": 1.41, + "grad_norm": 0.455078125, + "learning_rate": 0.00016362424423265598, + "loss": 0.3371, + "step": 10565 + }, + { + "epoch": 1.41, + "grad_norm": 0.8046875, + "learning_rate": 0.0001636152597967626, + "loss": 0.3519, + "step": 10566 + }, + { + "epoch": 1.41, + "grad_norm": 0.51171875, + "learning_rate": 0.00016360627449820542, + "loss": 0.5155, + "step": 10567 + }, + { + "epoch": 1.41, + "grad_norm": 0.41796875, + "learning_rate": 0.00016359728833710636, + "loss": 0.2223, + "step": 10568 + }, + { + "epoch": 1.41, + "grad_norm": 0.48046875, + "learning_rate": 0.00016358830131358727, + "loss": 0.1725, + "step": 10569 + }, + { + "epoch": 1.41, + "grad_norm": 0.5, + "learning_rate": 0.00016357931342776997, + "loss": 0.322, + "step": 10570 + }, + { + "epoch": 1.41, + "grad_norm": 0.66015625, + "learning_rate": 0.0001635703246797764, + "loss": 0.644, + "step": 10571 + }, + { + "epoch": 1.41, + "grad_norm": 1.0859375, + "learning_rate": 0.00016356133506972845, + "loss": 0.5036, + "step": 10572 + }, + { + "epoch": 1.41, + "grad_norm": 0.41796875, + "learning_rate": 0.00016355234459774796, + "loss": 0.2945, + "step": 10573 + }, + { + "epoch": 1.41, + "grad_norm": 0.625, + "learning_rate": 0.0001635433532639569, + "loss": 0.3089, + "step": 10574 + }, + { + "epoch": 1.41, + "grad_norm": 0.3984375, + "learning_rate": 0.00016353436106847718, + "loss": 0.1831, + "step": 10575 + }, + { + "epoch": 1.41, + "grad_norm": 0.498046875, + "learning_rate": 0.00016352536801143077, + "loss": 0.5003, + "step": 10576 + }, + { + "epoch": 1.41, + "grad_norm": 0.5703125, + "learning_rate": 0.0001635163740929396, + "loss": 0.474, + "step": 10577 + }, + { + "epoch": 1.41, + "grad_norm": 0.55078125, + "learning_rate": 0.0001635073793131256, + "loss": 0.3407, + "step": 10578 + }, + { + "epoch": 1.41, + "grad_norm": 0.671875, + "learning_rate": 0.0001634983836721108, + "loss": 0.4653, + "step": 10579 + }, + { + "epoch": 1.41, + "grad_norm": 0.75390625, + "learning_rate": 0.0001634893871700172, + "loss": 0.2565, + "step": 10580 + }, + { + "epoch": 1.41, + "grad_norm": 0.4609375, + "learning_rate": 0.0001634803898069667, + "loss": 0.2525, + "step": 10581 + }, + { + "epoch": 1.41, + "grad_norm": 0.419921875, + "learning_rate": 0.0001634713915830814, + "loss": 0.2295, + "step": 10582 + }, + { + "epoch": 1.41, + "grad_norm": 0.421875, + "learning_rate": 0.0001634623924984833, + "loss": 0.27, + "step": 10583 + }, + { + "epoch": 1.41, + "grad_norm": 0.56640625, + "learning_rate": 0.00016345339255329443, + "loss": 0.3581, + "step": 10584 + }, + { + "epoch": 1.41, + "grad_norm": 0.75, + "learning_rate": 0.00016344439174763683, + "loss": 0.4803, + "step": 10585 + }, + { + "epoch": 1.41, + "grad_norm": 0.5625, + "learning_rate": 0.00016343539008163255, + "loss": 0.4602, + "step": 10586 + }, + { + "epoch": 1.41, + "grad_norm": 0.6328125, + "learning_rate": 0.00016342638755540367, + "loss": 0.2062, + "step": 10587 + }, + { + "epoch": 1.41, + "grad_norm": 0.5703125, + "learning_rate": 0.00016341738416907227, + "loss": 0.4917, + "step": 10588 + }, + { + "epoch": 1.41, + "grad_norm": 0.53515625, + "learning_rate": 0.0001634083799227604, + "loss": 0.4115, + "step": 10589 + }, + { + "epoch": 1.41, + "grad_norm": 0.53515625, + "learning_rate": 0.00016339937481659028, + "loss": 0.4896, + "step": 10590 + }, + { + "epoch": 1.41, + "grad_norm": 0.58203125, + "learning_rate": 0.00016339036885068393, + "loss": 0.3063, + "step": 10591 + }, + { + "epoch": 1.41, + "grad_norm": 0.6328125, + "learning_rate": 0.00016338136202516344, + "loss": 0.3862, + "step": 10592 + }, + { + "epoch": 1.41, + "grad_norm": 0.5859375, + "learning_rate": 0.00016337235434015106, + "loss": 0.4515, + "step": 10593 + }, + { + "epoch": 1.41, + "grad_norm": 0.4765625, + "learning_rate": 0.00016336334579576886, + "loss": 0.4618, + "step": 10594 + }, + { + "epoch": 1.41, + "grad_norm": 0.37109375, + "learning_rate": 0.00016335433639213902, + "loss": 0.2937, + "step": 10595 + }, + { + "epoch": 1.41, + "grad_norm": 0.5625, + "learning_rate": 0.00016334532612938374, + "loss": 0.3977, + "step": 10596 + }, + { + "epoch": 1.41, + "grad_norm": 1.4375, + "learning_rate": 0.00016333631500762516, + "loss": 0.2028, + "step": 10597 + }, + { + "epoch": 1.41, + "grad_norm": 0.46875, + "learning_rate": 0.0001633273030269855, + "loss": 0.4062, + "step": 10598 + }, + { + "epoch": 1.41, + "grad_norm": 0.546875, + "learning_rate": 0.00016331829018758698, + "loss": 0.6228, + "step": 10599 + }, + { + "epoch": 1.41, + "grad_norm": 0.53125, + "learning_rate": 0.0001633092764895518, + "loss": 0.2819, + "step": 10600 + }, + { + "epoch": 1.41, + "grad_norm": 0.7109375, + "learning_rate": 0.00016330026193300218, + "loss": 0.4468, + "step": 10601 + }, + { + "epoch": 1.41, + "grad_norm": 0.498046875, + "learning_rate": 0.00016329124651806042, + "loss": 0.576, + "step": 10602 + }, + { + "epoch": 1.41, + "grad_norm": 0.55859375, + "learning_rate": 0.0001632822302448487, + "loss": 0.384, + "step": 10603 + }, + { + "epoch": 1.41, + "grad_norm": 0.53515625, + "learning_rate": 0.0001632732131134894, + "loss": 0.4672, + "step": 10604 + }, + { + "epoch": 1.42, + "grad_norm": 0.48828125, + "learning_rate": 0.00016326419512410466, + "loss": 0.3453, + "step": 10605 + }, + { + "epoch": 1.42, + "grad_norm": 0.609375, + "learning_rate": 0.0001632551762768168, + "loss": 0.2886, + "step": 10606 + }, + { + "epoch": 1.42, + "grad_norm": 0.51171875, + "learning_rate": 0.0001632461565717482, + "loss": 0.4252, + "step": 10607 + }, + { + "epoch": 1.42, + "grad_norm": 0.484375, + "learning_rate": 0.00016323713600902113, + "loss": 0.4155, + "step": 10608 + }, + { + "epoch": 1.42, + "grad_norm": 0.609375, + "learning_rate": 0.00016322811458875788, + "loss": 0.4164, + "step": 10609 + }, + { + "epoch": 1.42, + "grad_norm": 0.54296875, + "learning_rate": 0.00016321909231108083, + "loss": 0.3113, + "step": 10610 + }, + { + "epoch": 1.42, + "grad_norm": 0.43359375, + "learning_rate": 0.00016321006917611235, + "loss": 0.1419, + "step": 10611 + }, + { + "epoch": 1.42, + "grad_norm": 0.412109375, + "learning_rate": 0.00016320104518397472, + "loss": 0.2777, + "step": 10612 + }, + { + "epoch": 1.42, + "grad_norm": 0.74609375, + "learning_rate": 0.00016319202033479035, + "loss": 0.7253, + "step": 10613 + }, + { + "epoch": 1.42, + "grad_norm": 0.59375, + "learning_rate": 0.0001631829946286817, + "loss": 0.4986, + "step": 10614 + }, + { + "epoch": 1.42, + "grad_norm": 0.443359375, + "learning_rate": 0.00016317396806577098, + "loss": 0.4088, + "step": 10615 + }, + { + "epoch": 1.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00016316494064618078, + "loss": 0.4412, + "step": 10616 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.00016315591237003342, + "loss": 0.3152, + "step": 10617 + }, + { + "epoch": 1.42, + "grad_norm": 0.474609375, + "learning_rate": 0.00016314688323745138, + "loss": 0.2982, + "step": 10618 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.00016313785324855706, + "loss": 0.4922, + "step": 10619 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.00016312882240347295, + "loss": 0.3696, + "step": 10620 + }, + { + "epoch": 1.42, + "grad_norm": 0.59765625, + "learning_rate": 0.00016311979070232147, + "loss": 0.3077, + "step": 10621 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.0001631107581452251, + "loss": 0.277, + "step": 10622 + }, + { + "epoch": 1.42, + "grad_norm": 0.31640625, + "learning_rate": 0.0001631017247323064, + "loss": 0.2459, + "step": 10623 + }, + { + "epoch": 1.42, + "grad_norm": 0.44921875, + "learning_rate": 0.00016309269046368776, + "loss": 0.2565, + "step": 10624 + }, + { + "epoch": 1.42, + "grad_norm": 0.455078125, + "learning_rate": 0.00016308365533949175, + "loss": 0.3176, + "step": 10625 + }, + { + "epoch": 1.42, + "grad_norm": 0.69140625, + "learning_rate": 0.00016307461935984093, + "loss": 0.2283, + "step": 10626 + }, + { + "epoch": 1.42, + "grad_norm": 0.56640625, + "learning_rate": 0.00016306558252485775, + "loss": 0.3528, + "step": 10627 + }, + { + "epoch": 1.42, + "grad_norm": 0.57421875, + "learning_rate": 0.00016305654483466482, + "loss": 0.1733, + "step": 10628 + }, + { + "epoch": 1.42, + "grad_norm": 0.60546875, + "learning_rate": 0.00016304750628938464, + "loss": 0.3157, + "step": 10629 + }, + { + "epoch": 1.42, + "grad_norm": 0.50390625, + "learning_rate": 0.00016303846688913985, + "loss": 0.2456, + "step": 10630 + }, + { + "epoch": 1.42, + "grad_norm": 0.6328125, + "learning_rate": 0.00016302942663405296, + "loss": 0.4022, + "step": 10631 + }, + { + "epoch": 1.42, + "grad_norm": 0.462890625, + "learning_rate": 0.0001630203855242466, + "loss": 0.3044, + "step": 10632 + }, + { + "epoch": 1.42, + "grad_norm": 0.451171875, + "learning_rate": 0.00016301134355984338, + "loss": 0.2751, + "step": 10633 + }, + { + "epoch": 1.42, + "grad_norm": 0.6875, + "learning_rate": 0.00016300230074096589, + "loss": 0.2505, + "step": 10634 + }, + { + "epoch": 1.42, + "grad_norm": 0.5390625, + "learning_rate": 0.00016299325706773678, + "loss": 0.4947, + "step": 10635 + }, + { + "epoch": 1.42, + "grad_norm": 0.7421875, + "learning_rate": 0.00016298421254027864, + "loss": 0.4037, + "step": 10636 + }, + { + "epoch": 1.42, + "grad_norm": 0.57421875, + "learning_rate": 0.0001629751671587142, + "loss": 0.2913, + "step": 10637 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.00016296612092316605, + "loss": 0.2127, + "step": 10638 + }, + { + "epoch": 1.42, + "grad_norm": 0.53125, + "learning_rate": 0.0001629570738337569, + "loss": 0.2036, + "step": 10639 + }, + { + "epoch": 1.42, + "grad_norm": 0.61328125, + "learning_rate": 0.00016294802589060945, + "loss": 0.5601, + "step": 10640 + }, + { + "epoch": 1.42, + "grad_norm": 0.466796875, + "learning_rate": 0.00016293897709384632, + "loss": 0.4602, + "step": 10641 + }, + { + "epoch": 1.42, + "grad_norm": 0.416015625, + "learning_rate": 0.00016292992744359027, + "loss": 0.2435, + "step": 10642 + }, + { + "epoch": 1.42, + "grad_norm": 0.7109375, + "learning_rate": 0.00016292087693996402, + "loss": 0.4541, + "step": 10643 + }, + { + "epoch": 1.42, + "grad_norm": 0.8515625, + "learning_rate": 0.00016291182558309032, + "loss": 0.3853, + "step": 10644 + }, + { + "epoch": 1.42, + "grad_norm": 0.66796875, + "learning_rate": 0.0001629027733730919, + "loss": 0.4185, + "step": 10645 + }, + { + "epoch": 1.42, + "grad_norm": 0.484375, + "learning_rate": 0.00016289372031009144, + "loss": 0.3179, + "step": 10646 + }, + { + "epoch": 1.42, + "grad_norm": 0.451171875, + "learning_rate": 0.00016288466639421183, + "loss": 0.2929, + "step": 10647 + }, + { + "epoch": 1.42, + "grad_norm": 0.490234375, + "learning_rate": 0.00016287561162557578, + "loss": 0.4228, + "step": 10648 + }, + { + "epoch": 1.42, + "grad_norm": 0.54296875, + "learning_rate": 0.00016286655600430603, + "loss": 0.6467, + "step": 10649 + }, + { + "epoch": 1.42, + "grad_norm": 0.39453125, + "learning_rate": 0.00016285749953052548, + "loss": 0.2728, + "step": 10650 + }, + { + "epoch": 1.42, + "grad_norm": 0.375, + "learning_rate": 0.0001628484422043569, + "loss": 0.224, + "step": 10651 + }, + { + "epoch": 1.42, + "grad_norm": 0.443359375, + "learning_rate": 0.00016283938402592306, + "loss": 0.1737, + "step": 10652 + }, + { + "epoch": 1.42, + "grad_norm": 0.8828125, + "learning_rate": 0.00016283032499534686, + "loss": 0.5055, + "step": 10653 + }, + { + "epoch": 1.42, + "grad_norm": 0.640625, + "learning_rate": 0.00016282126511275115, + "loss": 0.3923, + "step": 10654 + }, + { + "epoch": 1.42, + "grad_norm": 0.447265625, + "learning_rate": 0.00016281220437825878, + "loss": 0.4828, + "step": 10655 + }, + { + "epoch": 1.42, + "grad_norm": 0.4921875, + "learning_rate": 0.00016280314279199255, + "loss": 0.2654, + "step": 10656 + }, + { + "epoch": 1.42, + "grad_norm": 0.609375, + "learning_rate": 0.00016279408035407546, + "loss": 0.3494, + "step": 10657 + }, + { + "epoch": 1.42, + "grad_norm": 0.62109375, + "learning_rate": 0.0001627850170646303, + "loss": 0.3381, + "step": 10658 + }, + { + "epoch": 1.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00016277595292378002, + "loss": 0.2907, + "step": 10659 + }, + { + "epoch": 1.42, + "grad_norm": 0.640625, + "learning_rate": 0.0001627668879316475, + "loss": 0.4084, + "step": 10660 + }, + { + "epoch": 1.42, + "grad_norm": 0.46484375, + "learning_rate": 0.00016275782208835575, + "loss": 0.2189, + "step": 10661 + }, + { + "epoch": 1.42, + "grad_norm": 0.57421875, + "learning_rate": 0.00016274875539402762, + "loss": 0.3591, + "step": 10662 + }, + { + "epoch": 1.42, + "grad_norm": 0.60546875, + "learning_rate": 0.00016273968784878614, + "loss": 0.4478, + "step": 10663 + }, + { + "epoch": 1.42, + "grad_norm": 0.5, + "learning_rate": 0.0001627306194527542, + "loss": 0.4019, + "step": 10664 + }, + { + "epoch": 1.42, + "grad_norm": 0.51953125, + "learning_rate": 0.0001627215502060548, + "loss": 0.6714, + "step": 10665 + }, + { + "epoch": 1.42, + "grad_norm": 0.482421875, + "learning_rate": 0.0001627124801088109, + "loss": 0.4405, + "step": 10666 + }, + { + "epoch": 1.42, + "grad_norm": 0.6328125, + "learning_rate": 0.00016270340916114555, + "loss": 0.6529, + "step": 10667 + }, + { + "epoch": 1.42, + "grad_norm": 0.42578125, + "learning_rate": 0.0001626943373631817, + "loss": 0.2924, + "step": 10668 + }, + { + "epoch": 1.42, + "grad_norm": 0.6328125, + "learning_rate": 0.00016268526471504245, + "loss": 0.1999, + "step": 10669 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.0001626761912168507, + "loss": 0.4414, + "step": 10670 + }, + { + "epoch": 1.42, + "grad_norm": 0.85546875, + "learning_rate": 0.00016266711686872964, + "loss": 0.2184, + "step": 10671 + }, + { + "epoch": 1.42, + "grad_norm": 0.498046875, + "learning_rate": 0.00016265804167080224, + "loss": 0.5455, + "step": 10672 + }, + { + "epoch": 1.42, + "grad_norm": 0.578125, + "learning_rate": 0.0001626489656231916, + "loss": 0.4143, + "step": 10673 + }, + { + "epoch": 1.42, + "grad_norm": 0.39453125, + "learning_rate": 0.0001626398887260207, + "loss": 0.2671, + "step": 10674 + }, + { + "epoch": 1.42, + "grad_norm": 0.640625, + "learning_rate": 0.0001626308109794128, + "loss": 0.402, + "step": 10675 + }, + { + "epoch": 1.42, + "grad_norm": 0.69140625, + "learning_rate": 0.00016262173238349085, + "loss": 0.4365, + "step": 10676 + }, + { + "epoch": 1.42, + "grad_norm": 0.64453125, + "learning_rate": 0.00016261265293837803, + "loss": 0.5285, + "step": 10677 + }, + { + "epoch": 1.42, + "grad_norm": 0.83984375, + "learning_rate": 0.00016260357264419746, + "loss": 0.5507, + "step": 10678 + }, + { + "epoch": 1.43, + "grad_norm": 0.60546875, + "learning_rate": 0.0001625944915010723, + "loss": 0.4816, + "step": 10679 + }, + { + "epoch": 1.43, + "grad_norm": 0.400390625, + "learning_rate": 0.0001625854095091256, + "loss": 0.2408, + "step": 10680 + }, + { + "epoch": 1.43, + "grad_norm": 0.58984375, + "learning_rate": 0.00016257632666848062, + "loss": 0.5496, + "step": 10681 + }, + { + "epoch": 1.43, + "grad_norm": 0.64453125, + "learning_rate": 0.00016256724297926048, + "loss": 0.3805, + "step": 10682 + }, + { + "epoch": 1.43, + "grad_norm": 0.92578125, + "learning_rate": 0.00016255815844158834, + "loss": 0.49, + "step": 10683 + }, + { + "epoch": 1.43, + "grad_norm": 0.4296875, + "learning_rate": 0.00016254907305558746, + "loss": 0.2581, + "step": 10684 + }, + { + "epoch": 1.43, + "grad_norm": 0.58203125, + "learning_rate": 0.000162539986821381, + "loss": 0.5702, + "step": 10685 + }, + { + "epoch": 1.43, + "grad_norm": 0.33984375, + "learning_rate": 0.00016253089973909215, + "loss": 0.232, + "step": 10686 + }, + { + "epoch": 1.43, + "grad_norm": 0.345703125, + "learning_rate": 0.0001625218118088442, + "loss": 0.1905, + "step": 10687 + }, + { + "epoch": 1.43, + "grad_norm": 0.703125, + "learning_rate": 0.00016251272303076036, + "loss": 0.2162, + "step": 10688 + }, + { + "epoch": 1.43, + "grad_norm": 0.64453125, + "learning_rate": 0.00016250363340496385, + "loss": 0.4924, + "step": 10689 + }, + { + "epoch": 1.43, + "grad_norm": 0.53125, + "learning_rate": 0.00016249454293157798, + "loss": 0.6655, + "step": 10690 + }, + { + "epoch": 1.43, + "grad_norm": 0.47265625, + "learning_rate": 0.000162485451610726, + "loss": 0.313, + "step": 10691 + }, + { + "epoch": 1.43, + "grad_norm": 0.55859375, + "learning_rate": 0.00016247635944253115, + "loss": 0.3198, + "step": 10692 + }, + { + "epoch": 1.43, + "grad_norm": 0.80859375, + "learning_rate": 0.00016246726642711682, + "loss": 0.4706, + "step": 10693 + }, + { + "epoch": 1.43, + "grad_norm": 0.578125, + "learning_rate": 0.00016245817256460625, + "loss": 0.2742, + "step": 10694 + }, + { + "epoch": 1.43, + "grad_norm": 0.490234375, + "learning_rate": 0.00016244907785512277, + "loss": 0.3087, + "step": 10695 + }, + { + "epoch": 1.43, + "grad_norm": 0.58984375, + "learning_rate": 0.0001624399822987897, + "loss": 0.2867, + "step": 10696 + }, + { + "epoch": 1.43, + "grad_norm": 0.45703125, + "learning_rate": 0.00016243088589573042, + "loss": 0.1828, + "step": 10697 + }, + { + "epoch": 1.43, + "grad_norm": 0.53125, + "learning_rate": 0.00016242178864606826, + "loss": 0.2008, + "step": 10698 + }, + { + "epoch": 1.43, + "grad_norm": 0.486328125, + "learning_rate": 0.00016241269054992658, + "loss": 0.3206, + "step": 10699 + }, + { + "epoch": 1.43, + "grad_norm": 0.47265625, + "learning_rate": 0.00016240359160742875, + "loss": 0.535, + "step": 10700 + }, + { + "epoch": 1.43, + "grad_norm": 0.447265625, + "learning_rate": 0.00016239449181869816, + "loss": 0.5018, + "step": 10701 + }, + { + "epoch": 1.43, + "grad_norm": 0.61328125, + "learning_rate": 0.00016238539118385825, + "loss": 0.2387, + "step": 10702 + }, + { + "epoch": 1.43, + "grad_norm": 0.50390625, + "learning_rate": 0.00016237628970303236, + "loss": 0.4028, + "step": 10703 + }, + { + "epoch": 1.43, + "grad_norm": 0.49609375, + "learning_rate": 0.00016236718737634397, + "loss": 0.3382, + "step": 10704 + }, + { + "epoch": 1.43, + "grad_norm": 0.65625, + "learning_rate": 0.00016235808420391652, + "loss": 0.1987, + "step": 10705 + }, + { + "epoch": 1.43, + "grad_norm": 0.416015625, + "learning_rate": 0.00016234898018587337, + "loss": 0.2338, + "step": 10706 + }, + { + "epoch": 1.43, + "grad_norm": 0.671875, + "learning_rate": 0.00016233987532233805, + "loss": 0.3066, + "step": 10707 + }, + { + "epoch": 1.43, + "grad_norm": 0.4921875, + "learning_rate": 0.00016233076961343402, + "loss": 0.4544, + "step": 10708 + }, + { + "epoch": 1.43, + "grad_norm": 0.671875, + "learning_rate": 0.00016232166305928474, + "loss": 0.4659, + "step": 10709 + }, + { + "epoch": 1.43, + "grad_norm": 0.412109375, + "learning_rate": 0.0001623125556600137, + "loss": 0.2427, + "step": 10710 + }, + { + "epoch": 1.43, + "grad_norm": 0.482421875, + "learning_rate": 0.00016230344741574446, + "loss": 0.1582, + "step": 10711 + }, + { + "epoch": 1.43, + "grad_norm": 1.140625, + "learning_rate": 0.00016229433832660046, + "loss": 0.2665, + "step": 10712 + }, + { + "epoch": 1.43, + "grad_norm": 0.451171875, + "learning_rate": 0.00016228522839270527, + "loss": 0.2767, + "step": 10713 + }, + { + "epoch": 1.43, + "grad_norm": 0.72265625, + "learning_rate": 0.00016227611761418238, + "loss": 0.3835, + "step": 10714 + }, + { + "epoch": 1.43, + "grad_norm": 0.51171875, + "learning_rate": 0.0001622670059911554, + "loss": 0.6233, + "step": 10715 + }, + { + "epoch": 1.43, + "grad_norm": 0.4375, + "learning_rate": 0.00016225789352374782, + "loss": 0.229, + "step": 10716 + }, + { + "epoch": 1.43, + "grad_norm": 0.59375, + "learning_rate": 0.00016224878021208326, + "loss": 0.4031, + "step": 10717 + }, + { + "epoch": 1.43, + "grad_norm": 0.65625, + "learning_rate": 0.0001622396660562853, + "loss": 0.398, + "step": 10718 + }, + { + "epoch": 1.43, + "grad_norm": 0.50390625, + "learning_rate": 0.0001622305510564775, + "loss": 0.3508, + "step": 10719 + }, + { + "epoch": 1.43, + "grad_norm": 0.478515625, + "learning_rate": 0.0001622214352127835, + "loss": 0.5353, + "step": 10720 + }, + { + "epoch": 1.43, + "grad_norm": 0.6015625, + "learning_rate": 0.0001622123185253269, + "loss": 0.769, + "step": 10721 + }, + { + "epoch": 1.43, + "grad_norm": 0.546875, + "learning_rate": 0.00016220320099423133, + "loss": 0.4241, + "step": 10722 + }, + { + "epoch": 1.43, + "grad_norm": 0.51171875, + "learning_rate": 0.00016219408261962045, + "loss": 0.4986, + "step": 10723 + }, + { + "epoch": 1.43, + "grad_norm": 0.52734375, + "learning_rate": 0.00016218496340161787, + "loss": 0.337, + "step": 10724 + }, + { + "epoch": 1.43, + "grad_norm": 0.59765625, + "learning_rate": 0.0001621758433403473, + "loss": 0.278, + "step": 10725 + }, + { + "epoch": 1.43, + "grad_norm": 0.45703125, + "learning_rate": 0.0001621667224359324, + "loss": 0.5403, + "step": 10726 + }, + { + "epoch": 1.43, + "grad_norm": 0.466796875, + "learning_rate": 0.00016215760068849684, + "loss": 0.4357, + "step": 10727 + }, + { + "epoch": 1.43, + "grad_norm": 0.5546875, + "learning_rate": 0.00016214847809816428, + "loss": 0.6338, + "step": 10728 + }, + { + "epoch": 1.43, + "grad_norm": 0.51953125, + "learning_rate": 0.00016213935466505853, + "loss": 0.1828, + "step": 10729 + }, + { + "epoch": 1.43, + "grad_norm": 0.3984375, + "learning_rate": 0.00016213023038930322, + "loss": 0.2365, + "step": 10730 + }, + { + "epoch": 1.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00016212110527102209, + "loss": 0.5403, + "step": 10731 + }, + { + "epoch": 1.43, + "grad_norm": 0.57421875, + "learning_rate": 0.00016211197931033895, + "loss": 0.5154, + "step": 10732 + }, + { + "epoch": 1.43, + "grad_norm": 0.734375, + "learning_rate": 0.00016210285250737748, + "loss": 0.4758, + "step": 10733 + }, + { + "epoch": 1.43, + "grad_norm": 0.58203125, + "learning_rate": 0.00016209372486226148, + "loss": 0.3572, + "step": 10734 + }, + { + "epoch": 1.43, + "grad_norm": 0.455078125, + "learning_rate": 0.0001620845963751147, + "loss": 0.5463, + "step": 10735 + }, + { + "epoch": 1.43, + "grad_norm": 0.484375, + "learning_rate": 0.00016207546704606095, + "loss": 0.3561, + "step": 10736 + }, + { + "epoch": 1.43, + "grad_norm": 0.51171875, + "learning_rate": 0.00016206633687522405, + "loss": 0.3067, + "step": 10737 + }, + { + "epoch": 1.43, + "grad_norm": 0.5078125, + "learning_rate": 0.00016205720586272778, + "loss": 0.393, + "step": 10738 + }, + { + "epoch": 1.43, + "grad_norm": 0.578125, + "learning_rate": 0.00016204807400869596, + "loss": 0.1857, + "step": 10739 + }, + { + "epoch": 1.43, + "grad_norm": 0.466796875, + "learning_rate": 0.00016203894131325245, + "loss": 0.3621, + "step": 10740 + }, + { + "epoch": 1.43, + "grad_norm": 0.453125, + "learning_rate": 0.00016202980777652105, + "loss": 0.3898, + "step": 10741 + }, + { + "epoch": 1.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00016202067339862568, + "loss": 0.5171, + "step": 10742 + }, + { + "epoch": 1.43, + "grad_norm": 0.3984375, + "learning_rate": 0.00016201153817969018, + "loss": 0.273, + "step": 10743 + }, + { + "epoch": 1.43, + "grad_norm": 0.4609375, + "learning_rate": 0.0001620024021198384, + "loss": 0.3187, + "step": 10744 + }, + { + "epoch": 1.43, + "grad_norm": 0.67578125, + "learning_rate": 0.00016199326521919427, + "loss": 0.6162, + "step": 10745 + }, + { + "epoch": 1.43, + "grad_norm": 0.490234375, + "learning_rate": 0.00016198412747788166, + "loss": 0.4699, + "step": 10746 + }, + { + "epoch": 1.43, + "grad_norm": 0.48046875, + "learning_rate": 0.00016197498889602448, + "loss": 0.2452, + "step": 10747 + }, + { + "epoch": 1.43, + "grad_norm": 0.56640625, + "learning_rate": 0.00016196584947374674, + "loss": 0.3434, + "step": 10748 + }, + { + "epoch": 1.43, + "grad_norm": 0.46875, + "learning_rate": 0.00016195670921117226, + "loss": 0.6178, + "step": 10749 + }, + { + "epoch": 1.43, + "grad_norm": 0.5625, + "learning_rate": 0.00016194756810842504, + "loss": 0.6036, + "step": 10750 + }, + { + "epoch": 1.43, + "grad_norm": 0.470703125, + "learning_rate": 0.00016193842616562905, + "loss": 0.5062, + "step": 10751 + }, + { + "epoch": 1.43, + "grad_norm": 0.48828125, + "learning_rate": 0.00016192928338290826, + "loss": 0.4953, + "step": 10752 + }, + { + "epoch": 1.43, + "grad_norm": 0.62109375, + "learning_rate": 0.00016192013976038664, + "loss": 0.2279, + "step": 10753 + }, + { + "epoch": 1.44, + "grad_norm": 0.6171875, + "learning_rate": 0.00016191099529818819, + "loss": 0.3646, + "step": 10754 + }, + { + "epoch": 1.44, + "grad_norm": 0.78515625, + "learning_rate": 0.00016190184999643686, + "loss": 0.3021, + "step": 10755 + }, + { + "epoch": 1.44, + "grad_norm": 0.484375, + "learning_rate": 0.00016189270385525675, + "loss": 0.4425, + "step": 10756 + }, + { + "epoch": 1.44, + "grad_norm": 0.609375, + "learning_rate": 0.00016188355687477186, + "loss": 0.2959, + "step": 10757 + }, + { + "epoch": 1.44, + "grad_norm": 0.6640625, + "learning_rate": 0.00016187440905510622, + "loss": 0.4971, + "step": 10758 + }, + { + "epoch": 1.44, + "grad_norm": 0.56640625, + "learning_rate": 0.0001618652603963839, + "loss": 0.428, + "step": 10759 + }, + { + "epoch": 1.44, + "grad_norm": 0.380859375, + "learning_rate": 0.0001618561108987289, + "loss": 0.2874, + "step": 10760 + }, + { + "epoch": 1.44, + "grad_norm": 0.466796875, + "learning_rate": 0.0001618469605622654, + "loss": 0.2818, + "step": 10761 + }, + { + "epoch": 1.44, + "grad_norm": 0.8359375, + "learning_rate": 0.00016183780938711738, + "loss": 0.3708, + "step": 10762 + }, + { + "epoch": 1.44, + "grad_norm": 0.609375, + "learning_rate": 0.00016182865737340897, + "loss": 0.5085, + "step": 10763 + }, + { + "epoch": 1.44, + "grad_norm": 0.578125, + "learning_rate": 0.00016181950452126432, + "loss": 0.3812, + "step": 10764 + }, + { + "epoch": 1.44, + "grad_norm": 0.6953125, + "learning_rate": 0.00016181035083080744, + "loss": 0.3197, + "step": 10765 + }, + { + "epoch": 1.44, + "grad_norm": 0.609375, + "learning_rate": 0.0001618011963021626, + "loss": 0.4127, + "step": 10766 + }, + { + "epoch": 1.44, + "grad_norm": 0.75, + "learning_rate": 0.00016179204093545385, + "loss": 0.469, + "step": 10767 + }, + { + "epoch": 1.44, + "grad_norm": 0.54296875, + "learning_rate": 0.00016178288473080537, + "loss": 0.6796, + "step": 10768 + }, + { + "epoch": 1.44, + "grad_norm": 0.56640625, + "learning_rate": 0.00016177372768834134, + "loss": 0.6104, + "step": 10769 + }, + { + "epoch": 1.44, + "grad_norm": 0.69921875, + "learning_rate": 0.0001617645698081859, + "loss": 0.4766, + "step": 10770 + }, + { + "epoch": 1.44, + "grad_norm": 0.5703125, + "learning_rate": 0.00016175541109046324, + "loss": 0.4969, + "step": 10771 + }, + { + "epoch": 1.44, + "grad_norm": 0.546875, + "learning_rate": 0.0001617462515352976, + "loss": 0.3174, + "step": 10772 + }, + { + "epoch": 1.44, + "grad_norm": 0.5703125, + "learning_rate": 0.00016173709114281315, + "loss": 0.385, + "step": 10773 + }, + { + "epoch": 1.44, + "grad_norm": 0.48046875, + "learning_rate": 0.0001617279299131341, + "loss": 0.2915, + "step": 10774 + }, + { + "epoch": 1.44, + "grad_norm": 0.51953125, + "learning_rate": 0.00016171876784638474, + "loss": 0.1863, + "step": 10775 + }, + { + "epoch": 1.44, + "grad_norm": 0.81640625, + "learning_rate": 0.00016170960494268926, + "loss": 0.5572, + "step": 10776 + }, + { + "epoch": 1.44, + "grad_norm": 0.4921875, + "learning_rate": 0.0001617004412021719, + "loss": 0.3275, + "step": 10777 + }, + { + "epoch": 1.44, + "grad_norm": 0.671875, + "learning_rate": 0.000161691276624957, + "loss": 0.2583, + "step": 10778 + }, + { + "epoch": 1.44, + "grad_norm": 0.65234375, + "learning_rate": 0.0001616821112111688, + "loss": 0.4077, + "step": 10779 + }, + { + "epoch": 1.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00016167294496093157, + "loss": 0.276, + "step": 10780 + }, + { + "epoch": 1.44, + "grad_norm": 0.52734375, + "learning_rate": 0.0001616637778743696, + "loss": 0.5598, + "step": 10781 + }, + { + "epoch": 1.44, + "grad_norm": 0.69140625, + "learning_rate": 0.0001616546099516073, + "loss": 0.4413, + "step": 10782 + }, + { + "epoch": 1.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00016164544119276884, + "loss": 0.4272, + "step": 10783 + }, + { + "epoch": 1.44, + "grad_norm": 0.61328125, + "learning_rate": 0.00016163627159797867, + "loss": 0.2273, + "step": 10784 + }, + { + "epoch": 1.44, + "grad_norm": 0.6796875, + "learning_rate": 0.00016162710116736107, + "loss": 0.4979, + "step": 10785 + }, + { + "epoch": 1.44, + "grad_norm": 0.6484375, + "learning_rate": 0.00016161792990104047, + "loss": 0.3813, + "step": 10786 + }, + { + "epoch": 1.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00016160875779914116, + "loss": 0.2847, + "step": 10787 + }, + { + "epoch": 1.44, + "grad_norm": 0.435546875, + "learning_rate": 0.0001615995848617876, + "loss": 0.1977, + "step": 10788 + }, + { + "epoch": 1.44, + "grad_norm": 0.65234375, + "learning_rate": 0.00016159041108910409, + "loss": 0.3445, + "step": 10789 + }, + { + "epoch": 1.44, + "grad_norm": 0.546875, + "learning_rate": 0.0001615812364812151, + "loss": 0.3123, + "step": 10790 + }, + { + "epoch": 1.44, + "grad_norm": 0.5078125, + "learning_rate": 0.000161572061038245, + "loss": 0.3824, + "step": 10791 + }, + { + "epoch": 1.44, + "grad_norm": 0.58203125, + "learning_rate": 0.00016156288476031825, + "loss": 0.4204, + "step": 10792 + }, + { + "epoch": 1.44, + "grad_norm": 0.427734375, + "learning_rate": 0.00016155370764755927, + "loss": 0.2365, + "step": 10793 + }, + { + "epoch": 1.44, + "grad_norm": 0.5078125, + "learning_rate": 0.0001615445297000925, + "loss": 0.3213, + "step": 10794 + }, + { + "epoch": 1.44, + "grad_norm": 0.515625, + "learning_rate": 0.0001615353509180424, + "loss": 0.4538, + "step": 10795 + }, + { + "epoch": 1.44, + "grad_norm": 0.44921875, + "learning_rate": 0.00016152617130153348, + "loss": 0.2233, + "step": 10796 + }, + { + "epoch": 1.44, + "grad_norm": 0.56640625, + "learning_rate": 0.00016151699085069016, + "loss": 0.3308, + "step": 10797 + }, + { + "epoch": 1.44, + "grad_norm": 0.7109375, + "learning_rate": 0.000161507809565637, + "loss": 0.2845, + "step": 10798 + }, + { + "epoch": 1.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00016149862744649842, + "loss": 0.27, + "step": 10799 + }, + { + "epoch": 1.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00016148944449339902, + "loss": 0.556, + "step": 10800 + }, + { + "epoch": 1.44, + "grad_norm": 0.55859375, + "learning_rate": 0.00016148026070646326, + "loss": 0.2625, + "step": 10801 + }, + { + "epoch": 1.44, + "grad_norm": 0.55078125, + "learning_rate": 0.00016147107608581576, + "loss": 0.3935, + "step": 10802 + }, + { + "epoch": 1.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00016146189063158094, + "loss": 0.4147, + "step": 10803 + }, + { + "epoch": 1.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00016145270434388348, + "loss": 0.3967, + "step": 10804 + }, + { + "epoch": 1.44, + "grad_norm": 0.52734375, + "learning_rate": 0.0001614435172228479, + "loss": 0.3974, + "step": 10805 + }, + { + "epoch": 1.44, + "grad_norm": 0.65625, + "learning_rate": 0.0001614343292685988, + "loss": 0.4255, + "step": 10806 + }, + { + "epoch": 1.44, + "grad_norm": 0.578125, + "learning_rate": 0.00016142514048126077, + "loss": 0.2302, + "step": 10807 + }, + { + "epoch": 1.44, + "grad_norm": 0.5078125, + "learning_rate": 0.00016141595086095842, + "loss": 0.3416, + "step": 10808 + }, + { + "epoch": 1.44, + "grad_norm": 0.37109375, + "learning_rate": 0.00016140676040781634, + "loss": 0.2287, + "step": 10809 + }, + { + "epoch": 1.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00016139756912195917, + "loss": 0.4092, + "step": 10810 + }, + { + "epoch": 1.44, + "grad_norm": 0.6015625, + "learning_rate": 0.00016138837700351157, + "loss": 0.3053, + "step": 10811 + }, + { + "epoch": 1.44, + "grad_norm": 0.44140625, + "learning_rate": 0.00016137918405259818, + "loss": 0.2265, + "step": 10812 + }, + { + "epoch": 1.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00016136999026934368, + "loss": 0.5409, + "step": 10813 + }, + { + "epoch": 1.44, + "grad_norm": 0.59375, + "learning_rate": 0.0001613607956538727, + "loss": 0.3361, + "step": 10814 + }, + { + "epoch": 1.44, + "grad_norm": 0.6015625, + "learning_rate": 0.00016135160020630998, + "loss": 0.4202, + "step": 10815 + }, + { + "epoch": 1.44, + "grad_norm": 0.4453125, + "learning_rate": 0.00016134240392678013, + "loss": 0.5337, + "step": 10816 + }, + { + "epoch": 1.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00016133320681540795, + "loss": 0.1819, + "step": 10817 + }, + { + "epoch": 1.44, + "grad_norm": 0.470703125, + "learning_rate": 0.0001613240088723181, + "loss": 0.5311, + "step": 10818 + }, + { + "epoch": 1.44, + "grad_norm": 0.4921875, + "learning_rate": 0.00016131481009763537, + "loss": 0.2343, + "step": 10819 + }, + { + "epoch": 1.44, + "grad_norm": 0.57421875, + "learning_rate": 0.00016130561049148445, + "loss": 0.4028, + "step": 10820 + }, + { + "epoch": 1.44, + "grad_norm": 0.34765625, + "learning_rate": 0.00016129641005399012, + "loss": 0.2387, + "step": 10821 + }, + { + "epoch": 1.44, + "grad_norm": 0.56640625, + "learning_rate": 0.0001612872087852771, + "loss": 0.4446, + "step": 10822 + }, + { + "epoch": 1.44, + "grad_norm": 0.43359375, + "learning_rate": 0.0001612780066854702, + "loss": 0.274, + "step": 10823 + }, + { + "epoch": 1.44, + "grad_norm": 0.55078125, + "learning_rate": 0.0001612688037546942, + "loss": 0.4201, + "step": 10824 + }, + { + "epoch": 1.44, + "grad_norm": 0.546875, + "learning_rate": 0.00016125959999307388, + "loss": 0.6458, + "step": 10825 + }, + { + "epoch": 1.44, + "grad_norm": 0.5546875, + "learning_rate": 0.00016125039540073412, + "loss": 0.2617, + "step": 10826 + }, + { + "epoch": 1.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00016124118997779967, + "loss": 0.2511, + "step": 10827 + }, + { + "epoch": 1.44, + "grad_norm": 0.546875, + "learning_rate": 0.00016123198372439536, + "loss": 0.2513, + "step": 10828 + }, + { + "epoch": 1.45, + "grad_norm": 0.62109375, + "learning_rate": 0.0001612227766406461, + "loss": 0.2927, + "step": 10829 + }, + { + "epoch": 1.45, + "grad_norm": 0.55859375, + "learning_rate": 0.00016121356872667662, + "loss": 0.5064, + "step": 10830 + }, + { + "epoch": 1.45, + "grad_norm": 0.384765625, + "learning_rate": 0.00016120435998261189, + "loss": 0.2, + "step": 10831 + }, + { + "epoch": 1.45, + "grad_norm": 0.40234375, + "learning_rate": 0.00016119515040857678, + "loss": 0.3045, + "step": 10832 + }, + { + "epoch": 1.45, + "grad_norm": 0.494140625, + "learning_rate": 0.00016118594000469617, + "loss": 0.322, + "step": 10833 + }, + { + "epoch": 1.45, + "grad_norm": 0.66015625, + "learning_rate": 0.00016117672877109492, + "loss": 0.6663, + "step": 10834 + }, + { + "epoch": 1.45, + "grad_norm": 0.59765625, + "learning_rate": 0.00016116751670789797, + "loss": 0.475, + "step": 10835 + }, + { + "epoch": 1.45, + "grad_norm": 0.431640625, + "learning_rate": 0.00016115830381523022, + "loss": 0.3648, + "step": 10836 + }, + { + "epoch": 1.45, + "grad_norm": 0.62109375, + "learning_rate": 0.0001611490900932166, + "loss": 0.6783, + "step": 10837 + }, + { + "epoch": 1.45, + "grad_norm": 0.435546875, + "learning_rate": 0.00016113987554198213, + "loss": 0.352, + "step": 10838 + }, + { + "epoch": 1.45, + "grad_norm": 0.5625, + "learning_rate": 0.00016113066016165171, + "loss": 0.4996, + "step": 10839 + }, + { + "epoch": 1.45, + "grad_norm": 0.5078125, + "learning_rate": 0.00016112144395235024, + "loss": 0.2511, + "step": 10840 + }, + { + "epoch": 1.45, + "grad_norm": 0.515625, + "learning_rate": 0.00016111222691420283, + "loss": 0.341, + "step": 10841 + }, + { + "epoch": 1.45, + "grad_norm": 0.578125, + "learning_rate": 0.00016110300904733434, + "loss": 0.5719, + "step": 10842 + }, + { + "epoch": 1.45, + "grad_norm": 0.625, + "learning_rate": 0.0001610937903518699, + "loss": 0.4171, + "step": 10843 + }, + { + "epoch": 1.45, + "grad_norm": 0.52734375, + "learning_rate": 0.0001610845708279344, + "loss": 0.6594, + "step": 10844 + }, + { + "epoch": 1.45, + "grad_norm": 0.56640625, + "learning_rate": 0.00016107535047565295, + "loss": 0.4376, + "step": 10845 + }, + { + "epoch": 1.45, + "grad_norm": 0.375, + "learning_rate": 0.0001610661292951505, + "loss": 0.3648, + "step": 10846 + }, + { + "epoch": 1.45, + "grad_norm": 0.76953125, + "learning_rate": 0.00016105690728655217, + "loss": 0.691, + "step": 10847 + }, + { + "epoch": 1.45, + "grad_norm": 0.671875, + "learning_rate": 0.00016104768444998298, + "loss": 0.3284, + "step": 10848 + }, + { + "epoch": 1.45, + "grad_norm": 0.61328125, + "learning_rate": 0.00016103846078556803, + "loss": 0.3632, + "step": 10849 + }, + { + "epoch": 1.45, + "grad_norm": 0.59375, + "learning_rate": 0.0001610292362934324, + "loss": 0.3206, + "step": 10850 + }, + { + "epoch": 1.45, + "grad_norm": 0.44921875, + "learning_rate": 0.00016102001097370114, + "loss": 0.2069, + "step": 10851 + }, + { + "epoch": 1.45, + "grad_norm": 0.63671875, + "learning_rate": 0.00016101078482649937, + "loss": 0.5795, + "step": 10852 + }, + { + "epoch": 1.45, + "grad_norm": 0.63671875, + "learning_rate": 0.00016100155785195216, + "loss": 0.4515, + "step": 10853 + }, + { + "epoch": 1.45, + "grad_norm": 0.63671875, + "learning_rate": 0.0001609923300501847, + "loss": 0.376, + "step": 10854 + }, + { + "epoch": 1.45, + "grad_norm": 0.404296875, + "learning_rate": 0.00016098310142132212, + "loss": 0.2524, + "step": 10855 + }, + { + "epoch": 1.45, + "grad_norm": 0.51171875, + "learning_rate": 0.00016097387196548952, + "loss": 0.3521, + "step": 10856 + }, + { + "epoch": 1.45, + "grad_norm": 0.474609375, + "learning_rate": 0.00016096464168281212, + "loss": 0.2028, + "step": 10857 + }, + { + "epoch": 1.45, + "grad_norm": 0.5859375, + "learning_rate": 0.000160955410573415, + "loss": 0.5018, + "step": 10858 + }, + { + "epoch": 1.45, + "grad_norm": 0.640625, + "learning_rate": 0.00016094617863742343, + "loss": 0.3836, + "step": 10859 + }, + { + "epoch": 1.45, + "grad_norm": 0.65234375, + "learning_rate": 0.00016093694587496256, + "loss": 0.264, + "step": 10860 + }, + { + "epoch": 1.45, + "grad_norm": 0.609375, + "learning_rate": 0.00016092771228615757, + "loss": 0.5375, + "step": 10861 + }, + { + "epoch": 1.45, + "grad_norm": 0.466796875, + "learning_rate": 0.00016091847787113372, + "loss": 0.2122, + "step": 10862 + }, + { + "epoch": 1.45, + "grad_norm": 0.5234375, + "learning_rate": 0.00016090924263001624, + "loss": 0.722, + "step": 10863 + }, + { + "epoch": 1.45, + "grad_norm": 0.51953125, + "learning_rate": 0.0001609000065629303, + "loss": 0.3176, + "step": 10864 + }, + { + "epoch": 1.45, + "grad_norm": 0.66015625, + "learning_rate": 0.0001608907696700012, + "loss": 0.7127, + "step": 10865 + }, + { + "epoch": 1.45, + "grad_norm": 0.5859375, + "learning_rate": 0.00016088153195135415, + "loss": 0.4725, + "step": 10866 + }, + { + "epoch": 1.45, + "grad_norm": 0.71484375, + "learning_rate": 0.00016087229340711447, + "loss": 0.7881, + "step": 10867 + }, + { + "epoch": 1.45, + "grad_norm": 0.62890625, + "learning_rate": 0.00016086305403740744, + "loss": 0.397, + "step": 10868 + }, + { + "epoch": 1.45, + "grad_norm": 0.6015625, + "learning_rate": 0.0001608538138423583, + "loss": 0.4848, + "step": 10869 + }, + { + "epoch": 1.45, + "grad_norm": 0.578125, + "learning_rate": 0.00016084457282209243, + "loss": 0.3891, + "step": 10870 + }, + { + "epoch": 1.45, + "grad_norm": 0.72265625, + "learning_rate": 0.0001608353309767351, + "loss": 0.5066, + "step": 10871 + }, + { + "epoch": 1.45, + "grad_norm": 0.50390625, + "learning_rate": 0.00016082608830641162, + "loss": 0.3945, + "step": 10872 + }, + { + "epoch": 1.45, + "grad_norm": 0.46484375, + "learning_rate": 0.00016081684481124734, + "loss": 0.2238, + "step": 10873 + }, + { + "epoch": 1.45, + "grad_norm": 0.6171875, + "learning_rate": 0.00016080760049136763, + "loss": 0.5594, + "step": 10874 + }, + { + "epoch": 1.45, + "grad_norm": 0.5625, + "learning_rate": 0.00016079835534689788, + "loss": 0.3904, + "step": 10875 + }, + { + "epoch": 1.45, + "grad_norm": 0.55078125, + "learning_rate": 0.00016078910937796333, + "loss": 0.44, + "step": 10876 + }, + { + "epoch": 1.45, + "grad_norm": 0.5234375, + "learning_rate": 0.00016077986258468947, + "loss": 0.3708, + "step": 10877 + }, + { + "epoch": 1.45, + "grad_norm": 0.451171875, + "learning_rate": 0.00016077061496720164, + "loss": 0.3631, + "step": 10878 + }, + { + "epoch": 1.45, + "grad_norm": 0.5234375, + "learning_rate": 0.0001607613665256253, + "loss": 0.2379, + "step": 10879 + }, + { + "epoch": 1.45, + "grad_norm": 0.55859375, + "learning_rate": 0.00016075211726008585, + "loss": 0.5736, + "step": 10880 + }, + { + "epoch": 1.45, + "grad_norm": 0.453125, + "learning_rate": 0.00016074286717070865, + "loss": 0.3797, + "step": 10881 + }, + { + "epoch": 1.45, + "grad_norm": 0.63671875, + "learning_rate": 0.00016073361625761922, + "loss": 0.6331, + "step": 10882 + }, + { + "epoch": 1.45, + "grad_norm": 0.5546875, + "learning_rate": 0.000160724364520943, + "loss": 0.4998, + "step": 10883 + }, + { + "epoch": 1.45, + "grad_norm": 0.4375, + "learning_rate": 0.00016071511196080537, + "loss": 0.2594, + "step": 10884 + }, + { + "epoch": 1.45, + "grad_norm": 0.77734375, + "learning_rate": 0.00016070585857733187, + "loss": 0.4298, + "step": 10885 + }, + { + "epoch": 1.45, + "grad_norm": 0.6171875, + "learning_rate": 0.00016069660437064798, + "loss": 0.2609, + "step": 10886 + }, + { + "epoch": 1.45, + "grad_norm": 0.400390625, + "learning_rate": 0.00016068734934087918, + "loss": 0.463, + "step": 10887 + }, + { + "epoch": 1.45, + "grad_norm": 0.57421875, + "learning_rate": 0.00016067809348815096, + "loss": 0.4278, + "step": 10888 + }, + { + "epoch": 1.45, + "grad_norm": 0.435546875, + "learning_rate": 0.00016066883681258885, + "loss": 0.3171, + "step": 10889 + }, + { + "epoch": 1.45, + "grad_norm": 0.55078125, + "learning_rate": 0.00016065957931431842, + "loss": 0.3839, + "step": 10890 + }, + { + "epoch": 1.45, + "grad_norm": 0.5546875, + "learning_rate": 0.0001606503209934651, + "loss": 0.4111, + "step": 10891 + }, + { + "epoch": 1.45, + "grad_norm": 0.875, + "learning_rate": 0.00016064106185015452, + "loss": 0.2812, + "step": 10892 + }, + { + "epoch": 1.45, + "grad_norm": 0.52734375, + "learning_rate": 0.00016063180188451224, + "loss": 0.3542, + "step": 10893 + }, + { + "epoch": 1.45, + "grad_norm": 0.75390625, + "learning_rate": 0.0001606225410966638, + "loss": 0.801, + "step": 10894 + }, + { + "epoch": 1.45, + "grad_norm": 0.60546875, + "learning_rate": 0.00016061327948673482, + "loss": 0.6065, + "step": 10895 + }, + { + "epoch": 1.45, + "grad_norm": 0.69140625, + "learning_rate": 0.00016060401705485084, + "loss": 0.3624, + "step": 10896 + }, + { + "epoch": 1.45, + "grad_norm": 0.51953125, + "learning_rate": 0.00016059475380113752, + "loss": 0.4838, + "step": 10897 + }, + { + "epoch": 1.45, + "grad_norm": 0.498046875, + "learning_rate": 0.0001605854897257204, + "loss": 0.3246, + "step": 10898 + }, + { + "epoch": 1.45, + "grad_norm": 0.5, + "learning_rate": 0.00016057622482872522, + "loss": 0.2111, + "step": 10899 + }, + { + "epoch": 1.45, + "grad_norm": 0.439453125, + "learning_rate": 0.0001605669591102775, + "loss": 0.431, + "step": 10900 + }, + { + "epoch": 1.45, + "grad_norm": 0.4921875, + "learning_rate": 0.00016055769257050298, + "loss": 0.4054, + "step": 10901 + }, + { + "epoch": 1.45, + "grad_norm": 0.5859375, + "learning_rate": 0.00016054842520952725, + "loss": 0.4267, + "step": 10902 + }, + { + "epoch": 1.45, + "grad_norm": 0.64453125, + "learning_rate": 0.00016053915702747603, + "loss": 0.4235, + "step": 10903 + }, + { + "epoch": 1.46, + "grad_norm": 0.66015625, + "learning_rate": 0.000160529888024475, + "loss": 0.2585, + "step": 10904 + }, + { + "epoch": 1.46, + "grad_norm": 0.54296875, + "learning_rate": 0.00016052061820064984, + "loss": 0.3788, + "step": 10905 + }, + { + "epoch": 1.46, + "grad_norm": 0.67578125, + "learning_rate": 0.00016051134755612622, + "loss": 0.1529, + "step": 10906 + }, + { + "epoch": 1.46, + "grad_norm": 0.5234375, + "learning_rate": 0.00016050207609102994, + "loss": 0.2972, + "step": 10907 + }, + { + "epoch": 1.46, + "grad_norm": 0.46875, + "learning_rate": 0.00016049280380548667, + "loss": 0.4777, + "step": 10908 + }, + { + "epoch": 1.46, + "grad_norm": 0.62109375, + "learning_rate": 0.0001604835306996221, + "loss": 0.2793, + "step": 10909 + }, + { + "epoch": 1.46, + "grad_norm": 0.58984375, + "learning_rate": 0.0001604742567735621, + "loss": 0.8564, + "step": 10910 + }, + { + "epoch": 1.46, + "grad_norm": 0.58203125, + "learning_rate": 0.00016046498202743233, + "loss": 0.2894, + "step": 10911 + }, + { + "epoch": 1.46, + "grad_norm": 0.6015625, + "learning_rate": 0.00016045570646135862, + "loss": 0.5941, + "step": 10912 + }, + { + "epoch": 1.46, + "grad_norm": 0.59765625, + "learning_rate": 0.0001604464300754667, + "loss": 0.8196, + "step": 10913 + }, + { + "epoch": 1.46, + "grad_norm": 0.7109375, + "learning_rate": 0.00016043715286988242, + "loss": 0.45, + "step": 10914 + }, + { + "epoch": 1.46, + "grad_norm": 0.62109375, + "learning_rate": 0.00016042787484473156, + "loss": 0.2704, + "step": 10915 + }, + { + "epoch": 1.46, + "grad_norm": 0.5390625, + "learning_rate": 0.0001604185960001399, + "loss": 0.4989, + "step": 10916 + }, + { + "epoch": 1.46, + "grad_norm": 0.87890625, + "learning_rate": 0.00016040931633623336, + "loss": 0.2906, + "step": 10917 + }, + { + "epoch": 1.46, + "grad_norm": 0.5546875, + "learning_rate": 0.00016040003585313767, + "loss": 0.2741, + "step": 10918 + }, + { + "epoch": 1.46, + "grad_norm": 0.625, + "learning_rate": 0.00016039075455097877, + "loss": 0.4191, + "step": 10919 + }, + { + "epoch": 1.46, + "grad_norm": 0.62109375, + "learning_rate": 0.00016038147242988243, + "loss": 0.4586, + "step": 10920 + }, + { + "epoch": 1.46, + "grad_norm": 0.439453125, + "learning_rate": 0.00016037218948997462, + "loss": 0.3996, + "step": 10921 + }, + { + "epoch": 1.46, + "grad_norm": 0.45703125, + "learning_rate": 0.00016036290573138117, + "loss": 0.3245, + "step": 10922 + }, + { + "epoch": 1.46, + "grad_norm": 0.703125, + "learning_rate": 0.00016035362115422795, + "loss": 0.45, + "step": 10923 + }, + { + "epoch": 1.46, + "grad_norm": 0.5390625, + "learning_rate": 0.0001603443357586409, + "loss": 0.5205, + "step": 10924 + }, + { + "epoch": 1.46, + "grad_norm": 0.6796875, + "learning_rate": 0.00016033504954474594, + "loss": 0.3518, + "step": 10925 + }, + { + "epoch": 1.46, + "grad_norm": 0.44921875, + "learning_rate": 0.00016032576251266898, + "loss": 0.539, + "step": 10926 + }, + { + "epoch": 1.46, + "grad_norm": 0.5078125, + "learning_rate": 0.00016031647466253595, + "loss": 0.5476, + "step": 10927 + }, + { + "epoch": 1.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00016030718599447283, + "loss": 0.3607, + "step": 10928 + }, + { + "epoch": 1.46, + "grad_norm": 0.453125, + "learning_rate": 0.00016029789650860553, + "loss": 0.353, + "step": 10929 + }, + { + "epoch": 1.46, + "grad_norm": 0.62890625, + "learning_rate": 0.0001602886062050601, + "loss": 0.2153, + "step": 10930 + }, + { + "epoch": 1.46, + "grad_norm": 0.59375, + "learning_rate": 0.00016027931508396246, + "loss": 0.4781, + "step": 10931 + }, + { + "epoch": 1.46, + "grad_norm": 0.376953125, + "learning_rate": 0.00016027002314543862, + "loss": 0.1643, + "step": 10932 + }, + { + "epoch": 1.46, + "grad_norm": 0.5390625, + "learning_rate": 0.00016026073038961458, + "loss": 0.2801, + "step": 10933 + }, + { + "epoch": 1.46, + "grad_norm": 0.478515625, + "learning_rate": 0.00016025143681661636, + "loss": 0.3493, + "step": 10934 + }, + { + "epoch": 1.46, + "grad_norm": 0.73046875, + "learning_rate": 0.00016024214242657, + "loss": 0.4738, + "step": 10935 + }, + { + "epoch": 1.46, + "grad_norm": 0.60546875, + "learning_rate": 0.00016023284721960153, + "loss": 0.1754, + "step": 10936 + }, + { + "epoch": 1.46, + "grad_norm": 0.6015625, + "learning_rate": 0.000160223551195837, + "loss": 0.327, + "step": 10937 + }, + { + "epoch": 1.46, + "grad_norm": 0.61328125, + "learning_rate": 0.00016021425435540243, + "loss": 0.5697, + "step": 10938 + }, + { + "epoch": 1.46, + "grad_norm": 0.5703125, + "learning_rate": 0.00016020495669842393, + "loss": 0.4203, + "step": 10939 + }, + { + "epoch": 1.46, + "grad_norm": 0.412109375, + "learning_rate": 0.0001601956582250276, + "loss": 0.3581, + "step": 10940 + }, + { + "epoch": 1.46, + "grad_norm": 0.49609375, + "learning_rate": 0.0001601863589353395, + "loss": 0.408, + "step": 10941 + }, + { + "epoch": 1.46, + "grad_norm": 0.486328125, + "learning_rate": 0.00016017705882948575, + "loss": 0.3713, + "step": 10942 + }, + { + "epoch": 1.46, + "grad_norm": 0.486328125, + "learning_rate": 0.00016016775790759244, + "loss": 0.4163, + "step": 10943 + }, + { + "epoch": 1.46, + "grad_norm": 0.546875, + "learning_rate": 0.00016015845616978574, + "loss": 0.2975, + "step": 10944 + }, + { + "epoch": 1.46, + "grad_norm": 0.5546875, + "learning_rate": 0.00016014915361619174, + "loss": 0.5085, + "step": 10945 + }, + { + "epoch": 1.46, + "grad_norm": 0.578125, + "learning_rate": 0.00016013985024693664, + "loss": 0.6016, + "step": 10946 + }, + { + "epoch": 1.46, + "grad_norm": 0.498046875, + "learning_rate": 0.00016013054606214656, + "loss": 0.3673, + "step": 10947 + }, + { + "epoch": 1.46, + "grad_norm": 0.55859375, + "learning_rate": 0.00016012124106194771, + "loss": 0.5109, + "step": 10948 + }, + { + "epoch": 1.46, + "grad_norm": 0.52734375, + "learning_rate": 0.0001601119352464662, + "loss": 0.4153, + "step": 10949 + }, + { + "epoch": 1.46, + "grad_norm": 0.625, + "learning_rate": 0.00016010262861582833, + "loss": 0.4514, + "step": 10950 + }, + { + "epoch": 1.46, + "grad_norm": 0.40234375, + "learning_rate": 0.0001600933211701602, + "loss": 0.167, + "step": 10951 + }, + { + "epoch": 1.46, + "grad_norm": 0.47265625, + "learning_rate": 0.00016008401290958807, + "loss": 0.1806, + "step": 10952 + }, + { + "epoch": 1.46, + "grad_norm": 0.5078125, + "learning_rate": 0.00016007470383423814, + "loss": 0.4327, + "step": 10953 + }, + { + "epoch": 1.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00016006539394423675, + "loss": 0.3288, + "step": 10954 + }, + { + "epoch": 1.46, + "grad_norm": 0.625, + "learning_rate": 0.00016005608323971, + "loss": 0.3807, + "step": 10955 + }, + { + "epoch": 1.46, + "grad_norm": 0.59765625, + "learning_rate": 0.0001600467717207842, + "loss": 0.6466, + "step": 10956 + }, + { + "epoch": 1.46, + "grad_norm": 0.486328125, + "learning_rate": 0.0001600374593875857, + "loss": 0.3155, + "step": 10957 + }, + { + "epoch": 1.46, + "grad_norm": 0.45703125, + "learning_rate": 0.0001600281462402407, + "loss": 0.5353, + "step": 10958 + }, + { + "epoch": 1.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00016001883227887548, + "loss": 0.23, + "step": 10959 + }, + { + "epoch": 1.46, + "grad_norm": 0.60546875, + "learning_rate": 0.00016000951750361642, + "loss": 0.4821, + "step": 10960 + }, + { + "epoch": 1.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00016000020191458974, + "loss": 0.5813, + "step": 10961 + }, + { + "epoch": 1.46, + "grad_norm": 0.625, + "learning_rate": 0.00015999088551192186, + "loss": 0.6254, + "step": 10962 + }, + { + "epoch": 1.46, + "grad_norm": 0.73046875, + "learning_rate": 0.00015998156829573904, + "loss": 0.4249, + "step": 10963 + }, + { + "epoch": 1.46, + "grad_norm": 0.5703125, + "learning_rate": 0.00015997225026616766, + "loss": 0.4192, + "step": 10964 + }, + { + "epoch": 1.46, + "grad_norm": 0.546875, + "learning_rate": 0.0001599629314233341, + "loss": 0.3815, + "step": 10965 + }, + { + "epoch": 1.46, + "grad_norm": 0.55078125, + "learning_rate": 0.0001599536117673647, + "loss": 0.1999, + "step": 10966 + }, + { + "epoch": 1.46, + "grad_norm": 0.58203125, + "learning_rate": 0.00015994429129838583, + "loss": 0.4319, + "step": 10967 + }, + { + "epoch": 1.46, + "grad_norm": 0.5390625, + "learning_rate": 0.0001599349700165239, + "loss": 0.5338, + "step": 10968 + }, + { + "epoch": 1.46, + "grad_norm": 0.6484375, + "learning_rate": 0.0001599256479219053, + "loss": 0.4977, + "step": 10969 + }, + { + "epoch": 1.46, + "grad_norm": 0.7578125, + "learning_rate": 0.00015991632501465648, + "loss": 0.5105, + "step": 10970 + }, + { + "epoch": 1.46, + "grad_norm": 0.431640625, + "learning_rate": 0.0001599070012949038, + "loss": 0.3821, + "step": 10971 + }, + { + "epoch": 1.46, + "grad_norm": 0.421875, + "learning_rate": 0.0001598976767627738, + "loss": 0.1389, + "step": 10972 + }, + { + "epoch": 1.46, + "grad_norm": 0.50390625, + "learning_rate": 0.00015988835141839282, + "loss": 0.6019, + "step": 10973 + }, + { + "epoch": 1.46, + "grad_norm": 0.61328125, + "learning_rate": 0.00015987902526188737, + "loss": 0.6761, + "step": 10974 + }, + { + "epoch": 1.46, + "grad_norm": 0.6640625, + "learning_rate": 0.00015986969829338387, + "loss": 0.5863, + "step": 10975 + }, + { + "epoch": 1.46, + "grad_norm": 0.59765625, + "learning_rate": 0.0001598603705130089, + "loss": 0.4278, + "step": 10976 + }, + { + "epoch": 1.46, + "grad_norm": 0.55859375, + "learning_rate": 0.00015985104192088882, + "loss": 0.4702, + "step": 10977 + }, + { + "epoch": 1.46, + "grad_norm": 0.54296875, + "learning_rate": 0.00015984171251715026, + "loss": 0.3313, + "step": 10978 + }, + { + "epoch": 1.47, + "grad_norm": 0.7109375, + "learning_rate": 0.00015983238230191964, + "loss": 0.8021, + "step": 10979 + }, + { + "epoch": 1.47, + "grad_norm": 0.49609375, + "learning_rate": 0.00015982305127532352, + "loss": 0.435, + "step": 10980 + }, + { + "epoch": 1.47, + "grad_norm": 0.55859375, + "learning_rate": 0.00015981371943748842, + "loss": 0.6064, + "step": 10981 + }, + { + "epoch": 1.47, + "grad_norm": 0.4453125, + "learning_rate": 0.0001598043867885409, + "loss": 0.4838, + "step": 10982 + }, + { + "epoch": 1.47, + "grad_norm": 0.546875, + "learning_rate": 0.00015979505332860755, + "loss": 0.5168, + "step": 10983 + }, + { + "epoch": 1.47, + "grad_norm": 0.5078125, + "learning_rate": 0.00015978571905781488, + "loss": 0.537, + "step": 10984 + }, + { + "epoch": 1.47, + "grad_norm": 0.64453125, + "learning_rate": 0.00015977638397628947, + "loss": 0.7516, + "step": 10985 + }, + { + "epoch": 1.47, + "grad_norm": 0.6015625, + "learning_rate": 0.00015976704808415794, + "loss": 0.6553, + "step": 10986 + }, + { + "epoch": 1.47, + "grad_norm": 0.47265625, + "learning_rate": 0.00015975771138154686, + "loss": 0.271, + "step": 10987 + }, + { + "epoch": 1.47, + "grad_norm": 0.5625, + "learning_rate": 0.0001597483738685829, + "loss": 0.2377, + "step": 10988 + }, + { + "epoch": 1.47, + "grad_norm": 0.67578125, + "learning_rate": 0.00015973903554539263, + "loss": 0.3904, + "step": 10989 + }, + { + "epoch": 1.47, + "grad_norm": 0.498046875, + "learning_rate": 0.00015972969641210266, + "loss": 0.2107, + "step": 10990 + }, + { + "epoch": 1.47, + "grad_norm": 0.42578125, + "learning_rate": 0.00015972035646883974, + "loss": 0.2947, + "step": 10991 + }, + { + "epoch": 1.47, + "grad_norm": 0.60546875, + "learning_rate": 0.0001597110157157304, + "loss": 0.5914, + "step": 10992 + }, + { + "epoch": 1.47, + "grad_norm": 0.5625, + "learning_rate": 0.0001597016741529014, + "loss": 0.2893, + "step": 10993 + }, + { + "epoch": 1.47, + "grad_norm": 0.58984375, + "learning_rate": 0.00015969233178047935, + "loss": 0.5819, + "step": 10994 + }, + { + "epoch": 1.47, + "grad_norm": 0.5078125, + "learning_rate": 0.000159682988598591, + "loss": 0.4435, + "step": 10995 + }, + { + "epoch": 1.47, + "grad_norm": 0.6875, + "learning_rate": 0.000159673644607363, + "loss": 0.3766, + "step": 10996 + }, + { + "epoch": 1.47, + "grad_norm": 0.55859375, + "learning_rate": 0.00015966429980692213, + "loss": 0.3365, + "step": 10997 + }, + { + "epoch": 1.47, + "grad_norm": 0.57421875, + "learning_rate": 0.00015965495419739502, + "loss": 0.3701, + "step": 10998 + }, + { + "epoch": 1.47, + "grad_norm": 0.67578125, + "learning_rate": 0.00015964560777890845, + "loss": 0.6719, + "step": 10999 + }, + { + "epoch": 1.47, + "grad_norm": 0.41015625, + "learning_rate": 0.00015963626055158912, + "loss": 0.2631, + "step": 11000 + }, + { + "epoch": 1.47, + "grad_norm": 0.55859375, + "learning_rate": 0.00015962691251556388, + "loss": 0.4445, + "step": 11001 + }, + { + "epoch": 1.47, + "grad_norm": 0.57421875, + "learning_rate": 0.0001596175636709594, + "loss": 0.3347, + "step": 11002 + }, + { + "epoch": 1.47, + "grad_norm": 0.44921875, + "learning_rate": 0.00015960821401790254, + "loss": 0.4163, + "step": 11003 + }, + { + "epoch": 1.47, + "grad_norm": 0.5390625, + "learning_rate": 0.00015959886355652002, + "loss": 0.4759, + "step": 11004 + }, + { + "epoch": 1.47, + "grad_norm": 0.57421875, + "learning_rate": 0.00015958951228693867, + "loss": 0.3735, + "step": 11005 + }, + { + "epoch": 1.47, + "grad_norm": 0.5, + "learning_rate": 0.00015958016020928525, + "loss": 0.2989, + "step": 11006 + }, + { + "epoch": 1.47, + "grad_norm": 0.5546875, + "learning_rate": 0.00015957080732368665, + "loss": 0.4097, + "step": 11007 + }, + { + "epoch": 1.47, + "grad_norm": 0.42578125, + "learning_rate": 0.00015956145363026965, + "loss": 0.3581, + "step": 11008 + }, + { + "epoch": 1.47, + "grad_norm": 0.54296875, + "learning_rate": 0.00015955209912916116, + "loss": 0.5543, + "step": 11009 + }, + { + "epoch": 1.47, + "grad_norm": 0.412109375, + "learning_rate": 0.00015954274382048793, + "loss": 0.1778, + "step": 11010 + }, + { + "epoch": 1.47, + "grad_norm": 0.59765625, + "learning_rate": 0.00015953338770437688, + "loss": 0.6402, + "step": 11011 + }, + { + "epoch": 1.47, + "grad_norm": 0.447265625, + "learning_rate": 0.00015952403078095493, + "loss": 0.2988, + "step": 11012 + }, + { + "epoch": 1.47, + "grad_norm": 0.416015625, + "learning_rate": 0.00015951467305034888, + "loss": 0.3447, + "step": 11013 + }, + { + "epoch": 1.47, + "grad_norm": 0.5859375, + "learning_rate": 0.0001595053145126857, + "loss": 0.5974, + "step": 11014 + }, + { + "epoch": 1.47, + "grad_norm": 0.51953125, + "learning_rate": 0.00015949595516809225, + "loss": 0.309, + "step": 11015 + }, + { + "epoch": 1.47, + "grad_norm": 0.7265625, + "learning_rate": 0.00015948659501669544, + "loss": 0.3343, + "step": 11016 + }, + { + "epoch": 1.47, + "grad_norm": 0.5859375, + "learning_rate": 0.00015947723405862227, + "loss": 0.3639, + "step": 11017 + }, + { + "epoch": 1.47, + "grad_norm": 0.66796875, + "learning_rate": 0.0001594678722939996, + "loss": 0.6213, + "step": 11018 + }, + { + "epoch": 1.47, + "grad_norm": 0.95703125, + "learning_rate": 0.00015945850972295443, + "loss": 0.4742, + "step": 11019 + }, + { + "epoch": 1.47, + "grad_norm": 0.52734375, + "learning_rate": 0.00015944914634561367, + "loss": 0.3927, + "step": 11020 + }, + { + "epoch": 1.47, + "grad_norm": 0.375, + "learning_rate": 0.0001594397821621044, + "loss": 0.2698, + "step": 11021 + }, + { + "epoch": 1.47, + "grad_norm": 0.56640625, + "learning_rate": 0.00015943041717255348, + "loss": 0.5338, + "step": 11022 + }, + { + "epoch": 1.47, + "grad_norm": 0.470703125, + "learning_rate": 0.00015942105137708797, + "loss": 0.2806, + "step": 11023 + }, + { + "epoch": 1.47, + "grad_norm": 0.57421875, + "learning_rate": 0.0001594116847758349, + "loss": 0.3555, + "step": 11024 + }, + { + "epoch": 1.47, + "grad_norm": 0.5859375, + "learning_rate": 0.0001594023173689212, + "loss": 0.3907, + "step": 11025 + }, + { + "epoch": 1.47, + "grad_norm": 0.55078125, + "learning_rate": 0.000159392949156474, + "loss": 0.2756, + "step": 11026 + }, + { + "epoch": 1.47, + "grad_norm": 0.515625, + "learning_rate": 0.0001593835801386203, + "loss": 0.2224, + "step": 11027 + }, + { + "epoch": 1.47, + "grad_norm": 0.46875, + "learning_rate": 0.0001593742103154871, + "loss": 0.2253, + "step": 11028 + }, + { + "epoch": 1.47, + "grad_norm": 0.443359375, + "learning_rate": 0.00015936483968720153, + "loss": 0.3638, + "step": 11029 + }, + { + "epoch": 1.47, + "grad_norm": 0.56640625, + "learning_rate": 0.0001593554682538906, + "loss": 0.5764, + "step": 11030 + }, + { + "epoch": 1.47, + "grad_norm": 0.578125, + "learning_rate": 0.00015934609601568145, + "loss": 0.5109, + "step": 11031 + }, + { + "epoch": 1.47, + "grad_norm": 0.55859375, + "learning_rate": 0.00015933672297270115, + "loss": 0.3099, + "step": 11032 + }, + { + "epoch": 1.47, + "grad_norm": 0.61328125, + "learning_rate": 0.00015932734912507682, + "loss": 0.7662, + "step": 11033 + }, + { + "epoch": 1.47, + "grad_norm": 0.421875, + "learning_rate": 0.00015931797447293552, + "loss": 0.4455, + "step": 11034 + }, + { + "epoch": 1.47, + "grad_norm": 0.52734375, + "learning_rate": 0.00015930859901640445, + "loss": 0.4311, + "step": 11035 + }, + { + "epoch": 1.47, + "grad_norm": 0.6015625, + "learning_rate": 0.0001592992227556107, + "loss": 0.5559, + "step": 11036 + }, + { + "epoch": 1.47, + "grad_norm": 0.44921875, + "learning_rate": 0.00015928984569068145, + "loss": 0.2802, + "step": 11037 + }, + { + "epoch": 1.47, + "grad_norm": 0.5078125, + "learning_rate": 0.0001592804678217438, + "loss": 0.652, + "step": 11038 + }, + { + "epoch": 1.47, + "grad_norm": 0.5703125, + "learning_rate": 0.00015927108914892498, + "loss": 0.2422, + "step": 11039 + }, + { + "epoch": 1.47, + "grad_norm": 0.58203125, + "learning_rate": 0.0001592617096723522, + "loss": 0.2793, + "step": 11040 + }, + { + "epoch": 1.47, + "grad_norm": 0.34375, + "learning_rate": 0.00015925232939215252, + "loss": 0.2339, + "step": 11041 + }, + { + "epoch": 1.47, + "grad_norm": 0.5625, + "learning_rate": 0.00015924294830845328, + "loss": 0.3655, + "step": 11042 + }, + { + "epoch": 1.47, + "grad_norm": 0.353515625, + "learning_rate": 0.0001592335664213816, + "loss": 0.1795, + "step": 11043 + }, + { + "epoch": 1.47, + "grad_norm": 0.447265625, + "learning_rate": 0.00015922418373106478, + "loss": 0.2463, + "step": 11044 + }, + { + "epoch": 1.47, + "grad_norm": 0.67578125, + "learning_rate": 0.00015921480023763004, + "loss": 0.6133, + "step": 11045 + }, + { + "epoch": 1.47, + "grad_norm": 0.5078125, + "learning_rate": 0.00015920541594120454, + "loss": 0.3718, + "step": 11046 + }, + { + "epoch": 1.47, + "grad_norm": 0.51953125, + "learning_rate": 0.00015919603084191562, + "loss": 0.5458, + "step": 11047 + }, + { + "epoch": 1.47, + "grad_norm": 0.515625, + "learning_rate": 0.00015918664493989055, + "loss": 0.3403, + "step": 11048 + }, + { + "epoch": 1.47, + "grad_norm": 0.72265625, + "learning_rate": 0.00015917725823525657, + "loss": 0.7142, + "step": 11049 + }, + { + "epoch": 1.47, + "grad_norm": 0.3828125, + "learning_rate": 0.000159167870728141, + "loss": 0.3789, + "step": 11050 + }, + { + "epoch": 1.47, + "grad_norm": 0.84765625, + "learning_rate": 0.00015915848241867116, + "loss": 0.6033, + "step": 11051 + }, + { + "epoch": 1.47, + "grad_norm": 0.56640625, + "learning_rate": 0.0001591490933069743, + "loss": 0.4804, + "step": 11052 + }, + { + "epoch": 1.47, + "grad_norm": 0.58203125, + "learning_rate": 0.00015913970339317775, + "loss": 0.2318, + "step": 11053 + }, + { + "epoch": 1.48, + "grad_norm": 0.388671875, + "learning_rate": 0.0001591303126774089, + "loss": 0.266, + "step": 11054 + }, + { + "epoch": 1.48, + "grad_norm": 0.5546875, + "learning_rate": 0.00015912092115979504, + "loss": 0.8323, + "step": 11055 + }, + { + "epoch": 1.48, + "grad_norm": 0.7421875, + "learning_rate": 0.00015911152884046356, + "loss": 0.6251, + "step": 11056 + }, + { + "epoch": 1.48, + "grad_norm": 0.423828125, + "learning_rate": 0.00015910213571954183, + "loss": 0.3403, + "step": 11057 + }, + { + "epoch": 1.48, + "grad_norm": 0.67578125, + "learning_rate": 0.00015909274179715718, + "loss": 0.5989, + "step": 11058 + }, + { + "epoch": 1.48, + "grad_norm": 0.37890625, + "learning_rate": 0.00015908334707343702, + "loss": 0.2678, + "step": 11059 + }, + { + "epoch": 1.48, + "grad_norm": 0.40625, + "learning_rate": 0.00015907395154850877, + "loss": 0.2848, + "step": 11060 + }, + { + "epoch": 1.48, + "grad_norm": 0.515625, + "learning_rate": 0.0001590645552224998, + "loss": 0.3967, + "step": 11061 + }, + { + "epoch": 1.48, + "grad_norm": 0.453125, + "learning_rate": 0.0001590551580955376, + "loss": 0.2004, + "step": 11062 + }, + { + "epoch": 1.48, + "grad_norm": 0.55859375, + "learning_rate": 0.0001590457601677495, + "loss": 0.3815, + "step": 11063 + }, + { + "epoch": 1.48, + "grad_norm": 0.70703125, + "learning_rate": 0.00015903636143926303, + "loss": 0.4116, + "step": 11064 + }, + { + "epoch": 1.48, + "grad_norm": 0.49609375, + "learning_rate": 0.00015902696191020561, + "loss": 0.2731, + "step": 11065 + }, + { + "epoch": 1.48, + "grad_norm": 0.41015625, + "learning_rate": 0.00015901756158070472, + "loss": 0.4246, + "step": 11066 + }, + { + "epoch": 1.48, + "grad_norm": 0.5078125, + "learning_rate": 0.00015900816045088777, + "loss": 0.605, + "step": 11067 + }, + { + "epoch": 1.48, + "grad_norm": 0.47265625, + "learning_rate": 0.00015899875852088232, + "loss": 0.4038, + "step": 11068 + }, + { + "epoch": 1.48, + "grad_norm": 0.5, + "learning_rate": 0.00015898935579081584, + "loss": 0.5402, + "step": 11069 + }, + { + "epoch": 1.48, + "grad_norm": 0.36328125, + "learning_rate": 0.00015897995226081584, + "loss": 0.2066, + "step": 11070 + }, + { + "epoch": 1.48, + "grad_norm": 0.54296875, + "learning_rate": 0.00015897054793100984, + "loss": 0.3324, + "step": 11071 + }, + { + "epoch": 1.48, + "grad_norm": 0.640625, + "learning_rate": 0.00015896114280152532, + "loss": 0.3938, + "step": 11072 + }, + { + "epoch": 1.48, + "grad_norm": 0.66796875, + "learning_rate": 0.0001589517368724899, + "loss": 0.5835, + "step": 11073 + }, + { + "epoch": 1.48, + "grad_norm": 0.61328125, + "learning_rate": 0.0001589423301440311, + "loss": 0.2255, + "step": 11074 + }, + { + "epoch": 1.48, + "grad_norm": 0.490234375, + "learning_rate": 0.00015893292261627643, + "loss": 0.5382, + "step": 11075 + }, + { + "epoch": 1.48, + "grad_norm": 0.5234375, + "learning_rate": 0.00015892351428935356, + "loss": 0.4897, + "step": 11076 + }, + { + "epoch": 1.48, + "grad_norm": 0.53125, + "learning_rate": 0.00015891410516339, + "loss": 0.5029, + "step": 11077 + }, + { + "epoch": 1.48, + "grad_norm": 0.515625, + "learning_rate": 0.00015890469523851337, + "loss": 0.3097, + "step": 11078 + }, + { + "epoch": 1.48, + "grad_norm": 0.5546875, + "learning_rate": 0.00015889528451485126, + "loss": 0.4224, + "step": 11079 + }, + { + "epoch": 1.48, + "grad_norm": 0.451171875, + "learning_rate": 0.00015888587299253129, + "loss": 0.6371, + "step": 11080 + }, + { + "epoch": 1.48, + "grad_norm": 0.49609375, + "learning_rate": 0.00015887646067168105, + "loss": 0.3427, + "step": 11081 + }, + { + "epoch": 1.48, + "grad_norm": 0.53125, + "learning_rate": 0.00015886704755242829, + "loss": 0.3534, + "step": 11082 + }, + { + "epoch": 1.48, + "grad_norm": 0.412109375, + "learning_rate": 0.00015885763363490053, + "loss": 0.1906, + "step": 11083 + }, + { + "epoch": 1.48, + "grad_norm": 0.58203125, + "learning_rate": 0.00015884821891922553, + "loss": 0.4106, + "step": 11084 + }, + { + "epoch": 1.48, + "grad_norm": 0.314453125, + "learning_rate": 0.0001588388034055309, + "loss": 0.1223, + "step": 11085 + }, + { + "epoch": 1.48, + "grad_norm": 0.50390625, + "learning_rate": 0.0001588293870939443, + "loss": 0.4419, + "step": 11086 + }, + { + "epoch": 1.48, + "grad_norm": 0.5234375, + "learning_rate": 0.00015881996998459345, + "loss": 0.4966, + "step": 11087 + }, + { + "epoch": 1.48, + "grad_norm": 0.53125, + "learning_rate": 0.0001588105520776061, + "loss": 0.2971, + "step": 11088 + }, + { + "epoch": 1.48, + "grad_norm": 0.59765625, + "learning_rate": 0.00015880113337310992, + "loss": 0.5042, + "step": 11089 + }, + { + "epoch": 1.48, + "grad_norm": 0.578125, + "learning_rate": 0.0001587917138712326, + "loss": 0.3246, + "step": 11090 + }, + { + "epoch": 1.48, + "grad_norm": 0.474609375, + "learning_rate": 0.00015878229357210193, + "loss": 0.52, + "step": 11091 + }, + { + "epoch": 1.48, + "grad_norm": 0.5859375, + "learning_rate": 0.00015877287247584562, + "loss": 0.5457, + "step": 11092 + }, + { + "epoch": 1.48, + "grad_norm": 0.78515625, + "learning_rate": 0.00015876345058259145, + "loss": 0.4916, + "step": 11093 + }, + { + "epoch": 1.48, + "grad_norm": 0.8046875, + "learning_rate": 0.00015875402789246716, + "loss": 0.9228, + "step": 11094 + }, + { + "epoch": 1.48, + "grad_norm": 0.490234375, + "learning_rate": 0.00015874460440560054, + "loss": 0.5063, + "step": 11095 + }, + { + "epoch": 1.48, + "grad_norm": 0.53515625, + "learning_rate": 0.0001587351801221194, + "loss": 0.4733, + "step": 11096 + }, + { + "epoch": 1.48, + "grad_norm": 0.365234375, + "learning_rate": 0.00015872575504215152, + "loss": 0.1765, + "step": 11097 + }, + { + "epoch": 1.48, + "grad_norm": 0.69140625, + "learning_rate": 0.00015871632916582474, + "loss": 0.3779, + "step": 11098 + }, + { + "epoch": 1.48, + "grad_norm": 0.470703125, + "learning_rate": 0.0001587069024932668, + "loss": 0.4361, + "step": 11099 + }, + { + "epoch": 1.48, + "grad_norm": 0.58984375, + "learning_rate": 0.0001586974750246056, + "loss": 0.2784, + "step": 11100 + }, + { + "epoch": 1.48, + "grad_norm": 0.3515625, + "learning_rate": 0.000158688046759969, + "loss": 0.1857, + "step": 11101 + }, + { + "epoch": 1.48, + "grad_norm": 0.546875, + "learning_rate": 0.00015867861769948478, + "loss": 0.5771, + "step": 11102 + }, + { + "epoch": 1.48, + "grad_norm": 0.6015625, + "learning_rate": 0.00015866918784328087, + "loss": 0.7877, + "step": 11103 + }, + { + "epoch": 1.48, + "grad_norm": 0.46875, + "learning_rate": 0.0001586597571914851, + "loss": 0.2393, + "step": 11104 + }, + { + "epoch": 1.48, + "grad_norm": 0.515625, + "learning_rate": 0.00015865032574422538, + "loss": 0.4559, + "step": 11105 + }, + { + "epoch": 1.48, + "grad_norm": 0.416015625, + "learning_rate": 0.0001586408935016296, + "loss": 0.3224, + "step": 11106 + }, + { + "epoch": 1.48, + "grad_norm": 0.4921875, + "learning_rate": 0.00015863146046382566, + "loss": 0.4656, + "step": 11107 + }, + { + "epoch": 1.48, + "grad_norm": 0.5546875, + "learning_rate": 0.0001586220266309415, + "loss": 0.3506, + "step": 11108 + }, + { + "epoch": 1.48, + "grad_norm": 0.384765625, + "learning_rate": 0.00015861259200310504, + "loss": 0.3093, + "step": 11109 + }, + { + "epoch": 1.48, + "grad_norm": 0.474609375, + "learning_rate": 0.00015860315658044422, + "loss": 0.3915, + "step": 11110 + }, + { + "epoch": 1.48, + "grad_norm": 0.6015625, + "learning_rate": 0.00015859372036308698, + "loss": 0.7419, + "step": 11111 + }, + { + "epoch": 1.48, + "grad_norm": 0.52734375, + "learning_rate": 0.00015858428335116127, + "loss": 0.5163, + "step": 11112 + }, + { + "epoch": 1.48, + "grad_norm": 0.494140625, + "learning_rate": 0.0001585748455447951, + "loss": 0.3205, + "step": 11113 + }, + { + "epoch": 1.48, + "grad_norm": 0.51171875, + "learning_rate": 0.00015856540694411642, + "loss": 0.444, + "step": 11114 + }, + { + "epoch": 1.48, + "grad_norm": 0.6640625, + "learning_rate": 0.0001585559675492532, + "loss": 0.4004, + "step": 11115 + }, + { + "epoch": 1.48, + "grad_norm": 0.44921875, + "learning_rate": 0.00015854652736033354, + "loss": 0.5539, + "step": 11116 + }, + { + "epoch": 1.48, + "grad_norm": 0.51171875, + "learning_rate": 0.00015853708637748537, + "loss": 0.3475, + "step": 11117 + }, + { + "epoch": 1.48, + "grad_norm": 0.5625, + "learning_rate": 0.00015852764460083676, + "loss": 0.4779, + "step": 11118 + }, + { + "epoch": 1.48, + "grad_norm": 0.44140625, + "learning_rate": 0.00015851820203051568, + "loss": 0.3892, + "step": 11119 + }, + { + "epoch": 1.48, + "grad_norm": 0.423828125, + "learning_rate": 0.00015850875866665028, + "loss": 0.2633, + "step": 11120 + }, + { + "epoch": 1.48, + "grad_norm": 0.65625, + "learning_rate": 0.00015849931450936851, + "loss": 0.5847, + "step": 11121 + }, + { + "epoch": 1.48, + "grad_norm": 0.6640625, + "learning_rate": 0.0001584898695587985, + "loss": 0.4057, + "step": 11122 + }, + { + "epoch": 1.48, + "grad_norm": 0.83203125, + "learning_rate": 0.00015848042381506834, + "loss": 0.2379, + "step": 11123 + }, + { + "epoch": 1.48, + "grad_norm": 0.46875, + "learning_rate": 0.0001584709772783061, + "loss": 0.5448, + "step": 11124 + }, + { + "epoch": 1.48, + "grad_norm": 0.56640625, + "learning_rate": 0.00015846152994863986, + "loss": 0.4223, + "step": 11125 + }, + { + "epoch": 1.48, + "grad_norm": 0.4765625, + "learning_rate": 0.0001584520818261978, + "loss": 0.3386, + "step": 11126 + }, + { + "epoch": 1.48, + "grad_norm": 0.53125, + "learning_rate": 0.00015844263291110795, + "loss": 0.2913, + "step": 11127 + }, + { + "epoch": 1.48, + "grad_norm": 0.419921875, + "learning_rate": 0.00015843318320349852, + "loss": 0.4448, + "step": 11128 + }, + { + "epoch": 1.49, + "grad_norm": 0.40234375, + "learning_rate": 0.0001584237327034976, + "loss": 0.1544, + "step": 11129 + }, + { + "epoch": 1.49, + "grad_norm": 0.53125, + "learning_rate": 0.00015841428141123338, + "loss": 0.3401, + "step": 11130 + }, + { + "epoch": 1.49, + "grad_norm": 0.5234375, + "learning_rate": 0.000158404829326834, + "loss": 0.34, + "step": 11131 + }, + { + "epoch": 1.49, + "grad_norm": 0.392578125, + "learning_rate": 0.00015839537645042766, + "loss": 0.3735, + "step": 11132 + }, + { + "epoch": 1.49, + "grad_norm": 0.5625, + "learning_rate": 0.00015838592278214254, + "loss": 0.3856, + "step": 11133 + }, + { + "epoch": 1.49, + "grad_norm": 0.494140625, + "learning_rate": 0.00015837646832210682, + "loss": 0.3776, + "step": 11134 + }, + { + "epoch": 1.49, + "grad_norm": 0.546875, + "learning_rate": 0.00015836701307044875, + "loss": 0.334, + "step": 11135 + }, + { + "epoch": 1.49, + "grad_norm": 0.478515625, + "learning_rate": 0.00015835755702729652, + "loss": 0.5326, + "step": 11136 + }, + { + "epoch": 1.49, + "grad_norm": 0.5703125, + "learning_rate": 0.00015834810019277836, + "loss": 0.4282, + "step": 11137 + }, + { + "epoch": 1.49, + "grad_norm": 0.5, + "learning_rate": 0.0001583386425670225, + "loss": 0.4988, + "step": 11138 + }, + { + "epoch": 1.49, + "grad_norm": 0.65625, + "learning_rate": 0.0001583291841501572, + "loss": 0.3743, + "step": 11139 + }, + { + "epoch": 1.49, + "grad_norm": 0.443359375, + "learning_rate": 0.00015831972494231074, + "loss": 0.1754, + "step": 11140 + }, + { + "epoch": 1.49, + "grad_norm": 0.5546875, + "learning_rate": 0.00015831026494361137, + "loss": 0.3107, + "step": 11141 + }, + { + "epoch": 1.49, + "grad_norm": 0.390625, + "learning_rate": 0.0001583008041541874, + "loss": 0.2426, + "step": 11142 + }, + { + "epoch": 1.49, + "grad_norm": 0.59765625, + "learning_rate": 0.0001582913425741671, + "loss": 0.5514, + "step": 11143 + }, + { + "epoch": 1.49, + "grad_norm": 0.671875, + "learning_rate": 0.00015828188020367877, + "loss": 0.4678, + "step": 11144 + }, + { + "epoch": 1.49, + "grad_norm": 0.55859375, + "learning_rate": 0.00015827241704285077, + "loss": 0.3383, + "step": 11145 + }, + { + "epoch": 1.49, + "grad_norm": 0.58203125, + "learning_rate": 0.0001582629530918114, + "loss": 0.2929, + "step": 11146 + }, + { + "epoch": 1.49, + "grad_norm": 0.474609375, + "learning_rate": 0.00015825348835068895, + "loss": 0.2804, + "step": 11147 + }, + { + "epoch": 1.49, + "grad_norm": 0.50390625, + "learning_rate": 0.00015824402281961185, + "loss": 0.2496, + "step": 11148 + }, + { + "epoch": 1.49, + "grad_norm": 0.462890625, + "learning_rate": 0.0001582345564987084, + "loss": 0.2284, + "step": 11149 + }, + { + "epoch": 1.49, + "grad_norm": 0.396484375, + "learning_rate": 0.000158225089388107, + "loss": 0.3389, + "step": 11150 + }, + { + "epoch": 1.49, + "grad_norm": 0.99609375, + "learning_rate": 0.000158215621487936, + "loss": 0.3253, + "step": 11151 + }, + { + "epoch": 1.49, + "grad_norm": 0.51953125, + "learning_rate": 0.00015820615279832384, + "loss": 0.3079, + "step": 11152 + }, + { + "epoch": 1.49, + "grad_norm": 0.65234375, + "learning_rate": 0.00015819668331939888, + "loss": 0.5168, + "step": 11153 + }, + { + "epoch": 1.49, + "grad_norm": 0.54296875, + "learning_rate": 0.00015818721305128954, + "loss": 0.3042, + "step": 11154 + }, + { + "epoch": 1.49, + "grad_norm": 0.453125, + "learning_rate": 0.00015817774199412425, + "loss": 0.2655, + "step": 11155 + }, + { + "epoch": 1.49, + "grad_norm": 0.412109375, + "learning_rate": 0.00015816827014803142, + "loss": 0.2431, + "step": 11156 + }, + { + "epoch": 1.49, + "grad_norm": 0.43359375, + "learning_rate": 0.00015815879751313955, + "loss": 0.3414, + "step": 11157 + }, + { + "epoch": 1.49, + "grad_norm": 0.359375, + "learning_rate": 0.00015814932408957704, + "loss": 0.2319, + "step": 11158 + }, + { + "epoch": 1.49, + "grad_norm": 0.396484375, + "learning_rate": 0.00015813984987747236, + "loss": 0.2912, + "step": 11159 + }, + { + "epoch": 1.49, + "grad_norm": 0.6875, + "learning_rate": 0.00015813037487695403, + "loss": 0.5413, + "step": 11160 + }, + { + "epoch": 1.49, + "grad_norm": 0.3359375, + "learning_rate": 0.0001581208990881505, + "loss": 0.2456, + "step": 11161 + }, + { + "epoch": 1.49, + "grad_norm": 0.54296875, + "learning_rate": 0.00015811142251119026, + "loss": 0.4318, + "step": 11162 + }, + { + "epoch": 1.49, + "grad_norm": 0.4765625, + "learning_rate": 0.00015810194514620185, + "loss": 0.6397, + "step": 11163 + }, + { + "epoch": 1.49, + "grad_norm": 0.6640625, + "learning_rate": 0.00015809246699331376, + "loss": 0.4815, + "step": 11164 + }, + { + "epoch": 1.49, + "grad_norm": 0.462890625, + "learning_rate": 0.00015808298805265457, + "loss": 0.2854, + "step": 11165 + }, + { + "epoch": 1.49, + "grad_norm": 0.765625, + "learning_rate": 0.00015807350832435275, + "loss": 0.4604, + "step": 11166 + }, + { + "epoch": 1.49, + "grad_norm": 0.462890625, + "learning_rate": 0.0001580640278085369, + "loss": 0.3968, + "step": 11167 + }, + { + "epoch": 1.49, + "grad_norm": 0.73046875, + "learning_rate": 0.0001580545465053356, + "loss": 0.5298, + "step": 11168 + }, + { + "epoch": 1.49, + "grad_norm": 0.515625, + "learning_rate": 0.00015804506441487732, + "loss": 0.4834, + "step": 11169 + }, + { + "epoch": 1.49, + "grad_norm": 0.466796875, + "learning_rate": 0.00015803558153729074, + "loss": 0.2695, + "step": 11170 + }, + { + "epoch": 1.49, + "grad_norm": 0.50390625, + "learning_rate": 0.00015802609787270443, + "loss": 0.4989, + "step": 11171 + }, + { + "epoch": 1.49, + "grad_norm": 0.6328125, + "learning_rate": 0.000158016613421247, + "loss": 0.2561, + "step": 11172 + }, + { + "epoch": 1.49, + "grad_norm": 0.47265625, + "learning_rate": 0.00015800712818304707, + "loss": 0.2033, + "step": 11173 + }, + { + "epoch": 1.49, + "grad_norm": 0.51171875, + "learning_rate": 0.00015799764215823324, + "loss": 0.4917, + "step": 11174 + }, + { + "epoch": 1.49, + "grad_norm": 0.55078125, + "learning_rate": 0.00015798815534693417, + "loss": 0.5292, + "step": 11175 + }, + { + "epoch": 1.49, + "grad_norm": 0.66796875, + "learning_rate": 0.00015797866774927848, + "loss": 0.4168, + "step": 11176 + }, + { + "epoch": 1.49, + "grad_norm": 0.57421875, + "learning_rate": 0.00015796917936539484, + "loss": 0.3785, + "step": 11177 + }, + { + "epoch": 1.49, + "grad_norm": 0.4375, + "learning_rate": 0.00015795969019541194, + "loss": 0.2077, + "step": 11178 + }, + { + "epoch": 1.49, + "grad_norm": 0.470703125, + "learning_rate": 0.00015795020023945844, + "loss": 0.2675, + "step": 11179 + }, + { + "epoch": 1.49, + "grad_norm": 0.47265625, + "learning_rate": 0.000157940709497663, + "loss": 0.4612, + "step": 11180 + }, + { + "epoch": 1.49, + "grad_norm": 0.41015625, + "learning_rate": 0.0001579312179701544, + "loss": 0.2165, + "step": 11181 + }, + { + "epoch": 1.49, + "grad_norm": 0.64453125, + "learning_rate": 0.00015792172565706126, + "loss": 0.4269, + "step": 11182 + }, + { + "epoch": 1.49, + "grad_norm": 0.53515625, + "learning_rate": 0.0001579122325585124, + "loss": 0.3966, + "step": 11183 + }, + { + "epoch": 1.49, + "grad_norm": 0.56640625, + "learning_rate": 0.00015790273867463648, + "loss": 0.3338, + "step": 11184 + }, + { + "epoch": 1.49, + "grad_norm": 0.51171875, + "learning_rate": 0.00015789324400556224, + "loss": 0.385, + "step": 11185 + }, + { + "epoch": 1.49, + "grad_norm": 0.6328125, + "learning_rate": 0.0001578837485514185, + "loss": 0.2577, + "step": 11186 + }, + { + "epoch": 1.49, + "grad_norm": 0.404296875, + "learning_rate": 0.0001578742523123339, + "loss": 0.3412, + "step": 11187 + }, + { + "epoch": 1.49, + "grad_norm": 0.50390625, + "learning_rate": 0.00015786475528843736, + "loss": 0.3226, + "step": 11188 + }, + { + "epoch": 1.49, + "grad_norm": 0.490234375, + "learning_rate": 0.0001578552574798576, + "loss": 0.2289, + "step": 11189 + }, + { + "epoch": 1.49, + "grad_norm": 0.458984375, + "learning_rate": 0.0001578457588867234, + "loss": 0.281, + "step": 11190 + }, + { + "epoch": 1.49, + "grad_norm": 0.578125, + "learning_rate": 0.0001578362595091636, + "loss": 0.4277, + "step": 11191 + }, + { + "epoch": 1.49, + "grad_norm": 0.453125, + "learning_rate": 0.00015782675934730697, + "loss": 0.5221, + "step": 11192 + }, + { + "epoch": 1.49, + "grad_norm": 0.53125, + "learning_rate": 0.0001578172584012824, + "loss": 0.4718, + "step": 11193 + }, + { + "epoch": 1.49, + "grad_norm": 0.6328125, + "learning_rate": 0.00015780775667121865, + "loss": 0.4352, + "step": 11194 + }, + { + "epoch": 1.49, + "grad_norm": 0.4921875, + "learning_rate": 0.00015779825415724468, + "loss": 0.3835, + "step": 11195 + }, + { + "epoch": 1.49, + "grad_norm": 0.69921875, + "learning_rate": 0.00015778875085948926, + "loss": 0.4595, + "step": 11196 + }, + { + "epoch": 1.49, + "grad_norm": 0.373046875, + "learning_rate": 0.0001577792467780813, + "loss": 0.3075, + "step": 11197 + }, + { + "epoch": 1.49, + "grad_norm": 0.40234375, + "learning_rate": 0.0001577697419131497, + "loss": 0.1969, + "step": 11198 + }, + { + "epoch": 1.49, + "grad_norm": 0.62890625, + "learning_rate": 0.00015776023626482325, + "loss": 0.6051, + "step": 11199 + }, + { + "epoch": 1.49, + "grad_norm": 0.65625, + "learning_rate": 0.00015775072983323097, + "loss": 0.541, + "step": 11200 + }, + { + "epoch": 1.49, + "grad_norm": 0.357421875, + "learning_rate": 0.00015774122261850174, + "loss": 0.1711, + "step": 11201 + }, + { + "epoch": 1.49, + "grad_norm": 0.41796875, + "learning_rate": 0.00015773171462076443, + "loss": 0.26, + "step": 11202 + }, + { + "epoch": 1.49, + "grad_norm": 0.41015625, + "learning_rate": 0.00015772220584014805, + "loss": 0.3851, + "step": 11203 + }, + { + "epoch": 1.5, + "grad_norm": 0.44921875, + "learning_rate": 0.00015771269627678152, + "loss": 0.333, + "step": 11204 + }, + { + "epoch": 1.5, + "grad_norm": 0.5625, + "learning_rate": 0.00015770318593079377, + "loss": 0.4094, + "step": 11205 + }, + { + "epoch": 1.5, + "grad_norm": 0.54296875, + "learning_rate": 0.0001576936748023138, + "loss": 0.475, + "step": 11206 + }, + { + "epoch": 1.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00015768416289147058, + "loss": 0.4205, + "step": 11207 + }, + { + "epoch": 1.5, + "grad_norm": 0.734375, + "learning_rate": 0.00015767465019839305, + "loss": 0.479, + "step": 11208 + }, + { + "epoch": 1.5, + "grad_norm": 0.71484375, + "learning_rate": 0.00015766513672321028, + "loss": 0.2417, + "step": 11209 + }, + { + "epoch": 1.5, + "grad_norm": 0.5390625, + "learning_rate": 0.00015765562246605126, + "loss": 0.3066, + "step": 11210 + }, + { + "epoch": 1.5, + "grad_norm": 0.515625, + "learning_rate": 0.00015764610742704496, + "loss": 0.6104, + "step": 11211 + }, + { + "epoch": 1.5, + "grad_norm": 0.55078125, + "learning_rate": 0.00015763659160632044, + "loss": 0.4594, + "step": 11212 + }, + { + "epoch": 1.5, + "grad_norm": 0.46875, + "learning_rate": 0.0001576270750040068, + "loss": 0.4578, + "step": 11213 + }, + { + "epoch": 1.5, + "grad_norm": 0.69140625, + "learning_rate": 0.000157617557620233, + "loss": 0.2328, + "step": 11214 + }, + { + "epoch": 1.5, + "grad_norm": 0.5546875, + "learning_rate": 0.00015760803945512816, + "loss": 0.2857, + "step": 11215 + }, + { + "epoch": 1.5, + "grad_norm": 0.5625, + "learning_rate": 0.00015759852050882132, + "loss": 0.2616, + "step": 11216 + }, + { + "epoch": 1.5, + "grad_norm": 0.58203125, + "learning_rate": 0.00015758900078144156, + "loss": 0.8532, + "step": 11217 + }, + { + "epoch": 1.5, + "grad_norm": 0.439453125, + "learning_rate": 0.000157579480273118, + "loss": 0.3598, + "step": 11218 + }, + { + "epoch": 1.5, + "grad_norm": 0.67578125, + "learning_rate": 0.00015756995898397972, + "loss": 0.4115, + "step": 11219 + }, + { + "epoch": 1.5, + "grad_norm": 0.40234375, + "learning_rate": 0.00015756043691415587, + "loss": 0.2925, + "step": 11220 + }, + { + "epoch": 1.5, + "grad_norm": 0.59375, + "learning_rate": 0.00015755091406377555, + "loss": 0.4449, + "step": 11221 + }, + { + "epoch": 1.5, + "grad_norm": 0.5390625, + "learning_rate": 0.0001575413904329679, + "loss": 0.4501, + "step": 11222 + }, + { + "epoch": 1.5, + "grad_norm": 0.68359375, + "learning_rate": 0.00015753186602186209, + "loss": 0.7205, + "step": 11223 + }, + { + "epoch": 1.5, + "grad_norm": 0.58984375, + "learning_rate": 0.00015752234083058722, + "loss": 0.5966, + "step": 11224 + }, + { + "epoch": 1.5, + "grad_norm": 0.4765625, + "learning_rate": 0.00015751281485927248, + "loss": 0.2834, + "step": 11225 + }, + { + "epoch": 1.5, + "grad_norm": 0.56640625, + "learning_rate": 0.00015750328810804708, + "loss": 0.2392, + "step": 11226 + }, + { + "epoch": 1.5, + "grad_norm": 0.5390625, + "learning_rate": 0.00015749376057704018, + "loss": 0.5484, + "step": 11227 + }, + { + "epoch": 1.5, + "grad_norm": 0.400390625, + "learning_rate": 0.000157484232266381, + "loss": 0.1762, + "step": 11228 + }, + { + "epoch": 1.5, + "grad_norm": 0.55078125, + "learning_rate": 0.00015747470317619872, + "loss": 0.3425, + "step": 11229 + }, + { + "epoch": 1.5, + "grad_norm": 0.80859375, + "learning_rate": 0.00015746517330662258, + "loss": 0.417, + "step": 11230 + }, + { + "epoch": 1.5, + "grad_norm": 0.39453125, + "learning_rate": 0.00015745564265778184, + "loss": 0.2679, + "step": 11231 + }, + { + "epoch": 1.5, + "grad_norm": 0.74609375, + "learning_rate": 0.00015744611122980566, + "loss": 0.3624, + "step": 11232 + }, + { + "epoch": 1.5, + "grad_norm": 0.47265625, + "learning_rate": 0.00015743657902282337, + "loss": 0.2636, + "step": 11233 + }, + { + "epoch": 1.5, + "grad_norm": 0.65234375, + "learning_rate": 0.00015742704603696422, + "loss": 0.6254, + "step": 11234 + }, + { + "epoch": 1.5, + "grad_norm": 0.58984375, + "learning_rate": 0.00015741751227235746, + "loss": 0.3258, + "step": 11235 + }, + { + "epoch": 1.5, + "grad_norm": 0.73046875, + "learning_rate": 0.00015740797772913236, + "loss": 0.4751, + "step": 11236 + }, + { + "epoch": 1.5, + "grad_norm": 0.51953125, + "learning_rate": 0.0001573984424074183, + "loss": 0.2985, + "step": 11237 + }, + { + "epoch": 1.5, + "grad_norm": 0.33984375, + "learning_rate": 0.00015738890630734444, + "loss": 0.3195, + "step": 11238 + }, + { + "epoch": 1.5, + "grad_norm": 0.6796875, + "learning_rate": 0.00015737936942904023, + "loss": 0.7144, + "step": 11239 + }, + { + "epoch": 1.5, + "grad_norm": 0.6328125, + "learning_rate": 0.00015736983177263496, + "loss": 0.3664, + "step": 11240 + }, + { + "epoch": 1.5, + "grad_norm": 0.703125, + "learning_rate": 0.0001573602933382579, + "loss": 0.3851, + "step": 11241 + }, + { + "epoch": 1.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00015735075412603847, + "loss": 0.2325, + "step": 11242 + }, + { + "epoch": 1.5, + "grad_norm": 0.5, + "learning_rate": 0.000157341214136106, + "loss": 0.2057, + "step": 11243 + }, + { + "epoch": 1.5, + "grad_norm": 0.5859375, + "learning_rate": 0.00015733167336858987, + "loss": 0.2791, + "step": 11244 + }, + { + "epoch": 1.5, + "grad_norm": 0.400390625, + "learning_rate": 0.00015732213182361948, + "loss": 0.2946, + "step": 11245 + }, + { + "epoch": 1.5, + "grad_norm": 0.31640625, + "learning_rate": 0.00015731258950132417, + "loss": 0.2619, + "step": 11246 + }, + { + "epoch": 1.5, + "grad_norm": 0.494140625, + "learning_rate": 0.00015730304640183338, + "loss": 0.4711, + "step": 11247 + }, + { + "epoch": 1.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00015729350252527646, + "loss": 0.3961, + "step": 11248 + }, + { + "epoch": 1.5, + "grad_norm": 0.52734375, + "learning_rate": 0.0001572839578717829, + "loss": 0.3161, + "step": 11249 + }, + { + "epoch": 1.5, + "grad_norm": 0.33984375, + "learning_rate": 0.00015727441244148214, + "loss": 0.3168, + "step": 11250 + }, + { + "epoch": 1.5, + "grad_norm": 0.6640625, + "learning_rate": 0.00015726486623450354, + "loss": 0.4142, + "step": 11251 + }, + { + "epoch": 1.5, + "grad_norm": 0.427734375, + "learning_rate": 0.00015725531925097662, + "loss": 0.3535, + "step": 11252 + }, + { + "epoch": 1.5, + "grad_norm": 0.609375, + "learning_rate": 0.00015724577149103084, + "loss": 0.4795, + "step": 11253 + }, + { + "epoch": 1.5, + "grad_norm": 0.91015625, + "learning_rate": 0.00015723622295479563, + "loss": 0.4221, + "step": 11254 + }, + { + "epoch": 1.5, + "grad_norm": 0.6640625, + "learning_rate": 0.00015722667364240052, + "loss": 0.4733, + "step": 11255 + }, + { + "epoch": 1.5, + "grad_norm": 0.61328125, + "learning_rate": 0.000157217123553975, + "loss": 0.4672, + "step": 11256 + }, + { + "epoch": 1.5, + "grad_norm": 0.396484375, + "learning_rate": 0.00015720757268964853, + "loss": 0.3755, + "step": 11257 + }, + { + "epoch": 1.5, + "grad_norm": 0.65625, + "learning_rate": 0.0001571980210495507, + "loss": 0.4741, + "step": 11258 + }, + { + "epoch": 1.5, + "grad_norm": 0.65234375, + "learning_rate": 0.00015718846863381093, + "loss": 0.6413, + "step": 11259 + }, + { + "epoch": 1.5, + "grad_norm": 0.443359375, + "learning_rate": 0.00015717891544255887, + "loss": 0.2341, + "step": 11260 + }, + { + "epoch": 1.5, + "grad_norm": 0.6171875, + "learning_rate": 0.00015716936147592404, + "loss": 0.5425, + "step": 11261 + }, + { + "epoch": 1.5, + "grad_norm": 0.47265625, + "learning_rate": 0.00015715980673403594, + "loss": 0.2543, + "step": 11262 + }, + { + "epoch": 1.5, + "grad_norm": 0.44140625, + "learning_rate": 0.00015715025121702415, + "loss": 0.2578, + "step": 11263 + }, + { + "epoch": 1.5, + "grad_norm": 0.875, + "learning_rate": 0.00015714069492501836, + "loss": 0.4188, + "step": 11264 + }, + { + "epoch": 1.5, + "grad_norm": 0.4609375, + "learning_rate": 0.00015713113785814802, + "loss": 0.2203, + "step": 11265 + }, + { + "epoch": 1.5, + "grad_norm": 0.671875, + "learning_rate": 0.00015712158001654277, + "loss": 0.4657, + "step": 11266 + }, + { + "epoch": 1.5, + "grad_norm": 0.4765625, + "learning_rate": 0.00015711202140033224, + "loss": 0.2264, + "step": 11267 + }, + { + "epoch": 1.5, + "grad_norm": 0.46875, + "learning_rate": 0.00015710246200964606, + "loss": 0.4688, + "step": 11268 + }, + { + "epoch": 1.5, + "grad_norm": 0.357421875, + "learning_rate": 0.00015709290184461386, + "loss": 0.2028, + "step": 11269 + }, + { + "epoch": 1.5, + "grad_norm": 0.58203125, + "learning_rate": 0.00015708334090536527, + "loss": 0.4536, + "step": 11270 + }, + { + "epoch": 1.5, + "grad_norm": 0.59765625, + "learning_rate": 0.00015707377919202991, + "loss": 0.434, + "step": 11271 + }, + { + "epoch": 1.5, + "grad_norm": 0.5, + "learning_rate": 0.0001570642167047375, + "loss": 0.5811, + "step": 11272 + }, + { + "epoch": 1.5, + "grad_norm": 0.58984375, + "learning_rate": 0.00015705465344361766, + "loss": 0.5137, + "step": 11273 + }, + { + "epoch": 1.5, + "grad_norm": 0.5546875, + "learning_rate": 0.0001570450894088001, + "loss": 0.3024, + "step": 11274 + }, + { + "epoch": 1.5, + "grad_norm": 1.03125, + "learning_rate": 0.00015703552460041457, + "loss": 0.2628, + "step": 11275 + }, + { + "epoch": 1.5, + "grad_norm": 0.5546875, + "learning_rate": 0.0001570259590185907, + "loss": 0.276, + "step": 11276 + }, + { + "epoch": 1.5, + "grad_norm": 0.421875, + "learning_rate": 0.0001570163926634582, + "loss": 0.2171, + "step": 11277 + }, + { + "epoch": 1.5, + "grad_norm": 0.56640625, + "learning_rate": 0.00015700682553514684, + "loss": 0.4681, + "step": 11278 + }, + { + "epoch": 1.51, + "grad_norm": 0.63671875, + "learning_rate": 0.00015699725763378632, + "loss": 0.3726, + "step": 11279 + }, + { + "epoch": 1.51, + "grad_norm": 0.65234375, + "learning_rate": 0.00015698768895950642, + "loss": 0.4468, + "step": 11280 + }, + { + "epoch": 1.51, + "grad_norm": 0.431640625, + "learning_rate": 0.0001569781195124369, + "loss": 0.2623, + "step": 11281 + }, + { + "epoch": 1.51, + "grad_norm": 0.48046875, + "learning_rate": 0.0001569685492927075, + "loss": 0.2389, + "step": 11282 + }, + { + "epoch": 1.51, + "grad_norm": 0.4921875, + "learning_rate": 0.000156958978300448, + "loss": 0.4122, + "step": 11283 + }, + { + "epoch": 1.51, + "grad_norm": 0.5625, + "learning_rate": 0.0001569494065357882, + "loss": 0.2408, + "step": 11284 + }, + { + "epoch": 1.51, + "grad_norm": 0.59375, + "learning_rate": 0.0001569398339988579, + "loss": 0.3002, + "step": 11285 + }, + { + "epoch": 1.51, + "grad_norm": 0.48828125, + "learning_rate": 0.0001569302606897869, + "loss": 0.2613, + "step": 11286 + }, + { + "epoch": 1.51, + "grad_norm": 0.43359375, + "learning_rate": 0.00015692068660870504, + "loss": 0.2625, + "step": 11287 + }, + { + "epoch": 1.51, + "grad_norm": 0.5234375, + "learning_rate": 0.00015691111175574217, + "loss": 0.2095, + "step": 11288 + }, + { + "epoch": 1.51, + "grad_norm": 0.703125, + "learning_rate": 0.00015690153613102804, + "loss": 0.2784, + "step": 11289 + }, + { + "epoch": 1.51, + "grad_norm": 0.6640625, + "learning_rate": 0.00015689195973469258, + "loss": 0.4825, + "step": 11290 + }, + { + "epoch": 1.51, + "grad_norm": 0.439453125, + "learning_rate": 0.00015688238256686564, + "loss": 0.3073, + "step": 11291 + }, + { + "epoch": 1.51, + "grad_norm": 0.4765625, + "learning_rate": 0.00015687280462767708, + "loss": 0.5129, + "step": 11292 + }, + { + "epoch": 1.51, + "grad_norm": 0.5546875, + "learning_rate": 0.0001568632259172568, + "loss": 0.352, + "step": 11293 + }, + { + "epoch": 1.51, + "grad_norm": 0.5078125, + "learning_rate": 0.00015685364643573468, + "loss": 0.3991, + "step": 11294 + }, + { + "epoch": 1.51, + "grad_norm": 0.51171875, + "learning_rate": 0.00015684406618324061, + "loss": 0.4619, + "step": 11295 + }, + { + "epoch": 1.51, + "grad_norm": 0.46875, + "learning_rate": 0.00015683448515990453, + "loss": 0.3189, + "step": 11296 + }, + { + "epoch": 1.51, + "grad_norm": 0.59765625, + "learning_rate": 0.0001568249033658564, + "loss": 0.2406, + "step": 11297 + }, + { + "epoch": 1.51, + "grad_norm": 0.6796875, + "learning_rate": 0.00015681532080122603, + "loss": 0.346, + "step": 11298 + }, + { + "epoch": 1.51, + "grad_norm": 0.458984375, + "learning_rate": 0.00015680573746614348, + "loss": 0.5463, + "step": 11299 + }, + { + "epoch": 1.51, + "grad_norm": 0.322265625, + "learning_rate": 0.00015679615336073867, + "loss": 0.1865, + "step": 11300 + }, + { + "epoch": 1.51, + "grad_norm": 0.453125, + "learning_rate": 0.0001567865684851416, + "loss": 0.1722, + "step": 11301 + }, + { + "epoch": 1.51, + "grad_norm": 0.494140625, + "learning_rate": 0.00015677698283948215, + "loss": 0.3314, + "step": 11302 + }, + { + "epoch": 1.51, + "grad_norm": 0.458984375, + "learning_rate": 0.00015676739642389044, + "loss": 0.2717, + "step": 11303 + }, + { + "epoch": 1.51, + "grad_norm": 0.55859375, + "learning_rate": 0.00015675780923849635, + "loss": 0.7178, + "step": 11304 + }, + { + "epoch": 1.51, + "grad_norm": 0.55078125, + "learning_rate": 0.00015674822128342996, + "loss": 0.2053, + "step": 11305 + }, + { + "epoch": 1.51, + "grad_norm": 0.4375, + "learning_rate": 0.00015673863255882128, + "loss": 0.3562, + "step": 11306 + }, + { + "epoch": 1.51, + "grad_norm": 0.42578125, + "learning_rate": 0.00015672904306480035, + "loss": 0.3501, + "step": 11307 + }, + { + "epoch": 1.51, + "grad_norm": 0.32421875, + "learning_rate": 0.00015671945280149713, + "loss": 0.2805, + "step": 11308 + }, + { + "epoch": 1.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00015670986176904175, + "loss": 0.4957, + "step": 11309 + }, + { + "epoch": 1.51, + "grad_norm": 0.59765625, + "learning_rate": 0.00015670026996756427, + "loss": 0.5902, + "step": 11310 + }, + { + "epoch": 1.51, + "grad_norm": 0.498046875, + "learning_rate": 0.00015669067739719473, + "loss": 0.4546, + "step": 11311 + }, + { + "epoch": 1.51, + "grad_norm": 0.470703125, + "learning_rate": 0.0001566810840580632, + "loss": 0.4305, + "step": 11312 + }, + { + "epoch": 1.51, + "grad_norm": 1.1796875, + "learning_rate": 0.0001566714899502998, + "loss": 0.2451, + "step": 11313 + }, + { + "epoch": 1.51, + "grad_norm": 0.51953125, + "learning_rate": 0.00015666189507403463, + "loss": 0.366, + "step": 11314 + }, + { + "epoch": 1.51, + "grad_norm": 0.6796875, + "learning_rate": 0.0001566522994293978, + "loss": 0.4086, + "step": 11315 + }, + { + "epoch": 1.51, + "grad_norm": 0.6328125, + "learning_rate": 0.00015664270301651945, + "loss": 0.5794, + "step": 11316 + }, + { + "epoch": 1.51, + "grad_norm": 1.0390625, + "learning_rate": 0.00015663310583552965, + "loss": 0.5637, + "step": 11317 + }, + { + "epoch": 1.51, + "grad_norm": 0.4765625, + "learning_rate": 0.00015662350788655865, + "loss": 0.4737, + "step": 11318 + }, + { + "epoch": 1.51, + "grad_norm": 0.6171875, + "learning_rate": 0.00015661390916973648, + "loss": 0.7296, + "step": 11319 + }, + { + "epoch": 1.51, + "grad_norm": 0.76953125, + "learning_rate": 0.00015660430968519336, + "loss": 0.4817, + "step": 11320 + }, + { + "epoch": 1.51, + "grad_norm": 0.5703125, + "learning_rate": 0.00015659470943305955, + "loss": 0.5077, + "step": 11321 + }, + { + "epoch": 1.51, + "grad_norm": 0.6875, + "learning_rate": 0.00015658510841346507, + "loss": 0.6123, + "step": 11322 + }, + { + "epoch": 1.51, + "grad_norm": 0.4765625, + "learning_rate": 0.00015657550662654025, + "loss": 0.4281, + "step": 11323 + }, + { + "epoch": 1.51, + "grad_norm": 0.41796875, + "learning_rate": 0.00015656590407241527, + "loss": 0.4827, + "step": 11324 + }, + { + "epoch": 1.51, + "grad_norm": 0.498046875, + "learning_rate": 0.00015655630075122028, + "loss": 0.4529, + "step": 11325 + }, + { + "epoch": 1.51, + "grad_norm": 0.51171875, + "learning_rate": 0.0001565466966630856, + "loss": 0.5358, + "step": 11326 + }, + { + "epoch": 1.51, + "grad_norm": 0.53515625, + "learning_rate": 0.00015653709180814138, + "loss": 0.4064, + "step": 11327 + }, + { + "epoch": 1.51, + "grad_norm": 0.384765625, + "learning_rate": 0.00015652748618651794, + "loss": 0.2346, + "step": 11328 + }, + { + "epoch": 1.51, + "grad_norm": 0.337890625, + "learning_rate": 0.00015651787979834548, + "loss": 0.3618, + "step": 11329 + }, + { + "epoch": 1.51, + "grad_norm": 0.6640625, + "learning_rate": 0.0001565082726437543, + "loss": 0.6435, + "step": 11330 + }, + { + "epoch": 1.51, + "grad_norm": 0.443359375, + "learning_rate": 0.0001564986647228747, + "loss": 0.37, + "step": 11331 + }, + { + "epoch": 1.51, + "grad_norm": 0.42578125, + "learning_rate": 0.00015648905603583694, + "loss": 0.3045, + "step": 11332 + }, + { + "epoch": 1.51, + "grad_norm": 0.56640625, + "learning_rate": 0.0001564794465827713, + "loss": 0.2403, + "step": 11333 + }, + { + "epoch": 1.51, + "grad_norm": 0.353515625, + "learning_rate": 0.00015646983636380813, + "loss": 0.2033, + "step": 11334 + }, + { + "epoch": 1.51, + "grad_norm": 0.59375, + "learning_rate": 0.00015646022537907776, + "loss": 0.3353, + "step": 11335 + }, + { + "epoch": 1.51, + "grad_norm": 0.466796875, + "learning_rate": 0.0001564506136287105, + "loss": 0.3013, + "step": 11336 + }, + { + "epoch": 1.51, + "grad_norm": 0.55859375, + "learning_rate": 0.00015644100111283669, + "loss": 0.292, + "step": 11337 + }, + { + "epoch": 1.51, + "grad_norm": 0.412109375, + "learning_rate": 0.00015643138783158662, + "loss": 0.5071, + "step": 11338 + }, + { + "epoch": 1.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00015642177378509078, + "loss": 0.5582, + "step": 11339 + }, + { + "epoch": 1.51, + "grad_norm": 0.59765625, + "learning_rate": 0.00015641215897347946, + "loss": 0.6447, + "step": 11340 + }, + { + "epoch": 1.51, + "grad_norm": 0.64453125, + "learning_rate": 0.00015640254339688303, + "loss": 0.5465, + "step": 11341 + }, + { + "epoch": 1.51, + "grad_norm": 0.609375, + "learning_rate": 0.00015639292705543195, + "loss": 0.3897, + "step": 11342 + }, + { + "epoch": 1.51, + "grad_norm": 0.48046875, + "learning_rate": 0.00015638330994925656, + "loss": 0.2828, + "step": 11343 + }, + { + "epoch": 1.51, + "grad_norm": 0.5625, + "learning_rate": 0.0001563736920784873, + "loss": 0.43, + "step": 11344 + }, + { + "epoch": 1.51, + "grad_norm": 0.58203125, + "learning_rate": 0.0001563640734432546, + "loss": 0.5757, + "step": 11345 + }, + { + "epoch": 1.51, + "grad_norm": 0.3828125, + "learning_rate": 0.00015635445404368892, + "loss": 0.3175, + "step": 11346 + }, + { + "epoch": 1.51, + "grad_norm": 0.447265625, + "learning_rate": 0.00015634483387992062, + "loss": 0.5469, + "step": 11347 + }, + { + "epoch": 1.51, + "grad_norm": 0.578125, + "learning_rate": 0.00015633521295208027, + "loss": 0.5918, + "step": 11348 + }, + { + "epoch": 1.51, + "grad_norm": 0.56640625, + "learning_rate": 0.00015632559126029825, + "loss": 0.341, + "step": 11349 + }, + { + "epoch": 1.51, + "grad_norm": 0.80078125, + "learning_rate": 0.00015631596880470505, + "loss": 0.2837, + "step": 11350 + }, + { + "epoch": 1.51, + "grad_norm": 0.5, + "learning_rate": 0.0001563063455854312, + "loss": 0.28, + "step": 11351 + }, + { + "epoch": 1.51, + "grad_norm": 0.546875, + "learning_rate": 0.00015629672160260714, + "loss": 0.5138, + "step": 11352 + }, + { + "epoch": 1.51, + "grad_norm": 0.73046875, + "learning_rate": 0.00015628709685636341, + "loss": 0.5898, + "step": 11353 + }, + { + "epoch": 1.52, + "grad_norm": 0.466796875, + "learning_rate": 0.0001562774713468305, + "loss": 0.2495, + "step": 11354 + }, + { + "epoch": 1.52, + "grad_norm": 0.63671875, + "learning_rate": 0.000156267845074139, + "loss": 0.5358, + "step": 11355 + }, + { + "epoch": 1.52, + "grad_norm": 0.61328125, + "learning_rate": 0.00015625821803841937, + "loss": 0.656, + "step": 11356 + }, + { + "epoch": 1.52, + "grad_norm": 0.376953125, + "learning_rate": 0.00015624859023980224, + "loss": 0.3115, + "step": 11357 + }, + { + "epoch": 1.52, + "grad_norm": 0.490234375, + "learning_rate": 0.00015623896167841808, + "loss": 0.213, + "step": 11358 + }, + { + "epoch": 1.52, + "grad_norm": 0.484375, + "learning_rate": 0.00015622933235439753, + "loss": 0.5044, + "step": 11359 + }, + { + "epoch": 1.52, + "grad_norm": 0.451171875, + "learning_rate": 0.00015621970226787116, + "loss": 0.2796, + "step": 11360 + }, + { + "epoch": 1.52, + "grad_norm": 0.5546875, + "learning_rate": 0.00015621007141896956, + "loss": 0.3475, + "step": 11361 + }, + { + "epoch": 1.52, + "grad_norm": 0.5078125, + "learning_rate": 0.00015620043980782327, + "loss": 0.2878, + "step": 11362 + }, + { + "epoch": 1.52, + "grad_norm": 0.52734375, + "learning_rate": 0.00015619080743456296, + "loss": 0.337, + "step": 11363 + }, + { + "epoch": 1.52, + "grad_norm": 0.45703125, + "learning_rate": 0.00015618117429931926, + "loss": 0.4859, + "step": 11364 + }, + { + "epoch": 1.52, + "grad_norm": 0.64453125, + "learning_rate": 0.00015617154040222273, + "loss": 0.5161, + "step": 11365 + }, + { + "epoch": 1.52, + "grad_norm": 0.59765625, + "learning_rate": 0.0001561619057434041, + "loss": 0.3104, + "step": 11366 + }, + { + "epoch": 1.52, + "grad_norm": 0.5234375, + "learning_rate": 0.000156152270322994, + "loss": 0.6283, + "step": 11367 + }, + { + "epoch": 1.52, + "grad_norm": 0.447265625, + "learning_rate": 0.00015614263414112303, + "loss": 0.2676, + "step": 11368 + }, + { + "epoch": 1.52, + "grad_norm": 0.4296875, + "learning_rate": 0.00015613299719792196, + "loss": 0.2483, + "step": 11369 + }, + { + "epoch": 1.52, + "grad_norm": 0.423828125, + "learning_rate": 0.00015612335949352136, + "loss": 0.3411, + "step": 11370 + }, + { + "epoch": 1.52, + "grad_norm": 0.42578125, + "learning_rate": 0.00015611372102805203, + "loss": 0.523, + "step": 11371 + }, + { + "epoch": 1.52, + "grad_norm": 0.59765625, + "learning_rate": 0.00015610408180164465, + "loss": 0.5522, + "step": 11372 + }, + { + "epoch": 1.52, + "grad_norm": 0.609375, + "learning_rate": 0.00015609444181442986, + "loss": 0.515, + "step": 11373 + }, + { + "epoch": 1.52, + "grad_norm": 0.5078125, + "learning_rate": 0.00015608480106653848, + "loss": 0.2929, + "step": 11374 + }, + { + "epoch": 1.52, + "grad_norm": 0.50390625, + "learning_rate": 0.00015607515955810117, + "loss": 0.6155, + "step": 11375 + }, + { + "epoch": 1.52, + "grad_norm": 0.515625, + "learning_rate": 0.00015606551728924873, + "loss": 0.2735, + "step": 11376 + }, + { + "epoch": 1.52, + "grad_norm": 0.51953125, + "learning_rate": 0.0001560558742601119, + "loss": 0.266, + "step": 11377 + }, + { + "epoch": 1.52, + "grad_norm": 0.55078125, + "learning_rate": 0.0001560462304708214, + "loss": 0.5537, + "step": 11378 + }, + { + "epoch": 1.52, + "grad_norm": 0.33984375, + "learning_rate": 0.00015603658592150808, + "loss": 0.1968, + "step": 11379 + }, + { + "epoch": 1.52, + "grad_norm": 0.87109375, + "learning_rate": 0.00015602694061230267, + "loss": 0.3534, + "step": 11380 + }, + { + "epoch": 1.52, + "grad_norm": 0.494140625, + "learning_rate": 0.00015601729454333598, + "loss": 0.4179, + "step": 11381 + }, + { + "epoch": 1.52, + "grad_norm": 0.462890625, + "learning_rate": 0.00015600764771473883, + "loss": 0.3946, + "step": 11382 + }, + { + "epoch": 1.52, + "grad_norm": 0.57421875, + "learning_rate": 0.00015599800012664204, + "loss": 0.4494, + "step": 11383 + }, + { + "epoch": 1.52, + "grad_norm": 0.66015625, + "learning_rate": 0.00015598835177917642, + "loss": 0.3908, + "step": 11384 + }, + { + "epoch": 1.52, + "grad_norm": 0.421875, + "learning_rate": 0.00015597870267247286, + "loss": 0.2183, + "step": 11385 + }, + { + "epoch": 1.52, + "grad_norm": 0.53125, + "learning_rate": 0.00015596905280666214, + "loss": 0.3404, + "step": 11386 + }, + { + "epoch": 1.52, + "grad_norm": 0.88671875, + "learning_rate": 0.00015595940218187513, + "loss": 0.2823, + "step": 11387 + }, + { + "epoch": 1.52, + "grad_norm": 0.345703125, + "learning_rate": 0.00015594975079824273, + "loss": 0.2315, + "step": 11388 + }, + { + "epoch": 1.52, + "grad_norm": 0.447265625, + "learning_rate": 0.00015594009865589576, + "loss": 0.3686, + "step": 11389 + }, + { + "epoch": 1.52, + "grad_norm": 0.458984375, + "learning_rate": 0.00015593044575496517, + "loss": 0.3384, + "step": 11390 + }, + { + "epoch": 1.52, + "grad_norm": 0.4609375, + "learning_rate": 0.00015592079209558187, + "loss": 0.5877, + "step": 11391 + }, + { + "epoch": 1.52, + "grad_norm": 0.431640625, + "learning_rate": 0.0001559111376778767, + "loss": 0.4208, + "step": 11392 + }, + { + "epoch": 1.52, + "grad_norm": 0.7578125, + "learning_rate": 0.00015590148250198065, + "loss": 0.5278, + "step": 11393 + }, + { + "epoch": 1.52, + "grad_norm": 0.55078125, + "learning_rate": 0.00015589182656802463, + "loss": 0.4347, + "step": 11394 + }, + { + "epoch": 1.52, + "grad_norm": 0.490234375, + "learning_rate": 0.00015588216987613953, + "loss": 0.4714, + "step": 11395 + }, + { + "epoch": 1.52, + "grad_norm": 0.45703125, + "learning_rate": 0.00015587251242645634, + "loss": 0.346, + "step": 11396 + }, + { + "epoch": 1.52, + "grad_norm": 0.484375, + "learning_rate": 0.00015586285421910606, + "loss": 0.3889, + "step": 11397 + }, + { + "epoch": 1.52, + "grad_norm": 0.515625, + "learning_rate": 0.00015585319525421963, + "loss": 0.4407, + "step": 11398 + }, + { + "epoch": 1.52, + "grad_norm": 0.40234375, + "learning_rate": 0.00015584353553192796, + "loss": 0.2965, + "step": 11399 + }, + { + "epoch": 1.52, + "grad_norm": 0.4140625, + "learning_rate": 0.00015583387505236216, + "loss": 0.192, + "step": 11400 + }, + { + "epoch": 1.52, + "grad_norm": 0.70703125, + "learning_rate": 0.00015582421381565316, + "loss": 0.4108, + "step": 11401 + }, + { + "epoch": 1.52, + "grad_norm": 0.423828125, + "learning_rate": 0.000155814551821932, + "loss": 0.347, + "step": 11402 + }, + { + "epoch": 1.52, + "grad_norm": 0.51953125, + "learning_rate": 0.00015580488907132974, + "loss": 0.4305, + "step": 11403 + }, + { + "epoch": 1.52, + "grad_norm": 0.609375, + "learning_rate": 0.00015579522556397732, + "loss": 0.4947, + "step": 11404 + }, + { + "epoch": 1.52, + "grad_norm": 0.486328125, + "learning_rate": 0.00015578556130000582, + "loss": 0.3922, + "step": 11405 + }, + { + "epoch": 1.52, + "grad_norm": 0.435546875, + "learning_rate": 0.00015577589627954633, + "loss": 0.2943, + "step": 11406 + }, + { + "epoch": 1.52, + "grad_norm": 0.37890625, + "learning_rate": 0.00015576623050272986, + "loss": 0.1922, + "step": 11407 + }, + { + "epoch": 1.52, + "grad_norm": 0.6796875, + "learning_rate": 0.00015575656396968755, + "loss": 0.3568, + "step": 11408 + }, + { + "epoch": 1.52, + "grad_norm": 0.8125, + "learning_rate": 0.00015574689668055046, + "loss": 0.3287, + "step": 11409 + }, + { + "epoch": 1.52, + "grad_norm": 0.54296875, + "learning_rate": 0.00015573722863544966, + "loss": 0.3811, + "step": 11410 + }, + { + "epoch": 1.52, + "grad_norm": 0.431640625, + "learning_rate": 0.00015572755983451626, + "loss": 0.3968, + "step": 11411 + }, + { + "epoch": 1.52, + "grad_norm": 0.765625, + "learning_rate": 0.00015571789027788135, + "loss": 0.5899, + "step": 11412 + }, + { + "epoch": 1.52, + "grad_norm": 0.5859375, + "learning_rate": 0.00015570821996567613, + "loss": 0.3632, + "step": 11413 + }, + { + "epoch": 1.52, + "grad_norm": 0.67578125, + "learning_rate": 0.0001556985488980317, + "loss": 0.683, + "step": 11414 + }, + { + "epoch": 1.52, + "grad_norm": 0.421875, + "learning_rate": 0.0001556888770750792, + "loss": 0.4223, + "step": 11415 + }, + { + "epoch": 1.52, + "grad_norm": 0.59375, + "learning_rate": 0.00015567920449694974, + "loss": 0.3547, + "step": 11416 + }, + { + "epoch": 1.52, + "grad_norm": 0.5703125, + "learning_rate": 0.00015566953116377457, + "loss": 0.5792, + "step": 11417 + }, + { + "epoch": 1.52, + "grad_norm": 0.6015625, + "learning_rate": 0.00015565985707568483, + "loss": 0.8442, + "step": 11418 + }, + { + "epoch": 1.52, + "grad_norm": 0.416015625, + "learning_rate": 0.0001556501822328117, + "loss": 0.2623, + "step": 11419 + }, + { + "epoch": 1.52, + "grad_norm": 0.44140625, + "learning_rate": 0.00015564050663528634, + "loss": 0.1701, + "step": 11420 + }, + { + "epoch": 1.52, + "grad_norm": 0.62109375, + "learning_rate": 0.00015563083028324005, + "loss": 0.6816, + "step": 11421 + }, + { + "epoch": 1.52, + "grad_norm": 0.5625, + "learning_rate": 0.00015562115317680402, + "loss": 0.4041, + "step": 11422 + }, + { + "epoch": 1.52, + "grad_norm": 0.5078125, + "learning_rate": 0.00015561147531610941, + "loss": 0.4125, + "step": 11423 + }, + { + "epoch": 1.52, + "grad_norm": 0.431640625, + "learning_rate": 0.00015560179670128752, + "loss": 0.3235, + "step": 11424 + }, + { + "epoch": 1.52, + "grad_norm": 0.48828125, + "learning_rate": 0.00015559211733246956, + "loss": 0.4734, + "step": 11425 + }, + { + "epoch": 1.52, + "grad_norm": 0.57421875, + "learning_rate": 0.00015558243720978685, + "loss": 0.3409, + "step": 11426 + }, + { + "epoch": 1.52, + "grad_norm": 0.64453125, + "learning_rate": 0.0001555727563333706, + "loss": 0.577, + "step": 11427 + }, + { + "epoch": 1.52, + "grad_norm": 0.404296875, + "learning_rate": 0.0001555630747033521, + "loss": 0.1765, + "step": 11428 + }, + { + "epoch": 1.53, + "grad_norm": 0.5390625, + "learning_rate": 0.00015555339231986267, + "loss": 0.5561, + "step": 11429 + }, + { + "epoch": 1.53, + "grad_norm": 0.6484375, + "learning_rate": 0.00015554370918303354, + "loss": 0.2784, + "step": 11430 + }, + { + "epoch": 1.53, + "grad_norm": 0.54296875, + "learning_rate": 0.0001555340252929961, + "loss": 0.2093, + "step": 11431 + }, + { + "epoch": 1.53, + "grad_norm": 0.51171875, + "learning_rate": 0.00015552434064988163, + "loss": 0.3478, + "step": 11432 + }, + { + "epoch": 1.53, + "grad_norm": 0.515625, + "learning_rate": 0.00015551465525382146, + "loss": 0.3015, + "step": 11433 + }, + { + "epoch": 1.53, + "grad_norm": 0.546875, + "learning_rate": 0.00015550496910494697, + "loss": 0.5646, + "step": 11434 + }, + { + "epoch": 1.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00015549528220338941, + "loss": 0.4808, + "step": 11435 + }, + { + "epoch": 1.53, + "grad_norm": 0.69140625, + "learning_rate": 0.00015548559454928027, + "loss": 0.2309, + "step": 11436 + }, + { + "epoch": 1.53, + "grad_norm": 0.451171875, + "learning_rate": 0.00015547590614275084, + "loss": 0.1843, + "step": 11437 + }, + { + "epoch": 1.53, + "grad_norm": 0.494140625, + "learning_rate": 0.0001554662169839325, + "loss": 0.3106, + "step": 11438 + }, + { + "epoch": 1.53, + "grad_norm": 0.6640625, + "learning_rate": 0.00015545652707295665, + "loss": 0.3971, + "step": 11439 + }, + { + "epoch": 1.53, + "grad_norm": 0.4765625, + "learning_rate": 0.00015544683640995475, + "loss": 0.3542, + "step": 11440 + }, + { + "epoch": 1.53, + "grad_norm": 0.51171875, + "learning_rate": 0.0001554371449950581, + "loss": 0.4249, + "step": 11441 + }, + { + "epoch": 1.53, + "grad_norm": 0.46484375, + "learning_rate": 0.00015542745282839823, + "loss": 0.4844, + "step": 11442 + }, + { + "epoch": 1.53, + "grad_norm": 0.51171875, + "learning_rate": 0.0001554177599101065, + "loss": 0.6115, + "step": 11443 + }, + { + "epoch": 1.53, + "grad_norm": 0.6484375, + "learning_rate": 0.00015540806624031442, + "loss": 0.533, + "step": 11444 + }, + { + "epoch": 1.53, + "grad_norm": 0.484375, + "learning_rate": 0.00015539837181915335, + "loss": 0.4549, + "step": 11445 + }, + { + "epoch": 1.53, + "grad_norm": 0.53515625, + "learning_rate": 0.00015538867664675486, + "loss": 0.4561, + "step": 11446 + }, + { + "epoch": 1.53, + "grad_norm": 0.578125, + "learning_rate": 0.00015537898072325032, + "loss": 0.4338, + "step": 11447 + }, + { + "epoch": 1.53, + "grad_norm": 0.73828125, + "learning_rate": 0.00015536928404877128, + "loss": 0.465, + "step": 11448 + }, + { + "epoch": 1.53, + "grad_norm": 0.62890625, + "learning_rate": 0.00015535958662344919, + "loss": 0.4115, + "step": 11449 + }, + { + "epoch": 1.53, + "grad_norm": 0.4375, + "learning_rate": 0.0001553498884474156, + "loss": 0.2714, + "step": 11450 + }, + { + "epoch": 1.53, + "grad_norm": 0.5625, + "learning_rate": 0.00015534018952080198, + "loss": 0.6449, + "step": 11451 + }, + { + "epoch": 1.53, + "grad_norm": 0.431640625, + "learning_rate": 0.0001553304898437399, + "loss": 0.2748, + "step": 11452 + }, + { + "epoch": 1.53, + "grad_norm": 0.494140625, + "learning_rate": 0.00015532078941636086, + "loss": 0.351, + "step": 11453 + }, + { + "epoch": 1.53, + "grad_norm": 0.52734375, + "learning_rate": 0.0001553110882387964, + "loss": 0.3571, + "step": 11454 + }, + { + "epoch": 1.53, + "grad_norm": 0.5703125, + "learning_rate": 0.00015530138631117808, + "loss": 0.4075, + "step": 11455 + }, + { + "epoch": 1.53, + "grad_norm": 0.55859375, + "learning_rate": 0.00015529168363363747, + "loss": 0.3465, + "step": 11456 + }, + { + "epoch": 1.53, + "grad_norm": 0.56640625, + "learning_rate": 0.00015528198020630613, + "loss": 0.2667, + "step": 11457 + }, + { + "epoch": 1.53, + "grad_norm": 0.51171875, + "learning_rate": 0.00015527227602931572, + "loss": 0.2753, + "step": 11458 + }, + { + "epoch": 1.53, + "grad_norm": 0.58984375, + "learning_rate": 0.0001552625711027977, + "loss": 0.3571, + "step": 11459 + }, + { + "epoch": 1.53, + "grad_norm": 0.4140625, + "learning_rate": 0.00015525286542688379, + "loss": 0.5003, + "step": 11460 + }, + { + "epoch": 1.53, + "grad_norm": 0.5546875, + "learning_rate": 0.00015524315900170555, + "loss": 0.641, + "step": 11461 + }, + { + "epoch": 1.53, + "grad_norm": 0.482421875, + "learning_rate": 0.00015523345182739463, + "loss": 0.2955, + "step": 11462 + }, + { + "epoch": 1.53, + "grad_norm": 0.482421875, + "learning_rate": 0.00015522374390408264, + "loss": 0.1598, + "step": 11463 + }, + { + "epoch": 1.53, + "grad_norm": 0.5546875, + "learning_rate": 0.00015521403523190128, + "loss": 0.613, + "step": 11464 + }, + { + "epoch": 1.53, + "grad_norm": 0.5234375, + "learning_rate": 0.00015520432581098213, + "loss": 0.4893, + "step": 11465 + }, + { + "epoch": 1.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00015519461564145689, + "loss": 0.4055, + "step": 11466 + }, + { + "epoch": 1.53, + "grad_norm": 0.412109375, + "learning_rate": 0.00015518490472345723, + "loss": 0.2679, + "step": 11467 + }, + { + "epoch": 1.53, + "grad_norm": 0.39453125, + "learning_rate": 0.00015517519305711485, + "loss": 0.2955, + "step": 11468 + }, + { + "epoch": 1.53, + "grad_norm": 0.6015625, + "learning_rate": 0.00015516548064256147, + "loss": 0.4611, + "step": 11469 + }, + { + "epoch": 1.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00015515576747992876, + "loss": 0.4602, + "step": 11470 + }, + { + "epoch": 1.53, + "grad_norm": 0.482421875, + "learning_rate": 0.00015514605356934844, + "loss": 0.565, + "step": 11471 + }, + { + "epoch": 1.53, + "grad_norm": 0.54296875, + "learning_rate": 0.00015513633891095223, + "loss": 0.3289, + "step": 11472 + }, + { + "epoch": 1.53, + "grad_norm": 0.486328125, + "learning_rate": 0.00015512662350487187, + "loss": 0.2139, + "step": 11473 + }, + { + "epoch": 1.53, + "grad_norm": 0.490234375, + "learning_rate": 0.0001551169073512391, + "loss": 0.3764, + "step": 11474 + }, + { + "epoch": 1.53, + "grad_norm": 0.5625, + "learning_rate": 0.00015510719045018574, + "loss": 0.2897, + "step": 11475 + }, + { + "epoch": 1.53, + "grad_norm": 0.5234375, + "learning_rate": 0.0001550974728018435, + "loss": 0.5288, + "step": 11476 + }, + { + "epoch": 1.53, + "grad_norm": 0.498046875, + "learning_rate": 0.00015508775440634416, + "loss": 0.3162, + "step": 11477 + }, + { + "epoch": 1.53, + "grad_norm": 0.49609375, + "learning_rate": 0.00015507803526381951, + "loss": 0.3514, + "step": 11478 + }, + { + "epoch": 1.53, + "grad_norm": 0.49609375, + "learning_rate": 0.00015506831537440133, + "loss": 0.4698, + "step": 11479 + }, + { + "epoch": 1.53, + "grad_norm": 0.59375, + "learning_rate": 0.00015505859473822147, + "loss": 0.5777, + "step": 11480 + }, + { + "epoch": 1.53, + "grad_norm": 0.439453125, + "learning_rate": 0.00015504887335541175, + "loss": 0.6188, + "step": 11481 + }, + { + "epoch": 1.53, + "grad_norm": 0.32421875, + "learning_rate": 0.00015503915122610398, + "loss": 0.2702, + "step": 11482 + }, + { + "epoch": 1.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00015502942835042996, + "loss": 0.3926, + "step": 11483 + }, + { + "epoch": 1.53, + "grad_norm": 0.515625, + "learning_rate": 0.0001550197047285216, + "loss": 0.2896, + "step": 11484 + }, + { + "epoch": 1.53, + "grad_norm": 0.427734375, + "learning_rate": 0.00015500998036051074, + "loss": 0.2032, + "step": 11485 + }, + { + "epoch": 1.53, + "grad_norm": 0.57421875, + "learning_rate": 0.0001550002552465292, + "loss": 0.5565, + "step": 11486 + }, + { + "epoch": 1.53, + "grad_norm": 0.498046875, + "learning_rate": 0.00015499052938670893, + "loss": 0.5869, + "step": 11487 + }, + { + "epoch": 1.53, + "grad_norm": 0.45703125, + "learning_rate": 0.0001549808027811818, + "loss": 0.1021, + "step": 11488 + }, + { + "epoch": 1.53, + "grad_norm": 0.498046875, + "learning_rate": 0.00015497107543007967, + "loss": 0.3363, + "step": 11489 + }, + { + "epoch": 1.53, + "grad_norm": 0.376953125, + "learning_rate": 0.0001549613473335345, + "loss": 0.2173, + "step": 11490 + }, + { + "epoch": 1.53, + "grad_norm": 0.412109375, + "learning_rate": 0.00015495161849167816, + "loss": 0.5244, + "step": 11491 + }, + { + "epoch": 1.53, + "grad_norm": 0.63671875, + "learning_rate": 0.00015494188890464262, + "loss": 0.3318, + "step": 11492 + }, + { + "epoch": 1.53, + "grad_norm": 0.55859375, + "learning_rate": 0.00015493215857255983, + "loss": 0.5483, + "step": 11493 + }, + { + "epoch": 1.53, + "grad_norm": 0.470703125, + "learning_rate": 0.00015492242749556167, + "loss": 0.4299, + "step": 11494 + }, + { + "epoch": 1.53, + "grad_norm": 0.451171875, + "learning_rate": 0.00015491269567378016, + "loss": 0.3613, + "step": 11495 + }, + { + "epoch": 1.53, + "grad_norm": 0.59765625, + "learning_rate": 0.00015490296310734727, + "loss": 0.4638, + "step": 11496 + }, + { + "epoch": 1.53, + "grad_norm": 0.48828125, + "learning_rate": 0.00015489322979639496, + "loss": 0.6076, + "step": 11497 + }, + { + "epoch": 1.53, + "grad_norm": 0.6171875, + "learning_rate": 0.0001548834957410552, + "loss": 0.3314, + "step": 11498 + }, + { + "epoch": 1.53, + "grad_norm": 0.453125, + "learning_rate": 0.00015487376094146002, + "loss": 0.3041, + "step": 11499 + }, + { + "epoch": 1.53, + "grad_norm": 0.58203125, + "learning_rate": 0.00015486402539774145, + "loss": 0.3645, + "step": 11500 + }, + { + "epoch": 1.53, + "grad_norm": 0.4609375, + "learning_rate": 0.0001548542891100315, + "loss": 0.2839, + "step": 11501 + }, + { + "epoch": 1.53, + "grad_norm": 0.458984375, + "learning_rate": 0.00015484455207846214, + "loss": 0.4955, + "step": 11502 + }, + { + "epoch": 1.53, + "grad_norm": 0.482421875, + "learning_rate": 0.00015483481430316544, + "loss": 0.3644, + "step": 11503 + }, + { + "epoch": 1.54, + "grad_norm": 0.55859375, + "learning_rate": 0.0001548250757842735, + "loss": 0.4466, + "step": 11504 + }, + { + "epoch": 1.54, + "grad_norm": 0.51953125, + "learning_rate": 0.00015481533652191836, + "loss": 0.5384, + "step": 11505 + }, + { + "epoch": 1.54, + "grad_norm": 0.640625, + "learning_rate": 0.00015480559651623202, + "loss": 0.203, + "step": 11506 + }, + { + "epoch": 1.54, + "grad_norm": 0.57421875, + "learning_rate": 0.00015479585576734669, + "loss": 0.2508, + "step": 11507 + }, + { + "epoch": 1.54, + "grad_norm": 0.6015625, + "learning_rate": 0.00015478611427539434, + "loss": 0.3764, + "step": 11508 + }, + { + "epoch": 1.54, + "grad_norm": 0.48046875, + "learning_rate": 0.0001547763720405071, + "loss": 0.4638, + "step": 11509 + }, + { + "epoch": 1.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00015476662906281714, + "loss": 0.4932, + "step": 11510 + }, + { + "epoch": 1.54, + "grad_norm": 0.5390625, + "learning_rate": 0.00015475688534245653, + "loss": 0.2835, + "step": 11511 + }, + { + "epoch": 1.54, + "grad_norm": 0.890625, + "learning_rate": 0.00015474714087955742, + "loss": 0.3093, + "step": 11512 + }, + { + "epoch": 1.54, + "grad_norm": 0.4765625, + "learning_rate": 0.00015473739567425194, + "loss": 0.3261, + "step": 11513 + }, + { + "epoch": 1.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00015472764972667224, + "loss": 0.4847, + "step": 11514 + }, + { + "epoch": 1.54, + "grad_norm": 0.65234375, + "learning_rate": 0.0001547179030369505, + "loss": 0.8111, + "step": 11515 + }, + { + "epoch": 1.54, + "grad_norm": 0.4765625, + "learning_rate": 0.00015470815560521882, + "loss": 0.2921, + "step": 11516 + }, + { + "epoch": 1.54, + "grad_norm": 0.478515625, + "learning_rate": 0.00015469840743160952, + "loss": 0.5313, + "step": 11517 + }, + { + "epoch": 1.54, + "grad_norm": 0.50390625, + "learning_rate": 0.00015468865851625465, + "loss": 0.4561, + "step": 11518 + }, + { + "epoch": 1.54, + "grad_norm": 0.6015625, + "learning_rate": 0.00015467890885928652, + "loss": 0.4061, + "step": 11519 + }, + { + "epoch": 1.54, + "grad_norm": 0.474609375, + "learning_rate": 0.00015466915846083724, + "loss": 0.5028, + "step": 11520 + }, + { + "epoch": 1.54, + "grad_norm": 0.578125, + "learning_rate": 0.0001546594073210391, + "loss": 0.2686, + "step": 11521 + }, + { + "epoch": 1.54, + "grad_norm": 0.5078125, + "learning_rate": 0.00015464965544002433, + "loss": 0.5568, + "step": 11522 + }, + { + "epoch": 1.54, + "grad_norm": 0.486328125, + "learning_rate": 0.00015463990281792516, + "loss": 0.4916, + "step": 11523 + }, + { + "epoch": 1.54, + "grad_norm": 0.48046875, + "learning_rate": 0.00015463014945487383, + "loss": 0.4257, + "step": 11524 + }, + { + "epoch": 1.54, + "grad_norm": 0.478515625, + "learning_rate": 0.00015462039535100262, + "loss": 0.2903, + "step": 11525 + }, + { + "epoch": 1.54, + "grad_norm": 0.55859375, + "learning_rate": 0.0001546106405064438, + "loss": 0.5705, + "step": 11526 + }, + { + "epoch": 1.54, + "grad_norm": 0.6171875, + "learning_rate": 0.00015460088492132962, + "loss": 0.7577, + "step": 11527 + }, + { + "epoch": 1.54, + "grad_norm": 0.62109375, + "learning_rate": 0.00015459112859579245, + "loss": 0.3272, + "step": 11528 + }, + { + "epoch": 1.54, + "grad_norm": 0.5390625, + "learning_rate": 0.00015458137152996447, + "loss": 0.4573, + "step": 11529 + }, + { + "epoch": 1.54, + "grad_norm": 0.4453125, + "learning_rate": 0.0001545716137239781, + "loss": 0.3426, + "step": 11530 + }, + { + "epoch": 1.54, + "grad_norm": 0.41015625, + "learning_rate": 0.00015456185517796563, + "loss": 0.1757, + "step": 11531 + }, + { + "epoch": 1.54, + "grad_norm": 0.62109375, + "learning_rate": 0.0001545520958920594, + "loss": 0.4947, + "step": 11532 + }, + { + "epoch": 1.54, + "grad_norm": 0.60546875, + "learning_rate": 0.0001545423358663917, + "loss": 0.3735, + "step": 11533 + }, + { + "epoch": 1.54, + "grad_norm": 0.609375, + "learning_rate": 0.00015453257510109492, + "loss": 0.4041, + "step": 11534 + }, + { + "epoch": 1.54, + "grad_norm": 0.62890625, + "learning_rate": 0.00015452281359630145, + "loss": 0.3143, + "step": 11535 + }, + { + "epoch": 1.54, + "grad_norm": 0.67578125, + "learning_rate": 0.0001545130513521436, + "loss": 0.2219, + "step": 11536 + }, + { + "epoch": 1.54, + "grad_norm": 0.5859375, + "learning_rate": 0.00015450328836875382, + "loss": 0.5055, + "step": 11537 + }, + { + "epoch": 1.54, + "grad_norm": 0.494140625, + "learning_rate": 0.0001544935246462645, + "loss": 0.3663, + "step": 11538 + }, + { + "epoch": 1.54, + "grad_norm": 0.5703125, + "learning_rate": 0.00015448376018480797, + "loss": 0.2791, + "step": 11539 + }, + { + "epoch": 1.54, + "grad_norm": 0.6953125, + "learning_rate": 0.00015447399498451668, + "loss": 0.4607, + "step": 11540 + }, + { + "epoch": 1.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00015446422904552307, + "loss": 0.2409, + "step": 11541 + }, + { + "epoch": 1.54, + "grad_norm": 0.59765625, + "learning_rate": 0.00015445446236795953, + "loss": 0.298, + "step": 11542 + }, + { + "epoch": 1.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00015444469495195856, + "loss": 0.3266, + "step": 11543 + }, + { + "epoch": 1.54, + "grad_norm": 0.51953125, + "learning_rate": 0.0001544349267976526, + "loss": 0.6634, + "step": 11544 + }, + { + "epoch": 1.54, + "grad_norm": 0.474609375, + "learning_rate": 0.00015442515790517407, + "loss": 0.4236, + "step": 11545 + }, + { + "epoch": 1.54, + "grad_norm": 0.7265625, + "learning_rate": 0.00015441538827465546, + "loss": 0.3396, + "step": 11546 + }, + { + "epoch": 1.54, + "grad_norm": 0.55859375, + "learning_rate": 0.00015440561790622929, + "loss": 0.2973, + "step": 11547 + }, + { + "epoch": 1.54, + "grad_norm": 0.6484375, + "learning_rate": 0.00015439584680002798, + "loss": 0.3278, + "step": 11548 + }, + { + "epoch": 1.54, + "grad_norm": 0.55078125, + "learning_rate": 0.00015438607495618414, + "loss": 0.6131, + "step": 11549 + }, + { + "epoch": 1.54, + "grad_norm": 0.416015625, + "learning_rate": 0.00015437630237483019, + "loss": 0.2973, + "step": 11550 + }, + { + "epoch": 1.54, + "grad_norm": 0.67578125, + "learning_rate": 0.00015436652905609867, + "loss": 0.261, + "step": 11551 + }, + { + "epoch": 1.54, + "grad_norm": 0.6171875, + "learning_rate": 0.00015435675500012212, + "loss": 0.6305, + "step": 11552 + }, + { + "epoch": 1.54, + "grad_norm": 0.65625, + "learning_rate": 0.0001543469802070331, + "loss": 0.3365, + "step": 11553 + }, + { + "epoch": 1.54, + "grad_norm": 0.490234375, + "learning_rate": 0.00015433720467696412, + "loss": 0.3628, + "step": 11554 + }, + { + "epoch": 1.54, + "grad_norm": 0.515625, + "learning_rate": 0.00015432742841004784, + "loss": 0.2258, + "step": 11555 + }, + { + "epoch": 1.54, + "grad_norm": 0.73046875, + "learning_rate": 0.00015431765140641672, + "loss": 0.4396, + "step": 11556 + }, + { + "epoch": 1.54, + "grad_norm": 0.59765625, + "learning_rate": 0.00015430787366620336, + "loss": 0.3678, + "step": 11557 + }, + { + "epoch": 1.54, + "grad_norm": 0.396484375, + "learning_rate": 0.00015429809518954042, + "loss": 0.2108, + "step": 11558 + }, + { + "epoch": 1.54, + "grad_norm": 0.470703125, + "learning_rate": 0.00015428831597656044, + "loss": 0.3015, + "step": 11559 + }, + { + "epoch": 1.54, + "grad_norm": 0.68359375, + "learning_rate": 0.00015427853602739607, + "loss": 0.4832, + "step": 11560 + }, + { + "epoch": 1.54, + "grad_norm": 0.4765625, + "learning_rate": 0.0001542687553421799, + "loss": 0.6324, + "step": 11561 + }, + { + "epoch": 1.54, + "grad_norm": 0.59765625, + "learning_rate": 0.0001542589739210446, + "loss": 0.5225, + "step": 11562 + }, + { + "epoch": 1.54, + "grad_norm": 0.53125, + "learning_rate": 0.00015424919176412275, + "loss": 0.289, + "step": 11563 + }, + { + "epoch": 1.54, + "grad_norm": 0.5234375, + "learning_rate": 0.00015423940887154708, + "loss": 0.4486, + "step": 11564 + }, + { + "epoch": 1.54, + "grad_norm": 0.455078125, + "learning_rate": 0.0001542296252434502, + "loss": 0.1977, + "step": 11565 + }, + { + "epoch": 1.54, + "grad_norm": 0.55859375, + "learning_rate": 0.0001542198408799648, + "loss": 0.2984, + "step": 11566 + }, + { + "epoch": 1.54, + "grad_norm": 0.46875, + "learning_rate": 0.00015421005578122356, + "loss": 0.4348, + "step": 11567 + }, + { + "epoch": 1.54, + "grad_norm": 0.5546875, + "learning_rate": 0.00015420026994735918, + "loss": 0.6935, + "step": 11568 + }, + { + "epoch": 1.54, + "grad_norm": 0.73828125, + "learning_rate": 0.00015419048337850434, + "loss": 0.4469, + "step": 11569 + }, + { + "epoch": 1.54, + "grad_norm": 0.5625, + "learning_rate": 0.00015418069607479176, + "loss": 0.2876, + "step": 11570 + }, + { + "epoch": 1.54, + "grad_norm": 0.5859375, + "learning_rate": 0.00015417090803635417, + "loss": 0.5815, + "step": 11571 + }, + { + "epoch": 1.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00015416111926332433, + "loss": 0.2957, + "step": 11572 + }, + { + "epoch": 1.54, + "grad_norm": 0.5, + "learning_rate": 0.00015415132975583494, + "loss": 0.2329, + "step": 11573 + }, + { + "epoch": 1.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00015414153951401875, + "loss": 0.2893, + "step": 11574 + }, + { + "epoch": 1.54, + "grad_norm": 0.6484375, + "learning_rate": 0.00015413174853800855, + "loss": 0.2785, + "step": 11575 + }, + { + "epoch": 1.54, + "grad_norm": 0.51171875, + "learning_rate": 0.00015412195682793707, + "loss": 0.4372, + "step": 11576 + }, + { + "epoch": 1.54, + "grad_norm": 0.4921875, + "learning_rate": 0.00015411216438393713, + "loss": 0.2666, + "step": 11577 + }, + { + "epoch": 1.54, + "grad_norm": 0.57421875, + "learning_rate": 0.00015410237120614152, + "loss": 0.4697, + "step": 11578 + }, + { + "epoch": 1.55, + "grad_norm": 0.73828125, + "learning_rate": 0.00015409257729468307, + "loss": 0.2628, + "step": 11579 + }, + { + "epoch": 1.55, + "grad_norm": 0.5390625, + "learning_rate": 0.0001540827826496945, + "loss": 0.2279, + "step": 11580 + }, + { + "epoch": 1.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00015407298727130873, + "loss": 0.5747, + "step": 11581 + }, + { + "epoch": 1.55, + "grad_norm": 0.435546875, + "learning_rate": 0.00015406319115965852, + "loss": 0.2227, + "step": 11582 + }, + { + "epoch": 1.55, + "grad_norm": 0.494140625, + "learning_rate": 0.00015405339431487675, + "loss": 0.3169, + "step": 11583 + }, + { + "epoch": 1.55, + "grad_norm": 1.2109375, + "learning_rate": 0.00015404359673709626, + "loss": 0.4149, + "step": 11584 + }, + { + "epoch": 1.55, + "grad_norm": 0.5078125, + "learning_rate": 0.0001540337984264499, + "loss": 0.385, + "step": 11585 + }, + { + "epoch": 1.55, + "grad_norm": 0.423828125, + "learning_rate": 0.0001540239993830706, + "loss": 0.193, + "step": 11586 + }, + { + "epoch": 1.55, + "grad_norm": 0.431640625, + "learning_rate": 0.00015401419960709114, + "loss": 0.2887, + "step": 11587 + }, + { + "epoch": 1.55, + "grad_norm": 0.421875, + "learning_rate": 0.0001540043990986445, + "loss": 0.2219, + "step": 11588 + }, + { + "epoch": 1.55, + "grad_norm": 0.44921875, + "learning_rate": 0.00015399459785786355, + "loss": 0.3139, + "step": 11589 + }, + { + "epoch": 1.55, + "grad_norm": 0.55859375, + "learning_rate": 0.00015398479588488116, + "loss": 0.4606, + "step": 11590 + }, + { + "epoch": 1.55, + "grad_norm": 0.5390625, + "learning_rate": 0.00015397499317983034, + "loss": 0.4276, + "step": 11591 + }, + { + "epoch": 1.55, + "grad_norm": 0.4609375, + "learning_rate": 0.00015396518974284396, + "loss": 0.3648, + "step": 11592 + }, + { + "epoch": 1.55, + "grad_norm": 0.67578125, + "learning_rate": 0.00015395538557405494, + "loss": 0.4424, + "step": 11593 + }, + { + "epoch": 1.55, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001539455806735963, + "loss": 0.0801, + "step": 11594 + }, + { + "epoch": 1.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00015393577504160095, + "loss": 0.6036, + "step": 11595 + }, + { + "epoch": 1.55, + "grad_norm": 0.62890625, + "learning_rate": 0.00015392596867820187, + "loss": 0.5948, + "step": 11596 + }, + { + "epoch": 1.55, + "grad_norm": 0.71875, + "learning_rate": 0.00015391616158353207, + "loss": 0.2213, + "step": 11597 + }, + { + "epoch": 1.55, + "grad_norm": 0.494140625, + "learning_rate": 0.0001539063537577245, + "loss": 0.6008, + "step": 11598 + }, + { + "epoch": 1.55, + "grad_norm": 0.703125, + "learning_rate": 0.00015389654520091216, + "loss": 0.3802, + "step": 11599 + }, + { + "epoch": 1.55, + "grad_norm": 0.37890625, + "learning_rate": 0.00015388673591322808, + "loss": 0.2149, + "step": 11600 + }, + { + "epoch": 1.55, + "grad_norm": 0.7265625, + "learning_rate": 0.0001538769258948053, + "loss": 0.4361, + "step": 11601 + }, + { + "epoch": 1.55, + "grad_norm": 0.6953125, + "learning_rate": 0.0001538671151457768, + "loss": 0.3786, + "step": 11602 + }, + { + "epoch": 1.55, + "grad_norm": 0.5078125, + "learning_rate": 0.0001538573036662757, + "loss": 0.4063, + "step": 11603 + }, + { + "epoch": 1.55, + "grad_norm": 0.392578125, + "learning_rate": 0.00015384749145643496, + "loss": 0.1741, + "step": 11604 + }, + { + "epoch": 1.55, + "grad_norm": 0.50390625, + "learning_rate": 0.00015383767851638765, + "loss": 0.4526, + "step": 11605 + }, + { + "epoch": 1.55, + "grad_norm": 0.50390625, + "learning_rate": 0.0001538278648462669, + "loss": 0.518, + "step": 11606 + }, + { + "epoch": 1.55, + "grad_norm": 0.6484375, + "learning_rate": 0.0001538180504462057, + "loss": 0.3305, + "step": 11607 + }, + { + "epoch": 1.55, + "grad_norm": 0.5234375, + "learning_rate": 0.00015380823531633729, + "loss": 0.4011, + "step": 11608 + }, + { + "epoch": 1.55, + "grad_norm": 0.48046875, + "learning_rate": 0.0001537984194567946, + "loss": 0.2811, + "step": 11609 + }, + { + "epoch": 1.55, + "grad_norm": 0.60546875, + "learning_rate": 0.00015378860286771086, + "loss": 0.4928, + "step": 11610 + }, + { + "epoch": 1.55, + "grad_norm": 0.53125, + "learning_rate": 0.00015377878554921913, + "loss": 0.3085, + "step": 11611 + }, + { + "epoch": 1.55, + "grad_norm": 0.6015625, + "learning_rate": 0.00015376896750145255, + "loss": 0.5626, + "step": 11612 + }, + { + "epoch": 1.55, + "grad_norm": 0.42578125, + "learning_rate": 0.00015375914872454427, + "loss": 0.3299, + "step": 11613 + }, + { + "epoch": 1.55, + "grad_norm": 0.5, + "learning_rate": 0.00015374932921862738, + "loss": 0.192, + "step": 11614 + }, + { + "epoch": 1.55, + "grad_norm": 0.671875, + "learning_rate": 0.00015373950898383515, + "loss": 0.4947, + "step": 11615 + }, + { + "epoch": 1.55, + "grad_norm": 0.458984375, + "learning_rate": 0.0001537296880203007, + "loss": 0.3496, + "step": 11616 + }, + { + "epoch": 1.55, + "grad_norm": 0.67578125, + "learning_rate": 0.00015371986632815716, + "loss": 0.4438, + "step": 11617 + }, + { + "epoch": 1.55, + "grad_norm": 0.77734375, + "learning_rate": 0.00015371004390753775, + "loss": 0.2539, + "step": 11618 + }, + { + "epoch": 1.55, + "grad_norm": 0.5625, + "learning_rate": 0.0001537002207585757, + "loss": 0.4521, + "step": 11619 + }, + { + "epoch": 1.55, + "grad_norm": 0.302734375, + "learning_rate": 0.00015369039688140415, + "loss": 0.1387, + "step": 11620 + }, + { + "epoch": 1.55, + "grad_norm": 0.51953125, + "learning_rate": 0.0001536805722761564, + "loss": 0.6023, + "step": 11621 + }, + { + "epoch": 1.55, + "grad_norm": 0.53125, + "learning_rate": 0.00015367074694296563, + "loss": 0.2888, + "step": 11622 + }, + { + "epoch": 1.55, + "grad_norm": 0.51953125, + "learning_rate": 0.0001536609208819651, + "loss": 0.2554, + "step": 11623 + }, + { + "epoch": 1.55, + "grad_norm": 0.81640625, + "learning_rate": 0.00015365109409328802, + "loss": 0.4574, + "step": 11624 + }, + { + "epoch": 1.55, + "grad_norm": 0.48046875, + "learning_rate": 0.00015364126657706767, + "loss": 0.3472, + "step": 11625 + }, + { + "epoch": 1.55, + "grad_norm": 0.60546875, + "learning_rate": 0.0001536314383334373, + "loss": 0.4775, + "step": 11626 + }, + { + "epoch": 1.55, + "grad_norm": 0.57421875, + "learning_rate": 0.00015362160936253024, + "loss": 0.4561, + "step": 11627 + }, + { + "epoch": 1.55, + "grad_norm": 0.52734375, + "learning_rate": 0.00015361177966447974, + "loss": 0.3738, + "step": 11628 + }, + { + "epoch": 1.55, + "grad_norm": 0.73046875, + "learning_rate": 0.00015360194923941912, + "loss": 0.1988, + "step": 11629 + }, + { + "epoch": 1.55, + "grad_norm": 0.39453125, + "learning_rate": 0.0001535921180874816, + "loss": 0.2384, + "step": 11630 + }, + { + "epoch": 1.55, + "grad_norm": 0.640625, + "learning_rate": 0.00015358228620880065, + "loss": 0.3572, + "step": 11631 + }, + { + "epoch": 1.55, + "grad_norm": 0.69140625, + "learning_rate": 0.00015357245360350942, + "loss": 0.4849, + "step": 11632 + }, + { + "epoch": 1.55, + "grad_norm": 0.5078125, + "learning_rate": 0.00015356262027174143, + "loss": 0.3771, + "step": 11633 + }, + { + "epoch": 1.55, + "grad_norm": 0.466796875, + "learning_rate": 0.00015355278621362984, + "loss": 0.5248, + "step": 11634 + }, + { + "epoch": 1.55, + "grad_norm": 0.625, + "learning_rate": 0.00015354295142930817, + "loss": 0.3452, + "step": 11635 + }, + { + "epoch": 1.55, + "grad_norm": 0.5546875, + "learning_rate": 0.0001535331159189097, + "loss": 0.5738, + "step": 11636 + }, + { + "epoch": 1.55, + "grad_norm": 0.46875, + "learning_rate": 0.00015352327968256777, + "loss": 0.4606, + "step": 11637 + }, + { + "epoch": 1.55, + "grad_norm": 0.404296875, + "learning_rate": 0.0001535134427204159, + "loss": 0.1243, + "step": 11638 + }, + { + "epoch": 1.55, + "grad_norm": 0.5, + "learning_rate": 0.00015350360503258733, + "loss": 0.2287, + "step": 11639 + }, + { + "epoch": 1.55, + "grad_norm": 0.53125, + "learning_rate": 0.0001534937666192156, + "loss": 0.4511, + "step": 11640 + }, + { + "epoch": 1.55, + "grad_norm": 0.73828125, + "learning_rate": 0.000153483927480434, + "loss": 0.6099, + "step": 11641 + }, + { + "epoch": 1.55, + "grad_norm": 0.4921875, + "learning_rate": 0.00015347408761637603, + "loss": 0.6571, + "step": 11642 + }, + { + "epoch": 1.55, + "grad_norm": 0.4140625, + "learning_rate": 0.00015346424702717513, + "loss": 0.1983, + "step": 11643 + }, + { + "epoch": 1.55, + "grad_norm": 0.6953125, + "learning_rate": 0.00015345440571296471, + "loss": 0.7562, + "step": 11644 + }, + { + "epoch": 1.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00015344456367387827, + "loss": 0.3594, + "step": 11645 + }, + { + "epoch": 1.55, + "grad_norm": 0.48046875, + "learning_rate": 0.00015343472091004925, + "loss": 0.6121, + "step": 11646 + }, + { + "epoch": 1.55, + "grad_norm": 0.5859375, + "learning_rate": 0.0001534248774216111, + "loss": 0.4616, + "step": 11647 + }, + { + "epoch": 1.55, + "grad_norm": 0.5078125, + "learning_rate": 0.00015341503320869733, + "loss": 0.3743, + "step": 11648 + }, + { + "epoch": 1.55, + "grad_norm": 0.412109375, + "learning_rate": 0.00015340518827144145, + "loss": 0.3703, + "step": 11649 + }, + { + "epoch": 1.55, + "grad_norm": 0.431640625, + "learning_rate": 0.00015339534260997687, + "loss": 0.1611, + "step": 11650 + }, + { + "epoch": 1.55, + "grad_norm": 0.6171875, + "learning_rate": 0.00015338549622443724, + "loss": 0.6047, + "step": 11651 + }, + { + "epoch": 1.55, + "grad_norm": 0.5390625, + "learning_rate": 0.00015337564911495601, + "loss": 0.364, + "step": 11652 + }, + { + "epoch": 1.55, + "grad_norm": 0.51953125, + "learning_rate": 0.00015336580128166668, + "loss": 0.3556, + "step": 11653 + }, + { + "epoch": 1.56, + "grad_norm": 0.66796875, + "learning_rate": 0.00015335595272470288, + "loss": 0.3903, + "step": 11654 + }, + { + "epoch": 1.56, + "grad_norm": 0.5546875, + "learning_rate": 0.00015334610344419812, + "loss": 0.3812, + "step": 11655 + }, + { + "epoch": 1.56, + "grad_norm": 0.52734375, + "learning_rate": 0.00015333625344028594, + "loss": 0.3901, + "step": 11656 + }, + { + "epoch": 1.56, + "grad_norm": 0.6015625, + "learning_rate": 0.00015332640271309992, + "loss": 0.2999, + "step": 11657 + }, + { + "epoch": 1.56, + "grad_norm": 0.447265625, + "learning_rate": 0.00015331655126277368, + "loss": 0.3445, + "step": 11658 + }, + { + "epoch": 1.56, + "grad_norm": 0.515625, + "learning_rate": 0.00015330669908944077, + "loss": 0.5016, + "step": 11659 + }, + { + "epoch": 1.56, + "grad_norm": 0.546875, + "learning_rate": 0.00015329684619323484, + "loss": 0.4464, + "step": 11660 + }, + { + "epoch": 1.56, + "grad_norm": 0.3671875, + "learning_rate": 0.00015328699257428943, + "loss": 0.167, + "step": 11661 + }, + { + "epoch": 1.56, + "grad_norm": 0.65234375, + "learning_rate": 0.00015327713823273822, + "loss": 0.6571, + "step": 11662 + }, + { + "epoch": 1.56, + "grad_norm": 0.51953125, + "learning_rate": 0.00015326728316871483, + "loss": 0.6145, + "step": 11663 + }, + { + "epoch": 1.56, + "grad_norm": 0.35546875, + "learning_rate": 0.00015325742738235288, + "loss": 0.1797, + "step": 11664 + }, + { + "epoch": 1.56, + "grad_norm": 0.65234375, + "learning_rate": 0.00015324757087378606, + "loss": 0.1671, + "step": 11665 + }, + { + "epoch": 1.56, + "grad_norm": 0.4453125, + "learning_rate": 0.00015323771364314796, + "loss": 0.2775, + "step": 11666 + }, + { + "epoch": 1.56, + "grad_norm": 0.48828125, + "learning_rate": 0.00015322785569057233, + "loss": 0.3496, + "step": 11667 + }, + { + "epoch": 1.56, + "grad_norm": 0.466796875, + "learning_rate": 0.00015321799701619278, + "loss": 0.3747, + "step": 11668 + }, + { + "epoch": 1.56, + "grad_norm": 0.54296875, + "learning_rate": 0.00015320813762014307, + "loss": 0.2377, + "step": 11669 + }, + { + "epoch": 1.56, + "grad_norm": 0.365234375, + "learning_rate": 0.00015319827750255686, + "loss": 0.1952, + "step": 11670 + }, + { + "epoch": 1.56, + "grad_norm": 0.55859375, + "learning_rate": 0.00015318841666356784, + "loss": 0.5578, + "step": 11671 + }, + { + "epoch": 1.56, + "grad_norm": 0.451171875, + "learning_rate": 0.0001531785551033098, + "loss": 0.371, + "step": 11672 + }, + { + "epoch": 1.56, + "grad_norm": 0.4765625, + "learning_rate": 0.0001531686928219164, + "loss": 0.293, + "step": 11673 + }, + { + "epoch": 1.56, + "grad_norm": 0.48046875, + "learning_rate": 0.0001531588298195214, + "loss": 0.3151, + "step": 11674 + }, + { + "epoch": 1.56, + "grad_norm": 0.4296875, + "learning_rate": 0.00015314896609625853, + "loss": 0.4288, + "step": 11675 + }, + { + "epoch": 1.56, + "grad_norm": 0.76171875, + "learning_rate": 0.00015313910165226159, + "loss": 0.3015, + "step": 11676 + }, + { + "epoch": 1.56, + "grad_norm": 1.015625, + "learning_rate": 0.00015312923648766433, + "loss": 0.3544, + "step": 11677 + }, + { + "epoch": 1.56, + "grad_norm": 0.416015625, + "learning_rate": 0.0001531193706026005, + "loss": 0.2014, + "step": 11678 + }, + { + "epoch": 1.56, + "grad_norm": 0.5859375, + "learning_rate": 0.00015310950399720396, + "loss": 0.5509, + "step": 11679 + }, + { + "epoch": 1.56, + "grad_norm": 0.451171875, + "learning_rate": 0.0001530996366716084, + "loss": 0.4593, + "step": 11680 + }, + { + "epoch": 1.56, + "grad_norm": 0.490234375, + "learning_rate": 0.00015308976862594772, + "loss": 0.3689, + "step": 11681 + }, + { + "epoch": 1.56, + "grad_norm": 0.39453125, + "learning_rate": 0.00015307989986035572, + "loss": 0.3521, + "step": 11682 + }, + { + "epoch": 1.56, + "grad_norm": 0.447265625, + "learning_rate": 0.00015307003037496618, + "loss": 0.4412, + "step": 11683 + }, + { + "epoch": 1.56, + "grad_norm": 0.5078125, + "learning_rate": 0.000153060160169913, + "loss": 0.3504, + "step": 11684 + }, + { + "epoch": 1.56, + "grad_norm": 0.59765625, + "learning_rate": 0.00015305028924532995, + "loss": 0.4052, + "step": 11685 + }, + { + "epoch": 1.56, + "grad_norm": 0.345703125, + "learning_rate": 0.000153040417601351, + "loss": 0.1488, + "step": 11686 + }, + { + "epoch": 1.56, + "grad_norm": 0.45703125, + "learning_rate": 0.00015303054523810986, + "loss": 0.315, + "step": 11687 + }, + { + "epoch": 1.56, + "grad_norm": 0.6015625, + "learning_rate": 0.00015302067215574057, + "loss": 0.4193, + "step": 11688 + }, + { + "epoch": 1.56, + "grad_norm": 0.63671875, + "learning_rate": 0.0001530107983543769, + "loss": 0.4643, + "step": 11689 + }, + { + "epoch": 1.56, + "grad_norm": 0.484375, + "learning_rate": 0.00015300092383415282, + "loss": 0.4574, + "step": 11690 + }, + { + "epoch": 1.56, + "grad_norm": 0.6015625, + "learning_rate": 0.00015299104859520213, + "loss": 0.2387, + "step": 11691 + }, + { + "epoch": 1.56, + "grad_norm": 0.404296875, + "learning_rate": 0.00015298117263765884, + "loss": 0.3583, + "step": 11692 + }, + { + "epoch": 1.56, + "grad_norm": 0.57421875, + "learning_rate": 0.00015297129596165686, + "loss": 0.2253, + "step": 11693 + }, + { + "epoch": 1.56, + "grad_norm": 0.373046875, + "learning_rate": 0.0001529614185673301, + "loss": 0.2726, + "step": 11694 + }, + { + "epoch": 1.56, + "grad_norm": 1.046875, + "learning_rate": 0.00015295154045481252, + "loss": 0.2529, + "step": 11695 + }, + { + "epoch": 1.56, + "grad_norm": 0.365234375, + "learning_rate": 0.00015294166162423806, + "loss": 0.4242, + "step": 11696 + }, + { + "epoch": 1.56, + "grad_norm": 0.419921875, + "learning_rate": 0.0001529317820757407, + "loss": 0.2194, + "step": 11697 + }, + { + "epoch": 1.56, + "grad_norm": 0.53515625, + "learning_rate": 0.00015292190180945438, + "loss": 0.2532, + "step": 11698 + }, + { + "epoch": 1.56, + "grad_norm": 0.671875, + "learning_rate": 0.00015291202082551314, + "loss": 0.4123, + "step": 11699 + }, + { + "epoch": 1.56, + "grad_norm": 0.5625, + "learning_rate": 0.0001529021391240509, + "loss": 0.5877, + "step": 11700 + }, + { + "epoch": 1.56, + "grad_norm": 0.353515625, + "learning_rate": 0.00015289225670520176, + "loss": 0.3041, + "step": 11701 + }, + { + "epoch": 1.56, + "grad_norm": 0.56640625, + "learning_rate": 0.00015288237356909964, + "loss": 0.455, + "step": 11702 + }, + { + "epoch": 1.56, + "grad_norm": 0.515625, + "learning_rate": 0.00015287248971587855, + "loss": 0.7259, + "step": 11703 + }, + { + "epoch": 1.56, + "grad_norm": 0.419921875, + "learning_rate": 0.00015286260514567261, + "loss": 0.3272, + "step": 11704 + }, + { + "epoch": 1.56, + "grad_norm": 0.51953125, + "learning_rate": 0.0001528527198586158, + "loss": 0.4827, + "step": 11705 + }, + { + "epoch": 1.56, + "grad_norm": 0.51171875, + "learning_rate": 0.00015284283385484223, + "loss": 0.3883, + "step": 11706 + }, + { + "epoch": 1.56, + "grad_norm": 0.46484375, + "learning_rate": 0.0001528329471344859, + "loss": 0.3698, + "step": 11707 + }, + { + "epoch": 1.56, + "grad_norm": 0.5703125, + "learning_rate": 0.00015282305969768086, + "loss": 0.2186, + "step": 11708 + }, + { + "epoch": 1.56, + "grad_norm": 0.44921875, + "learning_rate": 0.0001528131715445613, + "loss": 0.2579, + "step": 11709 + }, + { + "epoch": 1.56, + "grad_norm": 0.51171875, + "learning_rate": 0.00015280328267526118, + "loss": 0.3645, + "step": 11710 + }, + { + "epoch": 1.56, + "grad_norm": 0.515625, + "learning_rate": 0.00015279339308991468, + "loss": 0.2488, + "step": 11711 + }, + { + "epoch": 1.56, + "grad_norm": 0.55078125, + "learning_rate": 0.00015278350278865585, + "loss": 0.2271, + "step": 11712 + }, + { + "epoch": 1.56, + "grad_norm": 0.6015625, + "learning_rate": 0.00015277361177161894, + "loss": 0.5308, + "step": 11713 + }, + { + "epoch": 1.56, + "grad_norm": 0.53515625, + "learning_rate": 0.00015276372003893788, + "loss": 0.6352, + "step": 11714 + }, + { + "epoch": 1.56, + "grad_norm": 0.578125, + "learning_rate": 0.00015275382759074696, + "loss": 0.4655, + "step": 11715 + }, + { + "epoch": 1.56, + "grad_norm": 0.47265625, + "learning_rate": 0.0001527439344271803, + "loss": 0.1897, + "step": 11716 + }, + { + "epoch": 1.56, + "grad_norm": 0.36328125, + "learning_rate": 0.00015273404054837197, + "loss": 0.2939, + "step": 11717 + }, + { + "epoch": 1.56, + "grad_norm": 0.66796875, + "learning_rate": 0.00015272414595445625, + "loss": 0.7986, + "step": 11718 + }, + { + "epoch": 1.56, + "grad_norm": 0.53515625, + "learning_rate": 0.0001527142506455673, + "loss": 0.4627, + "step": 11719 + }, + { + "epoch": 1.56, + "grad_norm": 0.5390625, + "learning_rate": 0.00015270435462183928, + "loss": 0.3429, + "step": 11720 + }, + { + "epoch": 1.56, + "grad_norm": 0.50390625, + "learning_rate": 0.00015269445788340633, + "loss": 0.4629, + "step": 11721 + }, + { + "epoch": 1.56, + "grad_norm": 0.78515625, + "learning_rate": 0.00015268456043040276, + "loss": 0.5939, + "step": 11722 + }, + { + "epoch": 1.56, + "grad_norm": 0.498046875, + "learning_rate": 0.00015267466226296267, + "loss": 0.5138, + "step": 11723 + }, + { + "epoch": 1.56, + "grad_norm": 0.77734375, + "learning_rate": 0.00015266476338122043, + "loss": 0.4825, + "step": 11724 + }, + { + "epoch": 1.56, + "grad_norm": 0.71484375, + "learning_rate": 0.00015265486378531018, + "loss": 0.274, + "step": 11725 + }, + { + "epoch": 1.56, + "grad_norm": 0.421875, + "learning_rate": 0.00015264496347536618, + "loss": 0.2087, + "step": 11726 + }, + { + "epoch": 1.56, + "grad_norm": 0.396484375, + "learning_rate": 0.00015263506245152266, + "loss": 0.192, + "step": 11727 + }, + { + "epoch": 1.56, + "grad_norm": 0.59375, + "learning_rate": 0.00015262516071391392, + "loss": 0.4686, + "step": 11728 + }, + { + "epoch": 1.57, + "grad_norm": 0.40234375, + "learning_rate": 0.00015261525826267426, + "loss": 0.4128, + "step": 11729 + }, + { + "epoch": 1.57, + "grad_norm": 0.50390625, + "learning_rate": 0.0001526053550979379, + "loss": 0.4966, + "step": 11730 + }, + { + "epoch": 1.57, + "grad_norm": 0.765625, + "learning_rate": 0.0001525954512198392, + "loss": 0.5397, + "step": 11731 + }, + { + "epoch": 1.57, + "grad_norm": 0.53515625, + "learning_rate": 0.0001525855466285124, + "loss": 0.5296, + "step": 11732 + }, + { + "epoch": 1.57, + "grad_norm": 0.40625, + "learning_rate": 0.00015257564132409182, + "loss": 0.2168, + "step": 11733 + }, + { + "epoch": 1.57, + "grad_norm": 0.41015625, + "learning_rate": 0.0001525657353067118, + "loss": 0.332, + "step": 11734 + }, + { + "epoch": 1.57, + "grad_norm": 0.5078125, + "learning_rate": 0.0001525558285765067, + "loss": 0.5551, + "step": 11735 + }, + { + "epoch": 1.57, + "grad_norm": 0.400390625, + "learning_rate": 0.00015254592113361084, + "loss": 0.26, + "step": 11736 + }, + { + "epoch": 1.57, + "grad_norm": 0.47265625, + "learning_rate": 0.00015253601297815855, + "loss": 0.367, + "step": 11737 + }, + { + "epoch": 1.57, + "grad_norm": 0.421875, + "learning_rate": 0.00015252610411028422, + "loss": 0.3961, + "step": 11738 + }, + { + "epoch": 1.57, + "grad_norm": 0.484375, + "learning_rate": 0.00015251619453012216, + "loss": 0.3114, + "step": 11739 + }, + { + "epoch": 1.57, + "grad_norm": 0.6484375, + "learning_rate": 0.00015250628423780683, + "loss": 0.354, + "step": 11740 + }, + { + "epoch": 1.57, + "grad_norm": 0.50390625, + "learning_rate": 0.0001524963732334726, + "loss": 0.5538, + "step": 11741 + }, + { + "epoch": 1.57, + "grad_norm": 0.388671875, + "learning_rate": 0.00015248646151725382, + "loss": 0.2219, + "step": 11742 + }, + { + "epoch": 1.57, + "grad_norm": 0.47265625, + "learning_rate": 0.000152476549089285, + "loss": 0.2651, + "step": 11743 + }, + { + "epoch": 1.57, + "grad_norm": 0.427734375, + "learning_rate": 0.00015246663594970045, + "loss": 0.4, + "step": 11744 + }, + { + "epoch": 1.57, + "grad_norm": 0.734375, + "learning_rate": 0.00015245672209863466, + "loss": 0.8033, + "step": 11745 + }, + { + "epoch": 1.57, + "grad_norm": 0.486328125, + "learning_rate": 0.000152446807536222, + "loss": 0.3308, + "step": 11746 + }, + { + "epoch": 1.57, + "grad_norm": 0.55859375, + "learning_rate": 0.00015243689226259705, + "loss": 0.3111, + "step": 11747 + }, + { + "epoch": 1.57, + "grad_norm": 0.55078125, + "learning_rate": 0.00015242697627789413, + "loss": 0.4318, + "step": 11748 + }, + { + "epoch": 1.57, + "grad_norm": 0.40234375, + "learning_rate": 0.0001524170595822478, + "loss": 0.2356, + "step": 11749 + }, + { + "epoch": 1.57, + "grad_norm": 0.48046875, + "learning_rate": 0.00015240714217579248, + "loss": 0.4268, + "step": 11750 + }, + { + "epoch": 1.57, + "grad_norm": 0.478515625, + "learning_rate": 0.00015239722405866268, + "loss": 0.2103, + "step": 11751 + }, + { + "epoch": 1.57, + "grad_norm": 0.376953125, + "learning_rate": 0.0001523873052309929, + "loss": 0.1534, + "step": 11752 + }, + { + "epoch": 1.57, + "grad_norm": 0.408203125, + "learning_rate": 0.00015237738569291763, + "loss": 0.2638, + "step": 11753 + }, + { + "epoch": 1.57, + "grad_norm": 0.546875, + "learning_rate": 0.0001523674654445714, + "loss": 0.4402, + "step": 11754 + }, + { + "epoch": 1.57, + "grad_norm": 0.55859375, + "learning_rate": 0.00015235754448608875, + "loss": 0.4661, + "step": 11755 + }, + { + "epoch": 1.57, + "grad_norm": 0.47265625, + "learning_rate": 0.00015234762281760418, + "loss": 0.5037, + "step": 11756 + }, + { + "epoch": 1.57, + "grad_norm": 0.671875, + "learning_rate": 0.00015233770043925223, + "loss": 0.6287, + "step": 11757 + }, + { + "epoch": 1.57, + "grad_norm": 0.50390625, + "learning_rate": 0.00015232777735116747, + "loss": 0.2215, + "step": 11758 + }, + { + "epoch": 1.57, + "grad_norm": 0.4453125, + "learning_rate": 0.0001523178535534845, + "loss": 0.4308, + "step": 11759 + }, + { + "epoch": 1.57, + "grad_norm": 0.703125, + "learning_rate": 0.00015230792904633782, + "loss": 0.4923, + "step": 11760 + }, + { + "epoch": 1.57, + "grad_norm": 0.62890625, + "learning_rate": 0.00015229800382986208, + "loss": 0.3635, + "step": 11761 + }, + { + "epoch": 1.57, + "grad_norm": 0.71875, + "learning_rate": 0.00015228807790419185, + "loss": 0.4096, + "step": 11762 + }, + { + "epoch": 1.57, + "grad_norm": 0.74609375, + "learning_rate": 0.0001522781512694617, + "loss": 0.5937, + "step": 11763 + }, + { + "epoch": 1.57, + "grad_norm": 0.484375, + "learning_rate": 0.00015226822392580623, + "loss": 0.3016, + "step": 11764 + }, + { + "epoch": 1.57, + "grad_norm": 0.53125, + "learning_rate": 0.00015225829587336016, + "loss": 0.2829, + "step": 11765 + }, + { + "epoch": 1.57, + "grad_norm": 0.64453125, + "learning_rate": 0.00015224836711225806, + "loss": 0.459, + "step": 11766 + }, + { + "epoch": 1.57, + "grad_norm": 0.6015625, + "learning_rate": 0.00015223843764263452, + "loss": 0.3156, + "step": 11767 + }, + { + "epoch": 1.57, + "grad_norm": 0.37109375, + "learning_rate": 0.00015222850746462432, + "loss": 0.1464, + "step": 11768 + }, + { + "epoch": 1.57, + "grad_norm": 0.59375, + "learning_rate": 0.00015221857657836196, + "loss": 0.4568, + "step": 11769 + }, + { + "epoch": 1.57, + "grad_norm": 0.5390625, + "learning_rate": 0.00015220864498398222, + "loss": 0.7072, + "step": 11770 + }, + { + "epoch": 1.57, + "grad_norm": 0.462890625, + "learning_rate": 0.00015219871268161973, + "loss": 0.1821, + "step": 11771 + }, + { + "epoch": 1.57, + "grad_norm": 0.609375, + "learning_rate": 0.0001521887796714092, + "loss": 0.3071, + "step": 11772 + }, + { + "epoch": 1.57, + "grad_norm": 0.3828125, + "learning_rate": 0.00015217884595348531, + "loss": 0.212, + "step": 11773 + }, + { + "epoch": 1.57, + "grad_norm": 0.4296875, + "learning_rate": 0.00015216891152798284, + "loss": 0.3332, + "step": 11774 + }, + { + "epoch": 1.57, + "grad_norm": 0.64453125, + "learning_rate": 0.0001521589763950364, + "loss": 0.5137, + "step": 11775 + }, + { + "epoch": 1.57, + "grad_norm": 0.470703125, + "learning_rate": 0.00015214904055478078, + "loss": 0.3548, + "step": 11776 + }, + { + "epoch": 1.57, + "grad_norm": 0.5078125, + "learning_rate": 0.0001521391040073507, + "loss": 0.4944, + "step": 11777 + }, + { + "epoch": 1.57, + "grad_norm": 0.419921875, + "learning_rate": 0.00015212916675288088, + "loss": 0.3418, + "step": 11778 + }, + { + "epoch": 1.57, + "grad_norm": 0.431640625, + "learning_rate": 0.00015211922879150612, + "loss": 0.3382, + "step": 11779 + }, + { + "epoch": 1.57, + "grad_norm": 0.5625, + "learning_rate": 0.00015210929012336122, + "loss": 0.4736, + "step": 11780 + }, + { + "epoch": 1.57, + "grad_norm": 0.51171875, + "learning_rate": 0.00015209935074858084, + "loss": 0.5398, + "step": 11781 + }, + { + "epoch": 1.57, + "grad_norm": 0.54296875, + "learning_rate": 0.00015208941066729984, + "loss": 0.5017, + "step": 11782 + }, + { + "epoch": 1.57, + "grad_norm": 0.498046875, + "learning_rate": 0.000152079469879653, + "loss": 0.3407, + "step": 11783 + }, + { + "epoch": 1.57, + "grad_norm": 0.6796875, + "learning_rate": 0.0001520695283857751, + "loss": 0.5412, + "step": 11784 + }, + { + "epoch": 1.57, + "grad_norm": 0.5390625, + "learning_rate": 0.000152059586185801, + "loss": 0.439, + "step": 11785 + }, + { + "epoch": 1.57, + "grad_norm": 0.337890625, + "learning_rate": 0.00015204964327986554, + "loss": 0.2135, + "step": 11786 + }, + { + "epoch": 1.57, + "grad_norm": 0.46484375, + "learning_rate": 0.00015203969966810346, + "loss": 0.517, + "step": 11787 + }, + { + "epoch": 1.57, + "grad_norm": 0.51171875, + "learning_rate": 0.00015202975535064965, + "loss": 0.361, + "step": 11788 + }, + { + "epoch": 1.57, + "grad_norm": 0.515625, + "learning_rate": 0.00015201981032763898, + "loss": 0.1451, + "step": 11789 + }, + { + "epoch": 1.57, + "grad_norm": 0.5625, + "learning_rate": 0.0001520098645992063, + "loss": 0.3733, + "step": 11790 + }, + { + "epoch": 1.57, + "grad_norm": 0.546875, + "learning_rate": 0.00015199991816548648, + "loss": 0.5718, + "step": 11791 + }, + { + "epoch": 1.57, + "grad_norm": 0.54296875, + "learning_rate": 0.00015198997102661442, + "loss": 0.5699, + "step": 11792 + }, + { + "epoch": 1.57, + "grad_norm": 0.6328125, + "learning_rate": 0.00015198002318272496, + "loss": 0.6359, + "step": 11793 + }, + { + "epoch": 1.57, + "grad_norm": 0.7421875, + "learning_rate": 0.000151970074633953, + "loss": 0.2485, + "step": 11794 + }, + { + "epoch": 1.57, + "grad_norm": 0.50390625, + "learning_rate": 0.00015196012538043352, + "loss": 0.2676, + "step": 11795 + }, + { + "epoch": 1.57, + "grad_norm": 0.53125, + "learning_rate": 0.00015195017542230134, + "loss": 0.3104, + "step": 11796 + }, + { + "epoch": 1.57, + "grad_norm": 0.57421875, + "learning_rate": 0.0001519402247596915, + "loss": 0.6023, + "step": 11797 + }, + { + "epoch": 1.57, + "grad_norm": 0.419921875, + "learning_rate": 0.00015193027339273883, + "loss": 0.2949, + "step": 11798 + }, + { + "epoch": 1.57, + "grad_norm": 0.58203125, + "learning_rate": 0.00015192032132157834, + "loss": 0.4356, + "step": 11799 + }, + { + "epoch": 1.57, + "grad_norm": 0.6953125, + "learning_rate": 0.000151910368546345, + "loss": 0.7847, + "step": 11800 + }, + { + "epoch": 1.57, + "grad_norm": 0.392578125, + "learning_rate": 0.00015190041506717367, + "loss": 0.2054, + "step": 11801 + }, + { + "epoch": 1.57, + "grad_norm": 0.5234375, + "learning_rate": 0.00015189046088419947, + "loss": 0.5126, + "step": 11802 + }, + { + "epoch": 1.57, + "grad_norm": 0.64453125, + "learning_rate": 0.0001518805059975573, + "loss": 0.742, + "step": 11803 + }, + { + "epoch": 1.58, + "grad_norm": 0.4609375, + "learning_rate": 0.00015187055040738216, + "loss": 0.3084, + "step": 11804 + }, + { + "epoch": 1.58, + "grad_norm": 0.37109375, + "learning_rate": 0.0001518605941138091, + "loss": 0.205, + "step": 11805 + }, + { + "epoch": 1.58, + "grad_norm": 0.470703125, + "learning_rate": 0.00015185063711697306, + "loss": 0.3924, + "step": 11806 + }, + { + "epoch": 1.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00015184067941700913, + "loss": 0.3692, + "step": 11807 + }, + { + "epoch": 1.58, + "grad_norm": 0.83984375, + "learning_rate": 0.0001518307210140523, + "loss": 0.5583, + "step": 11808 + }, + { + "epoch": 1.58, + "grad_norm": 0.53515625, + "learning_rate": 0.0001518207619082376, + "loss": 0.5716, + "step": 11809 + }, + { + "epoch": 1.58, + "grad_norm": 0.4296875, + "learning_rate": 0.0001518108020997002, + "loss": 0.3085, + "step": 11810 + }, + { + "epoch": 1.58, + "grad_norm": 0.46875, + "learning_rate": 0.00015180084158857498, + "loss": 0.3582, + "step": 11811 + }, + { + "epoch": 1.58, + "grad_norm": 0.51171875, + "learning_rate": 0.00015179088037499714, + "loss": 0.4193, + "step": 11812 + }, + { + "epoch": 1.58, + "grad_norm": 0.51171875, + "learning_rate": 0.0001517809184591017, + "loss": 0.313, + "step": 11813 + }, + { + "epoch": 1.58, + "grad_norm": 0.46484375, + "learning_rate": 0.0001517709558410238, + "loss": 0.1465, + "step": 11814 + }, + { + "epoch": 1.58, + "grad_norm": 0.54296875, + "learning_rate": 0.0001517609925208985, + "loss": 0.4388, + "step": 11815 + }, + { + "epoch": 1.58, + "grad_norm": 0.66796875, + "learning_rate": 0.00015175102849886095, + "loss": 0.6287, + "step": 11816 + }, + { + "epoch": 1.58, + "grad_norm": 0.94921875, + "learning_rate": 0.00015174106377504622, + "loss": 0.4134, + "step": 11817 + }, + { + "epoch": 1.58, + "grad_norm": 0.427734375, + "learning_rate": 0.00015173109834958946, + "loss": 0.1714, + "step": 11818 + }, + { + "epoch": 1.58, + "grad_norm": 0.8125, + "learning_rate": 0.00015172113222262579, + "loss": 0.8341, + "step": 11819 + }, + { + "epoch": 1.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00015171116539429037, + "loss": 0.3679, + "step": 11820 + }, + { + "epoch": 1.58, + "grad_norm": 0.5390625, + "learning_rate": 0.00015170119786471836, + "loss": 0.3919, + "step": 11821 + }, + { + "epoch": 1.58, + "grad_norm": 0.578125, + "learning_rate": 0.00015169122963404496, + "loss": 0.2074, + "step": 11822 + }, + { + "epoch": 1.58, + "grad_norm": 0.6484375, + "learning_rate": 0.0001516812607024053, + "loss": 0.5687, + "step": 11823 + }, + { + "epoch": 1.58, + "grad_norm": 0.6171875, + "learning_rate": 0.00015167129106993454, + "loss": 0.44, + "step": 11824 + }, + { + "epoch": 1.58, + "grad_norm": 0.439453125, + "learning_rate": 0.00015166132073676797, + "loss": 0.1973, + "step": 11825 + }, + { + "epoch": 1.58, + "grad_norm": 0.455078125, + "learning_rate": 0.00015165134970304066, + "loss": 0.4734, + "step": 11826 + }, + { + "epoch": 1.58, + "grad_norm": 0.58984375, + "learning_rate": 0.00015164137796888795, + "loss": 0.2711, + "step": 11827 + }, + { + "epoch": 1.58, + "grad_norm": 0.46875, + "learning_rate": 0.00015163140553444503, + "loss": 0.4222, + "step": 11828 + }, + { + "epoch": 1.58, + "grad_norm": 0.65625, + "learning_rate": 0.00015162143239984705, + "loss": 0.7059, + "step": 11829 + }, + { + "epoch": 1.58, + "grad_norm": 0.75, + "learning_rate": 0.00015161145856522936, + "loss": 0.3286, + "step": 11830 + }, + { + "epoch": 1.58, + "grad_norm": 0.5234375, + "learning_rate": 0.00015160148403072716, + "loss": 0.7337, + "step": 11831 + }, + { + "epoch": 1.58, + "grad_norm": 0.5078125, + "learning_rate": 0.0001515915087964757, + "loss": 0.606, + "step": 11832 + }, + { + "epoch": 1.58, + "grad_norm": 0.4765625, + "learning_rate": 0.0001515815328626103, + "loss": 0.3021, + "step": 11833 + }, + { + "epoch": 1.58, + "grad_norm": 0.4765625, + "learning_rate": 0.0001515715562292662, + "loss": 0.2691, + "step": 11834 + }, + { + "epoch": 1.58, + "grad_norm": 0.42578125, + "learning_rate": 0.00015156157889657868, + "loss": 0.1673, + "step": 11835 + }, + { + "epoch": 1.58, + "grad_norm": 0.625, + "learning_rate": 0.0001515516008646831, + "loss": 0.4325, + "step": 11836 + }, + { + "epoch": 1.58, + "grad_norm": 0.39453125, + "learning_rate": 0.0001515416221337147, + "loss": 0.2117, + "step": 11837 + }, + { + "epoch": 1.58, + "grad_norm": 0.6171875, + "learning_rate": 0.0001515316427038088, + "loss": 0.3139, + "step": 11838 + }, + { + "epoch": 1.58, + "grad_norm": 0.609375, + "learning_rate": 0.0001515216625751008, + "loss": 0.4794, + "step": 11839 + }, + { + "epoch": 1.58, + "grad_norm": 0.578125, + "learning_rate": 0.00015151168174772596, + "loss": 0.5085, + "step": 11840 + }, + { + "epoch": 1.58, + "grad_norm": 0.60546875, + "learning_rate": 0.00015150170022181967, + "loss": 0.3062, + "step": 11841 + }, + { + "epoch": 1.58, + "grad_norm": 0.40625, + "learning_rate": 0.00015149171799751725, + "loss": 0.3092, + "step": 11842 + }, + { + "epoch": 1.58, + "grad_norm": 0.58203125, + "learning_rate": 0.00015148173507495408, + "loss": 0.3741, + "step": 11843 + }, + { + "epoch": 1.58, + "grad_norm": 0.71875, + "learning_rate": 0.00015147175145426557, + "loss": 0.3207, + "step": 11844 + }, + { + "epoch": 1.58, + "grad_norm": 0.484375, + "learning_rate": 0.00015146176713558707, + "loss": 0.2208, + "step": 11845 + }, + { + "epoch": 1.58, + "grad_norm": 0.478515625, + "learning_rate": 0.000151451782119054, + "loss": 0.2492, + "step": 11846 + }, + { + "epoch": 1.58, + "grad_norm": 0.5234375, + "learning_rate": 0.0001514417964048017, + "loss": 0.4032, + "step": 11847 + }, + { + "epoch": 1.58, + "grad_norm": 0.55078125, + "learning_rate": 0.00015143180999296565, + "loss": 0.3198, + "step": 11848 + }, + { + "epoch": 1.58, + "grad_norm": 0.76953125, + "learning_rate": 0.00015142182288368122, + "loss": 0.3321, + "step": 11849 + }, + { + "epoch": 1.58, + "grad_norm": 0.44921875, + "learning_rate": 0.00015141183507708393, + "loss": 0.3555, + "step": 11850 + }, + { + "epoch": 1.58, + "grad_norm": 0.474609375, + "learning_rate": 0.0001514018465733091, + "loss": 0.2877, + "step": 11851 + }, + { + "epoch": 1.58, + "grad_norm": 0.78515625, + "learning_rate": 0.00015139185737249227, + "loss": 0.682, + "step": 11852 + }, + { + "epoch": 1.58, + "grad_norm": 0.66015625, + "learning_rate": 0.00015138186747476888, + "loss": 0.3859, + "step": 11853 + }, + { + "epoch": 1.58, + "grad_norm": 0.494140625, + "learning_rate": 0.00015137187688027436, + "loss": 0.4204, + "step": 11854 + }, + { + "epoch": 1.58, + "grad_norm": 0.5234375, + "learning_rate": 0.00015136188558914423, + "loss": 0.4806, + "step": 11855 + }, + { + "epoch": 1.58, + "grad_norm": 0.52734375, + "learning_rate": 0.00015135189360151397, + "loss": 0.4346, + "step": 11856 + }, + { + "epoch": 1.58, + "grad_norm": 0.50390625, + "learning_rate": 0.00015134190091751907, + "loss": 0.371, + "step": 11857 + }, + { + "epoch": 1.58, + "grad_norm": 0.6328125, + "learning_rate": 0.00015133190753729506, + "loss": 0.4571, + "step": 11858 + }, + { + "epoch": 1.58, + "grad_norm": 0.4609375, + "learning_rate": 0.00015132191346097743, + "loss": 0.4243, + "step": 11859 + }, + { + "epoch": 1.58, + "grad_norm": 0.400390625, + "learning_rate": 0.00015131191868870172, + "loss": 0.2265, + "step": 11860 + }, + { + "epoch": 1.58, + "grad_norm": 0.58984375, + "learning_rate": 0.00015130192322060343, + "loss": 0.3892, + "step": 11861 + }, + { + "epoch": 1.58, + "grad_norm": 0.59375, + "learning_rate": 0.00015129192705681818, + "loss": 0.3556, + "step": 11862 + }, + { + "epoch": 1.58, + "grad_norm": 0.65625, + "learning_rate": 0.00015128193019748145, + "loss": 0.3689, + "step": 11863 + }, + { + "epoch": 1.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00015127193264272886, + "loss": 0.4186, + "step": 11864 + }, + { + "epoch": 1.58, + "grad_norm": 0.5859375, + "learning_rate": 0.00015126193439269595, + "loss": 0.321, + "step": 11865 + }, + { + "epoch": 1.58, + "grad_norm": 0.4453125, + "learning_rate": 0.00015125193544751833, + "loss": 0.2892, + "step": 11866 + }, + { + "epoch": 1.58, + "grad_norm": 0.57421875, + "learning_rate": 0.0001512419358073315, + "loss": 0.5696, + "step": 11867 + }, + { + "epoch": 1.58, + "grad_norm": 0.5703125, + "learning_rate": 0.0001512319354722712, + "loss": 0.4784, + "step": 11868 + }, + { + "epoch": 1.58, + "grad_norm": 1.1640625, + "learning_rate": 0.00015122193444247294, + "loss": 0.362, + "step": 11869 + }, + { + "epoch": 1.58, + "grad_norm": 0.53515625, + "learning_rate": 0.0001512119327180724, + "loss": 0.1514, + "step": 11870 + }, + { + "epoch": 1.58, + "grad_norm": 0.392578125, + "learning_rate": 0.0001512019302992052, + "loss": 0.3738, + "step": 11871 + }, + { + "epoch": 1.58, + "grad_norm": 0.46484375, + "learning_rate": 0.0001511919271860069, + "loss": 0.2736, + "step": 11872 + }, + { + "epoch": 1.58, + "grad_norm": 0.5234375, + "learning_rate": 0.00015118192337861328, + "loss": 0.2506, + "step": 11873 + }, + { + "epoch": 1.58, + "grad_norm": 0.51171875, + "learning_rate": 0.00015117191887715987, + "loss": 0.5811, + "step": 11874 + }, + { + "epoch": 1.58, + "grad_norm": 0.61328125, + "learning_rate": 0.00015116191368178244, + "loss": 0.3784, + "step": 11875 + }, + { + "epoch": 1.58, + "grad_norm": 0.61328125, + "learning_rate": 0.00015115190779261658, + "loss": 0.4749, + "step": 11876 + }, + { + "epoch": 1.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00015114190120979807, + "loss": 0.3374, + "step": 11877 + }, + { + "epoch": 1.59, + "grad_norm": 0.65234375, + "learning_rate": 0.00015113189393346253, + "loss": 0.3868, + "step": 11878 + }, + { + "epoch": 1.59, + "grad_norm": 0.5390625, + "learning_rate": 0.00015112188596374572, + "loss": 0.382, + "step": 11879 + }, + { + "epoch": 1.59, + "grad_norm": 0.32421875, + "learning_rate": 0.0001511118773007833, + "loss": 0.155, + "step": 11880 + }, + { + "epoch": 1.59, + "grad_norm": 0.54296875, + "learning_rate": 0.00015110186794471103, + "loss": 0.5844, + "step": 11881 + }, + { + "epoch": 1.59, + "grad_norm": 0.48828125, + "learning_rate": 0.00015109185789566464, + "loss": 0.4783, + "step": 11882 + }, + { + "epoch": 1.59, + "grad_norm": 0.50390625, + "learning_rate": 0.00015108184715377986, + "loss": 0.5012, + "step": 11883 + }, + { + "epoch": 1.59, + "grad_norm": 0.486328125, + "learning_rate": 0.0001510718357191925, + "loss": 0.4917, + "step": 11884 + }, + { + "epoch": 1.59, + "grad_norm": 0.69140625, + "learning_rate": 0.00015106182359203818, + "loss": 0.514, + "step": 11885 + }, + { + "epoch": 1.59, + "grad_norm": 0.5, + "learning_rate": 0.00015105181077245278, + "loss": 0.3418, + "step": 11886 + }, + { + "epoch": 1.59, + "grad_norm": 0.4921875, + "learning_rate": 0.00015104179726057207, + "loss": 0.3032, + "step": 11887 + }, + { + "epoch": 1.59, + "grad_norm": 0.60546875, + "learning_rate": 0.00015103178305653185, + "loss": 0.3319, + "step": 11888 + }, + { + "epoch": 1.59, + "grad_norm": 0.63671875, + "learning_rate": 0.00015102176816046792, + "loss": 0.211, + "step": 11889 + }, + { + "epoch": 1.59, + "grad_norm": 0.61328125, + "learning_rate": 0.00015101175257251602, + "loss": 0.3733, + "step": 11890 + }, + { + "epoch": 1.59, + "grad_norm": 0.6328125, + "learning_rate": 0.00015100173629281204, + "loss": 0.4589, + "step": 11891 + }, + { + "epoch": 1.59, + "grad_norm": 0.4140625, + "learning_rate": 0.00015099171932149178, + "loss": 0.1731, + "step": 11892 + }, + { + "epoch": 1.59, + "grad_norm": 0.640625, + "learning_rate": 0.0001509817016586911, + "loss": 0.4994, + "step": 11893 + }, + { + "epoch": 1.59, + "grad_norm": 0.9375, + "learning_rate": 0.00015097168330454577, + "loss": 0.2797, + "step": 11894 + }, + { + "epoch": 1.59, + "grad_norm": 0.62890625, + "learning_rate": 0.00015096166425919175, + "loss": 0.3219, + "step": 11895 + }, + { + "epoch": 1.59, + "grad_norm": 0.46484375, + "learning_rate": 0.00015095164452276486, + "loss": 0.4026, + "step": 11896 + }, + { + "epoch": 1.59, + "grad_norm": 0.515625, + "learning_rate": 0.00015094162409540093, + "loss": 0.3298, + "step": 11897 + }, + { + "epoch": 1.59, + "grad_norm": 0.408203125, + "learning_rate": 0.00015093160297723589, + "loss": 0.2885, + "step": 11898 + }, + { + "epoch": 1.59, + "grad_norm": 0.439453125, + "learning_rate": 0.00015092158116840565, + "loss": 0.2848, + "step": 11899 + }, + { + "epoch": 1.59, + "grad_norm": 0.71484375, + "learning_rate": 0.00015091155866904608, + "loss": 0.8664, + "step": 11900 + }, + { + "epoch": 1.59, + "grad_norm": 0.498046875, + "learning_rate": 0.00015090153547929312, + "loss": 0.6143, + "step": 11901 + }, + { + "epoch": 1.59, + "grad_norm": 0.55859375, + "learning_rate": 0.00015089151159928263, + "loss": 0.3726, + "step": 11902 + }, + { + "epoch": 1.59, + "grad_norm": 0.384765625, + "learning_rate": 0.0001508814870291506, + "loss": 0.3897, + "step": 11903 + }, + { + "epoch": 1.59, + "grad_norm": 0.51953125, + "learning_rate": 0.00015087146176903296, + "loss": 0.3877, + "step": 11904 + }, + { + "epoch": 1.59, + "grad_norm": 0.58203125, + "learning_rate": 0.00015086143581906561, + "loss": 0.536, + "step": 11905 + }, + { + "epoch": 1.59, + "grad_norm": 0.7265625, + "learning_rate": 0.00015085140917938457, + "loss": 0.39, + "step": 11906 + }, + { + "epoch": 1.59, + "grad_norm": 0.34765625, + "learning_rate": 0.00015084138185012585, + "loss": 0.1615, + "step": 11907 + }, + { + "epoch": 1.59, + "grad_norm": 0.6328125, + "learning_rate": 0.00015083135383142527, + "loss": 0.3426, + "step": 11908 + }, + { + "epoch": 1.59, + "grad_norm": 0.59375, + "learning_rate": 0.00015082132512341895, + "loss": 0.5064, + "step": 11909 + }, + { + "epoch": 1.59, + "grad_norm": 0.54296875, + "learning_rate": 0.00015081129572624283, + "loss": 0.4039, + "step": 11910 + }, + { + "epoch": 1.59, + "grad_norm": 0.859375, + "learning_rate": 0.00015080126564003291, + "loss": 0.2939, + "step": 11911 + }, + { + "epoch": 1.59, + "grad_norm": 0.6171875, + "learning_rate": 0.00015079123486492528, + "loss": 0.4417, + "step": 11912 + }, + { + "epoch": 1.59, + "grad_norm": 0.515625, + "learning_rate": 0.00015078120340105588, + "loss": 0.5116, + "step": 11913 + }, + { + "epoch": 1.59, + "grad_norm": 0.578125, + "learning_rate": 0.00015077117124856078, + "loss": 0.4808, + "step": 11914 + }, + { + "epoch": 1.59, + "grad_norm": 0.66015625, + "learning_rate": 0.00015076113840757597, + "loss": 0.3461, + "step": 11915 + }, + { + "epoch": 1.59, + "grad_norm": 0.60546875, + "learning_rate": 0.0001507511048782376, + "loss": 0.6768, + "step": 11916 + }, + { + "epoch": 1.59, + "grad_norm": 0.65234375, + "learning_rate": 0.00015074107066068165, + "loss": 0.4134, + "step": 11917 + }, + { + "epoch": 1.59, + "grad_norm": 0.48046875, + "learning_rate": 0.00015073103575504422, + "loss": 0.3846, + "step": 11918 + }, + { + "epoch": 1.59, + "grad_norm": 0.65234375, + "learning_rate": 0.00015072100016146138, + "loss": 0.3401, + "step": 11919 + }, + { + "epoch": 1.59, + "grad_norm": 0.59375, + "learning_rate": 0.0001507109638800692, + "loss": 0.4454, + "step": 11920 + }, + { + "epoch": 1.59, + "grad_norm": 0.58203125, + "learning_rate": 0.00015070092691100385, + "loss": 0.4572, + "step": 11921 + }, + { + "epoch": 1.59, + "grad_norm": 0.515625, + "learning_rate": 0.00015069088925440138, + "loss": 0.3065, + "step": 11922 + }, + { + "epoch": 1.59, + "grad_norm": 0.46875, + "learning_rate": 0.00015068085091039793, + "loss": 0.3097, + "step": 11923 + }, + { + "epoch": 1.59, + "grad_norm": 0.578125, + "learning_rate": 0.00015067081187912956, + "loss": 0.8754, + "step": 11924 + }, + { + "epoch": 1.59, + "grad_norm": 0.5390625, + "learning_rate": 0.00015066077216073254, + "loss": 0.503, + "step": 11925 + }, + { + "epoch": 1.59, + "grad_norm": 0.6484375, + "learning_rate": 0.0001506507317553429, + "loss": 0.4361, + "step": 11926 + }, + { + "epoch": 1.59, + "grad_norm": 0.443359375, + "learning_rate": 0.0001506406906630968, + "loss": 0.4786, + "step": 11927 + }, + { + "epoch": 1.59, + "grad_norm": 0.48046875, + "learning_rate": 0.00015063064888413047, + "loss": 0.33, + "step": 11928 + }, + { + "epoch": 1.59, + "grad_norm": 0.5703125, + "learning_rate": 0.00015062060641858003, + "loss": 0.4732, + "step": 11929 + }, + { + "epoch": 1.59, + "grad_norm": 0.435546875, + "learning_rate": 0.0001506105632665817, + "loss": 0.3316, + "step": 11930 + }, + { + "epoch": 1.59, + "grad_norm": 1.3984375, + "learning_rate": 0.00015060051942827164, + "loss": 0.4273, + "step": 11931 + }, + { + "epoch": 1.59, + "grad_norm": 0.427734375, + "learning_rate": 0.00015059047490378603, + "loss": 0.2501, + "step": 11932 + }, + { + "epoch": 1.59, + "grad_norm": 0.59765625, + "learning_rate": 0.00015058042969326113, + "loss": 0.4891, + "step": 11933 + }, + { + "epoch": 1.59, + "grad_norm": 0.41015625, + "learning_rate": 0.00015057038379683314, + "loss": 0.3635, + "step": 11934 + }, + { + "epoch": 1.59, + "grad_norm": 0.61328125, + "learning_rate": 0.00015056033721463832, + "loss": 0.3167, + "step": 11935 + }, + { + "epoch": 1.59, + "grad_norm": 0.478515625, + "learning_rate": 0.00015055028994681284, + "loss": 0.4925, + "step": 11936 + }, + { + "epoch": 1.59, + "grad_norm": 0.423828125, + "learning_rate": 0.000150540241993493, + "loss": 0.2029, + "step": 11937 + }, + { + "epoch": 1.59, + "grad_norm": 0.59375, + "learning_rate": 0.00015053019335481505, + "loss": 0.5126, + "step": 11938 + }, + { + "epoch": 1.59, + "grad_norm": 0.462890625, + "learning_rate": 0.00015052014403091528, + "loss": 0.2494, + "step": 11939 + }, + { + "epoch": 1.59, + "grad_norm": 0.625, + "learning_rate": 0.00015051009402192986, + "loss": 0.3598, + "step": 11940 + }, + { + "epoch": 1.59, + "grad_norm": 0.466796875, + "learning_rate": 0.0001505000433279952, + "loss": 0.3432, + "step": 11941 + }, + { + "epoch": 1.59, + "grad_norm": 0.455078125, + "learning_rate": 0.0001504899919492475, + "loss": 0.3067, + "step": 11942 + }, + { + "epoch": 1.59, + "grad_norm": 0.43359375, + "learning_rate": 0.00015047993988582316, + "loss": 0.2402, + "step": 11943 + }, + { + "epoch": 1.59, + "grad_norm": 0.4921875, + "learning_rate": 0.0001504698871378584, + "loss": 0.1969, + "step": 11944 + }, + { + "epoch": 1.59, + "grad_norm": 0.515625, + "learning_rate": 0.00015045983370548958, + "loss": 0.2517, + "step": 11945 + }, + { + "epoch": 1.59, + "grad_norm": 0.5390625, + "learning_rate": 0.00015044977958885304, + "loss": 0.3207, + "step": 11946 + }, + { + "epoch": 1.59, + "grad_norm": 0.375, + "learning_rate": 0.00015043972478808511, + "loss": 0.1764, + "step": 11947 + }, + { + "epoch": 1.59, + "grad_norm": 0.73046875, + "learning_rate": 0.00015042966930332212, + "loss": 0.397, + "step": 11948 + }, + { + "epoch": 1.59, + "grad_norm": 0.498046875, + "learning_rate": 0.00015041961313470048, + "loss": 0.3626, + "step": 11949 + }, + { + "epoch": 1.59, + "grad_norm": 0.5625, + "learning_rate": 0.0001504095562823565, + "loss": 0.3028, + "step": 11950 + }, + { + "epoch": 1.59, + "grad_norm": 0.75, + "learning_rate": 0.0001503994987464266, + "loss": 0.3948, + "step": 11951 + }, + { + "epoch": 1.59, + "grad_norm": 0.45703125, + "learning_rate": 0.00015038944052704712, + "loss": 0.2782, + "step": 11952 + }, + { + "epoch": 1.6, + "grad_norm": 0.56640625, + "learning_rate": 0.00015037938162435453, + "loss": 0.4269, + "step": 11953 + }, + { + "epoch": 1.6, + "grad_norm": 0.349609375, + "learning_rate": 0.0001503693220384852, + "loss": 0.2032, + "step": 11954 + }, + { + "epoch": 1.6, + "grad_norm": 0.4140625, + "learning_rate": 0.0001503592617695755, + "loss": 0.466, + "step": 11955 + }, + { + "epoch": 1.6, + "grad_norm": 0.63671875, + "learning_rate": 0.00015034920081776192, + "loss": 0.3798, + "step": 11956 + }, + { + "epoch": 1.6, + "grad_norm": 0.53515625, + "learning_rate": 0.00015033913918318086, + "loss": 0.516, + "step": 11957 + }, + { + "epoch": 1.6, + "grad_norm": 0.4296875, + "learning_rate": 0.00015032907686596874, + "loss": 0.2486, + "step": 11958 + }, + { + "epoch": 1.6, + "grad_norm": 0.640625, + "learning_rate": 0.000150319013866262, + "loss": 0.4121, + "step": 11959 + }, + { + "epoch": 1.6, + "grad_norm": 0.53125, + "learning_rate": 0.0001503089501841972, + "loss": 0.3704, + "step": 11960 + }, + { + "epoch": 1.6, + "grad_norm": 0.56640625, + "learning_rate": 0.0001502988858199107, + "loss": 0.2845, + "step": 11961 + }, + { + "epoch": 1.6, + "grad_norm": 0.455078125, + "learning_rate": 0.0001502888207735391, + "loss": 0.3098, + "step": 11962 + }, + { + "epoch": 1.6, + "grad_norm": 0.6171875, + "learning_rate": 0.00015027875504521878, + "loss": 0.3582, + "step": 11963 + }, + { + "epoch": 1.6, + "grad_norm": 0.451171875, + "learning_rate": 0.00015026868863508627, + "loss": 0.3794, + "step": 11964 + }, + { + "epoch": 1.6, + "grad_norm": 0.7265625, + "learning_rate": 0.00015025862154327806, + "loss": 0.6697, + "step": 11965 + }, + { + "epoch": 1.6, + "grad_norm": 0.57421875, + "learning_rate": 0.00015024855376993072, + "loss": 0.4346, + "step": 11966 + }, + { + "epoch": 1.6, + "grad_norm": 0.5390625, + "learning_rate": 0.00015023848531518072, + "loss": 0.3922, + "step": 11967 + }, + { + "epoch": 1.6, + "grad_norm": 0.62109375, + "learning_rate": 0.00015022841617916464, + "loss": 0.4345, + "step": 11968 + }, + { + "epoch": 1.6, + "grad_norm": 0.54296875, + "learning_rate": 0.00015021834636201896, + "loss": 0.1527, + "step": 11969 + }, + { + "epoch": 1.6, + "grad_norm": 0.51953125, + "learning_rate": 0.00015020827586388028, + "loss": 0.349, + "step": 11970 + }, + { + "epoch": 1.6, + "grad_norm": 0.404296875, + "learning_rate": 0.00015019820468488519, + "loss": 0.2862, + "step": 11971 + }, + { + "epoch": 1.6, + "grad_norm": 0.46875, + "learning_rate": 0.00015018813282517018, + "loss": 0.3518, + "step": 11972 + }, + { + "epoch": 1.6, + "grad_norm": 0.67578125, + "learning_rate": 0.0001501780602848719, + "loss": 0.3691, + "step": 11973 + }, + { + "epoch": 1.6, + "grad_norm": 0.462890625, + "learning_rate": 0.00015016798706412695, + "loss": 0.3004, + "step": 11974 + }, + { + "epoch": 1.6, + "grad_norm": 0.416015625, + "learning_rate": 0.00015015791316307187, + "loss": 0.3069, + "step": 11975 + }, + { + "epoch": 1.6, + "grad_norm": 0.5703125, + "learning_rate": 0.00015014783858184329, + "loss": 0.465, + "step": 11976 + }, + { + "epoch": 1.6, + "grad_norm": 0.390625, + "learning_rate": 0.00015013776332057786, + "loss": 0.1643, + "step": 11977 + }, + { + "epoch": 1.6, + "grad_norm": 0.953125, + "learning_rate": 0.00015012768737941216, + "loss": 0.6514, + "step": 11978 + }, + { + "epoch": 1.6, + "grad_norm": 0.416015625, + "learning_rate": 0.00015011761075848284, + "loss": 0.3158, + "step": 11979 + }, + { + "epoch": 1.6, + "grad_norm": 0.53515625, + "learning_rate": 0.0001501075334579266, + "loss": 0.3018, + "step": 11980 + }, + { + "epoch": 1.6, + "grad_norm": 0.55078125, + "learning_rate": 0.00015009745547788, + "loss": 0.5524, + "step": 11981 + }, + { + "epoch": 1.6, + "grad_norm": 0.51953125, + "learning_rate": 0.00015008737681847974, + "loss": 0.2791, + "step": 11982 + }, + { + "epoch": 1.6, + "grad_norm": 0.39453125, + "learning_rate": 0.00015007729747986255, + "loss": 0.1407, + "step": 11983 + }, + { + "epoch": 1.6, + "grad_norm": 0.47265625, + "learning_rate": 0.00015006721746216504, + "loss": 0.4517, + "step": 11984 + }, + { + "epoch": 1.6, + "grad_norm": 0.44140625, + "learning_rate": 0.00015005713676552392, + "loss": 0.4811, + "step": 11985 + }, + { + "epoch": 1.6, + "grad_norm": 0.51953125, + "learning_rate": 0.0001500470553900759, + "loss": 0.3159, + "step": 11986 + }, + { + "epoch": 1.6, + "grad_norm": 0.373046875, + "learning_rate": 0.00015003697333595772, + "loss": 0.2688, + "step": 11987 + }, + { + "epoch": 1.6, + "grad_norm": 0.50390625, + "learning_rate": 0.00015002689060330604, + "loss": 0.3115, + "step": 11988 + }, + { + "epoch": 1.6, + "grad_norm": 0.6171875, + "learning_rate": 0.00015001680719225764, + "loss": 0.3623, + "step": 11989 + }, + { + "epoch": 1.6, + "grad_norm": 0.490234375, + "learning_rate": 0.0001500067231029492, + "loss": 0.5173, + "step": 11990 + }, + { + "epoch": 1.6, + "grad_norm": 0.53125, + "learning_rate": 0.00014999663833551753, + "loss": 0.4452, + "step": 11991 + }, + { + "epoch": 1.6, + "grad_norm": 0.3828125, + "learning_rate": 0.00014998655289009936, + "loss": 0.2293, + "step": 11992 + }, + { + "epoch": 1.6, + "grad_norm": 0.48828125, + "learning_rate": 0.00014997646676683145, + "loss": 0.4031, + "step": 11993 + }, + { + "epoch": 1.6, + "grad_norm": 0.5, + "learning_rate": 0.00014996637996585055, + "loss": 0.463, + "step": 11994 + }, + { + "epoch": 1.6, + "grad_norm": 0.53125, + "learning_rate": 0.00014995629248729347, + "loss": 0.2697, + "step": 11995 + }, + { + "epoch": 1.6, + "grad_norm": 0.54296875, + "learning_rate": 0.00014994620433129703, + "loss": 0.2595, + "step": 11996 + }, + { + "epoch": 1.6, + "grad_norm": 0.462890625, + "learning_rate": 0.00014993611549799795, + "loss": 0.2357, + "step": 11997 + }, + { + "epoch": 1.6, + "grad_norm": 0.56640625, + "learning_rate": 0.00014992602598753316, + "loss": 0.5092, + "step": 11998 + }, + { + "epoch": 1.6, + "grad_norm": 0.6171875, + "learning_rate": 0.0001499159358000394, + "loss": 0.5535, + "step": 11999 + }, + { + "epoch": 1.6, + "grad_norm": 0.443359375, + "learning_rate": 0.0001499058449356535, + "loss": 0.2648, + "step": 12000 + }, + { + "epoch": 1.6, + "grad_norm": 0.453125, + "learning_rate": 0.00014989575339451232, + "loss": 0.4572, + "step": 12001 + }, + { + "epoch": 1.6, + "grad_norm": 0.51953125, + "learning_rate": 0.0001498856611767527, + "loss": 0.2636, + "step": 12002 + }, + { + "epoch": 1.6, + "grad_norm": 0.498046875, + "learning_rate": 0.00014987556828251149, + "loss": 0.3494, + "step": 12003 + }, + { + "epoch": 1.6, + "grad_norm": 0.53515625, + "learning_rate": 0.0001498654747119256, + "loss": 0.4843, + "step": 12004 + }, + { + "epoch": 1.6, + "grad_norm": 0.56640625, + "learning_rate": 0.00014985538046513185, + "loss": 0.5336, + "step": 12005 + }, + { + "epoch": 1.6, + "grad_norm": 0.38671875, + "learning_rate": 0.0001498452855422671, + "loss": 0.217, + "step": 12006 + }, + { + "epoch": 1.6, + "grad_norm": 0.6796875, + "learning_rate": 0.0001498351899434683, + "loss": 0.4747, + "step": 12007 + }, + { + "epoch": 1.6, + "grad_norm": 0.6953125, + "learning_rate": 0.00014982509366887235, + "loss": 0.3652, + "step": 12008 + }, + { + "epoch": 1.6, + "grad_norm": 0.5703125, + "learning_rate": 0.00014981499671861618, + "loss": 0.4486, + "step": 12009 + }, + { + "epoch": 1.6, + "grad_norm": 0.470703125, + "learning_rate": 0.00014980489909283668, + "loss": 0.22, + "step": 12010 + }, + { + "epoch": 1.6, + "grad_norm": 0.46484375, + "learning_rate": 0.0001497948007916708, + "loss": 0.3265, + "step": 12011 + }, + { + "epoch": 1.6, + "grad_norm": 0.51171875, + "learning_rate": 0.0001497847018152554, + "loss": 0.2053, + "step": 12012 + }, + { + "epoch": 1.6, + "grad_norm": 0.52734375, + "learning_rate": 0.00014977460216372755, + "loss": 0.5471, + "step": 12013 + }, + { + "epoch": 1.6, + "grad_norm": 0.67578125, + "learning_rate": 0.0001497645018372241, + "loss": 0.8454, + "step": 12014 + }, + { + "epoch": 1.6, + "grad_norm": 0.51953125, + "learning_rate": 0.0001497544008358821, + "loss": 0.6928, + "step": 12015 + }, + { + "epoch": 1.6, + "grad_norm": 0.62890625, + "learning_rate": 0.0001497442991598385, + "loss": 0.2008, + "step": 12016 + }, + { + "epoch": 1.6, + "grad_norm": 0.640625, + "learning_rate": 0.00014973419680923026, + "loss": 0.5146, + "step": 12017 + }, + { + "epoch": 1.6, + "grad_norm": 0.53515625, + "learning_rate": 0.0001497240937841944, + "loss": 0.4637, + "step": 12018 + }, + { + "epoch": 1.6, + "grad_norm": 0.44921875, + "learning_rate": 0.0001497139900848679, + "loss": 0.1749, + "step": 12019 + }, + { + "epoch": 1.6, + "grad_norm": 0.58984375, + "learning_rate": 0.00014970388571138778, + "loss": 0.4681, + "step": 12020 + }, + { + "epoch": 1.6, + "grad_norm": 0.55859375, + "learning_rate": 0.0001496937806638911, + "loss": 0.182, + "step": 12021 + }, + { + "epoch": 1.6, + "grad_norm": 0.484375, + "learning_rate": 0.00014968367494251484, + "loss": 0.2363, + "step": 12022 + }, + { + "epoch": 1.6, + "grad_norm": 0.326171875, + "learning_rate": 0.00014967356854739608, + "loss": 0.3471, + "step": 12023 + }, + { + "epoch": 1.6, + "grad_norm": 0.6640625, + "learning_rate": 0.00014966346147867184, + "loss": 0.6115, + "step": 12024 + }, + { + "epoch": 1.6, + "grad_norm": 0.63671875, + "learning_rate": 0.0001496533537364792, + "loss": 0.63, + "step": 12025 + }, + { + "epoch": 1.6, + "grad_norm": 0.578125, + "learning_rate": 0.0001496432453209552, + "loss": 0.5724, + "step": 12026 + }, + { + "epoch": 1.6, + "grad_norm": 0.62890625, + "learning_rate": 0.00014963313623223693, + "loss": 0.2518, + "step": 12027 + }, + { + "epoch": 1.61, + "grad_norm": 0.6328125, + "learning_rate": 0.0001496230264704615, + "loss": 0.5679, + "step": 12028 + }, + { + "epoch": 1.61, + "grad_norm": 0.6328125, + "learning_rate": 0.00014961291603576596, + "loss": 0.28, + "step": 12029 + }, + { + "epoch": 1.61, + "grad_norm": 0.6171875, + "learning_rate": 0.00014960280492828747, + "loss": 0.5704, + "step": 12030 + }, + { + "epoch": 1.61, + "grad_norm": 0.58984375, + "learning_rate": 0.00014959269314816307, + "loss": 0.5004, + "step": 12031 + }, + { + "epoch": 1.61, + "grad_norm": 0.38671875, + "learning_rate": 0.00014958258069552993, + "loss": 0.2128, + "step": 12032 + }, + { + "epoch": 1.61, + "grad_norm": 0.3984375, + "learning_rate": 0.00014957246757052518, + "loss": 0.2859, + "step": 12033 + }, + { + "epoch": 1.61, + "grad_norm": 0.65625, + "learning_rate": 0.00014956235377328595, + "loss": 0.5394, + "step": 12034 + }, + { + "epoch": 1.61, + "grad_norm": 0.546875, + "learning_rate": 0.00014955223930394944, + "loss": 0.1736, + "step": 12035 + }, + { + "epoch": 1.61, + "grad_norm": 0.51171875, + "learning_rate": 0.00014954212416265272, + "loss": 0.2602, + "step": 12036 + }, + { + "epoch": 1.61, + "grad_norm": 0.78125, + "learning_rate": 0.000149532008349533, + "loss": 0.5522, + "step": 12037 + }, + { + "epoch": 1.61, + "grad_norm": 0.4609375, + "learning_rate": 0.00014952189186472745, + "loss": 0.3236, + "step": 12038 + }, + { + "epoch": 1.61, + "grad_norm": 0.41796875, + "learning_rate": 0.00014951177470837328, + "loss": 0.3253, + "step": 12039 + }, + { + "epoch": 1.61, + "grad_norm": 0.5859375, + "learning_rate": 0.00014950165688060768, + "loss": 0.5529, + "step": 12040 + }, + { + "epoch": 1.61, + "grad_norm": 0.421875, + "learning_rate": 0.0001494915383815678, + "loss": 0.2221, + "step": 12041 + }, + { + "epoch": 1.61, + "grad_norm": 0.69921875, + "learning_rate": 0.00014948141921139094, + "loss": 0.7727, + "step": 12042 + }, + { + "epoch": 1.61, + "grad_norm": 0.67578125, + "learning_rate": 0.00014947129937021425, + "loss": 0.4068, + "step": 12043 + }, + { + "epoch": 1.61, + "grad_norm": 0.609375, + "learning_rate": 0.000149461178858175, + "loss": 0.5032, + "step": 12044 + }, + { + "epoch": 1.61, + "grad_norm": 0.59375, + "learning_rate": 0.00014945105767541042, + "loss": 0.5109, + "step": 12045 + }, + { + "epoch": 1.61, + "grad_norm": 0.55078125, + "learning_rate": 0.0001494409358220578, + "loss": 0.4637, + "step": 12046 + }, + { + "epoch": 1.61, + "grad_norm": 0.40625, + "learning_rate": 0.0001494308132982543, + "loss": 0.2254, + "step": 12047 + }, + { + "epoch": 1.61, + "grad_norm": 0.431640625, + "learning_rate": 0.00014942069010413726, + "loss": 0.3745, + "step": 12048 + }, + { + "epoch": 1.61, + "grad_norm": 0.42578125, + "learning_rate": 0.00014941056623984396, + "loss": 0.3979, + "step": 12049 + }, + { + "epoch": 1.61, + "grad_norm": 0.55859375, + "learning_rate": 0.00014940044170551166, + "loss": 0.2064, + "step": 12050 + }, + { + "epoch": 1.61, + "grad_norm": 0.5078125, + "learning_rate": 0.0001493903165012777, + "loss": 0.3206, + "step": 12051 + }, + { + "epoch": 1.61, + "grad_norm": 0.51171875, + "learning_rate": 0.00014938019062727927, + "loss": 0.3738, + "step": 12052 + }, + { + "epoch": 1.61, + "grad_norm": 0.404296875, + "learning_rate": 0.00014937006408365387, + "loss": 0.1711, + "step": 12053 + }, + { + "epoch": 1.61, + "grad_norm": 0.51171875, + "learning_rate": 0.00014935993687053862, + "loss": 0.37, + "step": 12054 + }, + { + "epoch": 1.61, + "grad_norm": 0.375, + "learning_rate": 0.00014934980898807101, + "loss": 0.2463, + "step": 12055 + }, + { + "epoch": 1.61, + "grad_norm": 0.4375, + "learning_rate": 0.00014933968043638827, + "loss": 0.1953, + "step": 12056 + }, + { + "epoch": 1.61, + "grad_norm": 0.578125, + "learning_rate": 0.00014932955121562782, + "loss": 0.4532, + "step": 12057 + }, + { + "epoch": 1.61, + "grad_norm": 0.59765625, + "learning_rate": 0.00014931942132592697, + "loss": 0.229, + "step": 12058 + }, + { + "epoch": 1.61, + "grad_norm": 0.4765625, + "learning_rate": 0.00014930929076742316, + "loss": 0.2514, + "step": 12059 + }, + { + "epoch": 1.61, + "grad_norm": 0.337890625, + "learning_rate": 0.0001492991595402537, + "loss": 0.1852, + "step": 12060 + }, + { + "epoch": 1.61, + "grad_norm": 0.515625, + "learning_rate": 0.00014928902764455598, + "loss": 0.3849, + "step": 12061 + }, + { + "epoch": 1.61, + "grad_norm": 0.5390625, + "learning_rate": 0.00014927889508046746, + "loss": 0.581, + "step": 12062 + }, + { + "epoch": 1.61, + "grad_norm": 0.53125, + "learning_rate": 0.00014926876184812545, + "loss": 0.4503, + "step": 12063 + }, + { + "epoch": 1.61, + "grad_norm": 0.390625, + "learning_rate": 0.00014925862794766743, + "loss": 0.4369, + "step": 12064 + }, + { + "epoch": 1.61, + "grad_norm": 0.48828125, + "learning_rate": 0.00014924849337923083, + "loss": 0.5125, + "step": 12065 + }, + { + "epoch": 1.61, + "grad_norm": 0.6171875, + "learning_rate": 0.00014923835814295298, + "loss": 0.6761, + "step": 12066 + }, + { + "epoch": 1.61, + "grad_norm": 0.54296875, + "learning_rate": 0.00014922822223897145, + "loss": 0.6704, + "step": 12067 + }, + { + "epoch": 1.61, + "grad_norm": 0.42578125, + "learning_rate": 0.0001492180856674236, + "loss": 0.2721, + "step": 12068 + }, + { + "epoch": 1.61, + "grad_norm": 0.625, + "learning_rate": 0.0001492079484284469, + "loss": 0.3724, + "step": 12069 + }, + { + "epoch": 1.61, + "grad_norm": 0.4453125, + "learning_rate": 0.00014919781052217886, + "loss": 0.4665, + "step": 12070 + }, + { + "epoch": 1.61, + "grad_norm": 0.46484375, + "learning_rate": 0.00014918767194875698, + "loss": 0.3134, + "step": 12071 + }, + { + "epoch": 1.61, + "grad_norm": 0.4453125, + "learning_rate": 0.00014917753270831866, + "loss": 0.3125, + "step": 12072 + }, + { + "epoch": 1.61, + "grad_norm": 0.65625, + "learning_rate": 0.00014916739280100143, + "loss": 0.2698, + "step": 12073 + }, + { + "epoch": 1.61, + "grad_norm": 0.62109375, + "learning_rate": 0.0001491572522269428, + "loss": 0.559, + "step": 12074 + }, + { + "epoch": 1.61, + "grad_norm": 0.3828125, + "learning_rate": 0.00014914711098628028, + "loss": 0.385, + "step": 12075 + }, + { + "epoch": 1.61, + "grad_norm": 0.72265625, + "learning_rate": 0.00014913696907915141, + "loss": 0.3126, + "step": 12076 + }, + { + "epoch": 1.61, + "grad_norm": 0.71875, + "learning_rate": 0.00014912682650569371, + "loss": 0.2958, + "step": 12077 + }, + { + "epoch": 1.61, + "grad_norm": 0.455078125, + "learning_rate": 0.0001491166832660447, + "loss": 0.3939, + "step": 12078 + }, + { + "epoch": 1.61, + "grad_norm": 0.462890625, + "learning_rate": 0.00014910653936034193, + "loss": 0.2483, + "step": 12079 + }, + { + "epoch": 1.61, + "grad_norm": 0.84375, + "learning_rate": 0.00014909639478872296, + "loss": 0.4115, + "step": 12080 + }, + { + "epoch": 1.61, + "grad_norm": 0.431640625, + "learning_rate": 0.0001490862495513254, + "loss": 0.1956, + "step": 12081 + }, + { + "epoch": 1.61, + "grad_norm": 0.55078125, + "learning_rate": 0.00014907610364828678, + "loss": 0.4704, + "step": 12082 + }, + { + "epoch": 1.61, + "grad_norm": 0.375, + "learning_rate": 0.0001490659570797447, + "loss": 0.2712, + "step": 12083 + }, + { + "epoch": 1.61, + "grad_norm": 0.5546875, + "learning_rate": 0.00014905580984583673, + "loss": 0.4321, + "step": 12084 + }, + { + "epoch": 1.61, + "grad_norm": 0.515625, + "learning_rate": 0.00014904566194670054, + "loss": 0.2835, + "step": 12085 + }, + { + "epoch": 1.61, + "grad_norm": 0.4921875, + "learning_rate": 0.00014903551338247366, + "loss": 0.34, + "step": 12086 + }, + { + "epoch": 1.61, + "grad_norm": 0.42578125, + "learning_rate": 0.00014902536415329376, + "loss": 0.2896, + "step": 12087 + }, + { + "epoch": 1.61, + "grad_norm": 0.6484375, + "learning_rate": 0.00014901521425929844, + "loss": 0.5319, + "step": 12088 + }, + { + "epoch": 1.61, + "grad_norm": 0.30859375, + "learning_rate": 0.00014900506370062538, + "loss": 0.0949, + "step": 12089 + }, + { + "epoch": 1.61, + "grad_norm": 0.443359375, + "learning_rate": 0.0001489949124774122, + "loss": 0.331, + "step": 12090 + }, + { + "epoch": 1.61, + "grad_norm": 0.57421875, + "learning_rate": 0.00014898476058979656, + "loss": 0.3583, + "step": 12091 + }, + { + "epoch": 1.61, + "grad_norm": 0.48828125, + "learning_rate": 0.00014897460803791612, + "loss": 0.5104, + "step": 12092 + }, + { + "epoch": 1.61, + "grad_norm": 0.48828125, + "learning_rate": 0.00014896445482190856, + "loss": 0.471, + "step": 12093 + }, + { + "epoch": 1.61, + "grad_norm": 0.486328125, + "learning_rate": 0.00014895430094191158, + "loss": 0.4422, + "step": 12094 + }, + { + "epoch": 1.61, + "grad_norm": 0.60546875, + "learning_rate": 0.00014894414639806286, + "loss": 0.6257, + "step": 12095 + }, + { + "epoch": 1.61, + "grad_norm": 0.58984375, + "learning_rate": 0.0001489339911905001, + "loss": 0.4004, + "step": 12096 + }, + { + "epoch": 1.61, + "grad_norm": 0.4375, + "learning_rate": 0.00014892383531936101, + "loss": 0.3723, + "step": 12097 + }, + { + "epoch": 1.61, + "grad_norm": 0.439453125, + "learning_rate": 0.00014891367878478332, + "loss": 0.2566, + "step": 12098 + }, + { + "epoch": 1.61, + "grad_norm": 0.451171875, + "learning_rate": 0.00014890352158690475, + "loss": 0.4381, + "step": 12099 + }, + { + "epoch": 1.61, + "grad_norm": 0.60546875, + "learning_rate": 0.00014889336372586305, + "loss": 0.467, + "step": 12100 + }, + { + "epoch": 1.61, + "grad_norm": 0.55859375, + "learning_rate": 0.00014888320520179598, + "loss": 0.448, + "step": 12101 + }, + { + "epoch": 1.61, + "grad_norm": 0.408203125, + "learning_rate": 0.0001488730460148412, + "loss": 0.372, + "step": 12102 + }, + { + "epoch": 1.62, + "grad_norm": 0.5234375, + "learning_rate": 0.00014886288616513662, + "loss": 0.2899, + "step": 12103 + }, + { + "epoch": 1.62, + "grad_norm": 0.47265625, + "learning_rate": 0.00014885272565281991, + "loss": 0.3985, + "step": 12104 + }, + { + "epoch": 1.62, + "grad_norm": 0.490234375, + "learning_rate": 0.00014884256447802888, + "loss": 0.4285, + "step": 12105 + }, + { + "epoch": 1.62, + "grad_norm": 0.3828125, + "learning_rate": 0.00014883240264090136, + "loss": 0.2811, + "step": 12106 + }, + { + "epoch": 1.62, + "grad_norm": 0.63671875, + "learning_rate": 0.0001488222401415751, + "loss": 0.4334, + "step": 12107 + }, + { + "epoch": 1.62, + "grad_norm": 0.41796875, + "learning_rate": 0.00014881207698018794, + "loss": 0.2968, + "step": 12108 + }, + { + "epoch": 1.62, + "grad_norm": 0.447265625, + "learning_rate": 0.00014880191315687767, + "loss": 0.3424, + "step": 12109 + }, + { + "epoch": 1.62, + "grad_norm": 0.51171875, + "learning_rate": 0.00014879174867178215, + "loss": 0.2238, + "step": 12110 + }, + { + "epoch": 1.62, + "grad_norm": 0.48828125, + "learning_rate": 0.00014878158352503923, + "loss": 0.4575, + "step": 12111 + }, + { + "epoch": 1.62, + "grad_norm": 0.48046875, + "learning_rate": 0.00014877141771678669, + "loss": 0.3622, + "step": 12112 + }, + { + "epoch": 1.62, + "grad_norm": 0.462890625, + "learning_rate": 0.00014876125124716242, + "loss": 0.5233, + "step": 12113 + }, + { + "epoch": 1.62, + "grad_norm": 0.279296875, + "learning_rate": 0.0001487510841163043, + "loss": 0.2037, + "step": 12114 + }, + { + "epoch": 1.62, + "grad_norm": 0.52734375, + "learning_rate": 0.00014874091632435023, + "loss": 0.6258, + "step": 12115 + }, + { + "epoch": 1.62, + "grad_norm": 0.51171875, + "learning_rate": 0.000148730747871438, + "loss": 0.4696, + "step": 12116 + }, + { + "epoch": 1.62, + "grad_norm": 0.62890625, + "learning_rate": 0.00014872057875770558, + "loss": 0.4452, + "step": 12117 + }, + { + "epoch": 1.62, + "grad_norm": 0.52734375, + "learning_rate": 0.00014871040898329081, + "loss": 0.2107, + "step": 12118 + }, + { + "epoch": 1.62, + "grad_norm": 0.51171875, + "learning_rate": 0.00014870023854833168, + "loss": 0.4508, + "step": 12119 + }, + { + "epoch": 1.62, + "grad_norm": 0.546875, + "learning_rate": 0.00014869006745296605, + "loss": 0.3864, + "step": 12120 + }, + { + "epoch": 1.62, + "grad_norm": 0.58203125, + "learning_rate": 0.00014867989569733182, + "loss": 0.5052, + "step": 12121 + }, + { + "epoch": 1.62, + "grad_norm": 0.51171875, + "learning_rate": 0.000148669723281567, + "loss": 0.5114, + "step": 12122 + }, + { + "epoch": 1.62, + "grad_norm": 0.4296875, + "learning_rate": 0.00014865955020580945, + "loss": 0.403, + "step": 12123 + }, + { + "epoch": 1.62, + "grad_norm": 0.404296875, + "learning_rate": 0.00014864937647019724, + "loss": 0.2589, + "step": 12124 + }, + { + "epoch": 1.62, + "grad_norm": 0.486328125, + "learning_rate": 0.0001486392020748682, + "loss": 0.3753, + "step": 12125 + }, + { + "epoch": 1.62, + "grad_norm": 0.73828125, + "learning_rate": 0.0001486290270199604, + "loss": 0.5681, + "step": 12126 + }, + { + "epoch": 1.62, + "grad_norm": 0.490234375, + "learning_rate": 0.00014861885130561177, + "loss": 0.4509, + "step": 12127 + }, + { + "epoch": 1.62, + "grad_norm": 0.384765625, + "learning_rate": 0.00014860867493196033, + "loss": 0.2181, + "step": 12128 + }, + { + "epoch": 1.62, + "grad_norm": 0.6328125, + "learning_rate": 0.00014859849789914404, + "loss": 0.4568, + "step": 12129 + }, + { + "epoch": 1.62, + "grad_norm": 0.51171875, + "learning_rate": 0.00014858832020730092, + "loss": 0.2326, + "step": 12130 + }, + { + "epoch": 1.62, + "grad_norm": 0.4140625, + "learning_rate": 0.000148578141856569, + "loss": 0.3177, + "step": 12131 + }, + { + "epoch": 1.62, + "grad_norm": 0.609375, + "learning_rate": 0.00014856796284708632, + "loss": 0.2888, + "step": 12132 + }, + { + "epoch": 1.62, + "grad_norm": 0.6015625, + "learning_rate": 0.0001485577831789909, + "loss": 0.6541, + "step": 12133 + }, + { + "epoch": 1.62, + "grad_norm": 0.5, + "learning_rate": 0.00014854760285242072, + "loss": 0.2632, + "step": 12134 + }, + { + "epoch": 1.62, + "grad_norm": 0.38671875, + "learning_rate": 0.00014853742186751395, + "loss": 0.1753, + "step": 12135 + }, + { + "epoch": 1.62, + "grad_norm": 0.3984375, + "learning_rate": 0.00014852724022440853, + "loss": 0.206, + "step": 12136 + }, + { + "epoch": 1.62, + "grad_norm": 0.482421875, + "learning_rate": 0.00014851705792324265, + "loss": 0.3465, + "step": 12137 + }, + { + "epoch": 1.62, + "grad_norm": 0.515625, + "learning_rate": 0.0001485068749641543, + "loss": 0.3771, + "step": 12138 + }, + { + "epoch": 1.62, + "grad_norm": 0.494140625, + "learning_rate": 0.00014849669134728156, + "loss": 0.2324, + "step": 12139 + }, + { + "epoch": 1.62, + "grad_norm": 0.74609375, + "learning_rate": 0.0001484865070727626, + "loss": 0.5783, + "step": 12140 + }, + { + "epoch": 1.62, + "grad_norm": 0.59765625, + "learning_rate": 0.00014847632214073548, + "loss": 0.5342, + "step": 12141 + }, + { + "epoch": 1.62, + "grad_norm": 0.302734375, + "learning_rate": 0.0001484661365513383, + "loss": 0.1088, + "step": 12142 + }, + { + "epoch": 1.62, + "grad_norm": 0.5859375, + "learning_rate": 0.00014845595030470925, + "loss": 0.615, + "step": 12143 + }, + { + "epoch": 1.62, + "grad_norm": 0.41015625, + "learning_rate": 0.00014844576340098637, + "loss": 0.3762, + "step": 12144 + }, + { + "epoch": 1.62, + "grad_norm": 0.6953125, + "learning_rate": 0.00014843557584030788, + "loss": 0.3923, + "step": 12145 + }, + { + "epoch": 1.62, + "grad_norm": 0.578125, + "learning_rate": 0.00014842538762281188, + "loss": 0.6742, + "step": 12146 + }, + { + "epoch": 1.62, + "grad_norm": 0.63671875, + "learning_rate": 0.00014841519874863655, + "loss": 0.3729, + "step": 12147 + }, + { + "epoch": 1.62, + "grad_norm": 0.5859375, + "learning_rate": 0.00014840500921792002, + "loss": 0.4692, + "step": 12148 + }, + { + "epoch": 1.62, + "grad_norm": 0.3828125, + "learning_rate": 0.00014839481903080053, + "loss": 0.2249, + "step": 12149 + }, + { + "epoch": 1.62, + "grad_norm": 0.62890625, + "learning_rate": 0.00014838462818741622, + "loss": 0.2301, + "step": 12150 + }, + { + "epoch": 1.62, + "grad_norm": 0.71484375, + "learning_rate": 0.00014837443668790532, + "loss": 0.3759, + "step": 12151 + }, + { + "epoch": 1.62, + "grad_norm": 0.7109375, + "learning_rate": 0.000148364244532406, + "loss": 0.3807, + "step": 12152 + }, + { + "epoch": 1.62, + "grad_norm": 0.51953125, + "learning_rate": 0.00014835405172105646, + "loss": 0.5541, + "step": 12153 + }, + { + "epoch": 1.62, + "grad_norm": 0.5703125, + "learning_rate": 0.00014834385825399497, + "loss": 0.4396, + "step": 12154 + }, + { + "epoch": 1.62, + "grad_norm": 0.380859375, + "learning_rate": 0.00014833366413135972, + "loss": 0.3912, + "step": 12155 + }, + { + "epoch": 1.62, + "grad_norm": 0.5, + "learning_rate": 0.000148323469353289, + "loss": 0.4351, + "step": 12156 + }, + { + "epoch": 1.62, + "grad_norm": 0.55859375, + "learning_rate": 0.000148313273919921, + "loss": 0.4993, + "step": 12157 + }, + { + "epoch": 1.62, + "grad_norm": 0.578125, + "learning_rate": 0.000148303077831394, + "loss": 0.4571, + "step": 12158 + }, + { + "epoch": 1.62, + "grad_norm": 0.431640625, + "learning_rate": 0.00014829288108784625, + "loss": 0.4312, + "step": 12159 + }, + { + "epoch": 1.62, + "grad_norm": 0.50390625, + "learning_rate": 0.00014828268368941605, + "loss": 0.3641, + "step": 12160 + }, + { + "epoch": 1.62, + "grad_norm": 0.65625, + "learning_rate": 0.00014827248563624166, + "loss": 0.4955, + "step": 12161 + }, + { + "epoch": 1.62, + "grad_norm": 0.396484375, + "learning_rate": 0.0001482622869284614, + "loss": 0.2945, + "step": 12162 + }, + { + "epoch": 1.62, + "grad_norm": 0.5234375, + "learning_rate": 0.00014825208756621353, + "loss": 0.2842, + "step": 12163 + }, + { + "epoch": 1.62, + "grad_norm": 0.341796875, + "learning_rate": 0.00014824188754963643, + "loss": 0.2132, + "step": 12164 + }, + { + "epoch": 1.62, + "grad_norm": 0.490234375, + "learning_rate": 0.00014823168687886832, + "loss": 0.4549, + "step": 12165 + }, + { + "epoch": 1.62, + "grad_norm": 0.38671875, + "learning_rate": 0.00014822148555404763, + "loss": 0.2467, + "step": 12166 + }, + { + "epoch": 1.62, + "grad_norm": 0.4921875, + "learning_rate": 0.00014821128357531263, + "loss": 0.3461, + "step": 12167 + }, + { + "epoch": 1.62, + "grad_norm": 0.61328125, + "learning_rate": 0.00014820108094280168, + "loss": 0.4181, + "step": 12168 + }, + { + "epoch": 1.62, + "grad_norm": 0.462890625, + "learning_rate": 0.0001481908776566531, + "loss": 0.3489, + "step": 12169 + }, + { + "epoch": 1.62, + "grad_norm": 0.5390625, + "learning_rate": 0.00014818067371700534, + "loss": 0.4435, + "step": 12170 + }, + { + "epoch": 1.62, + "grad_norm": 0.578125, + "learning_rate": 0.00014817046912399671, + "loss": 0.6661, + "step": 12171 + }, + { + "epoch": 1.62, + "grad_norm": 0.66796875, + "learning_rate": 0.0001481602638777656, + "loss": 0.6726, + "step": 12172 + }, + { + "epoch": 1.62, + "grad_norm": 0.58984375, + "learning_rate": 0.00014815005797845038, + "loss": 0.5385, + "step": 12173 + }, + { + "epoch": 1.62, + "grad_norm": 0.39453125, + "learning_rate": 0.00014813985142618955, + "loss": 0.1712, + "step": 12174 + }, + { + "epoch": 1.62, + "grad_norm": 0.48046875, + "learning_rate": 0.00014812964422112134, + "loss": 0.339, + "step": 12175 + }, + { + "epoch": 1.62, + "grad_norm": 0.609375, + "learning_rate": 0.00014811943636338433, + "loss": 0.2581, + "step": 12176 + }, + { + "epoch": 1.62, + "grad_norm": 0.515625, + "learning_rate": 0.00014810922785311685, + "loss": 0.2329, + "step": 12177 + }, + { + "epoch": 1.63, + "grad_norm": 0.48828125, + "learning_rate": 0.00014809901869045734, + "loss": 0.4562, + "step": 12178 + }, + { + "epoch": 1.63, + "grad_norm": 0.578125, + "learning_rate": 0.00014808880887554434, + "loss": 0.4355, + "step": 12179 + }, + { + "epoch": 1.63, + "grad_norm": 0.671875, + "learning_rate": 0.00014807859840851618, + "loss": 0.519, + "step": 12180 + }, + { + "epoch": 1.63, + "grad_norm": 0.392578125, + "learning_rate": 0.00014806838728951137, + "loss": 0.2933, + "step": 12181 + }, + { + "epoch": 1.63, + "grad_norm": 0.4609375, + "learning_rate": 0.00014805817551866838, + "loss": 0.4303, + "step": 12182 + }, + { + "epoch": 1.63, + "grad_norm": 0.65625, + "learning_rate": 0.0001480479630961257, + "loss": 0.6449, + "step": 12183 + }, + { + "epoch": 1.63, + "grad_norm": 0.458984375, + "learning_rate": 0.00014803775002202182, + "loss": 0.1573, + "step": 12184 + }, + { + "epoch": 1.63, + "grad_norm": 0.5390625, + "learning_rate": 0.00014802753629649518, + "loss": 0.3294, + "step": 12185 + }, + { + "epoch": 1.63, + "grad_norm": 0.5625, + "learning_rate": 0.00014801732191968434, + "loss": 0.4011, + "step": 12186 + }, + { + "epoch": 1.63, + "grad_norm": 0.431640625, + "learning_rate": 0.00014800710689172778, + "loss": 0.3454, + "step": 12187 + }, + { + "epoch": 1.63, + "grad_norm": 0.5546875, + "learning_rate": 0.00014799689121276405, + "loss": 0.5516, + "step": 12188 + }, + { + "epoch": 1.63, + "grad_norm": 0.58984375, + "learning_rate": 0.00014798667488293167, + "loss": 0.3573, + "step": 12189 + }, + { + "epoch": 1.63, + "grad_norm": 0.51171875, + "learning_rate": 0.0001479764579023692, + "loss": 0.5158, + "step": 12190 + }, + { + "epoch": 1.63, + "grad_norm": 0.47265625, + "learning_rate": 0.00014796624027121514, + "loss": 0.2844, + "step": 12191 + }, + { + "epoch": 1.63, + "grad_norm": 0.431640625, + "learning_rate": 0.0001479560219896081, + "loss": 0.3046, + "step": 12192 + }, + { + "epoch": 1.63, + "grad_norm": 0.62890625, + "learning_rate": 0.00014794580305768664, + "loss": 0.376, + "step": 12193 + }, + { + "epoch": 1.63, + "grad_norm": 0.38671875, + "learning_rate": 0.0001479355834755893, + "loss": 0.1513, + "step": 12194 + }, + { + "epoch": 1.63, + "grad_norm": 0.5546875, + "learning_rate": 0.00014792536324345466, + "loss": 0.4101, + "step": 12195 + }, + { + "epoch": 1.63, + "grad_norm": 0.451171875, + "learning_rate": 0.00014791514236142135, + "loss": 0.2079, + "step": 12196 + }, + { + "epoch": 1.63, + "grad_norm": 0.416015625, + "learning_rate": 0.00014790492082962796, + "loss": 0.1432, + "step": 12197 + }, + { + "epoch": 1.63, + "grad_norm": 0.5078125, + "learning_rate": 0.0001478946986482131, + "loss": 0.4326, + "step": 12198 + }, + { + "epoch": 1.63, + "grad_norm": 0.36328125, + "learning_rate": 0.0001478844758173154, + "loss": 0.1356, + "step": 12199 + }, + { + "epoch": 1.63, + "grad_norm": 0.58984375, + "learning_rate": 0.00014787425233707348, + "loss": 0.2993, + "step": 12200 + }, + { + "epoch": 1.63, + "grad_norm": 0.609375, + "learning_rate": 0.00014786402820762598, + "loss": 0.459, + "step": 12201 + }, + { + "epoch": 1.63, + "grad_norm": 0.5234375, + "learning_rate": 0.00014785380342911147, + "loss": 0.4299, + "step": 12202 + }, + { + "epoch": 1.63, + "grad_norm": 0.51953125, + "learning_rate": 0.00014784357800166876, + "loss": 0.5249, + "step": 12203 + }, + { + "epoch": 1.63, + "grad_norm": 0.52734375, + "learning_rate": 0.00014783335192543638, + "loss": 0.2554, + "step": 12204 + }, + { + "epoch": 1.63, + "grad_norm": 0.50390625, + "learning_rate": 0.00014782312520055305, + "loss": 0.3223, + "step": 12205 + }, + { + "epoch": 1.63, + "grad_norm": 0.55078125, + "learning_rate": 0.00014781289782715747, + "loss": 0.4639, + "step": 12206 + }, + { + "epoch": 1.63, + "grad_norm": 0.396484375, + "learning_rate": 0.00014780266980538828, + "loss": 0.2483, + "step": 12207 + }, + { + "epoch": 1.63, + "grad_norm": 0.5234375, + "learning_rate": 0.00014779244113538424, + "loss": 0.4557, + "step": 12208 + }, + { + "epoch": 1.63, + "grad_norm": 0.447265625, + "learning_rate": 0.00014778221181728398, + "loss": 0.3475, + "step": 12209 + }, + { + "epoch": 1.63, + "grad_norm": 0.52734375, + "learning_rate": 0.0001477719818512263, + "loss": 0.447, + "step": 12210 + }, + { + "epoch": 1.63, + "grad_norm": 0.41015625, + "learning_rate": 0.00014776175123734989, + "loss": 0.2981, + "step": 12211 + }, + { + "epoch": 1.63, + "grad_norm": 0.5234375, + "learning_rate": 0.00014775151997579346, + "loss": 0.2921, + "step": 12212 + }, + { + "epoch": 1.63, + "grad_norm": 0.5390625, + "learning_rate": 0.00014774128806669578, + "loss": 0.4727, + "step": 12213 + }, + { + "epoch": 1.63, + "grad_norm": 0.5859375, + "learning_rate": 0.00014773105551019557, + "loss": 0.6092, + "step": 12214 + }, + { + "epoch": 1.63, + "grad_norm": 0.36328125, + "learning_rate": 0.00014772082230643166, + "loss": 0.3492, + "step": 12215 + }, + { + "epoch": 1.63, + "grad_norm": 0.69921875, + "learning_rate": 0.00014771058845554274, + "loss": 0.4627, + "step": 12216 + }, + { + "epoch": 1.63, + "grad_norm": 0.47265625, + "learning_rate": 0.0001477003539576676, + "loss": 0.3638, + "step": 12217 + }, + { + "epoch": 1.63, + "grad_norm": 0.578125, + "learning_rate": 0.0001476901188129451, + "loss": 0.2703, + "step": 12218 + }, + { + "epoch": 1.63, + "grad_norm": 0.51953125, + "learning_rate": 0.00014767988302151393, + "loss": 0.2858, + "step": 12219 + }, + { + "epoch": 1.63, + "grad_norm": 0.345703125, + "learning_rate": 0.000147669646583513, + "loss": 0.1903, + "step": 12220 + }, + { + "epoch": 1.63, + "grad_norm": 0.55078125, + "learning_rate": 0.00014765940949908104, + "loss": 0.4765, + "step": 12221 + }, + { + "epoch": 1.63, + "grad_norm": 0.5546875, + "learning_rate": 0.00014764917176835688, + "loss": 0.5534, + "step": 12222 + }, + { + "epoch": 1.63, + "grad_norm": 0.423828125, + "learning_rate": 0.0001476389333914794, + "loss": 0.5761, + "step": 12223 + }, + { + "epoch": 1.63, + "grad_norm": 0.63671875, + "learning_rate": 0.0001476286943685874, + "loss": 0.3252, + "step": 12224 + }, + { + "epoch": 1.63, + "grad_norm": 0.60546875, + "learning_rate": 0.00014761845469981973, + "loss": 0.7223, + "step": 12225 + }, + { + "epoch": 1.63, + "grad_norm": 0.490234375, + "learning_rate": 0.00014760821438531525, + "loss": 0.3918, + "step": 12226 + }, + { + "epoch": 1.63, + "grad_norm": 0.4609375, + "learning_rate": 0.00014759797342521285, + "loss": 0.4615, + "step": 12227 + }, + { + "epoch": 1.63, + "grad_norm": 0.52734375, + "learning_rate": 0.00014758773181965134, + "loss": 0.6617, + "step": 12228 + }, + { + "epoch": 1.63, + "grad_norm": 0.4375, + "learning_rate": 0.00014757748956876972, + "loss": 0.4818, + "step": 12229 + }, + { + "epoch": 1.63, + "grad_norm": 0.546875, + "learning_rate": 0.00014756724667270673, + "loss": 0.2945, + "step": 12230 + }, + { + "epoch": 1.63, + "grad_norm": 0.546875, + "learning_rate": 0.00014755700313160138, + "loss": 0.3573, + "step": 12231 + }, + { + "epoch": 1.63, + "grad_norm": 0.6484375, + "learning_rate": 0.00014754675894559257, + "loss": 0.561, + "step": 12232 + }, + { + "epoch": 1.63, + "grad_norm": 0.828125, + "learning_rate": 0.00014753651411481914, + "loss": 0.3228, + "step": 12233 + }, + { + "epoch": 1.63, + "grad_norm": 0.609375, + "learning_rate": 0.0001475262686394201, + "loss": 0.4211, + "step": 12234 + }, + { + "epoch": 1.63, + "grad_norm": 0.5078125, + "learning_rate": 0.00014751602251953435, + "loss": 0.3612, + "step": 12235 + }, + { + "epoch": 1.63, + "grad_norm": 0.48046875, + "learning_rate": 0.00014750577575530085, + "loss": 0.3306, + "step": 12236 + }, + { + "epoch": 1.63, + "grad_norm": 0.40234375, + "learning_rate": 0.00014749552834685852, + "loss": 0.2827, + "step": 12237 + }, + { + "epoch": 1.63, + "grad_norm": 0.330078125, + "learning_rate": 0.00014748528029434637, + "loss": 0.2437, + "step": 12238 + }, + { + "epoch": 1.63, + "grad_norm": 0.5078125, + "learning_rate": 0.0001474750315979033, + "loss": 0.4819, + "step": 12239 + }, + { + "epoch": 1.63, + "grad_norm": 0.57421875, + "learning_rate": 0.00014746478225766837, + "loss": 0.31, + "step": 12240 + }, + { + "epoch": 1.63, + "grad_norm": 0.3828125, + "learning_rate": 0.0001474545322737805, + "loss": 0.3334, + "step": 12241 + }, + { + "epoch": 1.63, + "grad_norm": 0.427734375, + "learning_rate": 0.00014744428164637877, + "loss": 0.1955, + "step": 12242 + }, + { + "epoch": 1.63, + "grad_norm": 0.43359375, + "learning_rate": 0.0001474340303756021, + "loss": 0.303, + "step": 12243 + }, + { + "epoch": 1.63, + "grad_norm": 0.67578125, + "learning_rate": 0.0001474237784615895, + "loss": 0.4813, + "step": 12244 + }, + { + "epoch": 1.63, + "grad_norm": 0.419921875, + "learning_rate": 0.00014741352590448009, + "loss": 0.3465, + "step": 12245 + }, + { + "epoch": 1.63, + "grad_norm": 0.48046875, + "learning_rate": 0.0001474032727044128, + "loss": 0.3278, + "step": 12246 + }, + { + "epoch": 1.63, + "grad_norm": 0.640625, + "learning_rate": 0.0001473930188615267, + "loss": 0.9053, + "step": 12247 + }, + { + "epoch": 1.63, + "grad_norm": 0.462890625, + "learning_rate": 0.00014738276437596088, + "loss": 0.3968, + "step": 12248 + }, + { + "epoch": 1.63, + "grad_norm": 0.515625, + "learning_rate": 0.00014737250924785436, + "loss": 0.6202, + "step": 12249 + }, + { + "epoch": 1.63, + "grad_norm": 0.58203125, + "learning_rate": 0.00014736225347734618, + "loss": 0.2969, + "step": 12250 + }, + { + "epoch": 1.63, + "grad_norm": 0.408203125, + "learning_rate": 0.00014735199706457545, + "loss": 0.2752, + "step": 12251 + }, + { + "epoch": 1.63, + "grad_norm": 0.62109375, + "learning_rate": 0.0001473417400096813, + "loss": 0.6287, + "step": 12252 + }, + { + "epoch": 1.64, + "grad_norm": 0.40625, + "learning_rate": 0.00014733148231280274, + "loss": 0.2711, + "step": 12253 + }, + { + "epoch": 1.64, + "grad_norm": 0.54296875, + "learning_rate": 0.0001473212239740789, + "loss": 0.434, + "step": 12254 + }, + { + "epoch": 1.64, + "grad_norm": 0.60546875, + "learning_rate": 0.00014731096499364887, + "loss": 0.4424, + "step": 12255 + }, + { + "epoch": 1.64, + "grad_norm": 0.5625, + "learning_rate": 0.00014730070537165182, + "loss": 0.6354, + "step": 12256 + }, + { + "epoch": 1.64, + "grad_norm": 0.56640625, + "learning_rate": 0.00014729044510822684, + "loss": 0.4752, + "step": 12257 + }, + { + "epoch": 1.64, + "grad_norm": 0.67578125, + "learning_rate": 0.00014728018420351308, + "loss": 0.4149, + "step": 12258 + }, + { + "epoch": 1.64, + "grad_norm": 0.71484375, + "learning_rate": 0.0001472699226576497, + "loss": 0.5779, + "step": 12259 + }, + { + "epoch": 1.64, + "grad_norm": 0.8359375, + "learning_rate": 0.00014725966047077576, + "loss": 0.4742, + "step": 12260 + }, + { + "epoch": 1.64, + "grad_norm": 0.412109375, + "learning_rate": 0.00014724939764303055, + "loss": 0.2355, + "step": 12261 + }, + { + "epoch": 1.64, + "grad_norm": 0.6171875, + "learning_rate": 0.00014723913417455319, + "loss": 0.3645, + "step": 12262 + }, + { + "epoch": 1.64, + "grad_norm": 0.4453125, + "learning_rate": 0.00014722887006548283, + "loss": 0.2035, + "step": 12263 + }, + { + "epoch": 1.64, + "grad_norm": 0.48828125, + "learning_rate": 0.0001472186053159587, + "loss": 0.2605, + "step": 12264 + }, + { + "epoch": 1.64, + "grad_norm": 0.65625, + "learning_rate": 0.00014720833992611993, + "loss": 0.5865, + "step": 12265 + }, + { + "epoch": 1.64, + "grad_norm": 0.486328125, + "learning_rate": 0.00014719807389610584, + "loss": 0.2186, + "step": 12266 + }, + { + "epoch": 1.64, + "grad_norm": 0.439453125, + "learning_rate": 0.00014718780722605552, + "loss": 0.3479, + "step": 12267 + }, + { + "epoch": 1.64, + "grad_norm": 0.61328125, + "learning_rate": 0.00014717753991610829, + "loss": 0.2819, + "step": 12268 + }, + { + "epoch": 1.64, + "grad_norm": 0.5546875, + "learning_rate": 0.00014716727196640328, + "loss": 0.2722, + "step": 12269 + }, + { + "epoch": 1.64, + "grad_norm": 0.53515625, + "learning_rate": 0.00014715700337707984, + "loss": 0.516, + "step": 12270 + }, + { + "epoch": 1.64, + "grad_norm": 0.4375, + "learning_rate": 0.00014714673414827718, + "loss": 0.422, + "step": 12271 + }, + { + "epoch": 1.64, + "grad_norm": 0.58203125, + "learning_rate": 0.00014713646428013452, + "loss": 0.2364, + "step": 12272 + }, + { + "epoch": 1.64, + "grad_norm": 0.48046875, + "learning_rate": 0.00014712619377279116, + "loss": 0.2903, + "step": 12273 + }, + { + "epoch": 1.64, + "grad_norm": 0.51953125, + "learning_rate": 0.00014711592262638636, + "loss": 0.7713, + "step": 12274 + }, + { + "epoch": 1.64, + "grad_norm": 0.5390625, + "learning_rate": 0.0001471056508410594, + "loss": 0.396, + "step": 12275 + }, + { + "epoch": 1.64, + "grad_norm": 0.45703125, + "learning_rate": 0.0001470953784169496, + "loss": 0.3937, + "step": 12276 + }, + { + "epoch": 1.64, + "grad_norm": 0.40625, + "learning_rate": 0.00014708510535419627, + "loss": 0.3999, + "step": 12277 + }, + { + "epoch": 1.64, + "grad_norm": 0.41015625, + "learning_rate": 0.00014707483165293865, + "loss": 0.2256, + "step": 12278 + }, + { + "epoch": 1.64, + "grad_norm": 0.38671875, + "learning_rate": 0.00014706455731331613, + "loss": 0.2123, + "step": 12279 + }, + { + "epoch": 1.64, + "grad_norm": 0.484375, + "learning_rate": 0.00014705428233546797, + "loss": 0.4838, + "step": 12280 + }, + { + "epoch": 1.64, + "grad_norm": 0.5, + "learning_rate": 0.00014704400671953355, + "loss": 0.2882, + "step": 12281 + }, + { + "epoch": 1.64, + "grad_norm": 0.439453125, + "learning_rate": 0.00014703373046565224, + "loss": 0.3106, + "step": 12282 + }, + { + "epoch": 1.64, + "grad_norm": 0.58203125, + "learning_rate": 0.00014702345357396333, + "loss": 0.2113, + "step": 12283 + }, + { + "epoch": 1.64, + "grad_norm": 0.474609375, + "learning_rate": 0.00014701317604460623, + "loss": 0.684, + "step": 12284 + }, + { + "epoch": 1.64, + "grad_norm": 0.447265625, + "learning_rate": 0.00014700289787772025, + "loss": 0.2399, + "step": 12285 + }, + { + "epoch": 1.64, + "grad_norm": 0.5625, + "learning_rate": 0.00014699261907344486, + "loss": 0.4851, + "step": 12286 + }, + { + "epoch": 1.64, + "grad_norm": 0.490234375, + "learning_rate": 0.00014698233963191936, + "loss": 0.3515, + "step": 12287 + }, + { + "epoch": 1.64, + "grad_norm": 0.671875, + "learning_rate": 0.00014697205955328322, + "loss": 0.6291, + "step": 12288 + }, + { + "epoch": 1.64, + "grad_norm": 0.48046875, + "learning_rate": 0.00014696177883767575, + "loss": 0.4187, + "step": 12289 + }, + { + "epoch": 1.64, + "grad_norm": 0.3671875, + "learning_rate": 0.00014695149748523647, + "loss": 0.3156, + "step": 12290 + }, + { + "epoch": 1.64, + "grad_norm": 0.44140625, + "learning_rate": 0.00014694121549610473, + "loss": 0.2532, + "step": 12291 + }, + { + "epoch": 1.64, + "grad_norm": 0.578125, + "learning_rate": 0.00014693093287041997, + "loss": 0.6594, + "step": 12292 + }, + { + "epoch": 1.64, + "grad_norm": 0.390625, + "learning_rate": 0.00014692064960832165, + "loss": 0.4116, + "step": 12293 + }, + { + "epoch": 1.64, + "grad_norm": 0.4375, + "learning_rate": 0.0001469103657099492, + "loss": 0.3752, + "step": 12294 + }, + { + "epoch": 1.64, + "grad_norm": 0.375, + "learning_rate": 0.0001469000811754421, + "loss": 0.3153, + "step": 12295 + }, + { + "epoch": 1.64, + "grad_norm": 0.55859375, + "learning_rate": 0.0001468897960049398, + "loss": 0.5152, + "step": 12296 + }, + { + "epoch": 1.64, + "grad_norm": 0.69921875, + "learning_rate": 0.00014687951019858177, + "loss": 0.6414, + "step": 12297 + }, + { + "epoch": 1.64, + "grad_norm": 0.296875, + "learning_rate": 0.00014686922375650748, + "loss": 0.1825, + "step": 12298 + }, + { + "epoch": 1.64, + "grad_norm": 0.373046875, + "learning_rate": 0.00014685893667885645, + "loss": 0.2221, + "step": 12299 + }, + { + "epoch": 1.64, + "grad_norm": 0.484375, + "learning_rate": 0.00014684864896576816, + "loss": 0.2979, + "step": 12300 + }, + { + "epoch": 1.64, + "grad_norm": 0.486328125, + "learning_rate": 0.0001468383606173821, + "loss": 0.3934, + "step": 12301 + }, + { + "epoch": 1.64, + "grad_norm": 0.625, + "learning_rate": 0.00014682807163383788, + "loss": 0.4213, + "step": 12302 + }, + { + "epoch": 1.64, + "grad_norm": 0.44921875, + "learning_rate": 0.00014681778201527487, + "loss": 0.2662, + "step": 12303 + }, + { + "epoch": 1.64, + "grad_norm": 0.56640625, + "learning_rate": 0.00014680749176183274, + "loss": 0.2898, + "step": 12304 + }, + { + "epoch": 1.64, + "grad_norm": 0.58203125, + "learning_rate": 0.00014679720087365096, + "loss": 0.4639, + "step": 12305 + }, + { + "epoch": 1.64, + "grad_norm": 0.6328125, + "learning_rate": 0.00014678690935086911, + "loss": 0.3797, + "step": 12306 + }, + { + "epoch": 1.64, + "grad_norm": 0.58203125, + "learning_rate": 0.0001467766171936267, + "loss": 0.2753, + "step": 12307 + }, + { + "epoch": 1.64, + "grad_norm": 0.54296875, + "learning_rate": 0.00014676632440206342, + "loss": 0.4312, + "step": 12308 + }, + { + "epoch": 1.64, + "grad_norm": 0.546875, + "learning_rate": 0.00014675603097631873, + "loss": 0.4322, + "step": 12309 + }, + { + "epoch": 1.64, + "grad_norm": 0.47265625, + "learning_rate": 0.00014674573691653224, + "loss": 0.5486, + "step": 12310 + }, + { + "epoch": 1.64, + "grad_norm": 0.4140625, + "learning_rate": 0.00014673544222284357, + "loss": 0.1717, + "step": 12311 + }, + { + "epoch": 1.64, + "grad_norm": 0.46875, + "learning_rate": 0.00014672514689539228, + "loss": 0.368, + "step": 12312 + }, + { + "epoch": 1.64, + "grad_norm": 0.6640625, + "learning_rate": 0.00014671485093431804, + "loss": 0.5339, + "step": 12313 + }, + { + "epoch": 1.64, + "grad_norm": 0.380859375, + "learning_rate": 0.00014670455433976045, + "loss": 0.2046, + "step": 12314 + }, + { + "epoch": 1.64, + "grad_norm": 0.486328125, + "learning_rate": 0.00014669425711185912, + "loss": 0.2858, + "step": 12315 + }, + { + "epoch": 1.64, + "grad_norm": 0.365234375, + "learning_rate": 0.00014668395925075366, + "loss": 0.1013, + "step": 12316 + }, + { + "epoch": 1.64, + "grad_norm": 0.6875, + "learning_rate": 0.00014667366075658378, + "loss": 0.2683, + "step": 12317 + }, + { + "epoch": 1.64, + "grad_norm": 0.7265625, + "learning_rate": 0.0001466633616294891, + "loss": 0.3458, + "step": 12318 + }, + { + "epoch": 1.64, + "grad_norm": 0.50390625, + "learning_rate": 0.0001466530618696093, + "loss": 0.3944, + "step": 12319 + }, + { + "epoch": 1.64, + "grad_norm": 0.515625, + "learning_rate": 0.00014664276147708405, + "loss": 0.3643, + "step": 12320 + }, + { + "epoch": 1.64, + "grad_norm": 0.57421875, + "learning_rate": 0.00014663246045205298, + "loss": 0.5451, + "step": 12321 + }, + { + "epoch": 1.64, + "grad_norm": 0.578125, + "learning_rate": 0.00014662215879465587, + "loss": 0.6025, + "step": 12322 + }, + { + "epoch": 1.64, + "grad_norm": 0.29296875, + "learning_rate": 0.0001466118565050323, + "loss": 0.2062, + "step": 12323 + }, + { + "epoch": 1.64, + "grad_norm": 0.4375, + "learning_rate": 0.00014660155358332206, + "loss": 0.2436, + "step": 12324 + }, + { + "epoch": 1.64, + "grad_norm": 0.6171875, + "learning_rate": 0.00014659125002966487, + "loss": 0.3828, + "step": 12325 + }, + { + "epoch": 1.64, + "grad_norm": 0.490234375, + "learning_rate": 0.0001465809458442004, + "loss": 0.328, + "step": 12326 + }, + { + "epoch": 1.64, + "grad_norm": 0.423828125, + "learning_rate": 0.00014657064102706845, + "loss": 0.4672, + "step": 12327 + }, + { + "epoch": 1.65, + "grad_norm": 0.6484375, + "learning_rate": 0.00014656033557840868, + "loss": 0.5128, + "step": 12328 + }, + { + "epoch": 1.65, + "grad_norm": 0.5546875, + "learning_rate": 0.0001465500294983609, + "loss": 0.4534, + "step": 12329 + }, + { + "epoch": 1.65, + "grad_norm": 0.6484375, + "learning_rate": 0.00014653972278706482, + "loss": 0.4225, + "step": 12330 + }, + { + "epoch": 1.65, + "grad_norm": 0.60546875, + "learning_rate": 0.00014652941544466026, + "loss": 0.4102, + "step": 12331 + }, + { + "epoch": 1.65, + "grad_norm": 0.66015625, + "learning_rate": 0.00014651910747128695, + "loss": 0.3007, + "step": 12332 + }, + { + "epoch": 1.65, + "grad_norm": 0.54296875, + "learning_rate": 0.00014650879886708468, + "loss": 0.5935, + "step": 12333 + }, + { + "epoch": 1.65, + "grad_norm": 0.6015625, + "learning_rate": 0.00014649848963219325, + "loss": 0.4848, + "step": 12334 + }, + { + "epoch": 1.65, + "grad_norm": 0.48828125, + "learning_rate": 0.00014648817976675247, + "loss": 0.452, + "step": 12335 + }, + { + "epoch": 1.65, + "grad_norm": 0.515625, + "learning_rate": 0.0001464778692709021, + "loss": 0.3192, + "step": 12336 + }, + { + "epoch": 1.65, + "grad_norm": 0.47265625, + "learning_rate": 0.00014646755814478203, + "loss": 0.3733, + "step": 12337 + }, + { + "epoch": 1.65, + "grad_norm": 0.5625, + "learning_rate": 0.00014645724638853204, + "loss": 0.6018, + "step": 12338 + }, + { + "epoch": 1.65, + "grad_norm": 0.484375, + "learning_rate": 0.00014644693400229198, + "loss": 0.5669, + "step": 12339 + }, + { + "epoch": 1.65, + "grad_norm": 0.470703125, + "learning_rate": 0.00014643662098620167, + "loss": 0.2455, + "step": 12340 + }, + { + "epoch": 1.65, + "grad_norm": 0.61328125, + "learning_rate": 0.000146426307340401, + "loss": 0.6228, + "step": 12341 + }, + { + "epoch": 1.65, + "grad_norm": 0.63671875, + "learning_rate": 0.00014641599306502979, + "loss": 0.5808, + "step": 12342 + }, + { + "epoch": 1.65, + "grad_norm": 0.3671875, + "learning_rate": 0.00014640567816022793, + "loss": 0.2169, + "step": 12343 + }, + { + "epoch": 1.65, + "grad_norm": 0.73046875, + "learning_rate": 0.0001463953626261353, + "loss": 0.5061, + "step": 12344 + }, + { + "epoch": 1.65, + "grad_norm": 0.5703125, + "learning_rate": 0.00014638504646289176, + "loss": 0.6486, + "step": 12345 + }, + { + "epoch": 1.65, + "grad_norm": 0.470703125, + "learning_rate": 0.0001463747296706372, + "loss": 0.3288, + "step": 12346 + }, + { + "epoch": 1.65, + "grad_norm": 0.515625, + "learning_rate": 0.00014636441224951158, + "loss": 0.2472, + "step": 12347 + }, + { + "epoch": 1.65, + "grad_norm": 0.57421875, + "learning_rate": 0.00014635409419965477, + "loss": 0.3939, + "step": 12348 + }, + { + "epoch": 1.65, + "grad_norm": 0.62890625, + "learning_rate": 0.00014634377552120667, + "loss": 0.4779, + "step": 12349 + }, + { + "epoch": 1.65, + "grad_norm": 0.44140625, + "learning_rate": 0.00014633345621430728, + "loss": 0.3099, + "step": 12350 + }, + { + "epoch": 1.65, + "grad_norm": 0.5546875, + "learning_rate": 0.00014632313627909642, + "loss": 0.664, + "step": 12351 + }, + { + "epoch": 1.65, + "grad_norm": 0.5, + "learning_rate": 0.00014631281571571414, + "loss": 0.4833, + "step": 12352 + }, + { + "epoch": 1.65, + "grad_norm": 0.486328125, + "learning_rate": 0.00014630249452430034, + "loss": 0.2277, + "step": 12353 + }, + { + "epoch": 1.65, + "grad_norm": 0.486328125, + "learning_rate": 0.00014629217270499501, + "loss": 0.2645, + "step": 12354 + }, + { + "epoch": 1.65, + "grad_norm": 0.46484375, + "learning_rate": 0.0001462818502579381, + "loss": 0.3223, + "step": 12355 + }, + { + "epoch": 1.65, + "grad_norm": 0.4765625, + "learning_rate": 0.0001462715271832696, + "loss": 0.2477, + "step": 12356 + }, + { + "epoch": 1.65, + "grad_norm": 0.44921875, + "learning_rate": 0.0001462612034811295, + "loss": 0.395, + "step": 12357 + }, + { + "epoch": 1.65, + "grad_norm": 0.447265625, + "learning_rate": 0.00014625087915165778, + "loss": 0.2838, + "step": 12358 + }, + { + "epoch": 1.65, + "grad_norm": 0.50390625, + "learning_rate": 0.00014624055419499448, + "loss": 0.3661, + "step": 12359 + }, + { + "epoch": 1.65, + "grad_norm": 0.26953125, + "learning_rate": 0.00014623022861127955, + "loss": 0.1362, + "step": 12360 + }, + { + "epoch": 1.65, + "grad_norm": 0.39453125, + "learning_rate": 0.00014621990240065307, + "loss": 0.341, + "step": 12361 + }, + { + "epoch": 1.65, + "grad_norm": 0.365234375, + "learning_rate": 0.00014620957556325505, + "loss": 0.2178, + "step": 12362 + }, + { + "epoch": 1.65, + "grad_norm": 0.55859375, + "learning_rate": 0.0001461992480992255, + "loss": 0.6087, + "step": 12363 + }, + { + "epoch": 1.65, + "grad_norm": 0.6484375, + "learning_rate": 0.00014618892000870453, + "loss": 0.4584, + "step": 12364 + }, + { + "epoch": 1.65, + "grad_norm": 0.466796875, + "learning_rate": 0.00014617859129183216, + "loss": 0.5804, + "step": 12365 + }, + { + "epoch": 1.65, + "grad_norm": 0.52734375, + "learning_rate": 0.00014616826194874846, + "loss": 0.5216, + "step": 12366 + }, + { + "epoch": 1.65, + "grad_norm": 0.490234375, + "learning_rate": 0.00014615793197959346, + "loss": 0.4257, + "step": 12367 + }, + { + "epoch": 1.65, + "grad_norm": 0.486328125, + "learning_rate": 0.00014614760138450733, + "loss": 0.2961, + "step": 12368 + }, + { + "epoch": 1.65, + "grad_norm": 0.46875, + "learning_rate": 0.0001461372701636301, + "loss": 0.319, + "step": 12369 + }, + { + "epoch": 1.65, + "grad_norm": 0.435546875, + "learning_rate": 0.00014612693831710184, + "loss": 0.2508, + "step": 12370 + }, + { + "epoch": 1.65, + "grad_norm": 0.40625, + "learning_rate": 0.0001461166058450627, + "loss": 0.3166, + "step": 12371 + }, + { + "epoch": 1.65, + "grad_norm": 0.46875, + "learning_rate": 0.0001461062727476528, + "loss": 0.4974, + "step": 12372 + }, + { + "epoch": 1.65, + "grad_norm": 0.49609375, + "learning_rate": 0.00014609593902501224, + "loss": 0.4824, + "step": 12373 + }, + { + "epoch": 1.65, + "grad_norm": 0.6171875, + "learning_rate": 0.00014608560467728115, + "loss": 0.6214, + "step": 12374 + }, + { + "epoch": 1.65, + "grad_norm": 0.4375, + "learning_rate": 0.00014607526970459976, + "loss": 0.2888, + "step": 12375 + }, + { + "epoch": 1.65, + "grad_norm": 0.490234375, + "learning_rate": 0.00014606493410710807, + "loss": 0.2307, + "step": 12376 + }, + { + "epoch": 1.65, + "grad_norm": 0.431640625, + "learning_rate": 0.0001460545978849463, + "loss": 0.3587, + "step": 12377 + }, + { + "epoch": 1.65, + "grad_norm": 0.359375, + "learning_rate": 0.00014604426103825466, + "loss": 0.1179, + "step": 12378 + }, + { + "epoch": 1.65, + "grad_norm": 0.51171875, + "learning_rate": 0.0001460339235671733, + "loss": 0.5016, + "step": 12379 + }, + { + "epoch": 1.65, + "grad_norm": 0.66015625, + "learning_rate": 0.00014602358547184235, + "loss": 0.4561, + "step": 12380 + }, + { + "epoch": 1.65, + "grad_norm": 0.61328125, + "learning_rate": 0.00014601324675240208, + "loss": 0.3452, + "step": 12381 + }, + { + "epoch": 1.65, + "grad_norm": 0.5625, + "learning_rate": 0.00014600290740899262, + "loss": 0.3295, + "step": 12382 + }, + { + "epoch": 1.65, + "grad_norm": 0.68359375, + "learning_rate": 0.00014599256744175424, + "loss": 0.3054, + "step": 12383 + }, + { + "epoch": 1.65, + "grad_norm": 0.65625, + "learning_rate": 0.00014598222685082712, + "loss": 0.5034, + "step": 12384 + }, + { + "epoch": 1.65, + "grad_norm": 0.40234375, + "learning_rate": 0.00014597188563635148, + "loss": 0.4121, + "step": 12385 + }, + { + "epoch": 1.65, + "grad_norm": 0.390625, + "learning_rate": 0.00014596154379846755, + "loss": 0.2494, + "step": 12386 + }, + { + "epoch": 1.65, + "grad_norm": 0.56640625, + "learning_rate": 0.00014595120133731565, + "loss": 0.4963, + "step": 12387 + }, + { + "epoch": 1.65, + "grad_norm": 0.53515625, + "learning_rate": 0.00014594085825303592, + "loss": 0.282, + "step": 12388 + }, + { + "epoch": 1.65, + "grad_norm": 0.51171875, + "learning_rate": 0.00014593051454576868, + "loss": 0.1912, + "step": 12389 + }, + { + "epoch": 1.65, + "grad_norm": 0.6640625, + "learning_rate": 0.00014592017021565417, + "loss": 0.2382, + "step": 12390 + }, + { + "epoch": 1.65, + "grad_norm": 0.58203125, + "learning_rate": 0.0001459098252628327, + "loss": 0.4583, + "step": 12391 + }, + { + "epoch": 1.65, + "grad_norm": 0.5390625, + "learning_rate": 0.00014589947968744453, + "loss": 0.4164, + "step": 12392 + }, + { + "epoch": 1.65, + "grad_norm": 0.419921875, + "learning_rate": 0.00014588913348962998, + "loss": 0.3455, + "step": 12393 + }, + { + "epoch": 1.65, + "grad_norm": 0.640625, + "learning_rate": 0.0001458787866695293, + "loss": 0.4983, + "step": 12394 + }, + { + "epoch": 1.65, + "grad_norm": 0.44140625, + "learning_rate": 0.00014586843922728283, + "loss": 0.3883, + "step": 12395 + }, + { + "epoch": 1.65, + "grad_norm": 0.51171875, + "learning_rate": 0.00014585809116303086, + "loss": 0.4756, + "step": 12396 + }, + { + "epoch": 1.65, + "grad_norm": 0.4921875, + "learning_rate": 0.00014584774247691377, + "loss": 0.4053, + "step": 12397 + }, + { + "epoch": 1.65, + "grad_norm": 0.51171875, + "learning_rate": 0.00014583739316907188, + "loss": 0.3768, + "step": 12398 + }, + { + "epoch": 1.65, + "grad_norm": 0.6015625, + "learning_rate": 0.00014582704323964552, + "loss": 0.244, + "step": 12399 + }, + { + "epoch": 1.65, + "grad_norm": 0.455078125, + "learning_rate": 0.00014581669268877505, + "loss": 0.5803, + "step": 12400 + }, + { + "epoch": 1.65, + "grad_norm": 0.640625, + "learning_rate": 0.00014580634151660079, + "loss": 0.3543, + "step": 12401 + }, + { + "epoch": 1.65, + "grad_norm": 0.447265625, + "learning_rate": 0.00014579598972326317, + "loss": 0.1654, + "step": 12402 + }, + { + "epoch": 1.66, + "grad_norm": 0.498046875, + "learning_rate": 0.00014578563730890248, + "loss": 0.3551, + "step": 12403 + }, + { + "epoch": 1.66, + "grad_norm": 0.478515625, + "learning_rate": 0.00014577528427365922, + "loss": 0.302, + "step": 12404 + }, + { + "epoch": 1.66, + "grad_norm": 0.51953125, + "learning_rate": 0.0001457649306176737, + "loss": 0.4248, + "step": 12405 + }, + { + "epoch": 1.66, + "grad_norm": 0.49609375, + "learning_rate": 0.00014575457634108635, + "loss": 0.3033, + "step": 12406 + }, + { + "epoch": 1.66, + "grad_norm": 0.447265625, + "learning_rate": 0.00014574422144403762, + "loss": 0.198, + "step": 12407 + }, + { + "epoch": 1.66, + "grad_norm": 0.5546875, + "learning_rate": 0.00014573386592666785, + "loss": 0.4108, + "step": 12408 + }, + { + "epoch": 1.66, + "grad_norm": 0.447265625, + "learning_rate": 0.0001457235097891175, + "loss": 0.6034, + "step": 12409 + }, + { + "epoch": 1.66, + "grad_norm": 0.5625, + "learning_rate": 0.00014571315303152704, + "loss": 0.5599, + "step": 12410 + }, + { + "epoch": 1.66, + "grad_norm": 0.4375, + "learning_rate": 0.00014570279565403687, + "loss": 0.2813, + "step": 12411 + }, + { + "epoch": 1.66, + "grad_norm": 0.6796875, + "learning_rate": 0.00014569243765678747, + "loss": 0.7732, + "step": 12412 + }, + { + "epoch": 1.66, + "grad_norm": 0.458984375, + "learning_rate": 0.00014568207903991925, + "loss": 0.3796, + "step": 12413 + }, + { + "epoch": 1.66, + "grad_norm": 0.41015625, + "learning_rate": 0.00014567171980357278, + "loss": 0.1656, + "step": 12414 + }, + { + "epoch": 1.66, + "grad_norm": 0.5234375, + "learning_rate": 0.00014566135994788844, + "loss": 0.2354, + "step": 12415 + }, + { + "epoch": 1.66, + "grad_norm": 0.6015625, + "learning_rate": 0.0001456509994730068, + "loss": 0.568, + "step": 12416 + }, + { + "epoch": 1.66, + "grad_norm": 0.5390625, + "learning_rate": 0.00014564063837906824, + "loss": 0.5034, + "step": 12417 + }, + { + "epoch": 1.66, + "grad_norm": 0.875, + "learning_rate": 0.00014563027666621338, + "loss": 0.3428, + "step": 12418 + }, + { + "epoch": 1.66, + "grad_norm": 0.53125, + "learning_rate": 0.00014561991433458267, + "loss": 0.4435, + "step": 12419 + }, + { + "epoch": 1.66, + "grad_norm": 0.3515625, + "learning_rate": 0.00014560955138431662, + "loss": 0.2087, + "step": 12420 + }, + { + "epoch": 1.66, + "grad_norm": 0.412109375, + "learning_rate": 0.0001455991878155558, + "loss": 0.2611, + "step": 12421 + }, + { + "epoch": 1.66, + "grad_norm": 0.37890625, + "learning_rate": 0.00014558882362844072, + "loss": 0.3867, + "step": 12422 + }, + { + "epoch": 1.66, + "grad_norm": 0.5546875, + "learning_rate": 0.00014557845882311196, + "loss": 0.2778, + "step": 12423 + }, + { + "epoch": 1.66, + "grad_norm": 0.54296875, + "learning_rate": 0.00014556809339971003, + "loss": 0.2987, + "step": 12424 + }, + { + "epoch": 1.66, + "grad_norm": 0.466796875, + "learning_rate": 0.0001455577273583755, + "loss": 0.2303, + "step": 12425 + }, + { + "epoch": 1.66, + "grad_norm": 0.55078125, + "learning_rate": 0.00014554736069924895, + "loss": 0.3584, + "step": 12426 + }, + { + "epoch": 1.66, + "grad_norm": 0.458984375, + "learning_rate": 0.00014553699342247098, + "loss": 0.2683, + "step": 12427 + }, + { + "epoch": 1.66, + "grad_norm": 0.515625, + "learning_rate": 0.0001455266255281821, + "loss": 0.353, + "step": 12428 + }, + { + "epoch": 1.66, + "grad_norm": 0.53515625, + "learning_rate": 0.00014551625701652302, + "loss": 0.4421, + "step": 12429 + }, + { + "epoch": 1.66, + "grad_norm": 0.294921875, + "learning_rate": 0.00014550588788763428, + "loss": 0.1673, + "step": 12430 + }, + { + "epoch": 1.66, + "grad_norm": 0.4453125, + "learning_rate": 0.00014549551814165644, + "loss": 0.3586, + "step": 12431 + }, + { + "epoch": 1.66, + "grad_norm": 0.34375, + "learning_rate": 0.00014548514777873022, + "loss": 0.251, + "step": 12432 + }, + { + "epoch": 1.66, + "grad_norm": 0.474609375, + "learning_rate": 0.00014547477679899618, + "loss": 0.2956, + "step": 12433 + }, + { + "epoch": 1.66, + "grad_norm": 0.455078125, + "learning_rate": 0.000145464405202595, + "loss": 0.4147, + "step": 12434 + }, + { + "epoch": 1.66, + "grad_norm": 0.52734375, + "learning_rate": 0.00014545403298966728, + "loss": 0.4451, + "step": 12435 + }, + { + "epoch": 1.66, + "grad_norm": 0.51953125, + "learning_rate": 0.00014544366016035375, + "loss": 0.5253, + "step": 12436 + }, + { + "epoch": 1.66, + "grad_norm": 0.41015625, + "learning_rate": 0.00014543328671479496, + "loss": 0.2989, + "step": 12437 + }, + { + "epoch": 1.66, + "grad_norm": 0.6328125, + "learning_rate": 0.00014542291265313168, + "loss": 0.4153, + "step": 12438 + }, + { + "epoch": 1.66, + "grad_norm": 0.353515625, + "learning_rate": 0.00014541253797550457, + "loss": 0.2508, + "step": 12439 + }, + { + "epoch": 1.66, + "grad_norm": 0.48828125, + "learning_rate": 0.00014540216268205423, + "loss": 0.3699, + "step": 12440 + }, + { + "epoch": 1.66, + "grad_norm": 0.53515625, + "learning_rate": 0.0001453917867729215, + "loss": 0.5643, + "step": 12441 + }, + { + "epoch": 1.66, + "grad_norm": 0.625, + "learning_rate": 0.00014538141024824698, + "loss": 0.3911, + "step": 12442 + }, + { + "epoch": 1.66, + "grad_norm": 0.5546875, + "learning_rate": 0.0001453710331081714, + "loss": 0.5372, + "step": 12443 + }, + { + "epoch": 1.66, + "grad_norm": 0.85546875, + "learning_rate": 0.0001453606553528355, + "loss": 0.4893, + "step": 12444 + }, + { + "epoch": 1.66, + "grad_norm": 0.52734375, + "learning_rate": 0.00014535027698238002, + "loss": 0.3215, + "step": 12445 + }, + { + "epoch": 1.66, + "grad_norm": 0.5234375, + "learning_rate": 0.00014533989799694568, + "loss": 0.4336, + "step": 12446 + }, + { + "epoch": 1.66, + "grad_norm": 0.47265625, + "learning_rate": 0.0001453295183966732, + "loss": 0.273, + "step": 12447 + }, + { + "epoch": 1.66, + "grad_norm": 0.55859375, + "learning_rate": 0.00014531913818170337, + "loss": 0.4121, + "step": 12448 + }, + { + "epoch": 1.66, + "grad_norm": 0.5234375, + "learning_rate": 0.00014530875735217693, + "loss": 0.2305, + "step": 12449 + }, + { + "epoch": 1.66, + "grad_norm": 0.490234375, + "learning_rate": 0.00014529837590823467, + "loss": 0.3906, + "step": 12450 + }, + { + "epoch": 1.66, + "grad_norm": 0.55859375, + "learning_rate": 0.00014528799385001733, + "loss": 0.4489, + "step": 12451 + }, + { + "epoch": 1.66, + "grad_norm": 0.578125, + "learning_rate": 0.0001452776111776658, + "loss": 0.3629, + "step": 12452 + }, + { + "epoch": 1.66, + "grad_norm": 0.458984375, + "learning_rate": 0.00014526722789132075, + "loss": 0.4101, + "step": 12453 + }, + { + "epoch": 1.66, + "grad_norm": 0.6328125, + "learning_rate": 0.00014525684399112305, + "loss": 0.4339, + "step": 12454 + }, + { + "epoch": 1.66, + "grad_norm": 0.380859375, + "learning_rate": 0.0001452464594772135, + "loss": 0.2406, + "step": 12455 + }, + { + "epoch": 1.66, + "grad_norm": 0.462890625, + "learning_rate": 0.0001452360743497329, + "loss": 0.3031, + "step": 12456 + }, + { + "epoch": 1.66, + "grad_norm": 0.3671875, + "learning_rate": 0.00014522568860882214, + "loss": 0.1986, + "step": 12457 + }, + { + "epoch": 1.66, + "grad_norm": 0.5078125, + "learning_rate": 0.00014521530225462196, + "loss": 0.3052, + "step": 12458 + }, + { + "epoch": 1.66, + "grad_norm": 0.353515625, + "learning_rate": 0.00014520491528727332, + "loss": 0.3255, + "step": 12459 + }, + { + "epoch": 1.66, + "grad_norm": 0.47265625, + "learning_rate": 0.000145194527706917, + "loss": 0.2476, + "step": 12460 + }, + { + "epoch": 1.66, + "grad_norm": 0.58984375, + "learning_rate": 0.00014518413951369386, + "loss": 0.3723, + "step": 12461 + }, + { + "epoch": 1.66, + "grad_norm": 0.443359375, + "learning_rate": 0.00014517375070774482, + "loss": 0.3864, + "step": 12462 + }, + { + "epoch": 1.66, + "grad_norm": 0.451171875, + "learning_rate": 0.0001451633612892107, + "loss": 0.3077, + "step": 12463 + }, + { + "epoch": 1.66, + "grad_norm": 0.4921875, + "learning_rate": 0.0001451529712582324, + "loss": 0.4618, + "step": 12464 + }, + { + "epoch": 1.66, + "grad_norm": 0.51171875, + "learning_rate": 0.00014514258061495085, + "loss": 0.614, + "step": 12465 + }, + { + "epoch": 1.66, + "grad_norm": 0.52734375, + "learning_rate": 0.00014513218935950696, + "loss": 0.3856, + "step": 12466 + }, + { + "epoch": 1.66, + "grad_norm": 0.462890625, + "learning_rate": 0.0001451217974920416, + "loss": 0.2361, + "step": 12467 + }, + { + "epoch": 1.66, + "grad_norm": 0.5390625, + "learning_rate": 0.0001451114050126957, + "loss": 0.3413, + "step": 12468 + }, + { + "epoch": 1.66, + "grad_norm": 0.609375, + "learning_rate": 0.00014510101192161018, + "loss": 0.6724, + "step": 12469 + }, + { + "epoch": 1.66, + "grad_norm": 0.609375, + "learning_rate": 0.000145090618218926, + "loss": 0.3586, + "step": 12470 + }, + { + "epoch": 1.66, + "grad_norm": 0.69140625, + "learning_rate": 0.00014508022390478413, + "loss": 0.564, + "step": 12471 + }, + { + "epoch": 1.66, + "grad_norm": 0.68359375, + "learning_rate": 0.00014506982897932548, + "loss": 0.6655, + "step": 12472 + }, + { + "epoch": 1.66, + "grad_norm": 0.439453125, + "learning_rate": 0.000145059433442691, + "loss": 0.2735, + "step": 12473 + }, + { + "epoch": 1.66, + "grad_norm": 0.5859375, + "learning_rate": 0.0001450490372950217, + "loss": 0.3908, + "step": 12474 + }, + { + "epoch": 1.66, + "grad_norm": 0.484375, + "learning_rate": 0.00014503864053645854, + "loss": 0.4019, + "step": 12475 + }, + { + "epoch": 1.66, + "grad_norm": 0.51171875, + "learning_rate": 0.00014502824316714252, + "loss": 0.3372, + "step": 12476 + }, + { + "epoch": 1.66, + "grad_norm": 0.6015625, + "learning_rate": 0.00014501784518721462, + "loss": 0.4312, + "step": 12477 + }, + { + "epoch": 1.67, + "grad_norm": 0.41015625, + "learning_rate": 0.00014500744659681585, + "loss": 0.2834, + "step": 12478 + }, + { + "epoch": 1.67, + "grad_norm": 0.44921875, + "learning_rate": 0.0001449970473960872, + "loss": 0.3403, + "step": 12479 + }, + { + "epoch": 1.67, + "grad_norm": 0.50390625, + "learning_rate": 0.0001449866475851697, + "loss": 0.5779, + "step": 12480 + }, + { + "epoch": 1.67, + "grad_norm": 0.50390625, + "learning_rate": 0.0001449762471642044, + "loss": 0.5033, + "step": 12481 + }, + { + "epoch": 1.67, + "grad_norm": 0.380859375, + "learning_rate": 0.00014496584613333234, + "loss": 0.1996, + "step": 12482 + }, + { + "epoch": 1.67, + "grad_norm": 0.55859375, + "learning_rate": 0.00014495544449269452, + "loss": 0.3604, + "step": 12483 + }, + { + "epoch": 1.67, + "grad_norm": 0.458984375, + "learning_rate": 0.00014494504224243205, + "loss": 0.3228, + "step": 12484 + }, + { + "epoch": 1.67, + "grad_norm": 0.6015625, + "learning_rate": 0.00014493463938268595, + "loss": 0.5135, + "step": 12485 + }, + { + "epoch": 1.67, + "grad_norm": 0.53125, + "learning_rate": 0.00014492423591359728, + "loss": 0.599, + "step": 12486 + }, + { + "epoch": 1.67, + "grad_norm": 0.59375, + "learning_rate": 0.00014491383183530716, + "loss": 0.4334, + "step": 12487 + }, + { + "epoch": 1.67, + "grad_norm": 0.66015625, + "learning_rate": 0.00014490342714795665, + "loss": 0.3836, + "step": 12488 + }, + { + "epoch": 1.67, + "grad_norm": 0.65625, + "learning_rate": 0.00014489302185168685, + "loss": 0.3137, + "step": 12489 + }, + { + "epoch": 1.67, + "grad_norm": 0.5703125, + "learning_rate": 0.00014488261594663887, + "loss": 0.3741, + "step": 12490 + }, + { + "epoch": 1.67, + "grad_norm": 0.5546875, + "learning_rate": 0.0001448722094329538, + "loss": 0.5164, + "step": 12491 + }, + { + "epoch": 1.67, + "grad_norm": 0.5703125, + "learning_rate": 0.00014486180231077278, + "loss": 0.3952, + "step": 12492 + }, + { + "epoch": 1.67, + "grad_norm": 0.875, + "learning_rate": 0.0001448513945802369, + "loss": 0.442, + "step": 12493 + }, + { + "epoch": 1.67, + "grad_norm": 0.3828125, + "learning_rate": 0.00014484098624148735, + "loss": 0.2674, + "step": 12494 + }, + { + "epoch": 1.67, + "grad_norm": 0.390625, + "learning_rate": 0.00014483057729466526, + "loss": 0.1601, + "step": 12495 + }, + { + "epoch": 1.67, + "grad_norm": 0.44921875, + "learning_rate": 0.00014482016773991176, + "loss": 0.2599, + "step": 12496 + }, + { + "epoch": 1.67, + "grad_norm": 0.55078125, + "learning_rate": 0.000144809757577368, + "loss": 0.482, + "step": 12497 + }, + { + "epoch": 1.67, + "grad_norm": 0.55859375, + "learning_rate": 0.0001447993468071752, + "loss": 0.6178, + "step": 12498 + }, + { + "epoch": 1.67, + "grad_norm": 0.64453125, + "learning_rate": 0.00014478893542947446, + "loss": 0.6764, + "step": 12499 + }, + { + "epoch": 1.67, + "grad_norm": 0.50390625, + "learning_rate": 0.00014477852344440704, + "loss": 0.3567, + "step": 12500 + }, + { + "epoch": 1.67, + "grad_norm": 0.6953125, + "learning_rate": 0.00014476811085211412, + "loss": 0.3699, + "step": 12501 + }, + { + "epoch": 1.67, + "grad_norm": 0.44921875, + "learning_rate": 0.00014475769765273685, + "loss": 0.3658, + "step": 12502 + }, + { + "epoch": 1.67, + "grad_norm": 0.384765625, + "learning_rate": 0.0001447472838464165, + "loss": 0.3064, + "step": 12503 + }, + { + "epoch": 1.67, + "grad_norm": 0.462890625, + "learning_rate": 0.00014473686943329426, + "loss": 0.3521, + "step": 12504 + }, + { + "epoch": 1.67, + "grad_norm": 0.734375, + "learning_rate": 0.00014472645441351135, + "loss": 0.5033, + "step": 12505 + }, + { + "epoch": 1.67, + "grad_norm": 0.5, + "learning_rate": 0.00014471603878720901, + "loss": 0.477, + "step": 12506 + }, + { + "epoch": 1.67, + "grad_norm": 0.478515625, + "learning_rate": 0.0001447056225545285, + "loss": 0.1648, + "step": 12507 + }, + { + "epoch": 1.67, + "grad_norm": 0.4609375, + "learning_rate": 0.00014469520571561107, + "loss": 0.3673, + "step": 12508 + }, + { + "epoch": 1.67, + "grad_norm": 0.671875, + "learning_rate": 0.0001446847882705979, + "loss": 0.3552, + "step": 12509 + }, + { + "epoch": 1.67, + "grad_norm": 0.56640625, + "learning_rate": 0.0001446743702196304, + "loss": 0.4474, + "step": 12510 + }, + { + "epoch": 1.67, + "grad_norm": 0.5546875, + "learning_rate": 0.0001446639515628497, + "loss": 0.4938, + "step": 12511 + }, + { + "epoch": 1.67, + "grad_norm": 0.515625, + "learning_rate": 0.00014465353230039719, + "loss": 0.3851, + "step": 12512 + }, + { + "epoch": 1.67, + "grad_norm": 0.455078125, + "learning_rate": 0.00014464311243241412, + "loss": 0.2333, + "step": 12513 + }, + { + "epoch": 1.67, + "grad_norm": 0.4453125, + "learning_rate": 0.0001446326919590418, + "loss": 0.2029, + "step": 12514 + }, + { + "epoch": 1.67, + "grad_norm": 0.37109375, + "learning_rate": 0.00014462227088042154, + "loss": 0.2827, + "step": 12515 + }, + { + "epoch": 1.67, + "grad_norm": 0.58984375, + "learning_rate": 0.00014461184919669463, + "loss": 0.333, + "step": 12516 + }, + { + "epoch": 1.67, + "grad_norm": 0.609375, + "learning_rate": 0.0001446014269080024, + "loss": 0.4163, + "step": 12517 + }, + { + "epoch": 1.67, + "grad_norm": 0.515625, + "learning_rate": 0.0001445910040144862, + "loss": 0.43, + "step": 12518 + }, + { + "epoch": 1.67, + "grad_norm": 0.5625, + "learning_rate": 0.00014458058051628739, + "loss": 0.4836, + "step": 12519 + }, + { + "epoch": 1.67, + "grad_norm": 0.5, + "learning_rate": 0.0001445701564135473, + "loss": 0.3907, + "step": 12520 + }, + { + "epoch": 1.67, + "grad_norm": 0.39453125, + "learning_rate": 0.00014455973170640727, + "loss": 0.1636, + "step": 12521 + }, + { + "epoch": 1.67, + "grad_norm": 0.55859375, + "learning_rate": 0.00014454930639500868, + "loss": 0.6403, + "step": 12522 + }, + { + "epoch": 1.67, + "grad_norm": 0.341796875, + "learning_rate": 0.00014453888047949288, + "loss": 0.3172, + "step": 12523 + }, + { + "epoch": 1.67, + "grad_norm": 0.71484375, + "learning_rate": 0.0001445284539600013, + "loss": 0.2995, + "step": 12524 + }, + { + "epoch": 1.67, + "grad_norm": 0.408203125, + "learning_rate": 0.00014451802683667533, + "loss": 0.1723, + "step": 12525 + }, + { + "epoch": 1.67, + "grad_norm": 0.5859375, + "learning_rate": 0.00014450759910965633, + "loss": 0.4011, + "step": 12526 + }, + { + "epoch": 1.67, + "grad_norm": 0.322265625, + "learning_rate": 0.0001444971707790857, + "loss": 0.2131, + "step": 12527 + }, + { + "epoch": 1.67, + "grad_norm": 0.46875, + "learning_rate": 0.0001444867418451049, + "loss": 0.4762, + "step": 12528 + }, + { + "epoch": 1.67, + "grad_norm": 0.50390625, + "learning_rate": 0.00014447631230785527, + "loss": 0.3097, + "step": 12529 + }, + { + "epoch": 1.67, + "grad_norm": 0.52734375, + "learning_rate": 0.0001444658821674784, + "loss": 0.374, + "step": 12530 + }, + { + "epoch": 1.67, + "grad_norm": 1.71875, + "learning_rate": 0.00014445545142411555, + "loss": 0.6422, + "step": 12531 + }, + { + "epoch": 1.67, + "grad_norm": 0.494140625, + "learning_rate": 0.00014444502007790827, + "loss": 0.3417, + "step": 12532 + }, + { + "epoch": 1.67, + "grad_norm": 1.1171875, + "learning_rate": 0.00014443458812899798, + "loss": 0.5287, + "step": 12533 + }, + { + "epoch": 1.67, + "grad_norm": 0.4296875, + "learning_rate": 0.0001444241555775262, + "loss": 0.3298, + "step": 12534 + }, + { + "epoch": 1.67, + "grad_norm": 0.50390625, + "learning_rate": 0.00014441372242363434, + "loss": 0.4208, + "step": 12535 + }, + { + "epoch": 1.67, + "grad_norm": 0.53515625, + "learning_rate": 0.00014440328866746385, + "loss": 0.5537, + "step": 12536 + }, + { + "epoch": 1.67, + "grad_norm": 0.41796875, + "learning_rate": 0.00014439285430915635, + "loss": 0.3634, + "step": 12537 + }, + { + "epoch": 1.67, + "grad_norm": 0.5625, + "learning_rate": 0.0001443824193488532, + "loss": 0.6757, + "step": 12538 + }, + { + "epoch": 1.67, + "grad_norm": 0.32421875, + "learning_rate": 0.00014437198378669598, + "loss": 0.2591, + "step": 12539 + }, + { + "epoch": 1.67, + "grad_norm": 0.53125, + "learning_rate": 0.00014436154762282618, + "loss": 0.2352, + "step": 12540 + }, + { + "epoch": 1.67, + "grad_norm": 0.408203125, + "learning_rate": 0.00014435111085738534, + "loss": 0.3636, + "step": 12541 + }, + { + "epoch": 1.67, + "grad_norm": 0.578125, + "learning_rate": 0.00014434067349051492, + "loss": 0.3589, + "step": 12542 + }, + { + "epoch": 1.67, + "grad_norm": 0.59765625, + "learning_rate": 0.00014433023552235655, + "loss": 0.3288, + "step": 12543 + }, + { + "epoch": 1.67, + "grad_norm": 0.421875, + "learning_rate": 0.00014431979695305174, + "loss": 0.168, + "step": 12544 + }, + { + "epoch": 1.67, + "grad_norm": 0.455078125, + "learning_rate": 0.000144309357782742, + "loss": 0.2612, + "step": 12545 + }, + { + "epoch": 1.67, + "grad_norm": 0.42578125, + "learning_rate": 0.00014429891801156898, + "loss": 0.2082, + "step": 12546 + }, + { + "epoch": 1.67, + "grad_norm": 0.53515625, + "learning_rate": 0.00014428847763967416, + "loss": 0.6382, + "step": 12547 + }, + { + "epoch": 1.67, + "grad_norm": 0.306640625, + "learning_rate": 0.0001442780366671992, + "loss": 0.2119, + "step": 12548 + }, + { + "epoch": 1.67, + "grad_norm": 0.640625, + "learning_rate": 0.0001442675950942856, + "loss": 0.4858, + "step": 12549 + }, + { + "epoch": 1.67, + "grad_norm": 0.546875, + "learning_rate": 0.000144257152921075, + "loss": 0.4154, + "step": 12550 + }, + { + "epoch": 1.67, + "grad_norm": 0.4296875, + "learning_rate": 0.00014424671014770906, + "loss": 0.4088, + "step": 12551 + }, + { + "epoch": 1.67, + "grad_norm": 0.51953125, + "learning_rate": 0.00014423626677432928, + "loss": 0.4302, + "step": 12552 + }, + { + "epoch": 1.68, + "grad_norm": 0.5234375, + "learning_rate": 0.00014422582280107738, + "loss": 0.4899, + "step": 12553 + }, + { + "epoch": 1.68, + "grad_norm": 0.5703125, + "learning_rate": 0.00014421537822809487, + "loss": 0.484, + "step": 12554 + }, + { + "epoch": 1.68, + "grad_norm": 0.88671875, + "learning_rate": 0.00014420493305552351, + "loss": 0.3736, + "step": 12555 + }, + { + "epoch": 1.68, + "grad_norm": 0.369140625, + "learning_rate": 0.00014419448728350486, + "loss": 0.2133, + "step": 12556 + }, + { + "epoch": 1.68, + "grad_norm": 0.48046875, + "learning_rate": 0.00014418404091218063, + "loss": 0.3869, + "step": 12557 + }, + { + "epoch": 1.68, + "grad_norm": 0.65625, + "learning_rate": 0.00014417359394169242, + "loss": 0.4508, + "step": 12558 + }, + { + "epoch": 1.68, + "grad_norm": 0.380859375, + "learning_rate": 0.00014416314637218192, + "loss": 0.2133, + "step": 12559 + }, + { + "epoch": 1.68, + "grad_norm": 0.5625, + "learning_rate": 0.00014415269820379083, + "loss": 0.4313, + "step": 12560 + }, + { + "epoch": 1.68, + "grad_norm": 0.4375, + "learning_rate": 0.0001441422494366608, + "loss": 0.3333, + "step": 12561 + }, + { + "epoch": 1.68, + "grad_norm": 0.484375, + "learning_rate": 0.00014413180007093356, + "loss": 0.5681, + "step": 12562 + }, + { + "epoch": 1.68, + "grad_norm": 0.482421875, + "learning_rate": 0.00014412135010675079, + "loss": 0.6527, + "step": 12563 + }, + { + "epoch": 1.68, + "grad_norm": 0.6796875, + "learning_rate": 0.00014411089954425416, + "loss": 0.4793, + "step": 12564 + }, + { + "epoch": 1.68, + "grad_norm": 0.5703125, + "learning_rate": 0.00014410044838358545, + "loss": 0.4027, + "step": 12565 + }, + { + "epoch": 1.68, + "grad_norm": 0.31640625, + "learning_rate": 0.00014408999662488635, + "loss": 0.1649, + "step": 12566 + }, + { + "epoch": 1.68, + "grad_norm": 0.396484375, + "learning_rate": 0.0001440795442682986, + "loss": 0.1903, + "step": 12567 + }, + { + "epoch": 1.68, + "grad_norm": 0.431640625, + "learning_rate": 0.0001440690913139639, + "loss": 0.3042, + "step": 12568 + }, + { + "epoch": 1.68, + "grad_norm": 0.6015625, + "learning_rate": 0.0001440586377620241, + "loss": 0.667, + "step": 12569 + }, + { + "epoch": 1.68, + "grad_norm": 0.546875, + "learning_rate": 0.00014404818361262088, + "loss": 0.537, + "step": 12570 + }, + { + "epoch": 1.68, + "grad_norm": 0.69140625, + "learning_rate": 0.000144037728865896, + "loss": 0.3548, + "step": 12571 + }, + { + "epoch": 1.68, + "grad_norm": 0.59375, + "learning_rate": 0.00014402727352199127, + "loss": 0.3406, + "step": 12572 + }, + { + "epoch": 1.68, + "grad_norm": 0.52734375, + "learning_rate": 0.00014401681758104845, + "loss": 0.3475, + "step": 12573 + }, + { + "epoch": 1.68, + "grad_norm": 0.6875, + "learning_rate": 0.00014400636104320932, + "loss": 0.3561, + "step": 12574 + }, + { + "epoch": 1.68, + "grad_norm": 0.4453125, + "learning_rate": 0.00014399590390861571, + "loss": 0.4598, + "step": 12575 + }, + { + "epoch": 1.68, + "grad_norm": 0.6875, + "learning_rate": 0.0001439854461774094, + "loss": 0.4997, + "step": 12576 + }, + { + "epoch": 1.68, + "grad_norm": 0.7421875, + "learning_rate": 0.0001439749878497322, + "loss": 0.4091, + "step": 12577 + }, + { + "epoch": 1.68, + "grad_norm": 0.478515625, + "learning_rate": 0.00014396452892572595, + "loss": 0.4736, + "step": 12578 + }, + { + "epoch": 1.68, + "grad_norm": 0.53125, + "learning_rate": 0.00014395406940553247, + "loss": 0.2278, + "step": 12579 + }, + { + "epoch": 1.68, + "grad_norm": 0.470703125, + "learning_rate": 0.00014394360928929363, + "loss": 0.2471, + "step": 12580 + }, + { + "epoch": 1.68, + "grad_norm": 0.57421875, + "learning_rate": 0.0001439331485771512, + "loss": 0.4128, + "step": 12581 + }, + { + "epoch": 1.68, + "grad_norm": 0.56640625, + "learning_rate": 0.0001439226872692471, + "loss": 0.7578, + "step": 12582 + }, + { + "epoch": 1.68, + "grad_norm": 0.515625, + "learning_rate": 0.00014391222536572318, + "loss": 0.4285, + "step": 12583 + }, + { + "epoch": 1.68, + "grad_norm": 0.6484375, + "learning_rate": 0.00014390176286672128, + "loss": 0.7257, + "step": 12584 + }, + { + "epoch": 1.68, + "grad_norm": 0.828125, + "learning_rate": 0.00014389129977238332, + "loss": 0.5424, + "step": 12585 + }, + { + "epoch": 1.68, + "grad_norm": 0.439453125, + "learning_rate": 0.00014388083608285113, + "loss": 0.471, + "step": 12586 + }, + { + "epoch": 1.68, + "grad_norm": 0.5234375, + "learning_rate": 0.0001438703717982667, + "loss": 0.3023, + "step": 12587 + }, + { + "epoch": 1.68, + "grad_norm": 0.65625, + "learning_rate": 0.00014385990691877183, + "loss": 0.3101, + "step": 12588 + }, + { + "epoch": 1.68, + "grad_norm": 0.48046875, + "learning_rate": 0.00014384944144450848, + "loss": 0.4834, + "step": 12589 + }, + { + "epoch": 1.68, + "grad_norm": 0.51171875, + "learning_rate": 0.00014383897537561854, + "loss": 0.5176, + "step": 12590 + }, + { + "epoch": 1.68, + "grad_norm": 0.40234375, + "learning_rate": 0.000143828508712244, + "loss": 0.2878, + "step": 12591 + }, + { + "epoch": 1.68, + "grad_norm": 0.447265625, + "learning_rate": 0.0001438180414545267, + "loss": 0.415, + "step": 12592 + }, + { + "epoch": 1.68, + "grad_norm": 0.4296875, + "learning_rate": 0.0001438075736026087, + "loss": 0.4113, + "step": 12593 + }, + { + "epoch": 1.68, + "grad_norm": 0.421875, + "learning_rate": 0.00014379710515663186, + "loss": 0.3257, + "step": 12594 + }, + { + "epoch": 1.68, + "grad_norm": 0.69140625, + "learning_rate": 0.00014378663611673814, + "loss": 0.3724, + "step": 12595 + }, + { + "epoch": 1.68, + "grad_norm": 0.5, + "learning_rate": 0.00014377616648306957, + "loss": 0.3938, + "step": 12596 + }, + { + "epoch": 1.68, + "grad_norm": 0.5390625, + "learning_rate": 0.00014376569625576806, + "loss": 0.3769, + "step": 12597 + }, + { + "epoch": 1.68, + "grad_norm": 0.52734375, + "learning_rate": 0.00014375522543497564, + "loss": 0.4618, + "step": 12598 + }, + { + "epoch": 1.68, + "grad_norm": 0.6796875, + "learning_rate": 0.00014374475402083428, + "loss": 0.5345, + "step": 12599 + }, + { + "epoch": 1.68, + "grad_norm": 0.53125, + "learning_rate": 0.00014373428201348596, + "loss": 0.4126, + "step": 12600 + }, + { + "epoch": 1.68, + "grad_norm": 0.5078125, + "learning_rate": 0.00014372380941307273, + "loss": 0.4995, + "step": 12601 + }, + { + "epoch": 1.68, + "grad_norm": 0.421875, + "learning_rate": 0.00014371333621973655, + "loss": 0.3096, + "step": 12602 + }, + { + "epoch": 1.68, + "grad_norm": 0.447265625, + "learning_rate": 0.0001437028624336195, + "loss": 0.3224, + "step": 12603 + }, + { + "epoch": 1.68, + "grad_norm": 0.48046875, + "learning_rate": 0.00014369238805486358, + "loss": 0.3851, + "step": 12604 + }, + { + "epoch": 1.68, + "grad_norm": 0.5234375, + "learning_rate": 0.00014368191308361086, + "loss": 0.2232, + "step": 12605 + }, + { + "epoch": 1.68, + "grad_norm": 0.59765625, + "learning_rate": 0.00014367143752000333, + "loss": 0.3056, + "step": 12606 + }, + { + "epoch": 1.68, + "grad_norm": 0.43359375, + "learning_rate": 0.0001436609613641831, + "loss": 0.1813, + "step": 12607 + }, + { + "epoch": 1.68, + "grad_norm": 0.51171875, + "learning_rate": 0.0001436504846162922, + "loss": 0.5549, + "step": 12608 + }, + { + "epoch": 1.68, + "grad_norm": 0.4296875, + "learning_rate": 0.00014364000727647272, + "loss": 0.282, + "step": 12609 + }, + { + "epoch": 1.68, + "grad_norm": 0.404296875, + "learning_rate": 0.00014362952934486676, + "loss": 0.3231, + "step": 12610 + }, + { + "epoch": 1.68, + "grad_norm": 0.58203125, + "learning_rate": 0.00014361905082161636, + "loss": 0.2464, + "step": 12611 + }, + { + "epoch": 1.68, + "grad_norm": 0.546875, + "learning_rate": 0.00014360857170686364, + "loss": 0.3834, + "step": 12612 + }, + { + "epoch": 1.68, + "grad_norm": 0.3828125, + "learning_rate": 0.00014359809200075065, + "loss": 0.2842, + "step": 12613 + }, + { + "epoch": 1.68, + "grad_norm": 0.6015625, + "learning_rate": 0.0001435876117034196, + "loss": 0.2243, + "step": 12614 + }, + { + "epoch": 1.68, + "grad_norm": 0.5625, + "learning_rate": 0.00014357713081501256, + "loss": 0.4613, + "step": 12615 + }, + { + "epoch": 1.68, + "grad_norm": 0.625, + "learning_rate": 0.00014356664933567163, + "loss": 0.6206, + "step": 12616 + }, + { + "epoch": 1.68, + "grad_norm": 0.380859375, + "learning_rate": 0.00014355616726553902, + "loss": 0.1528, + "step": 12617 + }, + { + "epoch": 1.68, + "grad_norm": 0.482421875, + "learning_rate": 0.0001435456846047568, + "loss": 0.25, + "step": 12618 + }, + { + "epoch": 1.68, + "grad_norm": 0.5, + "learning_rate": 0.00014353520135346716, + "loss": 0.2713, + "step": 12619 + }, + { + "epoch": 1.68, + "grad_norm": 0.5390625, + "learning_rate": 0.00014352471751181222, + "loss": 0.2221, + "step": 12620 + }, + { + "epoch": 1.68, + "grad_norm": 0.44921875, + "learning_rate": 0.0001435142330799342, + "loss": 0.3603, + "step": 12621 + }, + { + "epoch": 1.68, + "grad_norm": 0.62109375, + "learning_rate": 0.00014350374805797525, + "loss": 0.4703, + "step": 12622 + }, + { + "epoch": 1.68, + "grad_norm": 0.470703125, + "learning_rate": 0.00014349326244607758, + "loss": 0.4283, + "step": 12623 + }, + { + "epoch": 1.68, + "grad_norm": 0.458984375, + "learning_rate": 0.00014348277624438334, + "loss": 0.5627, + "step": 12624 + }, + { + "epoch": 1.68, + "grad_norm": 0.48828125, + "learning_rate": 0.0001434722894530347, + "loss": 0.3523, + "step": 12625 + }, + { + "epoch": 1.68, + "grad_norm": 0.55078125, + "learning_rate": 0.00014346180207217398, + "loss": 0.2644, + "step": 12626 + }, + { + "epoch": 1.68, + "grad_norm": 0.69140625, + "learning_rate": 0.0001434513141019433, + "loss": 0.367, + "step": 12627 + }, + { + "epoch": 1.69, + "grad_norm": 0.640625, + "learning_rate": 0.00014344082554248493, + "loss": 0.3776, + "step": 12628 + }, + { + "epoch": 1.69, + "grad_norm": 0.478515625, + "learning_rate": 0.00014343033639394105, + "loss": 0.3601, + "step": 12629 + }, + { + "epoch": 1.69, + "grad_norm": 0.69140625, + "learning_rate": 0.00014341984665645397, + "loss": 0.394, + "step": 12630 + }, + { + "epoch": 1.69, + "grad_norm": 0.5625, + "learning_rate": 0.0001434093563301659, + "loss": 0.3972, + "step": 12631 + }, + { + "epoch": 1.69, + "grad_norm": 0.796875, + "learning_rate": 0.0001433988654152191, + "loss": 0.614, + "step": 12632 + }, + { + "epoch": 1.69, + "grad_norm": 0.482421875, + "learning_rate": 0.00014338837391175582, + "loss": 0.48, + "step": 12633 + }, + { + "epoch": 1.69, + "grad_norm": 0.60546875, + "learning_rate": 0.00014337788181991833, + "loss": 0.4377, + "step": 12634 + }, + { + "epoch": 1.69, + "grad_norm": 0.400390625, + "learning_rate": 0.00014336738913984896, + "loss": 0.3849, + "step": 12635 + }, + { + "epoch": 1.69, + "grad_norm": 0.56640625, + "learning_rate": 0.00014335689587168996, + "loss": 0.3818, + "step": 12636 + }, + { + "epoch": 1.69, + "grad_norm": 0.5625, + "learning_rate": 0.00014334640201558362, + "loss": 0.5605, + "step": 12637 + }, + { + "epoch": 1.69, + "grad_norm": 0.5546875, + "learning_rate": 0.0001433359075716722, + "loss": 0.2504, + "step": 12638 + }, + { + "epoch": 1.69, + "grad_norm": 0.3515625, + "learning_rate": 0.0001433254125400981, + "loss": 0.1996, + "step": 12639 + }, + { + "epoch": 1.69, + "grad_norm": 0.60546875, + "learning_rate": 0.00014331491692100358, + "loss": 0.2797, + "step": 12640 + }, + { + "epoch": 1.69, + "grad_norm": 0.365234375, + "learning_rate": 0.000143304420714531, + "loss": 0.3197, + "step": 12641 + }, + { + "epoch": 1.69, + "grad_norm": 0.4765625, + "learning_rate": 0.0001432939239208227, + "loss": 0.4498, + "step": 12642 + }, + { + "epoch": 1.69, + "grad_norm": 0.515625, + "learning_rate": 0.00014328342654002099, + "loss": 0.4871, + "step": 12643 + }, + { + "epoch": 1.69, + "grad_norm": 0.48046875, + "learning_rate": 0.00014327292857226824, + "loss": 0.6277, + "step": 12644 + }, + { + "epoch": 1.69, + "grad_norm": 0.66015625, + "learning_rate": 0.00014326243001770678, + "loss": 0.3462, + "step": 12645 + }, + { + "epoch": 1.69, + "grad_norm": 0.34375, + "learning_rate": 0.00014325193087647904, + "loss": 0.2693, + "step": 12646 + }, + { + "epoch": 1.69, + "grad_norm": 0.51953125, + "learning_rate": 0.00014324143114872735, + "loss": 0.3513, + "step": 12647 + }, + { + "epoch": 1.69, + "grad_norm": 0.421875, + "learning_rate": 0.00014323093083459408, + "loss": 0.5602, + "step": 12648 + }, + { + "epoch": 1.69, + "grad_norm": 0.494140625, + "learning_rate": 0.0001432204299342217, + "loss": 0.4562, + "step": 12649 + }, + { + "epoch": 1.69, + "grad_norm": 0.455078125, + "learning_rate": 0.00014320992844775248, + "loss": 0.3542, + "step": 12650 + }, + { + "epoch": 1.69, + "grad_norm": 0.45703125, + "learning_rate": 0.00014319942637532895, + "loss": 0.2992, + "step": 12651 + }, + { + "epoch": 1.69, + "grad_norm": 0.4921875, + "learning_rate": 0.00014318892371709341, + "loss": 0.4347, + "step": 12652 + }, + { + "epoch": 1.69, + "grad_norm": 0.703125, + "learning_rate": 0.0001431784204731884, + "loss": 0.3668, + "step": 12653 + }, + { + "epoch": 1.69, + "grad_norm": 0.48046875, + "learning_rate": 0.0001431679166437563, + "loss": 0.3342, + "step": 12654 + }, + { + "epoch": 1.69, + "grad_norm": 0.54296875, + "learning_rate": 0.00014315741222893953, + "loss": 0.6774, + "step": 12655 + }, + { + "epoch": 1.69, + "grad_norm": 0.44921875, + "learning_rate": 0.00014314690722888056, + "loss": 0.4981, + "step": 12656 + }, + { + "epoch": 1.69, + "grad_norm": 0.470703125, + "learning_rate": 0.00014313640164372181, + "loss": 0.5175, + "step": 12657 + }, + { + "epoch": 1.69, + "grad_norm": 0.57421875, + "learning_rate": 0.0001431258954736058, + "loss": 0.4028, + "step": 12658 + }, + { + "epoch": 1.69, + "grad_norm": 0.48046875, + "learning_rate": 0.00014311538871867493, + "loss": 0.2397, + "step": 12659 + }, + { + "epoch": 1.69, + "grad_norm": 0.51953125, + "learning_rate": 0.0001431048813790718, + "loss": 0.3241, + "step": 12660 + }, + { + "epoch": 1.69, + "grad_norm": 0.69140625, + "learning_rate": 0.00014309437345493874, + "loss": 0.8814, + "step": 12661 + }, + { + "epoch": 1.69, + "grad_norm": 0.5859375, + "learning_rate": 0.00014308386494641835, + "loss": 0.3921, + "step": 12662 + }, + { + "epoch": 1.69, + "grad_norm": 0.400390625, + "learning_rate": 0.00014307335585365307, + "loss": 0.4831, + "step": 12663 + }, + { + "epoch": 1.69, + "grad_norm": 0.44921875, + "learning_rate": 0.0001430628461767854, + "loss": 0.326, + "step": 12664 + }, + { + "epoch": 1.69, + "grad_norm": 0.56640625, + "learning_rate": 0.00014305233591595798, + "loss": 0.4759, + "step": 12665 + }, + { + "epoch": 1.69, + "grad_norm": 0.6328125, + "learning_rate": 0.00014304182507131325, + "loss": 0.4285, + "step": 12666 + }, + { + "epoch": 1.69, + "grad_norm": 0.404296875, + "learning_rate": 0.00014303131364299374, + "loss": 0.2125, + "step": 12667 + }, + { + "epoch": 1.69, + "grad_norm": 0.482421875, + "learning_rate": 0.00014302080163114197, + "loss": 0.2913, + "step": 12668 + }, + { + "epoch": 1.69, + "grad_norm": 0.33203125, + "learning_rate": 0.00014301028903590055, + "loss": 0.1707, + "step": 12669 + }, + { + "epoch": 1.69, + "grad_norm": 1.1328125, + "learning_rate": 0.00014299977585741197, + "loss": 0.4946, + "step": 12670 + }, + { + "epoch": 1.69, + "grad_norm": 0.5546875, + "learning_rate": 0.00014298926209581887, + "loss": 0.3058, + "step": 12671 + }, + { + "epoch": 1.69, + "grad_norm": 0.54296875, + "learning_rate": 0.00014297874775126378, + "loss": 0.4654, + "step": 12672 + }, + { + "epoch": 1.69, + "grad_norm": 0.6171875, + "learning_rate": 0.00014296823282388928, + "loss": 0.3909, + "step": 12673 + }, + { + "epoch": 1.69, + "grad_norm": 0.4375, + "learning_rate": 0.00014295771731383797, + "loss": 0.5111, + "step": 12674 + }, + { + "epoch": 1.69, + "grad_norm": 0.73046875, + "learning_rate": 0.00014294720122125243, + "loss": 0.4466, + "step": 12675 + }, + { + "epoch": 1.69, + "grad_norm": 0.45703125, + "learning_rate": 0.0001429366845462753, + "loss": 0.2542, + "step": 12676 + }, + { + "epoch": 1.69, + "grad_norm": 0.5234375, + "learning_rate": 0.00014292616728904913, + "loss": 0.5123, + "step": 12677 + }, + { + "epoch": 1.69, + "grad_norm": 0.83984375, + "learning_rate": 0.00014291564944971663, + "loss": 0.3231, + "step": 12678 + }, + { + "epoch": 1.69, + "grad_norm": 0.412109375, + "learning_rate": 0.00014290513102842035, + "loss": 0.1843, + "step": 12679 + }, + { + "epoch": 1.69, + "grad_norm": 0.6015625, + "learning_rate": 0.00014289461202530296, + "loss": 0.3574, + "step": 12680 + }, + { + "epoch": 1.69, + "grad_norm": 0.515625, + "learning_rate": 0.00014288409244050712, + "loss": 0.2279, + "step": 12681 + }, + { + "epoch": 1.69, + "grad_norm": 0.55078125, + "learning_rate": 0.00014287357227417543, + "loss": 0.3554, + "step": 12682 + }, + { + "epoch": 1.69, + "grad_norm": 0.58984375, + "learning_rate": 0.0001428630515264506, + "loss": 0.4308, + "step": 12683 + }, + { + "epoch": 1.69, + "grad_norm": 0.4375, + "learning_rate": 0.0001428525301974753, + "loss": 0.441, + "step": 12684 + }, + { + "epoch": 1.69, + "grad_norm": 0.5859375, + "learning_rate": 0.00014284200828739221, + "loss": 0.3452, + "step": 12685 + }, + { + "epoch": 1.69, + "grad_norm": 0.51953125, + "learning_rate": 0.00014283148579634394, + "loss": 0.4026, + "step": 12686 + }, + { + "epoch": 1.69, + "grad_norm": 0.67578125, + "learning_rate": 0.00014282096272447324, + "loss": 0.5341, + "step": 12687 + }, + { + "epoch": 1.69, + "grad_norm": 0.640625, + "learning_rate": 0.0001428104390719228, + "loss": 0.5459, + "step": 12688 + }, + { + "epoch": 1.69, + "grad_norm": 0.609375, + "learning_rate": 0.00014279991483883534, + "loss": 0.376, + "step": 12689 + }, + { + "epoch": 1.69, + "grad_norm": 0.435546875, + "learning_rate": 0.00014278939002535358, + "loss": 0.2319, + "step": 12690 + }, + { + "epoch": 1.69, + "grad_norm": 0.609375, + "learning_rate": 0.00014277886463162021, + "loss": 0.4694, + "step": 12691 + }, + { + "epoch": 1.69, + "grad_norm": 0.408203125, + "learning_rate": 0.00014276833865777802, + "loss": 0.3019, + "step": 12692 + }, + { + "epoch": 1.69, + "grad_norm": 0.55078125, + "learning_rate": 0.00014275781210396966, + "loss": 0.4856, + "step": 12693 + }, + { + "epoch": 1.69, + "grad_norm": 0.81640625, + "learning_rate": 0.00014274728497033797, + "loss": 0.3532, + "step": 12694 + }, + { + "epoch": 1.69, + "grad_norm": 0.431640625, + "learning_rate": 0.0001427367572570256, + "loss": 0.2396, + "step": 12695 + }, + { + "epoch": 1.69, + "grad_norm": 0.51171875, + "learning_rate": 0.00014272622896417542, + "loss": 0.302, + "step": 12696 + }, + { + "epoch": 1.69, + "grad_norm": 0.4765625, + "learning_rate": 0.00014271570009193014, + "loss": 0.35, + "step": 12697 + }, + { + "epoch": 1.69, + "grad_norm": 0.392578125, + "learning_rate": 0.00014270517064043253, + "loss": 0.1489, + "step": 12698 + }, + { + "epoch": 1.69, + "grad_norm": 0.66796875, + "learning_rate": 0.00014269464060982545, + "loss": 0.5053, + "step": 12699 + }, + { + "epoch": 1.69, + "grad_norm": 0.52734375, + "learning_rate": 0.0001426841100002516, + "loss": 0.49, + "step": 12700 + }, + { + "epoch": 1.69, + "grad_norm": 0.5, + "learning_rate": 0.00014267357881185387, + "loss": 0.4299, + "step": 12701 + }, + { + "epoch": 1.69, + "grad_norm": 0.54296875, + "learning_rate": 0.000142663047044775, + "loss": 0.7036, + "step": 12702 + }, + { + "epoch": 1.7, + "grad_norm": 0.50390625, + "learning_rate": 0.00014265251469915778, + "loss": 0.4381, + "step": 12703 + }, + { + "epoch": 1.7, + "grad_norm": 0.41015625, + "learning_rate": 0.00014264198177514517, + "loss": 0.3434, + "step": 12704 + }, + { + "epoch": 1.7, + "grad_norm": 0.57421875, + "learning_rate": 0.00014263144827287983, + "loss": 0.755, + "step": 12705 + }, + { + "epoch": 1.7, + "grad_norm": 0.4609375, + "learning_rate": 0.00014262091419250475, + "loss": 0.4215, + "step": 12706 + }, + { + "epoch": 1.7, + "grad_norm": 0.453125, + "learning_rate": 0.00014261037953416272, + "loss": 0.2599, + "step": 12707 + }, + { + "epoch": 1.7, + "grad_norm": 0.765625, + "learning_rate": 0.0001425998442979966, + "loss": 0.599, + "step": 12708 + }, + { + "epoch": 1.7, + "grad_norm": 0.421875, + "learning_rate": 0.00014258930848414924, + "loss": 0.355, + "step": 12709 + }, + { + "epoch": 1.7, + "grad_norm": 0.56640625, + "learning_rate": 0.0001425787720927635, + "loss": 0.5386, + "step": 12710 + }, + { + "epoch": 1.7, + "grad_norm": 0.71484375, + "learning_rate": 0.00014256823512398233, + "loss": 0.4898, + "step": 12711 + }, + { + "epoch": 1.7, + "grad_norm": 0.443359375, + "learning_rate": 0.00014255769757794855, + "loss": 0.4497, + "step": 12712 + }, + { + "epoch": 1.7, + "grad_norm": 0.7265625, + "learning_rate": 0.00014254715945480505, + "loss": 0.3622, + "step": 12713 + }, + { + "epoch": 1.7, + "grad_norm": 0.419921875, + "learning_rate": 0.00014253662075469476, + "loss": 0.205, + "step": 12714 + }, + { + "epoch": 1.7, + "grad_norm": 0.625, + "learning_rate": 0.00014252608147776065, + "loss": 0.4586, + "step": 12715 + }, + { + "epoch": 1.7, + "grad_norm": 0.90234375, + "learning_rate": 0.00014251554162414553, + "loss": 0.3003, + "step": 12716 + }, + { + "epoch": 1.7, + "grad_norm": 0.439453125, + "learning_rate": 0.0001425050011939924, + "loss": 0.2756, + "step": 12717 + }, + { + "epoch": 1.7, + "grad_norm": 0.41015625, + "learning_rate": 0.0001424944601874442, + "loss": 0.3845, + "step": 12718 + }, + { + "epoch": 1.7, + "grad_norm": 0.58984375, + "learning_rate": 0.00014248391860464382, + "loss": 0.307, + "step": 12719 + }, + { + "epoch": 1.7, + "grad_norm": 0.45703125, + "learning_rate": 0.0001424733764457342, + "loss": 0.4723, + "step": 12720 + }, + { + "epoch": 1.7, + "grad_norm": 0.4453125, + "learning_rate": 0.0001424628337108584, + "loss": 0.3355, + "step": 12721 + }, + { + "epoch": 1.7, + "grad_norm": 0.44140625, + "learning_rate": 0.0001424522904001593, + "loss": 0.3611, + "step": 12722 + }, + { + "epoch": 1.7, + "grad_norm": 0.431640625, + "learning_rate": 0.00014244174651377987, + "loss": 0.2261, + "step": 12723 + }, + { + "epoch": 1.7, + "grad_norm": 0.5234375, + "learning_rate": 0.00014243120205186316, + "loss": 0.2943, + "step": 12724 + }, + { + "epoch": 1.7, + "grad_norm": 0.39453125, + "learning_rate": 0.0001424206570145521, + "loss": 0.1659, + "step": 12725 + }, + { + "epoch": 1.7, + "grad_norm": 0.515625, + "learning_rate": 0.0001424101114019897, + "loss": 0.453, + "step": 12726 + }, + { + "epoch": 1.7, + "grad_norm": 0.58203125, + "learning_rate": 0.00014239956521431898, + "loss": 0.3795, + "step": 12727 + }, + { + "epoch": 1.7, + "grad_norm": 0.400390625, + "learning_rate": 0.00014238901845168293, + "loss": 0.3219, + "step": 12728 + }, + { + "epoch": 1.7, + "grad_norm": 0.63671875, + "learning_rate": 0.0001423784711142246, + "loss": 0.394, + "step": 12729 + }, + { + "epoch": 1.7, + "grad_norm": 0.53125, + "learning_rate": 0.00014236792320208698, + "loss": 0.3608, + "step": 12730 + }, + { + "epoch": 1.7, + "grad_norm": 0.48828125, + "learning_rate": 0.00014235737471541318, + "loss": 0.3988, + "step": 12731 + }, + { + "epoch": 1.7, + "grad_norm": 0.8203125, + "learning_rate": 0.00014234682565434614, + "loss": 0.6389, + "step": 12732 + }, + { + "epoch": 1.7, + "grad_norm": 0.490234375, + "learning_rate": 0.00014233627601902902, + "loss": 0.2634, + "step": 12733 + }, + { + "epoch": 1.7, + "grad_norm": 0.5546875, + "learning_rate": 0.00014232572580960478, + "loss": 0.5624, + "step": 12734 + }, + { + "epoch": 1.7, + "grad_norm": 0.4453125, + "learning_rate": 0.00014231517502621654, + "loss": 0.3358, + "step": 12735 + }, + { + "epoch": 1.7, + "grad_norm": 0.640625, + "learning_rate": 0.00014230462366900736, + "loss": 0.206, + "step": 12736 + }, + { + "epoch": 1.7, + "grad_norm": 0.357421875, + "learning_rate": 0.00014229407173812036, + "loss": 0.2864, + "step": 12737 + }, + { + "epoch": 1.7, + "grad_norm": 0.53125, + "learning_rate": 0.0001422835192336986, + "loss": 0.4297, + "step": 12738 + }, + { + "epoch": 1.7, + "grad_norm": 0.7421875, + "learning_rate": 0.00014227296615588518, + "loss": 0.4508, + "step": 12739 + }, + { + "epoch": 1.7, + "grad_norm": 0.91796875, + "learning_rate": 0.0001422624125048232, + "loss": 0.3984, + "step": 12740 + }, + { + "epoch": 1.7, + "grad_norm": 0.50390625, + "learning_rate": 0.00014225185828065578, + "loss": 0.194, + "step": 12741 + }, + { + "epoch": 1.7, + "grad_norm": 0.44921875, + "learning_rate": 0.00014224130348352605, + "loss": 0.3467, + "step": 12742 + }, + { + "epoch": 1.7, + "grad_norm": 0.447265625, + "learning_rate": 0.0001422307481135771, + "loss": 0.2898, + "step": 12743 + }, + { + "epoch": 1.7, + "grad_norm": 0.87109375, + "learning_rate": 0.00014222019217095214, + "loss": 0.877, + "step": 12744 + }, + { + "epoch": 1.7, + "grad_norm": 0.443359375, + "learning_rate": 0.0001422096356557943, + "loss": 0.3351, + "step": 12745 + }, + { + "epoch": 1.7, + "grad_norm": 0.546875, + "learning_rate": 0.00014219907856824666, + "loss": 0.4379, + "step": 12746 + }, + { + "epoch": 1.7, + "grad_norm": 0.6171875, + "learning_rate": 0.00014218852090845248, + "loss": 0.4629, + "step": 12747 + }, + { + "epoch": 1.7, + "grad_norm": 0.546875, + "learning_rate": 0.00014217796267655482, + "loss": 0.2566, + "step": 12748 + }, + { + "epoch": 1.7, + "grad_norm": 0.486328125, + "learning_rate": 0.00014216740387269696, + "loss": 0.4318, + "step": 12749 + }, + { + "epoch": 1.7, + "grad_norm": 0.5546875, + "learning_rate": 0.00014215684449702203, + "loss": 0.5941, + "step": 12750 + }, + { + "epoch": 1.7, + "grad_norm": 0.5703125, + "learning_rate": 0.00014214628454967322, + "loss": 0.5942, + "step": 12751 + }, + { + "epoch": 1.7, + "grad_norm": 0.58203125, + "learning_rate": 0.00014213572403079378, + "loss": 0.6905, + "step": 12752 + }, + { + "epoch": 1.7, + "grad_norm": 0.47265625, + "learning_rate": 0.00014212516294052685, + "loss": 0.2358, + "step": 12753 + }, + { + "epoch": 1.7, + "grad_norm": 0.671875, + "learning_rate": 0.0001421146012790157, + "loss": 0.7282, + "step": 12754 + }, + { + "epoch": 1.7, + "grad_norm": 1.0234375, + "learning_rate": 0.00014210403904640347, + "loss": 0.349, + "step": 12755 + }, + { + "epoch": 1.7, + "grad_norm": 0.59375, + "learning_rate": 0.0001420934762428335, + "loss": 0.3117, + "step": 12756 + }, + { + "epoch": 1.7, + "grad_norm": 0.408203125, + "learning_rate": 0.00014208291286844896, + "loss": 0.1972, + "step": 12757 + }, + { + "epoch": 1.7, + "grad_norm": 0.48046875, + "learning_rate": 0.00014207234892339314, + "loss": 0.3896, + "step": 12758 + }, + { + "epoch": 1.7, + "grad_norm": 0.51171875, + "learning_rate": 0.00014206178440780924, + "loss": 0.3372, + "step": 12759 + }, + { + "epoch": 1.7, + "grad_norm": 0.53125, + "learning_rate": 0.00014205121932184055, + "loss": 0.487, + "step": 12760 + }, + { + "epoch": 1.7, + "grad_norm": 0.451171875, + "learning_rate": 0.00014204065366563032, + "loss": 0.3611, + "step": 12761 + }, + { + "epoch": 1.7, + "grad_norm": 0.7734375, + "learning_rate": 0.00014203008743932184, + "loss": 0.3485, + "step": 12762 + }, + { + "epoch": 1.7, + "grad_norm": 0.64453125, + "learning_rate": 0.0001420195206430585, + "loss": 0.3019, + "step": 12763 + }, + { + "epoch": 1.7, + "grad_norm": 0.68359375, + "learning_rate": 0.0001420089532769834, + "loss": 0.5585, + "step": 12764 + }, + { + "epoch": 1.7, + "grad_norm": 0.6953125, + "learning_rate": 0.00014199838534123993, + "loss": 0.6107, + "step": 12765 + }, + { + "epoch": 1.7, + "grad_norm": 0.5390625, + "learning_rate": 0.0001419878168359714, + "loss": 0.4392, + "step": 12766 + }, + { + "epoch": 1.7, + "grad_norm": 0.58984375, + "learning_rate": 0.00014197724776132114, + "loss": 0.4196, + "step": 12767 + }, + { + "epoch": 1.7, + "grad_norm": 0.47265625, + "learning_rate": 0.00014196667811743248, + "loss": 0.3519, + "step": 12768 + }, + { + "epoch": 1.7, + "grad_norm": 0.390625, + "learning_rate": 0.00014195610790444872, + "loss": 0.25, + "step": 12769 + }, + { + "epoch": 1.7, + "grad_norm": 0.6484375, + "learning_rate": 0.00014194553712251323, + "loss": 0.4513, + "step": 12770 + }, + { + "epoch": 1.7, + "grad_norm": 0.484375, + "learning_rate": 0.00014193496577176928, + "loss": 0.5024, + "step": 12771 + }, + { + "epoch": 1.7, + "grad_norm": 0.37109375, + "learning_rate": 0.00014192439385236032, + "loss": 0.2171, + "step": 12772 + }, + { + "epoch": 1.7, + "grad_norm": 0.51953125, + "learning_rate": 0.00014191382136442964, + "loss": 0.238, + "step": 12773 + }, + { + "epoch": 1.7, + "grad_norm": 0.462890625, + "learning_rate": 0.00014190324830812067, + "loss": 0.3101, + "step": 12774 + }, + { + "epoch": 1.7, + "grad_norm": 0.40234375, + "learning_rate": 0.00014189267468357676, + "loss": 0.449, + "step": 12775 + }, + { + "epoch": 1.7, + "grad_norm": 0.40625, + "learning_rate": 0.00014188210049094127, + "loss": 0.4405, + "step": 12776 + }, + { + "epoch": 1.7, + "grad_norm": 0.453125, + "learning_rate": 0.00014187152573035767, + "loss": 0.269, + "step": 12777 + }, + { + "epoch": 1.71, + "grad_norm": 0.65234375, + "learning_rate": 0.00014186095040196926, + "loss": 0.2617, + "step": 12778 + }, + { + "epoch": 1.71, + "grad_norm": 0.55078125, + "learning_rate": 0.00014185037450591953, + "loss": 0.4274, + "step": 12779 + }, + { + "epoch": 1.71, + "grad_norm": 0.78125, + "learning_rate": 0.00014183979804235181, + "loss": 0.4805, + "step": 12780 + }, + { + "epoch": 1.71, + "grad_norm": 0.486328125, + "learning_rate": 0.00014182922101140964, + "loss": 0.5341, + "step": 12781 + }, + { + "epoch": 1.71, + "grad_norm": 0.458984375, + "learning_rate": 0.00014181864341323633, + "loss": 0.3988, + "step": 12782 + }, + { + "epoch": 1.71, + "grad_norm": 0.45703125, + "learning_rate": 0.00014180806524797542, + "loss": 0.5145, + "step": 12783 + }, + { + "epoch": 1.71, + "grad_norm": 0.52734375, + "learning_rate": 0.00014179748651577028, + "loss": 0.266, + "step": 12784 + }, + { + "epoch": 1.71, + "grad_norm": 0.53515625, + "learning_rate": 0.00014178690721676444, + "loss": 0.3946, + "step": 12785 + }, + { + "epoch": 1.71, + "grad_norm": 0.515625, + "learning_rate": 0.00014177632735110128, + "loss": 0.2884, + "step": 12786 + }, + { + "epoch": 1.71, + "grad_norm": 0.384765625, + "learning_rate": 0.00014176574691892433, + "loss": 0.4115, + "step": 12787 + }, + { + "epoch": 1.71, + "grad_norm": 0.625, + "learning_rate": 0.00014175516592037705, + "loss": 0.1928, + "step": 12788 + }, + { + "epoch": 1.71, + "grad_norm": 0.46484375, + "learning_rate": 0.0001417445843556029, + "loss": 0.2313, + "step": 12789 + }, + { + "epoch": 1.71, + "grad_norm": 0.5546875, + "learning_rate": 0.00014173400222474542, + "loss": 0.4682, + "step": 12790 + }, + { + "epoch": 1.71, + "grad_norm": 0.484375, + "learning_rate": 0.00014172341952794806, + "loss": 0.4276, + "step": 12791 + }, + { + "epoch": 1.71, + "grad_norm": 0.71875, + "learning_rate": 0.00014171283626535436, + "loss": 0.4265, + "step": 12792 + }, + { + "epoch": 1.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00014170225243710784, + "loss": 0.6018, + "step": 12793 + }, + { + "epoch": 1.71, + "grad_norm": 0.60546875, + "learning_rate": 0.00014169166804335202, + "loss": 0.4708, + "step": 12794 + }, + { + "epoch": 1.71, + "grad_norm": 0.52734375, + "learning_rate": 0.00014168108308423041, + "loss": 0.4362, + "step": 12795 + }, + { + "epoch": 1.71, + "grad_norm": 0.306640625, + "learning_rate": 0.00014167049755988654, + "loss": 0.2498, + "step": 12796 + }, + { + "epoch": 1.71, + "grad_norm": 0.462890625, + "learning_rate": 0.00014165991147046403, + "loss": 0.5273, + "step": 12797 + }, + { + "epoch": 1.71, + "grad_norm": 0.5546875, + "learning_rate": 0.00014164932481610634, + "loss": 0.4572, + "step": 12798 + }, + { + "epoch": 1.71, + "grad_norm": 0.474609375, + "learning_rate": 0.0001416387375969571, + "loss": 0.4241, + "step": 12799 + }, + { + "epoch": 1.71, + "grad_norm": 0.55078125, + "learning_rate": 0.00014162814981315985, + "loss": 0.4073, + "step": 12800 + }, + { + "epoch": 1.71, + "grad_norm": 0.53515625, + "learning_rate": 0.00014161756146485814, + "loss": 0.3691, + "step": 12801 + }, + { + "epoch": 1.71, + "grad_norm": 0.55078125, + "learning_rate": 0.0001416069725521956, + "loss": 0.4863, + "step": 12802 + }, + { + "epoch": 1.71, + "grad_norm": 0.74609375, + "learning_rate": 0.00014159638307531584, + "loss": 0.4106, + "step": 12803 + }, + { + "epoch": 1.71, + "grad_norm": 0.3984375, + "learning_rate": 0.00014158579303436242, + "loss": 0.1431, + "step": 12804 + }, + { + "epoch": 1.71, + "grad_norm": 0.59765625, + "learning_rate": 0.00014157520242947893, + "loss": 0.3534, + "step": 12805 + }, + { + "epoch": 1.71, + "grad_norm": 0.625, + "learning_rate": 0.00014156461126080906, + "loss": 0.346, + "step": 12806 + }, + { + "epoch": 1.71, + "grad_norm": 0.671875, + "learning_rate": 0.00014155401952849634, + "loss": 0.4372, + "step": 12807 + }, + { + "epoch": 1.71, + "grad_norm": 0.5390625, + "learning_rate": 0.00014154342723268445, + "loss": 0.6159, + "step": 12808 + }, + { + "epoch": 1.71, + "grad_norm": 0.59375, + "learning_rate": 0.00014153283437351704, + "loss": 0.3089, + "step": 12809 + }, + { + "epoch": 1.71, + "grad_norm": 0.5625, + "learning_rate": 0.00014152224095113772, + "loss": 0.4791, + "step": 12810 + }, + { + "epoch": 1.71, + "grad_norm": 0.5703125, + "learning_rate": 0.0001415116469656902, + "loss": 0.3564, + "step": 12811 + }, + { + "epoch": 1.71, + "grad_norm": 0.376953125, + "learning_rate": 0.0001415010524173181, + "loss": 0.2011, + "step": 12812 + }, + { + "epoch": 1.71, + "grad_norm": 0.458984375, + "learning_rate": 0.00014149045730616508, + "loss": 0.3353, + "step": 12813 + }, + { + "epoch": 1.71, + "grad_norm": 0.44140625, + "learning_rate": 0.00014147986163237482, + "loss": 0.3591, + "step": 12814 + }, + { + "epoch": 1.71, + "grad_norm": 0.51953125, + "learning_rate": 0.00014146926539609103, + "loss": 0.3964, + "step": 12815 + }, + { + "epoch": 1.71, + "grad_norm": 0.5390625, + "learning_rate": 0.00014145866859745736, + "loss": 0.3806, + "step": 12816 + }, + { + "epoch": 1.71, + "grad_norm": 0.61328125, + "learning_rate": 0.00014144807123661756, + "loss": 0.6903, + "step": 12817 + }, + { + "epoch": 1.71, + "grad_norm": 0.61328125, + "learning_rate": 0.00014143747331371532, + "loss": 0.4012, + "step": 12818 + }, + { + "epoch": 1.71, + "grad_norm": 0.71484375, + "learning_rate": 0.00014142687482889433, + "loss": 0.327, + "step": 12819 + }, + { + "epoch": 1.71, + "grad_norm": 0.498046875, + "learning_rate": 0.00014141627578229834, + "loss": 0.4547, + "step": 12820 + }, + { + "epoch": 1.71, + "grad_norm": 0.375, + "learning_rate": 0.00014140567617407105, + "loss": 0.2319, + "step": 12821 + }, + { + "epoch": 1.71, + "grad_norm": 0.48046875, + "learning_rate": 0.00014139507600435626, + "loss": 0.2693, + "step": 12822 + }, + { + "epoch": 1.71, + "grad_norm": 0.4921875, + "learning_rate": 0.00014138447527329765, + "loss": 0.2733, + "step": 12823 + }, + { + "epoch": 1.71, + "grad_norm": 0.5078125, + "learning_rate": 0.000141373873981039, + "loss": 0.3634, + "step": 12824 + }, + { + "epoch": 1.71, + "grad_norm": 0.62109375, + "learning_rate": 0.00014136327212772404, + "loss": 0.3399, + "step": 12825 + }, + { + "epoch": 1.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00014135266971349657, + "loss": 0.1916, + "step": 12826 + }, + { + "epoch": 1.71, + "grad_norm": 0.6328125, + "learning_rate": 0.00014134206673850036, + "loss": 0.4052, + "step": 12827 + }, + { + "epoch": 1.71, + "grad_norm": 0.546875, + "learning_rate": 0.0001413314632028792, + "loss": 0.3329, + "step": 12828 + }, + { + "epoch": 1.71, + "grad_norm": 0.72265625, + "learning_rate": 0.00014132085910677688, + "loss": 0.5711, + "step": 12829 + }, + { + "epoch": 1.71, + "grad_norm": 0.56640625, + "learning_rate": 0.0001413102544503372, + "loss": 0.2979, + "step": 12830 + }, + { + "epoch": 1.71, + "grad_norm": 0.7265625, + "learning_rate": 0.00014129964923370394, + "loss": 0.7226, + "step": 12831 + }, + { + "epoch": 1.71, + "grad_norm": 0.4375, + "learning_rate": 0.00014128904345702092, + "loss": 0.3172, + "step": 12832 + }, + { + "epoch": 1.71, + "grad_norm": 0.5703125, + "learning_rate": 0.000141278437120432, + "loss": 0.4074, + "step": 12833 + }, + { + "epoch": 1.71, + "grad_norm": 0.408203125, + "learning_rate": 0.00014126783022408093, + "loss": 0.3355, + "step": 12834 + }, + { + "epoch": 1.71, + "grad_norm": 0.6171875, + "learning_rate": 0.00014125722276811165, + "loss": 0.2468, + "step": 12835 + }, + { + "epoch": 1.71, + "grad_norm": 0.5859375, + "learning_rate": 0.00014124661475266794, + "loss": 0.3337, + "step": 12836 + }, + { + "epoch": 1.71, + "grad_norm": 0.466796875, + "learning_rate": 0.00014123600617789363, + "loss": 0.3544, + "step": 12837 + }, + { + "epoch": 1.71, + "grad_norm": 0.5, + "learning_rate": 0.00014122539704393265, + "loss": 0.284, + "step": 12838 + }, + { + "epoch": 1.71, + "grad_norm": 0.5234375, + "learning_rate": 0.0001412147873509288, + "loss": 0.3497, + "step": 12839 + }, + { + "epoch": 1.71, + "grad_norm": 0.455078125, + "learning_rate": 0.000141204177099026, + "loss": 0.3078, + "step": 12840 + }, + { + "epoch": 1.71, + "grad_norm": 0.49609375, + "learning_rate": 0.0001411935662883681, + "loss": 0.4175, + "step": 12841 + }, + { + "epoch": 1.71, + "grad_norm": 0.609375, + "learning_rate": 0.000141182954919099, + "loss": 0.3403, + "step": 12842 + }, + { + "epoch": 1.71, + "grad_norm": 0.36328125, + "learning_rate": 0.00014117234299136264, + "loss": 0.1633, + "step": 12843 + }, + { + "epoch": 1.71, + "grad_norm": 0.45703125, + "learning_rate": 0.00014116173050530284, + "loss": 0.3319, + "step": 12844 + }, + { + "epoch": 1.71, + "grad_norm": 0.546875, + "learning_rate": 0.00014115111746106359, + "loss": 0.4505, + "step": 12845 + }, + { + "epoch": 1.71, + "grad_norm": 0.55078125, + "learning_rate": 0.00014114050385878875, + "loss": 0.5398, + "step": 12846 + }, + { + "epoch": 1.71, + "grad_norm": 0.48828125, + "learning_rate": 0.00014112988969862227, + "loss": 0.2206, + "step": 12847 + }, + { + "epoch": 1.71, + "grad_norm": 0.515625, + "learning_rate": 0.00014111927498070811, + "loss": 0.4963, + "step": 12848 + }, + { + "epoch": 1.71, + "grad_norm": 0.453125, + "learning_rate": 0.0001411086597051902, + "loss": 0.3708, + "step": 12849 + }, + { + "epoch": 1.71, + "grad_norm": 0.4453125, + "learning_rate": 0.00014109804387221243, + "loss": 0.208, + "step": 12850 + }, + { + "epoch": 1.71, + "grad_norm": 0.5703125, + "learning_rate": 0.00014108742748191887, + "loss": 0.4318, + "step": 12851 + }, + { + "epoch": 1.71, + "grad_norm": 0.3984375, + "learning_rate": 0.00014107681053445337, + "loss": 0.2417, + "step": 12852 + }, + { + "epoch": 1.72, + "grad_norm": 0.53515625, + "learning_rate": 0.00014106619302995998, + "loss": 0.2748, + "step": 12853 + }, + { + "epoch": 1.72, + "grad_norm": 0.470703125, + "learning_rate": 0.00014105557496858268, + "loss": 0.2309, + "step": 12854 + }, + { + "epoch": 1.72, + "grad_norm": 0.439453125, + "learning_rate": 0.0001410449563504654, + "loss": 0.4744, + "step": 12855 + }, + { + "epoch": 1.72, + "grad_norm": 0.671875, + "learning_rate": 0.0001410343371757522, + "loss": 0.7078, + "step": 12856 + }, + { + "epoch": 1.72, + "grad_norm": 0.92578125, + "learning_rate": 0.00014102371744458702, + "loss": 0.2075, + "step": 12857 + }, + { + "epoch": 1.72, + "grad_norm": 0.447265625, + "learning_rate": 0.00014101309715711392, + "loss": 0.4192, + "step": 12858 + }, + { + "epoch": 1.72, + "grad_norm": 0.392578125, + "learning_rate": 0.0001410024763134769, + "loss": 0.3127, + "step": 12859 + }, + { + "epoch": 1.72, + "grad_norm": 0.435546875, + "learning_rate": 0.00014099185491381997, + "loss": 0.3031, + "step": 12860 + }, + { + "epoch": 1.72, + "grad_norm": 0.51953125, + "learning_rate": 0.0001409812329582872, + "loss": 0.4718, + "step": 12861 + }, + { + "epoch": 1.72, + "grad_norm": 0.50390625, + "learning_rate": 0.0001409706104470226, + "loss": 0.427, + "step": 12862 + }, + { + "epoch": 1.72, + "grad_norm": 0.3046875, + "learning_rate": 0.00014095998738017024, + "loss": 0.2672, + "step": 12863 + }, + { + "epoch": 1.72, + "grad_norm": 0.50390625, + "learning_rate": 0.00014094936375787413, + "loss": 0.3973, + "step": 12864 + }, + { + "epoch": 1.72, + "grad_norm": 0.404296875, + "learning_rate": 0.0001409387395802784, + "loss": 0.3346, + "step": 12865 + }, + { + "epoch": 1.72, + "grad_norm": 0.50390625, + "learning_rate": 0.0001409281148475271, + "loss": 0.566, + "step": 12866 + }, + { + "epoch": 1.72, + "grad_norm": 0.3359375, + "learning_rate": 0.00014091748955976426, + "loss": 0.1947, + "step": 12867 + }, + { + "epoch": 1.72, + "grad_norm": 0.4375, + "learning_rate": 0.00014090686371713402, + "loss": 0.2044, + "step": 12868 + }, + { + "epoch": 1.72, + "grad_norm": 0.421875, + "learning_rate": 0.00014089623731978042, + "loss": 0.374, + "step": 12869 + }, + { + "epoch": 1.72, + "grad_norm": 0.5078125, + "learning_rate": 0.00014088561036784762, + "loss": 0.467, + "step": 12870 + }, + { + "epoch": 1.72, + "grad_norm": 0.55078125, + "learning_rate": 0.0001408749828614797, + "loss": 0.2712, + "step": 12871 + }, + { + "epoch": 1.72, + "grad_norm": 0.44140625, + "learning_rate": 0.00014086435480082082, + "loss": 0.3006, + "step": 12872 + }, + { + "epoch": 1.72, + "grad_norm": 0.43359375, + "learning_rate": 0.000140853726186015, + "loss": 0.2924, + "step": 12873 + }, + { + "epoch": 1.72, + "grad_norm": 0.427734375, + "learning_rate": 0.00014084309701720645, + "loss": 0.3738, + "step": 12874 + }, + { + "epoch": 1.72, + "grad_norm": 0.5234375, + "learning_rate": 0.00014083246729453932, + "loss": 0.4173, + "step": 12875 + }, + { + "epoch": 1.72, + "grad_norm": 0.57421875, + "learning_rate": 0.00014082183701815768, + "loss": 0.6662, + "step": 12876 + }, + { + "epoch": 1.72, + "grad_norm": 0.482421875, + "learning_rate": 0.00014081120618820577, + "loss": 0.2886, + "step": 12877 + }, + { + "epoch": 1.72, + "grad_norm": 0.48828125, + "learning_rate": 0.0001408005748048277, + "loss": 0.2504, + "step": 12878 + }, + { + "epoch": 1.72, + "grad_norm": 0.5, + "learning_rate": 0.00014078994286816768, + "loss": 0.3865, + "step": 12879 + }, + { + "epoch": 1.72, + "grad_norm": 0.458984375, + "learning_rate": 0.0001407793103783698, + "loss": 0.1821, + "step": 12880 + }, + { + "epoch": 1.72, + "grad_norm": 0.466796875, + "learning_rate": 0.00014076867733557835, + "loss": 0.3328, + "step": 12881 + }, + { + "epoch": 1.72, + "grad_norm": 0.51171875, + "learning_rate": 0.00014075804373993742, + "loss": 0.4374, + "step": 12882 + }, + { + "epoch": 1.72, + "grad_norm": 0.60546875, + "learning_rate": 0.00014074740959159126, + "loss": 0.2262, + "step": 12883 + }, + { + "epoch": 1.72, + "grad_norm": 0.474609375, + "learning_rate": 0.00014073677489068412, + "loss": 0.2707, + "step": 12884 + }, + { + "epoch": 1.72, + "grad_norm": 0.44921875, + "learning_rate": 0.00014072613963736014, + "loss": 0.2729, + "step": 12885 + }, + { + "epoch": 1.72, + "grad_norm": 1.0078125, + "learning_rate": 0.0001407155038317636, + "loss": 0.3814, + "step": 12886 + }, + { + "epoch": 1.72, + "grad_norm": 0.494140625, + "learning_rate": 0.00014070486747403866, + "loss": 0.4127, + "step": 12887 + }, + { + "epoch": 1.72, + "grad_norm": 0.40625, + "learning_rate": 0.0001406942305643296, + "loss": 0.2028, + "step": 12888 + }, + { + "epoch": 1.72, + "grad_norm": 0.4140625, + "learning_rate": 0.00014068359310278066, + "loss": 0.2573, + "step": 12889 + }, + { + "epoch": 1.72, + "grad_norm": 0.408203125, + "learning_rate": 0.00014067295508953608, + "loss": 0.2539, + "step": 12890 + }, + { + "epoch": 1.72, + "grad_norm": 0.49609375, + "learning_rate": 0.00014066231652474015, + "loss": 0.3743, + "step": 12891 + }, + { + "epoch": 1.72, + "grad_norm": 0.404296875, + "learning_rate": 0.00014065167740853708, + "loss": 0.1568, + "step": 12892 + }, + { + "epoch": 1.72, + "grad_norm": 0.44921875, + "learning_rate": 0.0001406410377410712, + "loss": 0.3754, + "step": 12893 + }, + { + "epoch": 1.72, + "grad_norm": 0.53515625, + "learning_rate": 0.00014063039752248676, + "loss": 0.514, + "step": 12894 + }, + { + "epoch": 1.72, + "grad_norm": 0.416015625, + "learning_rate": 0.000140619756752928, + "loss": 0.2692, + "step": 12895 + }, + { + "epoch": 1.72, + "grad_norm": 0.439453125, + "learning_rate": 0.00014060911543253935, + "loss": 0.1938, + "step": 12896 + }, + { + "epoch": 1.72, + "grad_norm": 0.478515625, + "learning_rate": 0.00014059847356146497, + "loss": 0.5642, + "step": 12897 + }, + { + "epoch": 1.72, + "grad_norm": 0.58984375, + "learning_rate": 0.0001405878311398493, + "loss": 0.3981, + "step": 12898 + }, + { + "epoch": 1.72, + "grad_norm": 0.65625, + "learning_rate": 0.0001405771881678365, + "loss": 0.5528, + "step": 12899 + }, + { + "epoch": 1.72, + "grad_norm": 0.453125, + "learning_rate": 0.00014056654464557106, + "loss": 0.33, + "step": 12900 + }, + { + "epoch": 1.72, + "grad_norm": 0.5234375, + "learning_rate": 0.0001405559005731972, + "loss": 0.4676, + "step": 12901 + }, + { + "epoch": 1.72, + "grad_norm": 0.515625, + "learning_rate": 0.00014054525595085933, + "loss": 0.3751, + "step": 12902 + }, + { + "epoch": 1.72, + "grad_norm": 0.5546875, + "learning_rate": 0.00014053461077870178, + "loss": 0.6044, + "step": 12903 + }, + { + "epoch": 1.72, + "grad_norm": 0.45703125, + "learning_rate": 0.00014052396505686887, + "loss": 0.2322, + "step": 12904 + }, + { + "epoch": 1.72, + "grad_norm": 0.478515625, + "learning_rate": 0.00014051331878550495, + "loss": 0.3672, + "step": 12905 + }, + { + "epoch": 1.72, + "grad_norm": 0.578125, + "learning_rate": 0.0001405026719647545, + "loss": 0.4804, + "step": 12906 + }, + { + "epoch": 1.72, + "grad_norm": 0.421875, + "learning_rate": 0.00014049202459476176, + "loss": 0.1633, + "step": 12907 + }, + { + "epoch": 1.72, + "grad_norm": 0.50390625, + "learning_rate": 0.00014048137667567118, + "loss": 0.3599, + "step": 12908 + }, + { + "epoch": 1.72, + "grad_norm": 0.765625, + "learning_rate": 0.0001404707282076272, + "loss": 0.8387, + "step": 12909 + }, + { + "epoch": 1.72, + "grad_norm": 0.466796875, + "learning_rate": 0.00014046007919077413, + "loss": 0.3672, + "step": 12910 + }, + { + "epoch": 1.72, + "grad_norm": 0.54296875, + "learning_rate": 0.00014044942962525644, + "loss": 0.3496, + "step": 12911 + }, + { + "epoch": 1.72, + "grad_norm": 0.56640625, + "learning_rate": 0.0001404387795112185, + "loss": 0.3082, + "step": 12912 + }, + { + "epoch": 1.72, + "grad_norm": 0.455078125, + "learning_rate": 0.00014042812884880479, + "loss": 0.1819, + "step": 12913 + }, + { + "epoch": 1.72, + "grad_norm": 0.65625, + "learning_rate": 0.00014041747763815968, + "loss": 0.691, + "step": 12914 + }, + { + "epoch": 1.72, + "grad_norm": 0.408203125, + "learning_rate": 0.00014040682587942763, + "loss": 0.2713, + "step": 12915 + }, + { + "epoch": 1.72, + "grad_norm": 0.4453125, + "learning_rate": 0.0001403961735727531, + "loss": 0.3799, + "step": 12916 + }, + { + "epoch": 1.72, + "grad_norm": 0.578125, + "learning_rate": 0.00014038552071828054, + "loss": 0.3914, + "step": 12917 + }, + { + "epoch": 1.72, + "grad_norm": 0.65234375, + "learning_rate": 0.00014037486731615438, + "loss": 0.3987, + "step": 12918 + }, + { + "epoch": 1.72, + "grad_norm": 0.54296875, + "learning_rate": 0.00014036421336651911, + "loss": 0.4406, + "step": 12919 + }, + { + "epoch": 1.72, + "grad_norm": 0.47265625, + "learning_rate": 0.00014035355886951923, + "loss": 0.3529, + "step": 12920 + }, + { + "epoch": 1.72, + "grad_norm": 0.4296875, + "learning_rate": 0.00014034290382529917, + "loss": 0.2604, + "step": 12921 + }, + { + "epoch": 1.72, + "grad_norm": 0.53515625, + "learning_rate": 0.00014033224823400343, + "loss": 0.1759, + "step": 12922 + }, + { + "epoch": 1.72, + "grad_norm": 0.68359375, + "learning_rate": 0.00014032159209577654, + "loss": 0.416, + "step": 12923 + }, + { + "epoch": 1.72, + "grad_norm": 0.4765625, + "learning_rate": 0.00014031093541076297, + "loss": 0.4523, + "step": 12924 + }, + { + "epoch": 1.72, + "grad_norm": 0.52734375, + "learning_rate": 0.00014030027817910727, + "loss": 0.406, + "step": 12925 + }, + { + "epoch": 1.72, + "grad_norm": 0.640625, + "learning_rate": 0.0001402896204009539, + "loss": 0.2552, + "step": 12926 + }, + { + "epoch": 1.72, + "grad_norm": 0.5859375, + "learning_rate": 0.00014027896207644745, + "loss": 0.53, + "step": 12927 + }, + { + "epoch": 1.73, + "grad_norm": 0.435546875, + "learning_rate": 0.00014026830320573242, + "loss": 0.3756, + "step": 12928 + }, + { + "epoch": 1.73, + "grad_norm": 0.49609375, + "learning_rate": 0.00014025764378895335, + "loss": 0.503, + "step": 12929 + }, + { + "epoch": 1.73, + "grad_norm": 0.486328125, + "learning_rate": 0.00014024698382625474, + "loss": 0.4605, + "step": 12930 + }, + { + "epoch": 1.73, + "grad_norm": 0.546875, + "learning_rate": 0.00014023632331778128, + "loss": 0.2567, + "step": 12931 + }, + { + "epoch": 1.73, + "grad_norm": 0.59765625, + "learning_rate": 0.0001402256622636774, + "loss": 0.4319, + "step": 12932 + }, + { + "epoch": 1.73, + "grad_norm": 0.55859375, + "learning_rate": 0.00014021500066408772, + "loss": 0.4669, + "step": 12933 + }, + { + "epoch": 1.73, + "grad_norm": 0.50390625, + "learning_rate": 0.00014020433851915685, + "loss": 0.2626, + "step": 12934 + }, + { + "epoch": 1.73, + "grad_norm": 0.55859375, + "learning_rate": 0.00014019367582902934, + "loss": 0.4217, + "step": 12935 + }, + { + "epoch": 1.73, + "grad_norm": 0.498046875, + "learning_rate": 0.0001401830125938498, + "loss": 0.3585, + "step": 12936 + }, + { + "epoch": 1.73, + "grad_norm": 0.87109375, + "learning_rate": 0.00014017234881376277, + "loss": 0.6036, + "step": 12937 + }, + { + "epoch": 1.73, + "grad_norm": 0.55859375, + "learning_rate": 0.00014016168448891295, + "loss": 0.381, + "step": 12938 + }, + { + "epoch": 1.73, + "grad_norm": 0.44921875, + "learning_rate": 0.0001401510196194449, + "loss": 0.2784, + "step": 12939 + }, + { + "epoch": 1.73, + "grad_norm": 0.451171875, + "learning_rate": 0.00014014035420550328, + "loss": 0.4226, + "step": 12940 + }, + { + "epoch": 1.73, + "grad_norm": 0.474609375, + "learning_rate": 0.00014012968824723268, + "loss": 0.3729, + "step": 12941 + }, + { + "epoch": 1.73, + "grad_norm": 0.423828125, + "learning_rate": 0.00014011902174477774, + "loss": 0.6247, + "step": 12942 + }, + { + "epoch": 1.73, + "grad_norm": 0.427734375, + "learning_rate": 0.00014010835469828315, + "loss": 0.4841, + "step": 12943 + }, + { + "epoch": 1.73, + "grad_norm": 0.53515625, + "learning_rate": 0.00014009768710789347, + "loss": 0.4728, + "step": 12944 + }, + { + "epoch": 1.73, + "grad_norm": 0.60546875, + "learning_rate": 0.00014008701897375348, + "loss": 0.3854, + "step": 12945 + }, + { + "epoch": 1.73, + "grad_norm": 0.5625, + "learning_rate": 0.00014007635029600777, + "loss": 0.2243, + "step": 12946 + }, + { + "epoch": 1.73, + "grad_norm": 0.5234375, + "learning_rate": 0.00014006568107480098, + "loss": 0.2415, + "step": 12947 + }, + { + "epoch": 1.73, + "grad_norm": 0.5, + "learning_rate": 0.0001400550113102779, + "loss": 0.6069, + "step": 12948 + }, + { + "epoch": 1.73, + "grad_norm": 0.51953125, + "learning_rate": 0.00014004434100258313, + "loss": 0.232, + "step": 12949 + }, + { + "epoch": 1.73, + "grad_norm": 0.54296875, + "learning_rate": 0.0001400336701518614, + "loss": 0.2188, + "step": 12950 + }, + { + "epoch": 1.73, + "grad_norm": 0.462890625, + "learning_rate": 0.00014002299875825742, + "loss": 0.3206, + "step": 12951 + }, + { + "epoch": 1.73, + "grad_norm": 0.65234375, + "learning_rate": 0.0001400123268219159, + "loss": 0.3999, + "step": 12952 + }, + { + "epoch": 1.73, + "grad_norm": 0.54296875, + "learning_rate": 0.00014000165434298152, + "loss": 0.5485, + "step": 12953 + }, + { + "epoch": 1.73, + "grad_norm": 0.447265625, + "learning_rate": 0.00013999098132159903, + "loss": 0.3836, + "step": 12954 + }, + { + "epoch": 1.73, + "grad_norm": 0.50390625, + "learning_rate": 0.00013998030775791317, + "loss": 0.3726, + "step": 12955 + }, + { + "epoch": 1.73, + "grad_norm": 0.60546875, + "learning_rate": 0.0001399696336520687, + "loss": 0.283, + "step": 12956 + }, + { + "epoch": 1.73, + "grad_norm": 0.427734375, + "learning_rate": 0.00013995895900421033, + "loss": 0.4624, + "step": 12957 + }, + { + "epoch": 1.73, + "grad_norm": 0.55859375, + "learning_rate": 0.00013994828381448286, + "loss": 0.3528, + "step": 12958 + }, + { + "epoch": 1.73, + "grad_norm": 0.546875, + "learning_rate": 0.000139937608083031, + "loss": 0.2832, + "step": 12959 + }, + { + "epoch": 1.73, + "grad_norm": 0.6875, + "learning_rate": 0.0001399269318099995, + "loss": 0.4195, + "step": 12960 + }, + { + "epoch": 1.73, + "grad_norm": 0.63671875, + "learning_rate": 0.00013991625499553325, + "loss": 0.2374, + "step": 12961 + }, + { + "epoch": 1.73, + "grad_norm": 0.396484375, + "learning_rate": 0.00013990557763977695, + "loss": 0.373, + "step": 12962 + }, + { + "epoch": 1.73, + "grad_norm": 0.5859375, + "learning_rate": 0.00013989489974287536, + "loss": 0.5452, + "step": 12963 + }, + { + "epoch": 1.73, + "grad_norm": 0.54296875, + "learning_rate": 0.00013988422130497337, + "loss": 0.4805, + "step": 12964 + }, + { + "epoch": 1.73, + "grad_norm": 0.294921875, + "learning_rate": 0.00013987354232621572, + "loss": 0.1248, + "step": 12965 + }, + { + "epoch": 1.73, + "grad_norm": 0.4921875, + "learning_rate": 0.00013986286280674728, + "loss": 0.3511, + "step": 12966 + }, + { + "epoch": 1.73, + "grad_norm": 0.578125, + "learning_rate": 0.0001398521827467128, + "loss": 0.415, + "step": 12967 + }, + { + "epoch": 1.73, + "grad_norm": 0.474609375, + "learning_rate": 0.00013984150214625714, + "loss": 0.3, + "step": 12968 + }, + { + "epoch": 1.73, + "grad_norm": 0.7109375, + "learning_rate": 0.00013983082100552517, + "loss": 0.6839, + "step": 12969 + }, + { + "epoch": 1.73, + "grad_norm": 0.439453125, + "learning_rate": 0.0001398201393246617, + "loss": 0.3537, + "step": 12970 + }, + { + "epoch": 1.73, + "grad_norm": 0.6171875, + "learning_rate": 0.00013980945710381158, + "loss": 0.4313, + "step": 12971 + }, + { + "epoch": 1.73, + "grad_norm": 0.56640625, + "learning_rate": 0.00013979877434311963, + "loss": 0.3808, + "step": 12972 + }, + { + "epoch": 1.73, + "grad_norm": 0.578125, + "learning_rate": 0.00013978809104273082, + "loss": 0.631, + "step": 12973 + }, + { + "epoch": 1.73, + "grad_norm": 0.380859375, + "learning_rate": 0.0001397774072027899, + "loss": 0.2253, + "step": 12974 + }, + { + "epoch": 1.73, + "grad_norm": 0.578125, + "learning_rate": 0.00013976672282344185, + "loss": 0.6324, + "step": 12975 + }, + { + "epoch": 1.73, + "grad_norm": 0.5078125, + "learning_rate": 0.00013975603790483153, + "loss": 0.4715, + "step": 12976 + }, + { + "epoch": 1.73, + "grad_norm": 0.458984375, + "learning_rate": 0.0001397453524471038, + "loss": 0.5052, + "step": 12977 + }, + { + "epoch": 1.73, + "grad_norm": 0.9140625, + "learning_rate": 0.00013973466645040357, + "loss": 0.5269, + "step": 12978 + }, + { + "epoch": 1.73, + "grad_norm": 0.451171875, + "learning_rate": 0.00013972397991487577, + "loss": 0.376, + "step": 12979 + }, + { + "epoch": 1.73, + "grad_norm": 0.51953125, + "learning_rate": 0.00013971329284066531, + "loss": 0.2792, + "step": 12980 + }, + { + "epoch": 1.73, + "grad_norm": 0.53515625, + "learning_rate": 0.0001397026052279171, + "loss": 0.4394, + "step": 12981 + }, + { + "epoch": 1.73, + "grad_norm": 0.322265625, + "learning_rate": 0.00013969191707677615, + "loss": 0.2253, + "step": 12982 + }, + { + "epoch": 1.73, + "grad_norm": 1.859375, + "learning_rate": 0.00013968122838738724, + "loss": 0.4621, + "step": 12983 + }, + { + "epoch": 1.73, + "grad_norm": 0.51953125, + "learning_rate": 0.00013967053915989546, + "loss": 0.4026, + "step": 12984 + }, + { + "epoch": 1.73, + "grad_norm": 0.490234375, + "learning_rate": 0.0001396598493944457, + "loss": 0.4358, + "step": 12985 + }, + { + "epoch": 1.73, + "grad_norm": 0.56640625, + "learning_rate": 0.0001396491590911829, + "loss": 0.2244, + "step": 12986 + }, + { + "epoch": 1.73, + "grad_norm": 0.484375, + "learning_rate": 0.00013963846825025206, + "loss": 0.3496, + "step": 12987 + }, + { + "epoch": 1.73, + "grad_norm": 0.45703125, + "learning_rate": 0.00013962777687179817, + "loss": 0.3843, + "step": 12988 + }, + { + "epoch": 1.73, + "grad_norm": 0.408203125, + "learning_rate": 0.0001396170849559662, + "loss": 0.4379, + "step": 12989 + }, + { + "epoch": 1.73, + "grad_norm": 0.478515625, + "learning_rate": 0.0001396063925029011, + "loss": 0.4882, + "step": 12990 + }, + { + "epoch": 1.73, + "grad_norm": 0.56640625, + "learning_rate": 0.00013959569951274792, + "loss": 0.3087, + "step": 12991 + }, + { + "epoch": 1.73, + "grad_norm": 0.48046875, + "learning_rate": 0.00013958500598565163, + "loss": 0.4216, + "step": 12992 + }, + { + "epoch": 1.73, + "grad_norm": 0.58203125, + "learning_rate": 0.00013957431192175728, + "loss": 0.4548, + "step": 12993 + }, + { + "epoch": 1.73, + "grad_norm": 0.60546875, + "learning_rate": 0.00013956361732120985, + "loss": 0.4232, + "step": 12994 + }, + { + "epoch": 1.73, + "grad_norm": 0.7890625, + "learning_rate": 0.00013955292218415438, + "loss": 0.4889, + "step": 12995 + }, + { + "epoch": 1.73, + "grad_norm": 0.474609375, + "learning_rate": 0.00013954222651073588, + "loss": 0.4774, + "step": 12996 + }, + { + "epoch": 1.73, + "grad_norm": 0.369140625, + "learning_rate": 0.00013953153030109942, + "loss": 0.2248, + "step": 12997 + }, + { + "epoch": 1.73, + "grad_norm": 0.640625, + "learning_rate": 0.00013952083355539004, + "loss": 0.3505, + "step": 12998 + }, + { + "epoch": 1.73, + "grad_norm": 0.6328125, + "learning_rate": 0.00013951013627375278, + "loss": 0.3394, + "step": 12999 + }, + { + "epoch": 1.73, + "grad_norm": 0.53515625, + "learning_rate": 0.00013949943845633275, + "loss": 0.3459, + "step": 13000 + }, + { + "epoch": 1.73, + "grad_norm": 0.640625, + "learning_rate": 0.00013948874010327495, + "loss": 0.4092, + "step": 13001 + }, + { + "epoch": 1.73, + "grad_norm": 0.48046875, + "learning_rate": 0.0001394780412147245, + "loss": 0.3412, + "step": 13002 + }, + { + "epoch": 1.74, + "grad_norm": 0.51171875, + "learning_rate": 0.0001394673417908265, + "loss": 0.5701, + "step": 13003 + }, + { + "epoch": 1.74, + "grad_norm": 0.55078125, + "learning_rate": 0.00013945664183172597, + "loss": 0.3463, + "step": 13004 + }, + { + "epoch": 1.74, + "grad_norm": 0.60546875, + "learning_rate": 0.00013944594133756807, + "loss": 0.7448, + "step": 13005 + }, + { + "epoch": 1.74, + "grad_norm": 0.5078125, + "learning_rate": 0.00013943524030849794, + "loss": 0.3496, + "step": 13006 + }, + { + "epoch": 1.74, + "grad_norm": 0.515625, + "learning_rate": 0.00013942453874466056, + "loss": 0.2807, + "step": 13007 + }, + { + "epoch": 1.74, + "grad_norm": 0.54296875, + "learning_rate": 0.00013941383664620117, + "loss": 0.3881, + "step": 13008 + }, + { + "epoch": 1.74, + "grad_norm": 0.55078125, + "learning_rate": 0.0001394031340132649, + "loss": 0.3931, + "step": 13009 + }, + { + "epoch": 1.74, + "grad_norm": 0.56640625, + "learning_rate": 0.00013939243084599675, + "loss": 0.4668, + "step": 13010 + }, + { + "epoch": 1.74, + "grad_norm": 0.51953125, + "learning_rate": 0.00013938172714454203, + "loss": 0.4824, + "step": 13011 + }, + { + "epoch": 1.74, + "grad_norm": 0.76171875, + "learning_rate": 0.0001393710229090458, + "loss": 0.2729, + "step": 13012 + }, + { + "epoch": 1.74, + "grad_norm": 0.53515625, + "learning_rate": 0.00013936031813965323, + "loss": 0.2988, + "step": 13013 + }, + { + "epoch": 1.74, + "grad_norm": 0.427734375, + "learning_rate": 0.00013934961283650947, + "loss": 0.4374, + "step": 13014 + }, + { + "epoch": 1.74, + "grad_norm": 0.55859375, + "learning_rate": 0.00013933890699975972, + "loss": 0.3002, + "step": 13015 + }, + { + "epoch": 1.74, + "grad_norm": 0.466796875, + "learning_rate": 0.00013932820062954914, + "loss": 0.3399, + "step": 13016 + }, + { + "epoch": 1.74, + "grad_norm": 0.53125, + "learning_rate": 0.0001393174937260229, + "loss": 0.2855, + "step": 13017 + }, + { + "epoch": 1.74, + "grad_norm": 0.58203125, + "learning_rate": 0.00013930678628932625, + "loss": 0.3117, + "step": 13018 + }, + { + "epoch": 1.74, + "grad_norm": 0.443359375, + "learning_rate": 0.00013929607831960432, + "loss": 0.4353, + "step": 13019 + }, + { + "epoch": 1.74, + "grad_norm": 0.5546875, + "learning_rate": 0.00013928536981700236, + "loss": 0.4835, + "step": 13020 + }, + { + "epoch": 1.74, + "grad_norm": 0.33203125, + "learning_rate": 0.00013927466078166557, + "loss": 0.2232, + "step": 13021 + }, + { + "epoch": 1.74, + "grad_norm": 0.42578125, + "learning_rate": 0.00013926395121373916, + "loss": 0.2928, + "step": 13022 + }, + { + "epoch": 1.74, + "grad_norm": 0.52734375, + "learning_rate": 0.00013925324111336838, + "loss": 0.522, + "step": 13023 + }, + { + "epoch": 1.74, + "grad_norm": 0.31640625, + "learning_rate": 0.0001392425304806985, + "loss": 0.1771, + "step": 13024 + }, + { + "epoch": 1.74, + "grad_norm": 0.5703125, + "learning_rate": 0.0001392318193158747, + "loss": 0.5094, + "step": 13025 + }, + { + "epoch": 1.74, + "grad_norm": 0.61328125, + "learning_rate": 0.0001392211076190422, + "loss": 0.3334, + "step": 13026 + }, + { + "epoch": 1.74, + "grad_norm": 0.5234375, + "learning_rate": 0.00013921039539034635, + "loss": 0.2774, + "step": 13027 + }, + { + "epoch": 1.74, + "grad_norm": 0.40625, + "learning_rate": 0.00013919968262993238, + "loss": 0.219, + "step": 13028 + }, + { + "epoch": 1.74, + "grad_norm": 0.427734375, + "learning_rate": 0.00013918896933794552, + "loss": 0.2296, + "step": 13029 + }, + { + "epoch": 1.74, + "grad_norm": 0.451171875, + "learning_rate": 0.0001391782555145311, + "loss": 0.3377, + "step": 13030 + }, + { + "epoch": 1.74, + "grad_norm": 0.455078125, + "learning_rate": 0.0001391675411598344, + "loss": 0.3987, + "step": 13031 + }, + { + "epoch": 1.74, + "grad_norm": 0.447265625, + "learning_rate": 0.00013915682627400073, + "loss": 0.3424, + "step": 13032 + }, + { + "epoch": 1.74, + "grad_norm": 0.3515625, + "learning_rate": 0.00013914611085717534, + "loss": 0.2574, + "step": 13033 + }, + { + "epoch": 1.74, + "grad_norm": 0.59375, + "learning_rate": 0.00013913539490950356, + "loss": 0.5352, + "step": 13034 + }, + { + "epoch": 1.74, + "grad_norm": 0.396484375, + "learning_rate": 0.00013912467843113068, + "loss": 0.3423, + "step": 13035 + }, + { + "epoch": 1.74, + "grad_norm": 0.443359375, + "learning_rate": 0.0001391139614222021, + "loss": 0.4014, + "step": 13036 + }, + { + "epoch": 1.74, + "grad_norm": 0.70703125, + "learning_rate": 0.00013910324388286308, + "loss": 0.6205, + "step": 13037 + }, + { + "epoch": 1.74, + "grad_norm": 0.318359375, + "learning_rate": 0.00013909252581325898, + "loss": 0.1599, + "step": 13038 + }, + { + "epoch": 1.74, + "grad_norm": 0.55859375, + "learning_rate": 0.00013908180721353516, + "loss": 0.4461, + "step": 13039 + }, + { + "epoch": 1.74, + "grad_norm": 0.443359375, + "learning_rate": 0.00013907108808383688, + "loss": 0.2968, + "step": 13040 + }, + { + "epoch": 1.74, + "grad_norm": 0.41015625, + "learning_rate": 0.00013906036842430963, + "loss": 0.2996, + "step": 13041 + }, + { + "epoch": 1.74, + "grad_norm": 0.4765625, + "learning_rate": 0.0001390496482350987, + "loss": 0.7117, + "step": 13042 + }, + { + "epoch": 1.74, + "grad_norm": 0.4921875, + "learning_rate": 0.00013903892751634947, + "loss": 0.5377, + "step": 13043 + }, + { + "epoch": 1.74, + "grad_norm": 0.66796875, + "learning_rate": 0.00013902820626820737, + "loss": 0.5933, + "step": 13044 + }, + { + "epoch": 1.74, + "grad_norm": 0.53515625, + "learning_rate": 0.00013901748449081768, + "loss": 0.4385, + "step": 13045 + }, + { + "epoch": 1.74, + "grad_norm": 0.5546875, + "learning_rate": 0.0001390067621843259, + "loss": 0.2692, + "step": 13046 + }, + { + "epoch": 1.74, + "grad_norm": 0.54296875, + "learning_rate": 0.00013899603934887738, + "loss": 0.352, + "step": 13047 + }, + { + "epoch": 1.74, + "grad_norm": 0.53515625, + "learning_rate": 0.0001389853159846176, + "loss": 0.5626, + "step": 13048 + }, + { + "epoch": 1.74, + "grad_norm": 0.51953125, + "learning_rate": 0.00013897459209169183, + "loss": 0.4396, + "step": 13049 + }, + { + "epoch": 1.74, + "grad_norm": 0.63671875, + "learning_rate": 0.0001389638676702456, + "loss": 0.6628, + "step": 13050 + }, + { + "epoch": 1.74, + "grad_norm": 0.6328125, + "learning_rate": 0.00013895314272042433, + "loss": 0.4284, + "step": 13051 + }, + { + "epoch": 1.74, + "grad_norm": 0.498046875, + "learning_rate": 0.00013894241724237344, + "loss": 0.5356, + "step": 13052 + }, + { + "epoch": 1.74, + "grad_norm": 0.61328125, + "learning_rate": 0.00013893169123623835, + "loss": 0.6019, + "step": 13053 + }, + { + "epoch": 1.74, + "grad_norm": 0.48828125, + "learning_rate": 0.00013892096470216457, + "loss": 0.3326, + "step": 13054 + }, + { + "epoch": 1.74, + "grad_norm": 0.52734375, + "learning_rate": 0.00013891023764029756, + "loss": 0.3884, + "step": 13055 + }, + { + "epoch": 1.74, + "grad_norm": 0.455078125, + "learning_rate": 0.0001388995100507827, + "loss": 0.4247, + "step": 13056 + }, + { + "epoch": 1.74, + "grad_norm": 0.421875, + "learning_rate": 0.00013888878193376556, + "loss": 0.2899, + "step": 13057 + }, + { + "epoch": 1.74, + "grad_norm": 0.79296875, + "learning_rate": 0.00013887805328939153, + "loss": 0.6375, + "step": 13058 + }, + { + "epoch": 1.74, + "grad_norm": 0.59375, + "learning_rate": 0.00013886732411780617, + "loss": 0.4907, + "step": 13059 + }, + { + "epoch": 1.74, + "grad_norm": 0.482421875, + "learning_rate": 0.00013885659441915496, + "loss": 0.2246, + "step": 13060 + }, + { + "epoch": 1.74, + "grad_norm": 0.54296875, + "learning_rate": 0.00013884586419358338, + "loss": 0.5591, + "step": 13061 + }, + { + "epoch": 1.74, + "grad_norm": 0.6640625, + "learning_rate": 0.00013883513344123698, + "loss": 0.3929, + "step": 13062 + }, + { + "epoch": 1.74, + "grad_norm": 0.58203125, + "learning_rate": 0.0001388244021622612, + "loss": 0.5181, + "step": 13063 + }, + { + "epoch": 1.74, + "grad_norm": 0.62890625, + "learning_rate": 0.00013881367035680164, + "loss": 0.4679, + "step": 13064 + }, + { + "epoch": 1.74, + "grad_norm": 0.37890625, + "learning_rate": 0.00013880293802500377, + "loss": 0.2273, + "step": 13065 + }, + { + "epoch": 1.74, + "grad_norm": 0.396484375, + "learning_rate": 0.0001387922051670132, + "loss": 0.4278, + "step": 13066 + }, + { + "epoch": 1.74, + "grad_norm": 0.6171875, + "learning_rate": 0.0001387814717829754, + "loss": 0.6208, + "step": 13067 + }, + { + "epoch": 1.74, + "grad_norm": 0.51171875, + "learning_rate": 0.00013877073787303595, + "loss": 0.5065, + "step": 13068 + }, + { + "epoch": 1.74, + "grad_norm": 0.53515625, + "learning_rate": 0.00013876000343734045, + "loss": 0.384, + "step": 13069 + }, + { + "epoch": 1.74, + "grad_norm": 0.443359375, + "learning_rate": 0.00013874926847603437, + "loss": 0.4628, + "step": 13070 + }, + { + "epoch": 1.74, + "grad_norm": 0.41015625, + "learning_rate": 0.0001387385329892634, + "loss": 0.1947, + "step": 13071 + }, + { + "epoch": 1.74, + "grad_norm": 0.65625, + "learning_rate": 0.000138727796977173, + "loss": 0.6539, + "step": 13072 + }, + { + "epoch": 1.74, + "grad_norm": 0.41796875, + "learning_rate": 0.0001387170604399089, + "loss": 0.4445, + "step": 13073 + }, + { + "epoch": 1.74, + "grad_norm": 0.51953125, + "learning_rate": 0.00013870632337761656, + "loss": 0.4828, + "step": 13074 + }, + { + "epoch": 1.74, + "grad_norm": 0.53125, + "learning_rate": 0.0001386955857904416, + "loss": 0.4987, + "step": 13075 + }, + { + "epoch": 1.74, + "grad_norm": 0.82421875, + "learning_rate": 0.00013868484767852972, + "loss": 0.4756, + "step": 13076 + }, + { + "epoch": 1.74, + "grad_norm": 0.60546875, + "learning_rate": 0.00013867410904202645, + "loss": 0.3229, + "step": 13077 + }, + { + "epoch": 1.75, + "grad_norm": 0.5, + "learning_rate": 0.00013866336988107745, + "loss": 0.4822, + "step": 13078 + }, + { + "epoch": 1.75, + "grad_norm": 0.6484375, + "learning_rate": 0.00013865263019582832, + "loss": 0.5606, + "step": 13079 + }, + { + "epoch": 1.75, + "grad_norm": 0.62109375, + "learning_rate": 0.00013864188998642476, + "loss": 0.3675, + "step": 13080 + }, + { + "epoch": 1.75, + "grad_norm": 0.52734375, + "learning_rate": 0.00013863114925301233, + "loss": 0.4452, + "step": 13081 + }, + { + "epoch": 1.75, + "grad_norm": 0.48828125, + "learning_rate": 0.00013862040799573675, + "loss": 0.2128, + "step": 13082 + }, + { + "epoch": 1.75, + "grad_norm": 0.5703125, + "learning_rate": 0.0001386096662147436, + "loss": 0.3471, + "step": 13083 + }, + { + "epoch": 1.75, + "grad_norm": 0.55859375, + "learning_rate": 0.00013859892391017865, + "loss": 0.4229, + "step": 13084 + }, + { + "epoch": 1.75, + "grad_norm": 0.41015625, + "learning_rate": 0.00013858818108218753, + "loss": 0.1824, + "step": 13085 + }, + { + "epoch": 1.75, + "grad_norm": 0.498046875, + "learning_rate": 0.00013857743773091588, + "loss": 0.4, + "step": 13086 + }, + { + "epoch": 1.75, + "grad_norm": 0.423828125, + "learning_rate": 0.0001385666938565094, + "loss": 0.2808, + "step": 13087 + }, + { + "epoch": 1.75, + "grad_norm": 0.78515625, + "learning_rate": 0.00013855594945911382, + "loss": 0.4566, + "step": 13088 + }, + { + "epoch": 1.75, + "grad_norm": 0.78515625, + "learning_rate": 0.00013854520453887483, + "loss": 0.3477, + "step": 13089 + }, + { + "epoch": 1.75, + "grad_norm": 0.4609375, + "learning_rate": 0.0001385344590959381, + "loss": 0.4904, + "step": 13090 + }, + { + "epoch": 1.75, + "grad_norm": 0.5234375, + "learning_rate": 0.0001385237131304494, + "loss": 0.2892, + "step": 13091 + }, + { + "epoch": 1.75, + "grad_norm": 0.73046875, + "learning_rate": 0.00013851296664255442, + "loss": 0.5631, + "step": 13092 + }, + { + "epoch": 1.75, + "grad_norm": 0.484375, + "learning_rate": 0.00013850221963239887, + "loss": 0.4432, + "step": 13093 + }, + { + "epoch": 1.75, + "grad_norm": 0.470703125, + "learning_rate": 0.00013849147210012855, + "loss": 0.4135, + "step": 13094 + }, + { + "epoch": 1.75, + "grad_norm": 0.419921875, + "learning_rate": 0.00013848072404588912, + "loss": 0.3103, + "step": 13095 + }, + { + "epoch": 1.75, + "grad_norm": 0.703125, + "learning_rate": 0.00013846997546982642, + "loss": 0.3904, + "step": 13096 + }, + { + "epoch": 1.75, + "grad_norm": 0.498046875, + "learning_rate": 0.00013845922637208615, + "loss": 0.477, + "step": 13097 + }, + { + "epoch": 1.75, + "grad_norm": 0.49609375, + "learning_rate": 0.0001384484767528141, + "loss": 0.2678, + "step": 13098 + }, + { + "epoch": 1.75, + "grad_norm": 0.796875, + "learning_rate": 0.00013843772661215599, + "loss": 0.5261, + "step": 13099 + }, + { + "epoch": 1.75, + "grad_norm": 0.50390625, + "learning_rate": 0.00013842697595025765, + "loss": 0.238, + "step": 13100 + }, + { + "epoch": 1.75, + "grad_norm": 0.498046875, + "learning_rate": 0.00013841622476726487, + "loss": 0.3247, + "step": 13101 + }, + { + "epoch": 1.75, + "grad_norm": 0.71875, + "learning_rate": 0.0001384054730633234, + "loss": 0.4984, + "step": 13102 + }, + { + "epoch": 1.75, + "grad_norm": 0.3984375, + "learning_rate": 0.0001383947208385791, + "loss": 0.2295, + "step": 13103 + }, + { + "epoch": 1.75, + "grad_norm": 0.4140625, + "learning_rate": 0.00013838396809317773, + "loss": 0.2687, + "step": 13104 + }, + { + "epoch": 1.75, + "grad_norm": 0.58203125, + "learning_rate": 0.00013837321482726512, + "loss": 0.287, + "step": 13105 + }, + { + "epoch": 1.75, + "grad_norm": 0.484375, + "learning_rate": 0.0001383624610409871, + "loss": 0.3378, + "step": 13106 + }, + { + "epoch": 1.75, + "grad_norm": 0.39453125, + "learning_rate": 0.00013835170673448947, + "loss": 0.3311, + "step": 13107 + }, + { + "epoch": 1.75, + "grad_norm": 0.458984375, + "learning_rate": 0.00013834095190791808, + "loss": 0.3128, + "step": 13108 + }, + { + "epoch": 1.75, + "grad_norm": 0.5859375, + "learning_rate": 0.0001383301965614188, + "loss": 0.4159, + "step": 13109 + }, + { + "epoch": 1.75, + "grad_norm": 0.421875, + "learning_rate": 0.00013831944069513742, + "loss": 0.4369, + "step": 13110 + }, + { + "epoch": 1.75, + "grad_norm": 0.33203125, + "learning_rate": 0.00013830868430921985, + "loss": 0.1946, + "step": 13111 + }, + { + "epoch": 1.75, + "grad_norm": 0.5859375, + "learning_rate": 0.00013829792740381193, + "loss": 0.2515, + "step": 13112 + }, + { + "epoch": 1.75, + "grad_norm": 0.6875, + "learning_rate": 0.00013828716997905954, + "loss": 0.4327, + "step": 13113 + }, + { + "epoch": 1.75, + "grad_norm": 0.439453125, + "learning_rate": 0.00013827641203510854, + "loss": 0.458, + "step": 13114 + }, + { + "epoch": 1.75, + "grad_norm": 0.61328125, + "learning_rate": 0.00013826565357210485, + "loss": 0.8417, + "step": 13115 + }, + { + "epoch": 1.75, + "grad_norm": 0.5859375, + "learning_rate": 0.00013825489459019433, + "loss": 0.4712, + "step": 13116 + }, + { + "epoch": 1.75, + "grad_norm": 0.470703125, + "learning_rate": 0.0001382441350895229, + "loss": 0.2878, + "step": 13117 + }, + { + "epoch": 1.75, + "grad_norm": 0.5, + "learning_rate": 0.0001382333750702364, + "loss": 0.55, + "step": 13118 + }, + { + "epoch": 1.75, + "grad_norm": 0.59375, + "learning_rate": 0.00013822261453248083, + "loss": 0.3336, + "step": 13119 + }, + { + "epoch": 1.75, + "grad_norm": 0.6640625, + "learning_rate": 0.00013821185347640208, + "loss": 0.2716, + "step": 13120 + }, + { + "epoch": 1.75, + "grad_norm": 0.65234375, + "learning_rate": 0.0001382010919021461, + "loss": 0.8084, + "step": 13121 + }, + { + "epoch": 1.75, + "grad_norm": 0.46875, + "learning_rate": 0.00013819032980985876, + "loss": 0.4765, + "step": 13122 + }, + { + "epoch": 1.75, + "grad_norm": 0.4609375, + "learning_rate": 0.00013817956719968606, + "loss": 0.4295, + "step": 13123 + }, + { + "epoch": 1.75, + "grad_norm": 0.5078125, + "learning_rate": 0.00013816880407177388, + "loss": 0.3051, + "step": 13124 + }, + { + "epoch": 1.75, + "grad_norm": 0.498046875, + "learning_rate": 0.00013815804042626828, + "loss": 0.5712, + "step": 13125 + }, + { + "epoch": 1.75, + "grad_norm": 0.470703125, + "learning_rate": 0.0001381472762633151, + "loss": 0.5073, + "step": 13126 + }, + { + "epoch": 1.75, + "grad_norm": 0.458984375, + "learning_rate": 0.0001381365115830604, + "loss": 0.5288, + "step": 13127 + }, + { + "epoch": 1.75, + "grad_norm": 0.71484375, + "learning_rate": 0.00013812574638565015, + "loss": 0.2529, + "step": 13128 + }, + { + "epoch": 1.75, + "grad_norm": 0.5390625, + "learning_rate": 0.0001381149806712303, + "loss": 0.333, + "step": 13129 + }, + { + "epoch": 1.75, + "grad_norm": 0.609375, + "learning_rate": 0.00013810421443994684, + "loss": 0.3891, + "step": 13130 + }, + { + "epoch": 1.75, + "grad_norm": 0.41015625, + "learning_rate": 0.00013809344769194577, + "loss": 0.2637, + "step": 13131 + }, + { + "epoch": 1.75, + "grad_norm": 0.54296875, + "learning_rate": 0.00013808268042737312, + "loss": 0.362, + "step": 13132 + }, + { + "epoch": 1.75, + "grad_norm": 0.5234375, + "learning_rate": 0.00013807191264637487, + "loss": 0.3218, + "step": 13133 + }, + { + "epoch": 1.75, + "grad_norm": 0.56640625, + "learning_rate": 0.00013806114434909703, + "loss": 0.4572, + "step": 13134 + }, + { + "epoch": 1.75, + "grad_norm": 0.625, + "learning_rate": 0.00013805037553568566, + "loss": 0.4102, + "step": 13135 + }, + { + "epoch": 1.75, + "grad_norm": 0.515625, + "learning_rate": 0.00013803960620628679, + "loss": 0.4652, + "step": 13136 + }, + { + "epoch": 1.75, + "grad_norm": 0.52734375, + "learning_rate": 0.00013802883636104647, + "loss": 0.4875, + "step": 13137 + }, + { + "epoch": 1.75, + "grad_norm": 0.458984375, + "learning_rate": 0.00013801806600011066, + "loss": 0.4029, + "step": 13138 + }, + { + "epoch": 1.75, + "grad_norm": 0.359375, + "learning_rate": 0.0001380072951236255, + "loss": 0.1605, + "step": 13139 + }, + { + "epoch": 1.75, + "grad_norm": 0.416015625, + "learning_rate": 0.00013799652373173705, + "loss": 0.1866, + "step": 13140 + }, + { + "epoch": 1.75, + "grad_norm": 0.55859375, + "learning_rate": 0.00013798575182459133, + "loss": 0.3426, + "step": 13141 + }, + { + "epoch": 1.75, + "grad_norm": 0.53125, + "learning_rate": 0.00013797497940233445, + "loss": 0.4602, + "step": 13142 + }, + { + "epoch": 1.75, + "grad_norm": 0.55859375, + "learning_rate": 0.00013796420646511244, + "loss": 0.5626, + "step": 13143 + }, + { + "epoch": 1.75, + "grad_norm": 0.69140625, + "learning_rate": 0.00013795343301307148, + "loss": 0.4251, + "step": 13144 + }, + { + "epoch": 1.75, + "grad_norm": 0.490234375, + "learning_rate": 0.00013794265904635758, + "loss": 0.5888, + "step": 13145 + }, + { + "epoch": 1.75, + "grad_norm": 0.5234375, + "learning_rate": 0.0001379318845651169, + "loss": 0.2057, + "step": 13146 + }, + { + "epoch": 1.75, + "grad_norm": 0.55859375, + "learning_rate": 0.00013792110956949548, + "loss": 0.2768, + "step": 13147 + }, + { + "epoch": 1.75, + "grad_norm": 0.4140625, + "learning_rate": 0.0001379103340596395, + "loss": 0.2585, + "step": 13148 + }, + { + "epoch": 1.75, + "grad_norm": 0.45703125, + "learning_rate": 0.00013789955803569502, + "loss": 0.3211, + "step": 13149 + }, + { + "epoch": 1.75, + "grad_norm": 0.39453125, + "learning_rate": 0.00013788878149780827, + "loss": 0.3384, + "step": 13150 + }, + { + "epoch": 1.75, + "grad_norm": 0.58984375, + "learning_rate": 0.0001378780044461253, + "loss": 0.5496, + "step": 13151 + }, + { + "epoch": 1.76, + "grad_norm": 0.384765625, + "learning_rate": 0.00013786722688079224, + "loss": 0.2772, + "step": 13152 + }, + { + "epoch": 1.76, + "grad_norm": 0.59765625, + "learning_rate": 0.00013785644880195537, + "loss": 0.3679, + "step": 13153 + }, + { + "epoch": 1.76, + "grad_norm": 0.52734375, + "learning_rate": 0.00013784567020976067, + "loss": 0.2249, + "step": 13154 + }, + { + "epoch": 1.76, + "grad_norm": 0.45703125, + "learning_rate": 0.00013783489110435443, + "loss": 0.2973, + "step": 13155 + }, + { + "epoch": 1.76, + "grad_norm": 0.5703125, + "learning_rate": 0.0001378241114858828, + "loss": 0.7202, + "step": 13156 + }, + { + "epoch": 1.76, + "grad_norm": 0.326171875, + "learning_rate": 0.00013781333135449192, + "loss": 0.1624, + "step": 13157 + }, + { + "epoch": 1.76, + "grad_norm": 0.53515625, + "learning_rate": 0.00013780255071032803, + "loss": 0.5584, + "step": 13158 + }, + { + "epoch": 1.76, + "grad_norm": 0.466796875, + "learning_rate": 0.00013779176955353722, + "loss": 0.3308, + "step": 13159 + }, + { + "epoch": 1.76, + "grad_norm": 0.578125, + "learning_rate": 0.00013778098788426584, + "loss": 0.49, + "step": 13160 + }, + { + "epoch": 1.76, + "grad_norm": 0.55859375, + "learning_rate": 0.00013777020570265996, + "loss": 0.3732, + "step": 13161 + }, + { + "epoch": 1.76, + "grad_norm": 0.390625, + "learning_rate": 0.00013775942300886586, + "loss": 0.1852, + "step": 13162 + }, + { + "epoch": 1.76, + "grad_norm": 0.6015625, + "learning_rate": 0.00013774863980302977, + "loss": 0.2124, + "step": 13163 + }, + { + "epoch": 1.76, + "grad_norm": 0.3671875, + "learning_rate": 0.0001377378560852979, + "loss": 0.1523, + "step": 13164 + }, + { + "epoch": 1.76, + "grad_norm": 0.5390625, + "learning_rate": 0.00013772707185581647, + "loss": 0.2372, + "step": 13165 + }, + { + "epoch": 1.76, + "grad_norm": 0.326171875, + "learning_rate": 0.00013771628711473172, + "loss": 0.2003, + "step": 13166 + }, + { + "epoch": 1.76, + "grad_norm": 0.5703125, + "learning_rate": 0.00013770550186218995, + "loss": 0.335, + "step": 13167 + }, + { + "epoch": 1.76, + "grad_norm": 0.494140625, + "learning_rate": 0.00013769471609833734, + "loss": 0.2247, + "step": 13168 + }, + { + "epoch": 1.76, + "grad_norm": 0.78515625, + "learning_rate": 0.0001376839298233202, + "loss": 0.5006, + "step": 13169 + }, + { + "epoch": 1.76, + "grad_norm": 0.5234375, + "learning_rate": 0.00013767314303728478, + "loss": 0.3835, + "step": 13170 + }, + { + "epoch": 1.76, + "grad_norm": 0.55078125, + "learning_rate": 0.00013766235574037738, + "loss": 0.372, + "step": 13171 + }, + { + "epoch": 1.76, + "grad_norm": 0.59375, + "learning_rate": 0.00013765156793274424, + "loss": 0.5826, + "step": 13172 + }, + { + "epoch": 1.76, + "grad_norm": 1.0546875, + "learning_rate": 0.00013764077961453167, + "loss": 0.3057, + "step": 13173 + }, + { + "epoch": 1.76, + "grad_norm": 0.52734375, + "learning_rate": 0.00013762999078588599, + "loss": 0.405, + "step": 13174 + }, + { + "epoch": 1.76, + "grad_norm": 0.53515625, + "learning_rate": 0.00013761920144695345, + "loss": 0.2922, + "step": 13175 + }, + { + "epoch": 1.76, + "grad_norm": 0.51953125, + "learning_rate": 0.00013760841159788045, + "loss": 0.5391, + "step": 13176 + }, + { + "epoch": 1.76, + "grad_norm": 0.5859375, + "learning_rate": 0.0001375976212388132, + "loss": 0.4201, + "step": 13177 + }, + { + "epoch": 1.76, + "grad_norm": 0.443359375, + "learning_rate": 0.00013758683036989812, + "loss": 0.4742, + "step": 13178 + }, + { + "epoch": 1.76, + "grad_norm": 0.578125, + "learning_rate": 0.0001375760389912815, + "loss": 0.2778, + "step": 13179 + }, + { + "epoch": 1.76, + "grad_norm": 0.59765625, + "learning_rate": 0.00013756524710310965, + "loss": 0.8107, + "step": 13180 + }, + { + "epoch": 1.76, + "grad_norm": 0.458984375, + "learning_rate": 0.00013755445470552892, + "loss": 0.2299, + "step": 13181 + }, + { + "epoch": 1.76, + "grad_norm": 0.55078125, + "learning_rate": 0.0001375436617986857, + "loss": 0.6408, + "step": 13182 + }, + { + "epoch": 1.76, + "grad_norm": 0.53515625, + "learning_rate": 0.00013753286838272637, + "loss": 0.4573, + "step": 13183 + }, + { + "epoch": 1.76, + "grad_norm": 0.4765625, + "learning_rate": 0.0001375220744577972, + "loss": 0.4223, + "step": 13184 + }, + { + "epoch": 1.76, + "grad_norm": 0.44921875, + "learning_rate": 0.00013751128002404466, + "loss": 0.2844, + "step": 13185 + }, + { + "epoch": 1.76, + "grad_norm": 0.58203125, + "learning_rate": 0.00013750048508161505, + "loss": 0.5761, + "step": 13186 + }, + { + "epoch": 1.76, + "grad_norm": 0.4453125, + "learning_rate": 0.00013748968963065483, + "loss": 0.5251, + "step": 13187 + }, + { + "epoch": 1.76, + "grad_norm": 0.61328125, + "learning_rate": 0.00013747889367131033, + "loss": 0.6883, + "step": 13188 + }, + { + "epoch": 1.76, + "grad_norm": 0.484375, + "learning_rate": 0.00013746809720372797, + "loss": 0.413, + "step": 13189 + }, + { + "epoch": 1.76, + "grad_norm": 0.55078125, + "learning_rate": 0.0001374573002280542, + "loss": 0.488, + "step": 13190 + }, + { + "epoch": 1.76, + "grad_norm": 0.50390625, + "learning_rate": 0.00013744650274443538, + "loss": 0.6732, + "step": 13191 + }, + { + "epoch": 1.76, + "grad_norm": 0.53125, + "learning_rate": 0.00013743570475301798, + "loss": 0.3943, + "step": 13192 + }, + { + "epoch": 1.76, + "grad_norm": 0.578125, + "learning_rate": 0.00013742490625394835, + "loss": 0.5983, + "step": 13193 + }, + { + "epoch": 1.76, + "grad_norm": 0.77734375, + "learning_rate": 0.000137414107247373, + "loss": 0.3591, + "step": 13194 + }, + { + "epoch": 1.76, + "grad_norm": 0.52734375, + "learning_rate": 0.0001374033077334383, + "loss": 0.3392, + "step": 13195 + }, + { + "epoch": 1.76, + "grad_norm": 0.5234375, + "learning_rate": 0.00013739250771229082, + "loss": 0.3837, + "step": 13196 + }, + { + "epoch": 1.76, + "grad_norm": 0.453125, + "learning_rate": 0.00013738170718407687, + "loss": 0.284, + "step": 13197 + }, + { + "epoch": 1.76, + "grad_norm": 0.6015625, + "learning_rate": 0.00013737090614894304, + "loss": 0.3126, + "step": 13198 + }, + { + "epoch": 1.76, + "grad_norm": 0.388671875, + "learning_rate": 0.00013736010460703568, + "loss": 0.3297, + "step": 13199 + }, + { + "epoch": 1.76, + "grad_norm": 0.65234375, + "learning_rate": 0.00013734930255850135, + "loss": 0.5281, + "step": 13200 + }, + { + "epoch": 1.76, + "grad_norm": 0.62109375, + "learning_rate": 0.0001373385000034865, + "loss": 0.3783, + "step": 13201 + }, + { + "epoch": 1.76, + "grad_norm": 0.56640625, + "learning_rate": 0.00013732769694213762, + "loss": 0.3709, + "step": 13202 + }, + { + "epoch": 1.76, + "grad_norm": 0.58984375, + "learning_rate": 0.00013731689337460125, + "loss": 0.5184, + "step": 13203 + }, + { + "epoch": 1.76, + "grad_norm": 0.625, + "learning_rate": 0.0001373060893010238, + "loss": 0.684, + "step": 13204 + }, + { + "epoch": 1.76, + "grad_norm": 0.53125, + "learning_rate": 0.0001372952847215519, + "loss": 0.2719, + "step": 13205 + }, + { + "epoch": 1.76, + "grad_norm": 0.4140625, + "learning_rate": 0.00013728447963633195, + "loss": 0.4815, + "step": 13206 + }, + { + "epoch": 1.76, + "grad_norm": 0.5234375, + "learning_rate": 0.00013727367404551055, + "loss": 0.3812, + "step": 13207 + }, + { + "epoch": 1.76, + "grad_norm": 0.5625, + "learning_rate": 0.00013726286794923421, + "loss": 0.4636, + "step": 13208 + }, + { + "epoch": 1.76, + "grad_norm": 0.62890625, + "learning_rate": 0.00013725206134764946, + "loss": 0.358, + "step": 13209 + }, + { + "epoch": 1.76, + "grad_norm": 0.49609375, + "learning_rate": 0.00013724125424090285, + "loss": 0.4329, + "step": 13210 + }, + { + "epoch": 1.76, + "grad_norm": 0.43359375, + "learning_rate": 0.0001372304466291409, + "loss": 0.2058, + "step": 13211 + }, + { + "epoch": 1.76, + "grad_norm": 0.7734375, + "learning_rate": 0.00013721963851251028, + "loss": 0.4873, + "step": 13212 + }, + { + "epoch": 1.76, + "grad_norm": 0.5703125, + "learning_rate": 0.00013720882989115739, + "loss": 0.2805, + "step": 13213 + }, + { + "epoch": 1.76, + "grad_norm": 0.44921875, + "learning_rate": 0.00013719802076522892, + "loss": 0.5363, + "step": 13214 + }, + { + "epoch": 1.76, + "grad_norm": 0.5390625, + "learning_rate": 0.0001371872111348714, + "loss": 0.376, + "step": 13215 + }, + { + "epoch": 1.76, + "grad_norm": 0.6640625, + "learning_rate": 0.00013717640100023142, + "loss": 0.4055, + "step": 13216 + }, + { + "epoch": 1.76, + "grad_norm": 0.625, + "learning_rate": 0.00013716559036145565, + "loss": 0.4858, + "step": 13217 + }, + { + "epoch": 1.76, + "grad_norm": 0.5625, + "learning_rate": 0.00013715477921869058, + "loss": 0.3376, + "step": 13218 + }, + { + "epoch": 1.76, + "grad_norm": 0.64453125, + "learning_rate": 0.00013714396757208286, + "loss": 0.3926, + "step": 13219 + }, + { + "epoch": 1.76, + "grad_norm": 0.5390625, + "learning_rate": 0.0001371331554217791, + "loss": 0.5397, + "step": 13220 + }, + { + "epoch": 1.76, + "grad_norm": 0.5078125, + "learning_rate": 0.00013712234276792593, + "loss": 0.6384, + "step": 13221 + }, + { + "epoch": 1.76, + "grad_norm": 0.609375, + "learning_rate": 0.00013711152961066994, + "loss": 0.2876, + "step": 13222 + }, + { + "epoch": 1.76, + "grad_norm": 0.51171875, + "learning_rate": 0.0001371007159501578, + "loss": 0.263, + "step": 13223 + }, + { + "epoch": 1.76, + "grad_norm": 0.490234375, + "learning_rate": 0.00013708990178653614, + "loss": 0.3615, + "step": 13224 + }, + { + "epoch": 1.76, + "grad_norm": 0.57421875, + "learning_rate": 0.00013707908711995163, + "loss": 0.3199, + "step": 13225 + }, + { + "epoch": 1.76, + "grad_norm": 0.48828125, + "learning_rate": 0.0001370682719505509, + "loss": 0.2685, + "step": 13226 + }, + { + "epoch": 1.77, + "grad_norm": 0.60546875, + "learning_rate": 0.0001370574562784806, + "loss": 0.3385, + "step": 13227 + }, + { + "epoch": 1.77, + "grad_norm": 0.455078125, + "learning_rate": 0.0001370466401038874, + "loss": 0.369, + "step": 13228 + }, + { + "epoch": 1.77, + "grad_norm": 0.484375, + "learning_rate": 0.00013703582342691796, + "loss": 0.4866, + "step": 13229 + }, + { + "epoch": 1.77, + "grad_norm": 0.44140625, + "learning_rate": 0.00013702500624771904, + "loss": 0.2149, + "step": 13230 + }, + { + "epoch": 1.77, + "grad_norm": 0.640625, + "learning_rate": 0.00013701418856643726, + "loss": 0.2565, + "step": 13231 + }, + { + "epoch": 1.77, + "grad_norm": 0.498046875, + "learning_rate": 0.0001370033703832193, + "loss": 0.2738, + "step": 13232 + }, + { + "epoch": 1.77, + "grad_norm": 0.474609375, + "learning_rate": 0.00013699255169821191, + "loss": 0.4213, + "step": 13233 + }, + { + "epoch": 1.77, + "grad_norm": 0.515625, + "learning_rate": 0.00013698173251156176, + "loss": 0.3299, + "step": 13234 + }, + { + "epoch": 1.77, + "grad_norm": 0.6328125, + "learning_rate": 0.0001369709128234156, + "loss": 0.3785, + "step": 13235 + }, + { + "epoch": 1.77, + "grad_norm": 0.4921875, + "learning_rate": 0.00013696009263392014, + "loss": 0.3823, + "step": 13236 + }, + { + "epoch": 1.77, + "grad_norm": 0.458984375, + "learning_rate": 0.00013694927194322212, + "loss": 0.5198, + "step": 13237 + }, + { + "epoch": 1.77, + "grad_norm": 0.4765625, + "learning_rate": 0.0001369384507514682, + "loss": 0.3639, + "step": 13238 + }, + { + "epoch": 1.77, + "grad_norm": 0.78515625, + "learning_rate": 0.0001369276290588052, + "loss": 0.5855, + "step": 13239 + }, + { + "epoch": 1.77, + "grad_norm": 0.5625, + "learning_rate": 0.00013691680686537988, + "loss": 0.3048, + "step": 13240 + }, + { + "epoch": 1.77, + "grad_norm": 0.55859375, + "learning_rate": 0.00013690598417133893, + "loss": 0.3201, + "step": 13241 + }, + { + "epoch": 1.77, + "grad_norm": 0.58203125, + "learning_rate": 0.0001368951609768292, + "loss": 0.4097, + "step": 13242 + }, + { + "epoch": 1.77, + "grad_norm": 0.54296875, + "learning_rate": 0.00013688433728199737, + "loss": 0.4695, + "step": 13243 + }, + { + "epoch": 1.77, + "grad_norm": 0.7265625, + "learning_rate": 0.00013687351308699027, + "loss": 0.3955, + "step": 13244 + }, + { + "epoch": 1.77, + "grad_norm": 0.51171875, + "learning_rate": 0.00013686268839195465, + "loss": 0.2792, + "step": 13245 + }, + { + "epoch": 1.77, + "grad_norm": 0.498046875, + "learning_rate": 0.00013685186319703732, + "loss": 0.5932, + "step": 13246 + }, + { + "epoch": 1.77, + "grad_norm": 0.5546875, + "learning_rate": 0.00013684103750238505, + "loss": 0.5324, + "step": 13247 + }, + { + "epoch": 1.77, + "grad_norm": 0.462890625, + "learning_rate": 0.0001368302113081447, + "loss": 0.3802, + "step": 13248 + }, + { + "epoch": 1.77, + "grad_norm": 0.44921875, + "learning_rate": 0.00013681938461446307, + "loss": 0.2409, + "step": 13249 + }, + { + "epoch": 1.77, + "grad_norm": 0.55078125, + "learning_rate": 0.00013680855742148693, + "loss": 0.4278, + "step": 13250 + }, + { + "epoch": 1.77, + "grad_norm": 0.53515625, + "learning_rate": 0.00013679772972936312, + "loss": 0.2837, + "step": 13251 + }, + { + "epoch": 1.77, + "grad_norm": 0.50390625, + "learning_rate": 0.00013678690153823845, + "loss": 0.2571, + "step": 13252 + }, + { + "epoch": 1.77, + "grad_norm": 0.353515625, + "learning_rate": 0.00013677607284825983, + "loss": 0.3187, + "step": 13253 + }, + { + "epoch": 1.77, + "grad_norm": 0.546875, + "learning_rate": 0.00013676524365957403, + "loss": 0.363, + "step": 13254 + }, + { + "epoch": 1.77, + "grad_norm": 0.52734375, + "learning_rate": 0.000136754413972328, + "loss": 0.4447, + "step": 13255 + }, + { + "epoch": 1.77, + "grad_norm": 0.53125, + "learning_rate": 0.00013674358378666846, + "loss": 0.5995, + "step": 13256 + }, + { + "epoch": 1.77, + "grad_norm": 0.388671875, + "learning_rate": 0.00013673275310274234, + "loss": 0.1823, + "step": 13257 + }, + { + "epoch": 1.77, + "grad_norm": 0.474609375, + "learning_rate": 0.00013672192192069654, + "loss": 0.713, + "step": 13258 + }, + { + "epoch": 1.77, + "grad_norm": 0.53125, + "learning_rate": 0.00013671109024067789, + "loss": 0.2842, + "step": 13259 + }, + { + "epoch": 1.77, + "grad_norm": 0.95703125, + "learning_rate": 0.0001367002580628333, + "loss": 0.534, + "step": 13260 + }, + { + "epoch": 1.77, + "grad_norm": 0.62109375, + "learning_rate": 0.00013668942538730967, + "loss": 0.4207, + "step": 13261 + }, + { + "epoch": 1.77, + "grad_norm": 0.578125, + "learning_rate": 0.00013667859221425387, + "loss": 0.2672, + "step": 13262 + }, + { + "epoch": 1.77, + "grad_norm": 0.443359375, + "learning_rate": 0.00013666775854381284, + "loss": 0.3827, + "step": 13263 + }, + { + "epoch": 1.77, + "grad_norm": 0.5, + "learning_rate": 0.00013665692437613345, + "loss": 0.497, + "step": 13264 + }, + { + "epoch": 1.77, + "grad_norm": 0.5, + "learning_rate": 0.00013664608971136267, + "loss": 0.1792, + "step": 13265 + }, + { + "epoch": 1.77, + "grad_norm": 0.46484375, + "learning_rate": 0.00013663525454964733, + "loss": 0.3161, + "step": 13266 + }, + { + "epoch": 1.77, + "grad_norm": 0.373046875, + "learning_rate": 0.00013662441889113452, + "loss": 0.181, + "step": 13267 + }, + { + "epoch": 1.77, + "grad_norm": 0.55859375, + "learning_rate": 0.000136613582735971, + "loss": 0.3645, + "step": 13268 + }, + { + "epoch": 1.77, + "grad_norm": 0.4296875, + "learning_rate": 0.00013660274608430384, + "loss": 0.2881, + "step": 13269 + }, + { + "epoch": 1.77, + "grad_norm": 0.37109375, + "learning_rate": 0.00013659190893627993, + "loss": 0.2733, + "step": 13270 + }, + { + "epoch": 1.77, + "grad_norm": 0.56640625, + "learning_rate": 0.00013658107129204628, + "loss": 0.5144, + "step": 13271 + }, + { + "epoch": 1.77, + "grad_norm": 0.4609375, + "learning_rate": 0.00013657023315174978, + "loss": 0.2945, + "step": 13272 + }, + { + "epoch": 1.77, + "grad_norm": 0.447265625, + "learning_rate": 0.0001365593945155375, + "loss": 0.4324, + "step": 13273 + }, + { + "epoch": 1.77, + "grad_norm": 0.44140625, + "learning_rate": 0.00013654855538355632, + "loss": 0.3734, + "step": 13274 + }, + { + "epoch": 1.77, + "grad_norm": 0.50390625, + "learning_rate": 0.0001365377157559533, + "loss": 0.431, + "step": 13275 + }, + { + "epoch": 1.77, + "grad_norm": 0.462890625, + "learning_rate": 0.0001365268756328754, + "loss": 0.3644, + "step": 13276 + }, + { + "epoch": 1.77, + "grad_norm": 0.4609375, + "learning_rate": 0.00013651603501446964, + "loss": 0.2028, + "step": 13277 + }, + { + "epoch": 1.77, + "grad_norm": 0.70703125, + "learning_rate": 0.000136505193900883, + "loss": 0.5619, + "step": 13278 + }, + { + "epoch": 1.77, + "grad_norm": 0.91015625, + "learning_rate": 0.00013649435229226247, + "loss": 0.256, + "step": 13279 + }, + { + "epoch": 1.77, + "grad_norm": 0.453125, + "learning_rate": 0.00013648351018875514, + "loss": 0.3052, + "step": 13280 + }, + { + "epoch": 1.77, + "grad_norm": 0.455078125, + "learning_rate": 0.000136472667590508, + "loss": 0.5389, + "step": 13281 + }, + { + "epoch": 1.77, + "grad_norm": 0.5703125, + "learning_rate": 0.00013646182449766805, + "loss": 0.5548, + "step": 13282 + }, + { + "epoch": 1.77, + "grad_norm": 0.54296875, + "learning_rate": 0.00013645098091038236, + "loss": 0.3562, + "step": 13283 + }, + { + "epoch": 1.77, + "grad_norm": 0.400390625, + "learning_rate": 0.00013644013682879798, + "loss": 0.3789, + "step": 13284 + }, + { + "epoch": 1.77, + "grad_norm": 0.54296875, + "learning_rate": 0.00013642929225306198, + "loss": 0.2852, + "step": 13285 + }, + { + "epoch": 1.77, + "grad_norm": 0.5078125, + "learning_rate": 0.00013641844718332137, + "loss": 0.3888, + "step": 13286 + }, + { + "epoch": 1.77, + "grad_norm": 0.33984375, + "learning_rate": 0.00013640760161972323, + "loss": 0.1582, + "step": 13287 + }, + { + "epoch": 1.77, + "grad_norm": 0.482421875, + "learning_rate": 0.0001363967555624147, + "loss": 0.367, + "step": 13288 + }, + { + "epoch": 1.77, + "grad_norm": 0.3984375, + "learning_rate": 0.00013638590901154276, + "loss": 0.2391, + "step": 13289 + }, + { + "epoch": 1.77, + "grad_norm": 0.6328125, + "learning_rate": 0.00013637506196725456, + "loss": 0.3578, + "step": 13290 + }, + { + "epoch": 1.77, + "grad_norm": 0.451171875, + "learning_rate": 0.00013636421442969718, + "loss": 0.2041, + "step": 13291 + }, + { + "epoch": 1.77, + "grad_norm": 0.396484375, + "learning_rate": 0.0001363533663990177, + "loss": 0.2087, + "step": 13292 + }, + { + "epoch": 1.77, + "grad_norm": 0.4765625, + "learning_rate": 0.00013634251787536324, + "loss": 0.1797, + "step": 13293 + }, + { + "epoch": 1.77, + "grad_norm": 0.76953125, + "learning_rate": 0.00013633166885888094, + "loss": 0.3465, + "step": 13294 + }, + { + "epoch": 1.77, + "grad_norm": 0.486328125, + "learning_rate": 0.00013632081934971784, + "loss": 0.544, + "step": 13295 + }, + { + "epoch": 1.77, + "grad_norm": 0.5703125, + "learning_rate": 0.00013630996934802116, + "loss": 0.4098, + "step": 13296 + }, + { + "epoch": 1.77, + "grad_norm": 0.443359375, + "learning_rate": 0.00013629911885393797, + "loss": 0.2982, + "step": 13297 + }, + { + "epoch": 1.77, + "grad_norm": 0.7109375, + "learning_rate": 0.00013628826786761544, + "loss": 0.5494, + "step": 13298 + }, + { + "epoch": 1.77, + "grad_norm": 0.46875, + "learning_rate": 0.00013627741638920073, + "loss": 0.3394, + "step": 13299 + }, + { + "epoch": 1.77, + "grad_norm": 0.5625, + "learning_rate": 0.00013626656441884094, + "loss": 0.4145, + "step": 13300 + }, + { + "epoch": 1.77, + "grad_norm": 0.95703125, + "learning_rate": 0.00013625571195668328, + "loss": 0.7691, + "step": 13301 + }, + { + "epoch": 1.78, + "grad_norm": 0.55078125, + "learning_rate": 0.00013624485900287487, + "loss": 0.47, + "step": 13302 + }, + { + "epoch": 1.78, + "grad_norm": 0.419921875, + "learning_rate": 0.00013623400555756292, + "loss": 0.4783, + "step": 13303 + }, + { + "epoch": 1.78, + "grad_norm": 0.6171875, + "learning_rate": 0.00013622315162089462, + "loss": 0.2884, + "step": 13304 + }, + { + "epoch": 1.78, + "grad_norm": 0.546875, + "learning_rate": 0.00013621229719301713, + "loss": 0.4499, + "step": 13305 + }, + { + "epoch": 1.78, + "grad_norm": 0.45703125, + "learning_rate": 0.00013620144227407765, + "loss": 0.5052, + "step": 13306 + }, + { + "epoch": 1.78, + "grad_norm": 0.5859375, + "learning_rate": 0.00013619058686422338, + "loss": 0.5582, + "step": 13307 + }, + { + "epoch": 1.78, + "grad_norm": 0.4765625, + "learning_rate": 0.00013617973096360152, + "loss": 0.4125, + "step": 13308 + }, + { + "epoch": 1.78, + "grad_norm": 0.431640625, + "learning_rate": 0.00013616887457235926, + "loss": 0.3837, + "step": 13309 + }, + { + "epoch": 1.78, + "grad_norm": 0.578125, + "learning_rate": 0.00013615801769064392, + "loss": 0.3181, + "step": 13310 + }, + { + "epoch": 1.78, + "grad_norm": 0.47265625, + "learning_rate": 0.0001361471603186026, + "loss": 0.3535, + "step": 13311 + }, + { + "epoch": 1.78, + "grad_norm": 0.48828125, + "learning_rate": 0.00013613630245638258, + "loss": 0.3743, + "step": 13312 + }, + { + "epoch": 1.78, + "grad_norm": 0.458984375, + "learning_rate": 0.00013612544410413112, + "loss": 0.3509, + "step": 13313 + }, + { + "epoch": 1.78, + "grad_norm": 0.439453125, + "learning_rate": 0.00013611458526199545, + "loss": 0.3821, + "step": 13314 + }, + { + "epoch": 1.78, + "grad_norm": 0.50390625, + "learning_rate": 0.00013610372593012284, + "loss": 0.4038, + "step": 13315 + }, + { + "epoch": 1.78, + "grad_norm": 0.68359375, + "learning_rate": 0.00013609286610866055, + "loss": 0.7106, + "step": 13316 + }, + { + "epoch": 1.78, + "grad_norm": 0.3671875, + "learning_rate": 0.0001360820057977558, + "loss": 0.2196, + "step": 13317 + }, + { + "epoch": 1.78, + "grad_norm": 0.435546875, + "learning_rate": 0.00013607114499755588, + "loss": 0.219, + "step": 13318 + }, + { + "epoch": 1.78, + "grad_norm": 0.5625, + "learning_rate": 0.00013606028370820813, + "loss": 0.4979, + "step": 13319 + }, + { + "epoch": 1.78, + "grad_norm": 0.578125, + "learning_rate": 0.00013604942192985974, + "loss": 0.4199, + "step": 13320 + }, + { + "epoch": 1.78, + "grad_norm": 0.6796875, + "learning_rate": 0.00013603855966265807, + "loss": 0.4016, + "step": 13321 + }, + { + "epoch": 1.78, + "grad_norm": 0.6015625, + "learning_rate": 0.00013602769690675042, + "loss": 0.3792, + "step": 13322 + }, + { + "epoch": 1.78, + "grad_norm": 0.55859375, + "learning_rate": 0.00013601683366228407, + "loss": 0.4708, + "step": 13323 + }, + { + "epoch": 1.78, + "grad_norm": 0.55078125, + "learning_rate": 0.00013600596992940635, + "loss": 0.4557, + "step": 13324 + }, + { + "epoch": 1.78, + "grad_norm": 0.4765625, + "learning_rate": 0.00013599510570826454, + "loss": 0.2877, + "step": 13325 + }, + { + "epoch": 1.78, + "grad_norm": 0.76953125, + "learning_rate": 0.000135984240999006, + "loss": 0.4936, + "step": 13326 + }, + { + "epoch": 1.78, + "grad_norm": 0.5625, + "learning_rate": 0.00013597337580177808, + "loss": 0.5677, + "step": 13327 + }, + { + "epoch": 1.78, + "grad_norm": 0.41796875, + "learning_rate": 0.0001359625101167281, + "loss": 0.2357, + "step": 13328 + }, + { + "epoch": 1.78, + "grad_norm": 0.67578125, + "learning_rate": 0.00013595164394400339, + "loss": 0.844, + "step": 13329 + }, + { + "epoch": 1.78, + "grad_norm": 0.546875, + "learning_rate": 0.00013594077728375128, + "loss": 0.4916, + "step": 13330 + }, + { + "epoch": 1.78, + "grad_norm": 0.484375, + "learning_rate": 0.00013592991013611919, + "loss": 0.3825, + "step": 13331 + }, + { + "epoch": 1.78, + "grad_norm": 0.6484375, + "learning_rate": 0.00013591904250125447, + "loss": 0.5577, + "step": 13332 + }, + { + "epoch": 1.78, + "grad_norm": 0.58984375, + "learning_rate": 0.00013590817437930447, + "loss": 0.4437, + "step": 13333 + }, + { + "epoch": 1.78, + "grad_norm": 0.40234375, + "learning_rate": 0.0001358973057704166, + "loss": 0.2931, + "step": 13334 + }, + { + "epoch": 1.78, + "grad_norm": 0.51171875, + "learning_rate": 0.0001358864366747382, + "loss": 0.3402, + "step": 13335 + }, + { + "epoch": 1.78, + "grad_norm": 0.69140625, + "learning_rate": 0.0001358755670924167, + "loss": 0.292, + "step": 13336 + }, + { + "epoch": 1.78, + "grad_norm": 0.404296875, + "learning_rate": 0.00013586469702359948, + "loss": 0.3973, + "step": 13337 + }, + { + "epoch": 1.78, + "grad_norm": 0.71875, + "learning_rate": 0.00013585382646843396, + "loss": 0.335, + "step": 13338 + }, + { + "epoch": 1.78, + "grad_norm": 0.419921875, + "learning_rate": 0.00013584295542706752, + "loss": 0.2, + "step": 13339 + }, + { + "epoch": 1.78, + "grad_norm": 0.515625, + "learning_rate": 0.00013583208389964763, + "loss": 0.5756, + "step": 13340 + }, + { + "epoch": 1.78, + "grad_norm": 0.6015625, + "learning_rate": 0.00013582121188632167, + "loss": 0.4578, + "step": 13341 + }, + { + "epoch": 1.78, + "grad_norm": 0.62890625, + "learning_rate": 0.0001358103393872371, + "loss": 0.3257, + "step": 13342 + }, + { + "epoch": 1.78, + "grad_norm": 0.5625, + "learning_rate": 0.00013579946640254132, + "loss": 0.3608, + "step": 13343 + }, + { + "epoch": 1.78, + "grad_norm": 0.478515625, + "learning_rate": 0.00013578859293238182, + "loss": 0.364, + "step": 13344 + }, + { + "epoch": 1.78, + "grad_norm": 0.5625, + "learning_rate": 0.00013577771897690602, + "loss": 0.4055, + "step": 13345 + }, + { + "epoch": 1.78, + "grad_norm": 0.75, + "learning_rate": 0.00013576684453626137, + "loss": 0.5709, + "step": 13346 + }, + { + "epoch": 1.78, + "grad_norm": 0.59765625, + "learning_rate": 0.0001357559696105954, + "loss": 0.5337, + "step": 13347 + }, + { + "epoch": 1.78, + "grad_norm": 0.47265625, + "learning_rate": 0.0001357450942000555, + "loss": 0.2638, + "step": 13348 + }, + { + "epoch": 1.78, + "grad_norm": 0.47265625, + "learning_rate": 0.00013573421830478922, + "loss": 0.2763, + "step": 13349 + }, + { + "epoch": 1.78, + "grad_norm": 0.6328125, + "learning_rate": 0.00013572334192494393, + "loss": 0.3395, + "step": 13350 + }, + { + "epoch": 1.78, + "grad_norm": 0.455078125, + "learning_rate": 0.00013571246506066727, + "loss": 0.2097, + "step": 13351 + }, + { + "epoch": 1.78, + "grad_norm": 0.384765625, + "learning_rate": 0.00013570158771210664, + "loss": 0.3441, + "step": 13352 + }, + { + "epoch": 1.78, + "grad_norm": 0.44921875, + "learning_rate": 0.00013569070987940954, + "loss": 0.3048, + "step": 13353 + }, + { + "epoch": 1.78, + "grad_norm": 0.51171875, + "learning_rate": 0.00013567983156272356, + "loss": 0.3797, + "step": 13354 + }, + { + "epoch": 1.78, + "grad_norm": 0.451171875, + "learning_rate": 0.0001356689527621961, + "loss": 0.258, + "step": 13355 + }, + { + "epoch": 1.78, + "grad_norm": 0.4375, + "learning_rate": 0.00013565807347797482, + "loss": 0.3425, + "step": 13356 + }, + { + "epoch": 1.78, + "grad_norm": 0.48828125, + "learning_rate": 0.0001356471937102071, + "loss": 0.3599, + "step": 13357 + }, + { + "epoch": 1.78, + "grad_norm": 0.6484375, + "learning_rate": 0.00013563631345904064, + "loss": 0.7306, + "step": 13358 + }, + { + "epoch": 1.78, + "grad_norm": 0.4609375, + "learning_rate": 0.00013562543272462285, + "loss": 0.3354, + "step": 13359 + }, + { + "epoch": 1.78, + "grad_norm": 0.515625, + "learning_rate": 0.00013561455150710132, + "loss": 0.4916, + "step": 13360 + }, + { + "epoch": 1.78, + "grad_norm": 0.5390625, + "learning_rate": 0.00013560366980662364, + "loss": 0.3805, + "step": 13361 + }, + { + "epoch": 1.78, + "grad_norm": 0.77734375, + "learning_rate": 0.00013559278762333734, + "loss": 0.3725, + "step": 13362 + }, + { + "epoch": 1.78, + "grad_norm": 0.408203125, + "learning_rate": 0.00013558190495739, + "loss": 0.2154, + "step": 13363 + }, + { + "epoch": 1.78, + "grad_norm": 0.5390625, + "learning_rate": 0.00013557102180892917, + "loss": 0.5385, + "step": 13364 + }, + { + "epoch": 1.78, + "grad_norm": 0.62890625, + "learning_rate": 0.0001355601381781025, + "loss": 0.456, + "step": 13365 + }, + { + "epoch": 1.78, + "grad_norm": 0.453125, + "learning_rate": 0.00013554925406505748, + "loss": 0.2708, + "step": 13366 + }, + { + "epoch": 1.78, + "grad_norm": 0.51171875, + "learning_rate": 0.00013553836946994177, + "loss": 0.4351, + "step": 13367 + }, + { + "epoch": 1.78, + "grad_norm": 0.5078125, + "learning_rate": 0.000135527484392903, + "loss": 0.4552, + "step": 13368 + }, + { + "epoch": 1.78, + "grad_norm": 0.50390625, + "learning_rate": 0.0001355165988340887, + "loss": 0.4257, + "step": 13369 + }, + { + "epoch": 1.78, + "grad_norm": 0.494140625, + "learning_rate": 0.00013550571279364655, + "loss": 0.2467, + "step": 13370 + }, + { + "epoch": 1.78, + "grad_norm": 0.56640625, + "learning_rate": 0.0001354948262717241, + "loss": 0.4878, + "step": 13371 + }, + { + "epoch": 1.78, + "grad_norm": 0.546875, + "learning_rate": 0.0001354839392684691, + "loss": 0.4837, + "step": 13372 + }, + { + "epoch": 1.78, + "grad_norm": 0.38671875, + "learning_rate": 0.00013547305178402907, + "loss": 0.236, + "step": 13373 + }, + { + "epoch": 1.78, + "grad_norm": 0.3828125, + "learning_rate": 0.0001354621638185517, + "loss": 0.2896, + "step": 13374 + }, + { + "epoch": 1.78, + "grad_norm": 0.40625, + "learning_rate": 0.0001354512753721846, + "loss": 0.4897, + "step": 13375 + }, + { + "epoch": 1.78, + "grad_norm": 0.921875, + "learning_rate": 0.00013544038644507547, + "loss": 0.4122, + "step": 13376 + }, + { + "epoch": 1.79, + "grad_norm": 0.5859375, + "learning_rate": 0.000135429497037372, + "loss": 0.495, + "step": 13377 + }, + { + "epoch": 1.79, + "grad_norm": 0.392578125, + "learning_rate": 0.00013541860714922174, + "loss": 0.2104, + "step": 13378 + }, + { + "epoch": 1.79, + "grad_norm": 0.55078125, + "learning_rate": 0.0001354077167807725, + "loss": 0.4128, + "step": 13379 + }, + { + "epoch": 1.79, + "grad_norm": 0.408203125, + "learning_rate": 0.00013539682593217183, + "loss": 0.3705, + "step": 13380 + }, + { + "epoch": 1.79, + "grad_norm": 0.5546875, + "learning_rate": 0.00013538593460356754, + "loss": 0.4909, + "step": 13381 + }, + { + "epoch": 1.79, + "grad_norm": 0.333984375, + "learning_rate": 0.00013537504279510724, + "loss": 0.2747, + "step": 13382 + }, + { + "epoch": 1.79, + "grad_norm": 0.43359375, + "learning_rate": 0.00013536415050693869, + "loss": 0.3693, + "step": 13383 + }, + { + "epoch": 1.79, + "grad_norm": 0.435546875, + "learning_rate": 0.0001353532577392095, + "loss": 0.4058, + "step": 13384 + }, + { + "epoch": 1.79, + "grad_norm": 0.5078125, + "learning_rate": 0.00013534236449206746, + "loss": 0.5925, + "step": 13385 + }, + { + "epoch": 1.79, + "grad_norm": 0.353515625, + "learning_rate": 0.00013533147076566032, + "loss": 0.2974, + "step": 13386 + }, + { + "epoch": 1.79, + "grad_norm": 0.578125, + "learning_rate": 0.0001353205765601357, + "loss": 0.6407, + "step": 13387 + }, + { + "epoch": 1.79, + "grad_norm": 0.515625, + "learning_rate": 0.00013530968187564147, + "loss": 0.3118, + "step": 13388 + }, + { + "epoch": 1.79, + "grad_norm": 0.5, + "learning_rate": 0.00013529878671232523, + "loss": 0.4816, + "step": 13389 + }, + { + "epoch": 1.79, + "grad_norm": 0.5234375, + "learning_rate": 0.00013528789107033484, + "loss": 0.4578, + "step": 13390 + }, + { + "epoch": 1.79, + "grad_norm": 0.671875, + "learning_rate": 0.00013527699494981794, + "loss": 0.4465, + "step": 13391 + }, + { + "epoch": 1.79, + "grad_norm": 0.42578125, + "learning_rate": 0.0001352660983509224, + "loss": 0.2929, + "step": 13392 + }, + { + "epoch": 1.79, + "grad_norm": 0.69921875, + "learning_rate": 0.00013525520127379587, + "loss": 0.5895, + "step": 13393 + }, + { + "epoch": 1.79, + "grad_norm": 0.61328125, + "learning_rate": 0.00013524430371858622, + "loss": 0.2661, + "step": 13394 + }, + { + "epoch": 1.79, + "grad_norm": 0.3984375, + "learning_rate": 0.00013523340568544122, + "loss": 0.2187, + "step": 13395 + }, + { + "epoch": 1.79, + "grad_norm": 0.66015625, + "learning_rate": 0.00013522250717450858, + "loss": 0.3164, + "step": 13396 + }, + { + "epoch": 1.79, + "grad_norm": 0.44140625, + "learning_rate": 0.00013521160818593617, + "loss": 0.337, + "step": 13397 + }, + { + "epoch": 1.79, + "grad_norm": 0.35546875, + "learning_rate": 0.00013520070871987172, + "loss": 0.2023, + "step": 13398 + }, + { + "epoch": 1.79, + "grad_norm": 0.4921875, + "learning_rate": 0.0001351898087764631, + "loss": 0.2991, + "step": 13399 + }, + { + "epoch": 1.79, + "grad_norm": 0.5234375, + "learning_rate": 0.00013517890835585807, + "loss": 0.3515, + "step": 13400 + }, + { + "epoch": 1.79, + "grad_norm": 0.50390625, + "learning_rate": 0.00013516800745820448, + "loss": 0.3522, + "step": 13401 + }, + { + "epoch": 1.79, + "grad_norm": 0.57421875, + "learning_rate": 0.00013515710608365012, + "loss": 0.4678, + "step": 13402 + }, + { + "epoch": 1.79, + "grad_norm": 0.54296875, + "learning_rate": 0.00013514620423234287, + "loss": 0.2076, + "step": 13403 + }, + { + "epoch": 1.79, + "grad_norm": 0.333984375, + "learning_rate": 0.00013513530190443048, + "loss": 0.2203, + "step": 13404 + }, + { + "epoch": 1.79, + "grad_norm": 0.5234375, + "learning_rate": 0.0001351243991000609, + "loss": 0.2496, + "step": 13405 + }, + { + "epoch": 1.79, + "grad_norm": 0.58203125, + "learning_rate": 0.0001351134958193819, + "loss": 0.4353, + "step": 13406 + }, + { + "epoch": 1.79, + "grad_norm": 0.75390625, + "learning_rate": 0.00013510259206254136, + "loss": 0.3751, + "step": 13407 + }, + { + "epoch": 1.79, + "grad_norm": 0.5234375, + "learning_rate": 0.00013509168782968714, + "loss": 0.2384, + "step": 13408 + }, + { + "epoch": 1.79, + "grad_norm": 0.671875, + "learning_rate": 0.00013508078312096715, + "loss": 0.8305, + "step": 13409 + }, + { + "epoch": 1.79, + "grad_norm": 0.482421875, + "learning_rate": 0.00013506987793652917, + "loss": 0.2789, + "step": 13410 + }, + { + "epoch": 1.79, + "grad_norm": 0.54296875, + "learning_rate": 0.00013505897227652117, + "loss": 0.2703, + "step": 13411 + }, + { + "epoch": 1.79, + "grad_norm": 0.44140625, + "learning_rate": 0.00013504806614109098, + "loss": 0.5944, + "step": 13412 + }, + { + "epoch": 1.79, + "grad_norm": 0.4140625, + "learning_rate": 0.00013503715953038658, + "loss": 0.3108, + "step": 13413 + }, + { + "epoch": 1.79, + "grad_norm": 0.68359375, + "learning_rate": 0.00013502625244455577, + "loss": 0.285, + "step": 13414 + }, + { + "epoch": 1.79, + "grad_norm": 0.6171875, + "learning_rate": 0.00013501534488374647, + "loss": 0.4709, + "step": 13415 + }, + { + "epoch": 1.79, + "grad_norm": 0.67578125, + "learning_rate": 0.00013500443684810664, + "loss": 0.569, + "step": 13416 + }, + { + "epoch": 1.79, + "grad_norm": 0.5078125, + "learning_rate": 0.0001349935283377842, + "loss": 0.3231, + "step": 13417 + }, + { + "epoch": 1.79, + "grad_norm": 0.640625, + "learning_rate": 0.000134982619352927, + "loss": 0.2799, + "step": 13418 + }, + { + "epoch": 1.79, + "grad_norm": 0.439453125, + "learning_rate": 0.0001349717098936831, + "loss": 0.4105, + "step": 13419 + }, + { + "epoch": 1.79, + "grad_norm": 0.5625, + "learning_rate": 0.00013496079996020037, + "loss": 0.2268, + "step": 13420 + }, + { + "epoch": 1.79, + "grad_norm": 0.43359375, + "learning_rate": 0.0001349498895526267, + "loss": 0.4409, + "step": 13421 + }, + { + "epoch": 1.79, + "grad_norm": 0.484375, + "learning_rate": 0.00013493897867111014, + "loss": 0.3137, + "step": 13422 + }, + { + "epoch": 1.79, + "grad_norm": 0.59375, + "learning_rate": 0.00013492806731579856, + "loss": 0.3257, + "step": 13423 + }, + { + "epoch": 1.79, + "grad_norm": 0.44921875, + "learning_rate": 0.00013491715548684003, + "loss": 0.2309, + "step": 13424 + }, + { + "epoch": 1.79, + "grad_norm": 0.37109375, + "learning_rate": 0.00013490624318438242, + "loss": 0.2682, + "step": 13425 + }, + { + "epoch": 1.79, + "grad_norm": 0.498046875, + "learning_rate": 0.00013489533040857377, + "loss": 0.2804, + "step": 13426 + }, + { + "epoch": 1.79, + "grad_norm": 0.62109375, + "learning_rate": 0.00013488441715956204, + "loss": 0.6297, + "step": 13427 + }, + { + "epoch": 1.79, + "grad_norm": 0.55078125, + "learning_rate": 0.0001348735034374952, + "loss": 0.3992, + "step": 13428 + }, + { + "epoch": 1.79, + "grad_norm": 0.625, + "learning_rate": 0.0001348625892425213, + "loss": 0.4615, + "step": 13429 + }, + { + "epoch": 1.79, + "grad_norm": 0.443359375, + "learning_rate": 0.00013485167457478832, + "loss": 0.226, + "step": 13430 + }, + { + "epoch": 1.79, + "grad_norm": 0.51953125, + "learning_rate": 0.00013484075943444426, + "loss": 0.2594, + "step": 13431 + }, + { + "epoch": 1.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00013482984382163712, + "loss": 0.6347, + "step": 13432 + }, + { + "epoch": 1.79, + "grad_norm": 0.55078125, + "learning_rate": 0.00013481892773651495, + "loss": 0.5174, + "step": 13433 + }, + { + "epoch": 1.79, + "grad_norm": 0.43359375, + "learning_rate": 0.0001348080111792258, + "loss": 0.4188, + "step": 13434 + }, + { + "epoch": 1.79, + "grad_norm": 0.490234375, + "learning_rate": 0.00013479709414991763, + "loss": 0.3708, + "step": 13435 + }, + { + "epoch": 1.79, + "grad_norm": 0.55078125, + "learning_rate": 0.00013478617664873857, + "loss": 0.2134, + "step": 13436 + }, + { + "epoch": 1.79, + "grad_norm": 0.455078125, + "learning_rate": 0.00013477525867583662, + "loss": 0.3643, + "step": 13437 + }, + { + "epoch": 1.79, + "grad_norm": 0.47265625, + "learning_rate": 0.00013476434023135988, + "loss": 0.3442, + "step": 13438 + }, + { + "epoch": 1.79, + "grad_norm": 0.55078125, + "learning_rate": 0.0001347534213154563, + "loss": 0.3774, + "step": 13439 + }, + { + "epoch": 1.79, + "grad_norm": 0.482421875, + "learning_rate": 0.00013474250192827407, + "loss": 0.3546, + "step": 13440 + }, + { + "epoch": 1.79, + "grad_norm": 0.498046875, + "learning_rate": 0.00013473158206996118, + "loss": 0.3548, + "step": 13441 + }, + { + "epoch": 1.79, + "grad_norm": 0.41015625, + "learning_rate": 0.0001347206617406658, + "loss": 0.3622, + "step": 13442 + }, + { + "epoch": 1.79, + "grad_norm": 0.66015625, + "learning_rate": 0.00013470974094053595, + "loss": 0.5238, + "step": 13443 + }, + { + "epoch": 1.79, + "grad_norm": 0.435546875, + "learning_rate": 0.0001346988196697197, + "loss": 0.3746, + "step": 13444 + }, + { + "epoch": 1.79, + "grad_norm": 0.54296875, + "learning_rate": 0.00013468789792836521, + "loss": 0.3907, + "step": 13445 + }, + { + "epoch": 1.79, + "grad_norm": 0.2734375, + "learning_rate": 0.00013467697571662056, + "loss": 0.1559, + "step": 13446 + }, + { + "epoch": 1.79, + "grad_norm": 0.5625, + "learning_rate": 0.0001346660530346339, + "loss": 0.523, + "step": 13447 + }, + { + "epoch": 1.79, + "grad_norm": 0.51953125, + "learning_rate": 0.00013465512988255324, + "loss": 0.4239, + "step": 13448 + }, + { + "epoch": 1.79, + "grad_norm": 0.41796875, + "learning_rate": 0.00013464420626052684, + "loss": 0.331, + "step": 13449 + }, + { + "epoch": 1.79, + "grad_norm": 0.5625, + "learning_rate": 0.00013463328216870273, + "loss": 0.3984, + "step": 13450 + }, + { + "epoch": 1.79, + "grad_norm": 0.462890625, + "learning_rate": 0.0001346223576072291, + "loss": 0.4613, + "step": 13451 + }, + { + "epoch": 1.8, + "grad_norm": 0.640625, + "learning_rate": 0.0001346114325762541, + "loss": 0.2702, + "step": 13452 + }, + { + "epoch": 1.8, + "grad_norm": 0.4375, + "learning_rate": 0.0001346005070759258, + "loss": 0.4404, + "step": 13453 + }, + { + "epoch": 1.8, + "grad_norm": 0.484375, + "learning_rate": 0.00013458958110639245, + "loss": 0.3988, + "step": 13454 + }, + { + "epoch": 1.8, + "grad_norm": 0.4453125, + "learning_rate": 0.00013457865466780223, + "loss": 0.2685, + "step": 13455 + }, + { + "epoch": 1.8, + "grad_norm": 0.50390625, + "learning_rate": 0.0001345677277603032, + "loss": 0.3632, + "step": 13456 + }, + { + "epoch": 1.8, + "grad_norm": 0.482421875, + "learning_rate": 0.0001345568003840436, + "loss": 0.3352, + "step": 13457 + }, + { + "epoch": 1.8, + "grad_norm": 0.46875, + "learning_rate": 0.00013454587253917163, + "loss": 0.3643, + "step": 13458 + }, + { + "epoch": 1.8, + "grad_norm": 0.46484375, + "learning_rate": 0.00013453494422583547, + "loss": 0.4193, + "step": 13459 + }, + { + "epoch": 1.8, + "grad_norm": 0.443359375, + "learning_rate": 0.00013452401544418325, + "loss": 0.3326, + "step": 13460 + }, + { + "epoch": 1.8, + "grad_norm": 0.66015625, + "learning_rate": 0.00013451308619436328, + "loss": 0.2276, + "step": 13461 + }, + { + "epoch": 1.8, + "grad_norm": 0.546875, + "learning_rate": 0.00013450215647652368, + "loss": 0.2551, + "step": 13462 + }, + { + "epoch": 1.8, + "grad_norm": 0.609375, + "learning_rate": 0.00013449122629081273, + "loss": 0.4083, + "step": 13463 + }, + { + "epoch": 1.8, + "grad_norm": 0.46875, + "learning_rate": 0.00013448029563737856, + "loss": 0.3571, + "step": 13464 + }, + { + "epoch": 1.8, + "grad_norm": 0.52734375, + "learning_rate": 0.0001344693645163695, + "loss": 0.4038, + "step": 13465 + }, + { + "epoch": 1.8, + "grad_norm": 0.671875, + "learning_rate": 0.0001344584329279337, + "loss": 0.3874, + "step": 13466 + }, + { + "epoch": 1.8, + "grad_norm": 0.474609375, + "learning_rate": 0.00013444750087221943, + "loss": 0.5931, + "step": 13467 + }, + { + "epoch": 1.8, + "grad_norm": 0.55078125, + "learning_rate": 0.00013443656834937498, + "loss": 0.5008, + "step": 13468 + }, + { + "epoch": 1.8, + "grad_norm": 0.462890625, + "learning_rate": 0.0001344256353595485, + "loss": 0.271, + "step": 13469 + }, + { + "epoch": 1.8, + "grad_norm": 0.494140625, + "learning_rate": 0.0001344147019028884, + "loss": 0.3091, + "step": 13470 + }, + { + "epoch": 1.8, + "grad_norm": 0.52734375, + "learning_rate": 0.00013440376797954276, + "loss": 0.3392, + "step": 13471 + }, + { + "epoch": 1.8, + "grad_norm": 0.56640625, + "learning_rate": 0.00013439283358966, + "loss": 0.4167, + "step": 13472 + }, + { + "epoch": 1.8, + "grad_norm": 0.5390625, + "learning_rate": 0.00013438189873338834, + "loss": 0.4007, + "step": 13473 + }, + { + "epoch": 1.8, + "grad_norm": 0.5703125, + "learning_rate": 0.00013437096341087606, + "loss": 0.3078, + "step": 13474 + }, + { + "epoch": 1.8, + "grad_norm": 0.58203125, + "learning_rate": 0.00013436002762227145, + "loss": 0.4527, + "step": 13475 + }, + { + "epoch": 1.8, + "grad_norm": 0.412109375, + "learning_rate": 0.00013434909136772282, + "loss": 0.1981, + "step": 13476 + }, + { + "epoch": 1.8, + "grad_norm": 0.54296875, + "learning_rate": 0.00013433815464737847, + "loss": 0.236, + "step": 13477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5, + "learning_rate": 0.00013432721746138668, + "loss": 0.5126, + "step": 13478 + }, + { + "epoch": 1.8, + "grad_norm": 0.56640625, + "learning_rate": 0.00013431627980989584, + "loss": 0.4922, + "step": 13479 + }, + { + "epoch": 1.8, + "grad_norm": 0.482421875, + "learning_rate": 0.0001343053416930542, + "loss": 0.328, + "step": 13480 + }, + { + "epoch": 1.8, + "grad_norm": 0.498046875, + "learning_rate": 0.0001342944031110101, + "loss": 0.2913, + "step": 13481 + }, + { + "epoch": 1.8, + "grad_norm": 0.455078125, + "learning_rate": 0.00013428346406391188, + "loss": 0.3574, + "step": 13482 + }, + { + "epoch": 1.8, + "grad_norm": 0.400390625, + "learning_rate": 0.00013427252455190786, + "loss": 0.3194, + "step": 13483 + }, + { + "epoch": 1.8, + "grad_norm": 0.310546875, + "learning_rate": 0.00013426158457514645, + "loss": 0.363, + "step": 13484 + }, + { + "epoch": 1.8, + "grad_norm": 0.375, + "learning_rate": 0.00013425064413377595, + "loss": 0.2963, + "step": 13485 + }, + { + "epoch": 1.8, + "grad_norm": 0.53125, + "learning_rate": 0.00013423970322794476, + "loss": 0.2595, + "step": 13486 + }, + { + "epoch": 1.8, + "grad_norm": 0.5625, + "learning_rate": 0.0001342287618578012, + "loss": 0.3075, + "step": 13487 + }, + { + "epoch": 1.8, + "grad_norm": 0.6484375, + "learning_rate": 0.00013421782002349367, + "loss": 0.4813, + "step": 13488 + }, + { + "epoch": 1.8, + "grad_norm": 0.431640625, + "learning_rate": 0.0001342068777251705, + "loss": 0.3846, + "step": 13489 + }, + { + "epoch": 1.8, + "grad_norm": 0.6640625, + "learning_rate": 0.00013419593496298018, + "loss": 0.5971, + "step": 13490 + }, + { + "epoch": 1.8, + "grad_norm": 0.5390625, + "learning_rate": 0.000134184991737071, + "loss": 0.3366, + "step": 13491 + }, + { + "epoch": 1.8, + "grad_norm": 0.6171875, + "learning_rate": 0.00013417404804759138, + "loss": 0.3811, + "step": 13492 + }, + { + "epoch": 1.8, + "grad_norm": 0.5390625, + "learning_rate": 0.00013416310389468975, + "loss": 0.3582, + "step": 13493 + }, + { + "epoch": 1.8, + "grad_norm": 0.71484375, + "learning_rate": 0.0001341521592785145, + "loss": 0.2062, + "step": 13494 + }, + { + "epoch": 1.8, + "grad_norm": 0.671875, + "learning_rate": 0.00013414121419921406, + "loss": 0.473, + "step": 13495 + }, + { + "epoch": 1.8, + "grad_norm": 0.412109375, + "learning_rate": 0.00013413026865693683, + "loss": 0.2372, + "step": 13496 + }, + { + "epoch": 1.8, + "grad_norm": 0.451171875, + "learning_rate": 0.00013411932265183126, + "loss": 0.2405, + "step": 13497 + }, + { + "epoch": 1.8, + "grad_norm": 0.392578125, + "learning_rate": 0.0001341083761840458, + "loss": 0.1326, + "step": 13498 + }, + { + "epoch": 1.8, + "grad_norm": 0.3671875, + "learning_rate": 0.00013409742925372882, + "loss": 0.1979, + "step": 13499 + }, + { + "epoch": 1.8, + "grad_norm": 0.609375, + "learning_rate": 0.00013408648186102886, + "loss": 0.4406, + "step": 13500 + }, + { + "epoch": 1.8, + "grad_norm": 0.6015625, + "learning_rate": 0.0001340755340060943, + "loss": 0.2711, + "step": 13501 + }, + { + "epoch": 1.8, + "grad_norm": 0.57421875, + "learning_rate": 0.00013406458568907367, + "loss": 0.607, + "step": 13502 + }, + { + "epoch": 1.8, + "grad_norm": 0.5234375, + "learning_rate": 0.00013405363691011533, + "loss": 0.5055, + "step": 13503 + }, + { + "epoch": 1.8, + "grad_norm": 0.43359375, + "learning_rate": 0.0001340426876693679, + "loss": 0.4348, + "step": 13504 + }, + { + "epoch": 1.8, + "grad_norm": 0.42578125, + "learning_rate": 0.00013403173796697968, + "loss": 0.1644, + "step": 13505 + }, + { + "epoch": 1.8, + "grad_norm": 0.47265625, + "learning_rate": 0.00013402078780309933, + "loss": 0.2183, + "step": 13506 + }, + { + "epoch": 1.8, + "grad_norm": 0.59765625, + "learning_rate": 0.00013400983717787524, + "loss": 0.448, + "step": 13507 + }, + { + "epoch": 1.8, + "grad_norm": 0.58984375, + "learning_rate": 0.0001339988860914559, + "loss": 0.5044, + "step": 13508 + }, + { + "epoch": 1.8, + "grad_norm": 0.66015625, + "learning_rate": 0.00013398793454398987, + "loss": 0.8222, + "step": 13509 + }, + { + "epoch": 1.8, + "grad_norm": 0.73828125, + "learning_rate": 0.00013397698253562565, + "loss": 0.7365, + "step": 13510 + }, + { + "epoch": 1.8, + "grad_norm": 0.50390625, + "learning_rate": 0.00013396603006651174, + "loss": 0.3822, + "step": 13511 + }, + { + "epoch": 1.8, + "grad_norm": 0.466796875, + "learning_rate": 0.00013395507713679665, + "loss": 0.3179, + "step": 13512 + }, + { + "epoch": 1.8, + "grad_norm": 0.498046875, + "learning_rate": 0.00013394412374662893, + "loss": 0.2242, + "step": 13513 + }, + { + "epoch": 1.8, + "grad_norm": 0.32421875, + "learning_rate": 0.00013393316989615707, + "loss": 0.2999, + "step": 13514 + }, + { + "epoch": 1.8, + "grad_norm": 0.43359375, + "learning_rate": 0.00013392221558552968, + "loss": 0.3994, + "step": 13515 + }, + { + "epoch": 1.8, + "grad_norm": 0.7109375, + "learning_rate": 0.0001339112608148953, + "loss": 0.6766, + "step": 13516 + }, + { + "epoch": 1.8, + "grad_norm": 0.6015625, + "learning_rate": 0.0001339003055844024, + "loss": 0.4943, + "step": 13517 + }, + { + "epoch": 1.8, + "grad_norm": 0.33984375, + "learning_rate": 0.00013388934989419962, + "loss": 0.3955, + "step": 13518 + }, + { + "epoch": 1.8, + "grad_norm": 0.482421875, + "learning_rate": 0.0001338783937444355, + "loss": 0.1836, + "step": 13519 + }, + { + "epoch": 1.8, + "grad_norm": 0.44921875, + "learning_rate": 0.00013386743713525868, + "loss": 0.3456, + "step": 13520 + }, + { + "epoch": 1.8, + "grad_norm": 0.5625, + "learning_rate": 0.00013385648006681758, + "loss": 0.4892, + "step": 13521 + }, + { + "epoch": 1.8, + "grad_norm": 0.62109375, + "learning_rate": 0.00013384552253926094, + "loss": 0.5572, + "step": 13522 + }, + { + "epoch": 1.8, + "grad_norm": 0.60546875, + "learning_rate": 0.00013383456455273732, + "loss": 0.3457, + "step": 13523 + }, + { + "epoch": 1.8, + "grad_norm": 0.53125, + "learning_rate": 0.00013382360610739524, + "loss": 0.3993, + "step": 13524 + }, + { + "epoch": 1.8, + "grad_norm": 0.4921875, + "learning_rate": 0.00013381264720338336, + "loss": 0.3889, + "step": 13525 + }, + { + "epoch": 1.8, + "grad_norm": 0.45703125, + "learning_rate": 0.00013380168784085027, + "loss": 0.5196, + "step": 13526 + }, + { + "epoch": 1.81, + "grad_norm": 0.4609375, + "learning_rate": 0.00013379072801994463, + "loss": 0.2854, + "step": 13527 + }, + { + "epoch": 1.81, + "grad_norm": 0.435546875, + "learning_rate": 0.000133779767740815, + "loss": 0.3272, + "step": 13528 + }, + { + "epoch": 1.81, + "grad_norm": 0.609375, + "learning_rate": 0.0001337688070036101, + "loss": 0.8158, + "step": 13529 + }, + { + "epoch": 1.81, + "grad_norm": 0.828125, + "learning_rate": 0.00013375784580847846, + "loss": 0.3152, + "step": 13530 + }, + { + "epoch": 1.81, + "grad_norm": 0.486328125, + "learning_rate": 0.00013374688415556876, + "loss": 0.3599, + "step": 13531 + }, + { + "epoch": 1.81, + "grad_norm": 0.61328125, + "learning_rate": 0.00013373592204502966, + "loss": 0.3634, + "step": 13532 + }, + { + "epoch": 1.81, + "grad_norm": 0.458984375, + "learning_rate": 0.00013372495947700977, + "loss": 0.3036, + "step": 13533 + }, + { + "epoch": 1.81, + "grad_norm": 0.78125, + "learning_rate": 0.00013371399645165788, + "loss": 0.5, + "step": 13534 + }, + { + "epoch": 1.81, + "grad_norm": 0.466796875, + "learning_rate": 0.00013370303296912249, + "loss": 0.5294, + "step": 13535 + }, + { + "epoch": 1.81, + "grad_norm": 0.5234375, + "learning_rate": 0.00013369206902955234, + "loss": 0.4464, + "step": 13536 + }, + { + "epoch": 1.81, + "grad_norm": 0.419921875, + "learning_rate": 0.00013368110463309613, + "loss": 0.3074, + "step": 13537 + }, + { + "epoch": 1.81, + "grad_norm": 0.625, + "learning_rate": 0.00013367013977990254, + "loss": 0.5985, + "step": 13538 + }, + { + "epoch": 1.81, + "grad_norm": 0.4921875, + "learning_rate": 0.00013365917447012018, + "loss": 0.3145, + "step": 13539 + }, + { + "epoch": 1.81, + "grad_norm": 0.609375, + "learning_rate": 0.0001336482087038979, + "loss": 0.6716, + "step": 13540 + }, + { + "epoch": 1.81, + "grad_norm": 0.546875, + "learning_rate": 0.00013363724248138426, + "loss": 0.5129, + "step": 13541 + }, + { + "epoch": 1.81, + "grad_norm": 0.51953125, + "learning_rate": 0.000133626275802728, + "loss": 0.3819, + "step": 13542 + }, + { + "epoch": 1.81, + "grad_norm": 0.482421875, + "learning_rate": 0.00013361530866807793, + "loss": 0.3464, + "step": 13543 + }, + { + "epoch": 1.81, + "grad_norm": 0.482421875, + "learning_rate": 0.00013360434107758263, + "loss": 0.5499, + "step": 13544 + }, + { + "epoch": 1.81, + "grad_norm": 0.50390625, + "learning_rate": 0.00013359337303139098, + "loss": 0.5321, + "step": 13545 + }, + { + "epoch": 1.81, + "grad_norm": 0.546875, + "learning_rate": 0.00013358240452965157, + "loss": 0.4383, + "step": 13546 + }, + { + "epoch": 1.81, + "grad_norm": 0.58984375, + "learning_rate": 0.0001335714355725132, + "loss": 0.3988, + "step": 13547 + }, + { + "epoch": 1.81, + "grad_norm": 0.53125, + "learning_rate": 0.00013356046616012462, + "loss": 0.2671, + "step": 13548 + }, + { + "epoch": 1.81, + "grad_norm": 0.66015625, + "learning_rate": 0.0001335494962926346, + "loss": 0.2841, + "step": 13549 + }, + { + "epoch": 1.81, + "grad_norm": 0.439453125, + "learning_rate": 0.00013353852597019187, + "loss": 0.3988, + "step": 13550 + }, + { + "epoch": 1.81, + "grad_norm": 0.50390625, + "learning_rate": 0.00013352755519294523, + "loss": 0.2665, + "step": 13551 + }, + { + "epoch": 1.81, + "grad_norm": 0.76953125, + "learning_rate": 0.0001335165839610434, + "loss": 0.6579, + "step": 13552 + }, + { + "epoch": 1.81, + "grad_norm": 0.75390625, + "learning_rate": 0.00013350561227463517, + "loss": 0.5111, + "step": 13553 + }, + { + "epoch": 1.81, + "grad_norm": 0.474609375, + "learning_rate": 0.00013349464013386934, + "loss": 0.3347, + "step": 13554 + }, + { + "epoch": 1.81, + "grad_norm": 0.498046875, + "learning_rate": 0.0001334836675388947, + "loss": 0.4553, + "step": 13555 + }, + { + "epoch": 1.81, + "grad_norm": 0.51953125, + "learning_rate": 0.00013347269448986003, + "loss": 0.4842, + "step": 13556 + }, + { + "epoch": 1.81, + "grad_norm": 0.341796875, + "learning_rate": 0.00013346172098691413, + "loss": 0.1317, + "step": 13557 + }, + { + "epoch": 1.81, + "grad_norm": 0.55078125, + "learning_rate": 0.00013345074703020586, + "loss": 0.3691, + "step": 13558 + }, + { + "epoch": 1.81, + "grad_norm": 0.443359375, + "learning_rate": 0.00013343977261988396, + "loss": 0.2788, + "step": 13559 + }, + { + "epoch": 1.81, + "grad_norm": 0.41015625, + "learning_rate": 0.00013342879775609726, + "loss": 0.3157, + "step": 13560 + }, + { + "epoch": 1.81, + "grad_norm": 0.294921875, + "learning_rate": 0.00013341782243899466, + "loss": 0.1573, + "step": 13561 + }, + { + "epoch": 1.81, + "grad_norm": 0.4765625, + "learning_rate": 0.0001334068466687249, + "loss": 0.3956, + "step": 13562 + }, + { + "epoch": 1.81, + "grad_norm": 0.671875, + "learning_rate": 0.00013339587044543686, + "loss": 0.4548, + "step": 13563 + }, + { + "epoch": 1.81, + "grad_norm": 0.40625, + "learning_rate": 0.00013338489376927943, + "loss": 0.2186, + "step": 13564 + }, + { + "epoch": 1.81, + "grad_norm": 0.47265625, + "learning_rate": 0.00013337391664040136, + "loss": 0.2329, + "step": 13565 + }, + { + "epoch": 1.81, + "grad_norm": 0.435546875, + "learning_rate": 0.0001333629390589516, + "loss": 0.3227, + "step": 13566 + }, + { + "epoch": 1.81, + "grad_norm": 0.462890625, + "learning_rate": 0.00013335196102507894, + "loss": 0.4797, + "step": 13567 + }, + { + "epoch": 1.81, + "grad_norm": 0.462890625, + "learning_rate": 0.0001333409825389323, + "loss": 0.436, + "step": 13568 + }, + { + "epoch": 1.81, + "grad_norm": 0.71875, + "learning_rate": 0.0001333300036006605, + "loss": 0.5347, + "step": 13569 + }, + { + "epoch": 1.81, + "grad_norm": 0.421875, + "learning_rate": 0.00013331902421041253, + "loss": 0.5247, + "step": 13570 + }, + { + "epoch": 1.81, + "grad_norm": 0.69140625, + "learning_rate": 0.0001333080443683372, + "loss": 0.5583, + "step": 13571 + }, + { + "epoch": 1.81, + "grad_norm": 0.44921875, + "learning_rate": 0.00013329706407458336, + "loss": 0.5508, + "step": 13572 + }, + { + "epoch": 1.81, + "grad_norm": 0.5390625, + "learning_rate": 0.0001332860833293, + "loss": 0.5662, + "step": 13573 + }, + { + "epoch": 1.81, + "grad_norm": 0.609375, + "learning_rate": 0.00013327510213263595, + "loss": 0.781, + "step": 13574 + }, + { + "epoch": 1.81, + "grad_norm": 0.423828125, + "learning_rate": 0.0001332641204847402, + "loss": 0.1833, + "step": 13575 + }, + { + "epoch": 1.81, + "grad_norm": 0.59375, + "learning_rate": 0.0001332531383857616, + "loss": 0.3116, + "step": 13576 + }, + { + "epoch": 1.81, + "grad_norm": 0.388671875, + "learning_rate": 0.00013324215583584916, + "loss": 0.2679, + "step": 13577 + }, + { + "epoch": 1.81, + "grad_norm": 0.478515625, + "learning_rate": 0.00013323117283515167, + "loss": 0.5498, + "step": 13578 + }, + { + "epoch": 1.81, + "grad_norm": 0.396484375, + "learning_rate": 0.00013322018938381817, + "loss": 0.409, + "step": 13579 + }, + { + "epoch": 1.81, + "grad_norm": 0.640625, + "learning_rate": 0.00013320920548199762, + "loss": 0.4364, + "step": 13580 + }, + { + "epoch": 1.81, + "grad_norm": 0.5859375, + "learning_rate": 0.0001331982211298389, + "loss": 0.3741, + "step": 13581 + }, + { + "epoch": 1.81, + "grad_norm": 0.53515625, + "learning_rate": 0.000133187236327491, + "loss": 0.5088, + "step": 13582 + }, + { + "epoch": 1.81, + "grad_norm": 0.478515625, + "learning_rate": 0.0001331762510751029, + "loss": 0.4026, + "step": 13583 + }, + { + "epoch": 1.81, + "grad_norm": 0.46875, + "learning_rate": 0.00013316526537282352, + "loss": 0.6665, + "step": 13584 + }, + { + "epoch": 1.81, + "grad_norm": 0.486328125, + "learning_rate": 0.00013315427922080183, + "loss": 0.3718, + "step": 13585 + }, + { + "epoch": 1.81, + "grad_norm": 0.59765625, + "learning_rate": 0.00013314329261918687, + "loss": 0.5685, + "step": 13586 + }, + { + "epoch": 1.81, + "grad_norm": 0.8203125, + "learning_rate": 0.0001331323055681276, + "loss": 0.4755, + "step": 13587 + }, + { + "epoch": 1.81, + "grad_norm": 0.427734375, + "learning_rate": 0.00013312131806777298, + "loss": 0.1992, + "step": 13588 + }, + { + "epoch": 1.81, + "grad_norm": 0.40234375, + "learning_rate": 0.00013311033011827202, + "loss": 0.2235, + "step": 13589 + }, + { + "epoch": 1.81, + "grad_norm": 0.59765625, + "learning_rate": 0.00013309934171977376, + "loss": 0.32, + "step": 13590 + }, + { + "epoch": 1.81, + "grad_norm": 0.51171875, + "learning_rate": 0.00013308835287242717, + "loss": 0.2112, + "step": 13591 + }, + { + "epoch": 1.81, + "grad_norm": 0.546875, + "learning_rate": 0.00013307736357638127, + "loss": 0.2745, + "step": 13592 + }, + { + "epoch": 1.81, + "grad_norm": 0.380859375, + "learning_rate": 0.0001330663738317851, + "loss": 0.2556, + "step": 13593 + }, + { + "epoch": 1.81, + "grad_norm": 0.37890625, + "learning_rate": 0.00013305538363878768, + "loss": 0.2012, + "step": 13594 + }, + { + "epoch": 1.81, + "grad_norm": 0.75390625, + "learning_rate": 0.00013304439299753807, + "loss": 0.3615, + "step": 13595 + }, + { + "epoch": 1.81, + "grad_norm": 0.57421875, + "learning_rate": 0.00013303340190818524, + "loss": 0.3179, + "step": 13596 + }, + { + "epoch": 1.81, + "grad_norm": 0.412109375, + "learning_rate": 0.00013302241037087828, + "loss": 0.2604, + "step": 13597 + }, + { + "epoch": 1.81, + "grad_norm": 0.384765625, + "learning_rate": 0.00013301141838576626, + "loss": 0.2923, + "step": 13598 + }, + { + "epoch": 1.81, + "grad_norm": 0.55078125, + "learning_rate": 0.00013300042595299822, + "loss": 0.3796, + "step": 13599 + }, + { + "epoch": 1.81, + "grad_norm": 0.5703125, + "learning_rate": 0.00013298943307272321, + "loss": 0.3623, + "step": 13600 + }, + { + "epoch": 1.81, + "grad_norm": 0.3125, + "learning_rate": 0.00013297843974509037, + "loss": 0.1299, + "step": 13601 + }, + { + "epoch": 1.82, + "grad_norm": 0.37890625, + "learning_rate": 0.0001329674459702487, + "loss": 0.3743, + "step": 13602 + }, + { + "epoch": 1.82, + "grad_norm": 0.53125, + "learning_rate": 0.00013295645174834725, + "loss": 0.4089, + "step": 13603 + }, + { + "epoch": 1.82, + "grad_norm": 0.54296875, + "learning_rate": 0.00013294545707953517, + "loss": 0.3545, + "step": 13604 + }, + { + "epoch": 1.82, + "grad_norm": 0.56640625, + "learning_rate": 0.0001329344619639616, + "loss": 0.4562, + "step": 13605 + }, + { + "epoch": 1.82, + "grad_norm": 0.58203125, + "learning_rate": 0.00013292346640177556, + "loss": 0.264, + "step": 13606 + }, + { + "epoch": 1.82, + "grad_norm": 0.44140625, + "learning_rate": 0.0001329124703931262, + "loss": 0.314, + "step": 13607 + }, + { + "epoch": 1.82, + "grad_norm": 0.62890625, + "learning_rate": 0.0001329014739381626, + "loss": 0.4398, + "step": 13608 + }, + { + "epoch": 1.82, + "grad_norm": 0.380859375, + "learning_rate": 0.00013289047703703394, + "loss": 0.2541, + "step": 13609 + }, + { + "epoch": 1.82, + "grad_norm": 0.4296875, + "learning_rate": 0.00013287947968988924, + "loss": 0.3965, + "step": 13610 + }, + { + "epoch": 1.82, + "grad_norm": 0.57421875, + "learning_rate": 0.00013286848189687772, + "loss": 0.3855, + "step": 13611 + }, + { + "epoch": 1.82, + "grad_norm": 0.546875, + "learning_rate": 0.00013285748365814848, + "loss": 0.4268, + "step": 13612 + }, + { + "epoch": 1.82, + "grad_norm": 0.50390625, + "learning_rate": 0.00013284648497385068, + "loss": 0.341, + "step": 13613 + }, + { + "epoch": 1.82, + "grad_norm": 0.578125, + "learning_rate": 0.0001328354858441335, + "loss": 0.7163, + "step": 13614 + }, + { + "epoch": 1.82, + "grad_norm": 0.5625, + "learning_rate": 0.000132824486269146, + "loss": 0.2124, + "step": 13615 + }, + { + "epoch": 1.82, + "grad_norm": 0.341796875, + "learning_rate": 0.00013281348624903742, + "loss": 0.2061, + "step": 13616 + }, + { + "epoch": 1.82, + "grad_norm": 0.515625, + "learning_rate": 0.0001328024857839569, + "loss": 0.4578, + "step": 13617 + }, + { + "epoch": 1.82, + "grad_norm": 0.3671875, + "learning_rate": 0.00013279148487405366, + "loss": 0.2754, + "step": 13618 + }, + { + "epoch": 1.82, + "grad_norm": 0.59375, + "learning_rate": 0.0001327804835194768, + "loss": 0.348, + "step": 13619 + }, + { + "epoch": 1.82, + "grad_norm": 0.470703125, + "learning_rate": 0.00013276948172037556, + "loss": 0.2854, + "step": 13620 + }, + { + "epoch": 1.82, + "grad_norm": 0.40625, + "learning_rate": 0.00013275847947689912, + "loss": 0.3374, + "step": 13621 + }, + { + "epoch": 1.82, + "grad_norm": 0.423828125, + "learning_rate": 0.00013274747678919665, + "loss": 0.3087, + "step": 13622 + }, + { + "epoch": 1.82, + "grad_norm": 0.640625, + "learning_rate": 0.0001327364736574174, + "loss": 0.466, + "step": 13623 + }, + { + "epoch": 1.82, + "grad_norm": 0.84375, + "learning_rate": 0.00013272547008171055, + "loss": 0.5456, + "step": 13624 + }, + { + "epoch": 1.82, + "grad_norm": 0.328125, + "learning_rate": 0.00013271446606222536, + "loss": 0.2628, + "step": 13625 + }, + { + "epoch": 1.82, + "grad_norm": 0.443359375, + "learning_rate": 0.00013270346159911095, + "loss": 0.18, + "step": 13626 + }, + { + "epoch": 1.82, + "grad_norm": 0.578125, + "learning_rate": 0.00013269245669251663, + "loss": 0.3878, + "step": 13627 + }, + { + "epoch": 1.82, + "grad_norm": 0.66796875, + "learning_rate": 0.00013268145134259164, + "loss": 0.4577, + "step": 13628 + }, + { + "epoch": 1.82, + "grad_norm": 0.4765625, + "learning_rate": 0.00013267044554948518, + "loss": 0.2396, + "step": 13629 + }, + { + "epoch": 1.82, + "grad_norm": 0.474609375, + "learning_rate": 0.00013265943931334653, + "loss": 0.2349, + "step": 13630 + }, + { + "epoch": 1.82, + "grad_norm": 0.47265625, + "learning_rate": 0.00013264843263432493, + "loss": 0.5215, + "step": 13631 + }, + { + "epoch": 1.82, + "grad_norm": 0.51171875, + "learning_rate": 0.00013263742551256958, + "loss": 0.3171, + "step": 13632 + }, + { + "epoch": 1.82, + "grad_norm": 0.5078125, + "learning_rate": 0.00013262641794822982, + "loss": 0.1803, + "step": 13633 + }, + { + "epoch": 1.82, + "grad_norm": 0.55078125, + "learning_rate": 0.0001326154099414549, + "loss": 0.3336, + "step": 13634 + }, + { + "epoch": 1.82, + "grad_norm": 0.5390625, + "learning_rate": 0.00013260440149239405, + "loss": 0.3015, + "step": 13635 + }, + { + "epoch": 1.82, + "grad_norm": 0.5390625, + "learning_rate": 0.00013259339260119664, + "loss": 0.3846, + "step": 13636 + }, + { + "epoch": 1.82, + "grad_norm": 0.515625, + "learning_rate": 0.00013258238326801188, + "loss": 0.3679, + "step": 13637 + }, + { + "epoch": 1.82, + "grad_norm": 0.4375, + "learning_rate": 0.00013257137349298912, + "loss": 0.3067, + "step": 13638 + }, + { + "epoch": 1.82, + "grad_norm": 0.4609375, + "learning_rate": 0.00013256036327627762, + "loss": 0.3415, + "step": 13639 + }, + { + "epoch": 1.82, + "grad_norm": 0.5625, + "learning_rate": 0.00013254935261802666, + "loss": 0.6267, + "step": 13640 + }, + { + "epoch": 1.82, + "grad_norm": 0.41015625, + "learning_rate": 0.00013253834151838562, + "loss": 0.2739, + "step": 13641 + }, + { + "epoch": 1.82, + "grad_norm": 0.50390625, + "learning_rate": 0.00013252732997750378, + "loss": 0.4388, + "step": 13642 + }, + { + "epoch": 1.82, + "grad_norm": 0.80859375, + "learning_rate": 0.0001325163179955305, + "loss": 0.482, + "step": 13643 + }, + { + "epoch": 1.82, + "grad_norm": 0.50390625, + "learning_rate": 0.00013250530557261506, + "loss": 0.4365, + "step": 13644 + }, + { + "epoch": 1.82, + "grad_norm": 0.50390625, + "learning_rate": 0.00013249429270890676, + "loss": 0.4538, + "step": 13645 + }, + { + "epoch": 1.82, + "grad_norm": 0.57421875, + "learning_rate": 0.00013248327940455505, + "loss": 0.4463, + "step": 13646 + }, + { + "epoch": 1.82, + "grad_norm": 0.55078125, + "learning_rate": 0.0001324722656597092, + "loss": 0.5228, + "step": 13647 + }, + { + "epoch": 1.82, + "grad_norm": 0.41796875, + "learning_rate": 0.00013246125147451862, + "loss": 0.2474, + "step": 13648 + }, + { + "epoch": 1.82, + "grad_norm": 0.427734375, + "learning_rate": 0.0001324502368491326, + "loss": 0.1731, + "step": 13649 + }, + { + "epoch": 1.82, + "grad_norm": 0.48828125, + "learning_rate": 0.0001324392217837006, + "loss": 0.3824, + "step": 13650 + }, + { + "epoch": 1.82, + "grad_norm": 0.5703125, + "learning_rate": 0.00013242820627837188, + "loss": 0.572, + "step": 13651 + }, + { + "epoch": 1.82, + "grad_norm": 0.32421875, + "learning_rate": 0.00013241719033329588, + "loss": 0.2978, + "step": 13652 + }, + { + "epoch": 1.82, + "grad_norm": 0.515625, + "learning_rate": 0.00013240617394862198, + "loss": 0.5679, + "step": 13653 + }, + { + "epoch": 1.82, + "grad_norm": 0.57421875, + "learning_rate": 0.00013239515712449955, + "loss": 0.1839, + "step": 13654 + }, + { + "epoch": 1.82, + "grad_norm": 0.7265625, + "learning_rate": 0.00013238413986107803, + "loss": 0.5559, + "step": 13655 + }, + { + "epoch": 1.82, + "grad_norm": 0.458984375, + "learning_rate": 0.00013237312215850678, + "loss": 0.3722, + "step": 13656 + }, + { + "epoch": 1.82, + "grad_norm": 0.44140625, + "learning_rate": 0.00013236210401693522, + "loss": 0.2668, + "step": 13657 + }, + { + "epoch": 1.82, + "grad_norm": 0.53515625, + "learning_rate": 0.00013235108543651272, + "loss": 0.4515, + "step": 13658 + }, + { + "epoch": 1.82, + "grad_norm": 0.55078125, + "learning_rate": 0.00013234006641738876, + "loss": 0.2878, + "step": 13659 + }, + { + "epoch": 1.82, + "grad_norm": 0.47265625, + "learning_rate": 0.00013232904695971274, + "loss": 0.2972, + "step": 13660 + }, + { + "epoch": 1.82, + "grad_norm": 0.69140625, + "learning_rate": 0.00013231802706363413, + "loss": 0.4265, + "step": 13661 + }, + { + "epoch": 1.82, + "grad_norm": 0.62109375, + "learning_rate": 0.0001323070067293023, + "loss": 0.6447, + "step": 13662 + }, + { + "epoch": 1.82, + "grad_norm": 0.462890625, + "learning_rate": 0.0001322959859568667, + "loss": 0.4904, + "step": 13663 + }, + { + "epoch": 1.82, + "grad_norm": 0.478515625, + "learning_rate": 0.00013228496474647686, + "loss": 0.3249, + "step": 13664 + }, + { + "epoch": 1.82, + "grad_norm": 0.439453125, + "learning_rate": 0.00013227394309828214, + "loss": 0.4024, + "step": 13665 + }, + { + "epoch": 1.82, + "grad_norm": 0.546875, + "learning_rate": 0.00013226292101243205, + "loss": 0.2773, + "step": 13666 + }, + { + "epoch": 1.82, + "grad_norm": 0.60546875, + "learning_rate": 0.00013225189848907603, + "loss": 0.5771, + "step": 13667 + }, + { + "epoch": 1.82, + "grad_norm": 0.58984375, + "learning_rate": 0.0001322408755283636, + "loss": 0.4825, + "step": 13668 + }, + { + "epoch": 1.82, + "grad_norm": 0.54296875, + "learning_rate": 0.00013222985213044418, + "loss": 0.2448, + "step": 13669 + }, + { + "epoch": 1.82, + "grad_norm": 0.55859375, + "learning_rate": 0.0001322188282954673, + "loss": 0.3956, + "step": 13670 + }, + { + "epoch": 1.82, + "grad_norm": 0.388671875, + "learning_rate": 0.0001322078040235824, + "loss": 0.2105, + "step": 13671 + }, + { + "epoch": 1.82, + "grad_norm": 0.5390625, + "learning_rate": 0.000132196779314939, + "loss": 0.4734, + "step": 13672 + }, + { + "epoch": 1.82, + "grad_norm": 0.8515625, + "learning_rate": 0.00013218575416968666, + "loss": 0.3134, + "step": 13673 + }, + { + "epoch": 1.82, + "grad_norm": 0.404296875, + "learning_rate": 0.00013217472858797482, + "loss": 0.2649, + "step": 13674 + }, + { + "epoch": 1.82, + "grad_norm": 0.59765625, + "learning_rate": 0.000132163702569953, + "loss": 0.446, + "step": 13675 + }, + { + "epoch": 1.82, + "grad_norm": 0.609375, + "learning_rate": 0.0001321526761157707, + "loss": 0.3657, + "step": 13676 + }, + { + "epoch": 1.83, + "grad_norm": 0.482421875, + "learning_rate": 0.0001321416492255775, + "loss": 0.3005, + "step": 13677 + }, + { + "epoch": 1.83, + "grad_norm": 0.51953125, + "learning_rate": 0.00013213062189952295, + "loss": 0.3489, + "step": 13678 + }, + { + "epoch": 1.83, + "grad_norm": 0.390625, + "learning_rate": 0.00013211959413775648, + "loss": 0.2112, + "step": 13679 + }, + { + "epoch": 1.83, + "grad_norm": 0.53515625, + "learning_rate": 0.00013210856594042774, + "loss": 0.366, + "step": 13680 + }, + { + "epoch": 1.83, + "grad_norm": 0.42578125, + "learning_rate": 0.00013209753730768624, + "loss": 0.2595, + "step": 13681 + }, + { + "epoch": 1.83, + "grad_norm": 0.50390625, + "learning_rate": 0.00013208650823968152, + "loss": 0.3721, + "step": 13682 + }, + { + "epoch": 1.83, + "grad_norm": 0.69921875, + "learning_rate": 0.00013207547873656314, + "loss": 0.6424, + "step": 13683 + }, + { + "epoch": 1.83, + "grad_norm": 0.53515625, + "learning_rate": 0.00013206444879848073, + "loss": 0.5241, + "step": 13684 + }, + { + "epoch": 1.83, + "grad_norm": 0.56640625, + "learning_rate": 0.00013205341842558376, + "loss": 0.3605, + "step": 13685 + }, + { + "epoch": 1.83, + "grad_norm": 0.51171875, + "learning_rate": 0.0001320423876180219, + "loss": 0.6337, + "step": 13686 + }, + { + "epoch": 1.83, + "grad_norm": 0.451171875, + "learning_rate": 0.00013203135637594472, + "loss": 0.1473, + "step": 13687 + }, + { + "epoch": 1.83, + "grad_norm": 0.55859375, + "learning_rate": 0.00013202032469950171, + "loss": 0.693, + "step": 13688 + }, + { + "epoch": 1.83, + "grad_norm": 0.53125, + "learning_rate": 0.00013200929258884263, + "loss": 0.1243, + "step": 13689 + }, + { + "epoch": 1.83, + "grad_norm": 0.392578125, + "learning_rate": 0.00013199826004411694, + "loss": 0.2177, + "step": 13690 + }, + { + "epoch": 1.83, + "grad_norm": 0.50390625, + "learning_rate": 0.00013198722706547436, + "loss": 0.2792, + "step": 13691 + }, + { + "epoch": 1.83, + "grad_norm": 0.482421875, + "learning_rate": 0.0001319761936530644, + "loss": 0.2407, + "step": 13692 + }, + { + "epoch": 1.83, + "grad_norm": 0.64453125, + "learning_rate": 0.00013196515980703677, + "loss": 0.3979, + "step": 13693 + }, + { + "epoch": 1.83, + "grad_norm": 0.53515625, + "learning_rate": 0.00013195412552754102, + "loss": 0.1934, + "step": 13694 + }, + { + "epoch": 1.83, + "grad_norm": 0.6328125, + "learning_rate": 0.00013194309081472687, + "loss": 0.2955, + "step": 13695 + }, + { + "epoch": 1.83, + "grad_norm": 0.69140625, + "learning_rate": 0.00013193205566874386, + "loss": 0.4333, + "step": 13696 + }, + { + "epoch": 1.83, + "grad_norm": 0.60546875, + "learning_rate": 0.0001319210200897417, + "loss": 0.2189, + "step": 13697 + }, + { + "epoch": 1.83, + "grad_norm": 0.73828125, + "learning_rate": 0.00013190998407787004, + "loss": 0.4719, + "step": 13698 + }, + { + "epoch": 1.83, + "grad_norm": 0.486328125, + "learning_rate": 0.0001318989476332785, + "loss": 0.366, + "step": 13699 + }, + { + "epoch": 1.83, + "grad_norm": 0.7265625, + "learning_rate": 0.00013188791075611676, + "loss": 0.3224, + "step": 13700 + }, + { + "epoch": 1.83, + "grad_norm": 0.51953125, + "learning_rate": 0.00013187687344653448, + "loss": 0.3234, + "step": 13701 + }, + { + "epoch": 1.83, + "grad_norm": 0.859375, + "learning_rate": 0.00013186583570468132, + "loss": 0.6413, + "step": 13702 + }, + { + "epoch": 1.83, + "grad_norm": 0.55859375, + "learning_rate": 0.000131854797530707, + "loss": 0.5188, + "step": 13703 + }, + { + "epoch": 1.83, + "grad_norm": 0.44921875, + "learning_rate": 0.0001318437589247612, + "loss": 0.3682, + "step": 13704 + }, + { + "epoch": 1.83, + "grad_norm": 0.83203125, + "learning_rate": 0.00013183271988699357, + "loss": 0.5103, + "step": 13705 + }, + { + "epoch": 1.83, + "grad_norm": 0.5234375, + "learning_rate": 0.00013182168041755383, + "loss": 0.3779, + "step": 13706 + }, + { + "epoch": 1.83, + "grad_norm": 0.490234375, + "learning_rate": 0.00013181064051659167, + "loss": 0.3498, + "step": 13707 + }, + { + "epoch": 1.83, + "grad_norm": 0.54296875, + "learning_rate": 0.0001317996001842568, + "loss": 0.5333, + "step": 13708 + }, + { + "epoch": 1.83, + "grad_norm": 0.58203125, + "learning_rate": 0.00013178855942069895, + "loss": 0.448, + "step": 13709 + }, + { + "epoch": 1.83, + "grad_norm": 0.408203125, + "learning_rate": 0.00013177751822606788, + "loss": 0.253, + "step": 13710 + }, + { + "epoch": 1.83, + "grad_norm": 0.70703125, + "learning_rate": 0.0001317664766005132, + "loss": 0.7645, + "step": 13711 + }, + { + "epoch": 1.83, + "grad_norm": 0.5546875, + "learning_rate": 0.00013175543454418476, + "loss": 0.3651, + "step": 13712 + }, + { + "epoch": 1.83, + "grad_norm": 0.443359375, + "learning_rate": 0.0001317443920572322, + "loss": 0.4101, + "step": 13713 + }, + { + "epoch": 1.83, + "grad_norm": 0.55078125, + "learning_rate": 0.00013173334913980534, + "loss": 0.3989, + "step": 13714 + }, + { + "epoch": 1.83, + "grad_norm": 0.55859375, + "learning_rate": 0.0001317223057920539, + "loss": 0.4346, + "step": 13715 + }, + { + "epoch": 1.83, + "grad_norm": 0.6328125, + "learning_rate": 0.00013171126201412764, + "loss": 0.3452, + "step": 13716 + }, + { + "epoch": 1.83, + "grad_norm": 0.6484375, + "learning_rate": 0.0001317002178061763, + "loss": 0.464, + "step": 13717 + }, + { + "epoch": 1.83, + "grad_norm": 0.5859375, + "learning_rate": 0.00013168917316834966, + "loss": 0.4989, + "step": 13718 + }, + { + "epoch": 1.83, + "grad_norm": 1.0625, + "learning_rate": 0.0001316781281007975, + "loss": 0.4846, + "step": 13719 + }, + { + "epoch": 1.83, + "grad_norm": 0.51953125, + "learning_rate": 0.0001316670826036696, + "loss": 0.3366, + "step": 13720 + }, + { + "epoch": 1.83, + "grad_norm": 0.494140625, + "learning_rate": 0.00013165603667711573, + "loss": 0.3895, + "step": 13721 + }, + { + "epoch": 1.83, + "grad_norm": 0.51953125, + "learning_rate": 0.0001316449903212857, + "loss": 0.2858, + "step": 13722 + }, + { + "epoch": 1.83, + "grad_norm": 0.484375, + "learning_rate": 0.00013163394353632929, + "loss": 0.2618, + "step": 13723 + }, + { + "epoch": 1.83, + "grad_norm": 0.4765625, + "learning_rate": 0.00013162289632239626, + "loss": 0.176, + "step": 13724 + }, + { + "epoch": 1.83, + "grad_norm": 0.58984375, + "learning_rate": 0.00013161184867963648, + "loss": 0.1838, + "step": 13725 + }, + { + "epoch": 1.83, + "grad_norm": 0.83203125, + "learning_rate": 0.00013160080060819977, + "loss": 0.353, + "step": 13726 + }, + { + "epoch": 1.83, + "grad_norm": 0.423828125, + "learning_rate": 0.00013158975210823587, + "loss": 0.4318, + "step": 13727 + }, + { + "epoch": 1.83, + "grad_norm": 0.400390625, + "learning_rate": 0.00013157870317989474, + "loss": 0.3371, + "step": 13728 + }, + { + "epoch": 1.83, + "grad_norm": 0.59375, + "learning_rate": 0.00013156765382332604, + "loss": 0.5275, + "step": 13729 + }, + { + "epoch": 1.83, + "grad_norm": 0.609375, + "learning_rate": 0.00013155660403867976, + "loss": 0.6395, + "step": 13730 + }, + { + "epoch": 1.83, + "grad_norm": 0.62890625, + "learning_rate": 0.00013154555382610564, + "loss": 0.2613, + "step": 13731 + }, + { + "epoch": 1.83, + "grad_norm": 0.447265625, + "learning_rate": 0.0001315345031857536, + "loss": 0.3482, + "step": 13732 + }, + { + "epoch": 1.83, + "grad_norm": 0.54296875, + "learning_rate": 0.0001315234521177734, + "loss": 0.3694, + "step": 13733 + }, + { + "epoch": 1.83, + "grad_norm": 0.59765625, + "learning_rate": 0.000131512400622315, + "loss": 0.2087, + "step": 13734 + }, + { + "epoch": 1.83, + "grad_norm": 0.369140625, + "learning_rate": 0.00013150134869952825, + "loss": 0.2203, + "step": 13735 + }, + { + "epoch": 1.83, + "grad_norm": 0.384765625, + "learning_rate": 0.00013149029634956295, + "loss": 0.3228, + "step": 13736 + }, + { + "epoch": 1.83, + "grad_norm": 0.69921875, + "learning_rate": 0.00013147924357256903, + "loss": 0.4024, + "step": 13737 + }, + { + "epoch": 1.83, + "grad_norm": 0.52734375, + "learning_rate": 0.00013146819036869632, + "loss": 0.2876, + "step": 13738 + }, + { + "epoch": 1.83, + "grad_norm": 0.515625, + "learning_rate": 0.00013145713673809483, + "loss": 0.3365, + "step": 13739 + }, + { + "epoch": 1.83, + "grad_norm": 0.4765625, + "learning_rate": 0.00013144608268091435, + "loss": 0.4501, + "step": 13740 + }, + { + "epoch": 1.83, + "grad_norm": 0.671875, + "learning_rate": 0.00013143502819730477, + "loss": 0.3496, + "step": 13741 + }, + { + "epoch": 1.83, + "grad_norm": 0.4765625, + "learning_rate": 0.00013142397328741608, + "loss": 0.5198, + "step": 13742 + }, + { + "epoch": 1.83, + "grad_norm": 0.365234375, + "learning_rate": 0.00013141291795139813, + "loss": 0.2565, + "step": 13743 + }, + { + "epoch": 1.83, + "grad_norm": 0.431640625, + "learning_rate": 0.00013140186218940086, + "loss": 0.2845, + "step": 13744 + }, + { + "epoch": 1.83, + "grad_norm": 0.51171875, + "learning_rate": 0.00013139080600157413, + "loss": 0.5698, + "step": 13745 + }, + { + "epoch": 1.83, + "grad_norm": 0.345703125, + "learning_rate": 0.000131379749388068, + "loss": 0.413, + "step": 13746 + }, + { + "epoch": 1.83, + "grad_norm": 0.46484375, + "learning_rate": 0.0001313686923490323, + "loss": 0.3564, + "step": 13747 + }, + { + "epoch": 1.83, + "grad_norm": 0.5078125, + "learning_rate": 0.00013135763488461702, + "loss": 0.2258, + "step": 13748 + }, + { + "epoch": 1.83, + "grad_norm": 0.44140625, + "learning_rate": 0.00013134657699497205, + "loss": 0.1708, + "step": 13749 + }, + { + "epoch": 1.83, + "grad_norm": 0.64453125, + "learning_rate": 0.00013133551868024735, + "loss": 0.6069, + "step": 13750 + }, + { + "epoch": 1.83, + "grad_norm": 0.52734375, + "learning_rate": 0.00013132445994059298, + "loss": 0.5664, + "step": 13751 + }, + { + "epoch": 1.84, + "grad_norm": 0.43359375, + "learning_rate": 0.00013131340077615877, + "loss": 0.3364, + "step": 13752 + }, + { + "epoch": 1.84, + "grad_norm": 0.48828125, + "learning_rate": 0.00013130234118709482, + "loss": 0.414, + "step": 13753 + }, + { + "epoch": 1.84, + "grad_norm": 0.5703125, + "learning_rate": 0.00013129128117355095, + "loss": 0.4202, + "step": 13754 + }, + { + "epoch": 1.84, + "grad_norm": 0.388671875, + "learning_rate": 0.00013128022073567727, + "loss": 0.1647, + "step": 13755 + }, + { + "epoch": 1.84, + "grad_norm": 0.4375, + "learning_rate": 0.00013126915987362372, + "loss": 0.2341, + "step": 13756 + }, + { + "epoch": 1.84, + "grad_norm": 0.671875, + "learning_rate": 0.00013125809858754027, + "loss": 0.493, + "step": 13757 + }, + { + "epoch": 1.84, + "grad_norm": 0.5859375, + "learning_rate": 0.00013124703687757695, + "loss": 0.3082, + "step": 13758 + }, + { + "epoch": 1.84, + "grad_norm": 0.455078125, + "learning_rate": 0.00013123597474388378, + "loss": 0.5716, + "step": 13759 + }, + { + "epoch": 1.84, + "grad_norm": 0.58203125, + "learning_rate": 0.00013122491218661074, + "loss": 0.2437, + "step": 13760 + }, + { + "epoch": 1.84, + "grad_norm": 0.55078125, + "learning_rate": 0.00013121384920590786, + "loss": 0.3034, + "step": 13761 + }, + { + "epoch": 1.84, + "grad_norm": 0.390625, + "learning_rate": 0.0001312027858019251, + "loss": 0.2543, + "step": 13762 + }, + { + "epoch": 1.84, + "grad_norm": 0.462890625, + "learning_rate": 0.00013119172197481257, + "loss": 0.2334, + "step": 13763 + }, + { + "epoch": 1.84, + "grad_norm": 0.58203125, + "learning_rate": 0.00013118065772472028, + "loss": 0.339, + "step": 13764 + }, + { + "epoch": 1.84, + "grad_norm": 0.396484375, + "learning_rate": 0.00013116959305179825, + "loss": 0.3952, + "step": 13765 + }, + { + "epoch": 1.84, + "grad_norm": 0.44921875, + "learning_rate": 0.00013115852795619654, + "loss": 0.4485, + "step": 13766 + }, + { + "epoch": 1.84, + "grad_norm": 0.466796875, + "learning_rate": 0.00013114746243806519, + "loss": 0.2305, + "step": 13767 + }, + { + "epoch": 1.84, + "grad_norm": 0.5234375, + "learning_rate": 0.00013113639649755426, + "loss": 0.2769, + "step": 13768 + }, + { + "epoch": 1.84, + "grad_norm": 0.47265625, + "learning_rate": 0.0001311253301348138, + "loss": 0.3675, + "step": 13769 + }, + { + "epoch": 1.84, + "grad_norm": 0.46875, + "learning_rate": 0.0001311142633499939, + "loss": 0.59, + "step": 13770 + }, + { + "epoch": 1.84, + "grad_norm": 0.490234375, + "learning_rate": 0.00013110319614324464, + "loss": 0.3153, + "step": 13771 + }, + { + "epoch": 1.84, + "grad_norm": 0.5859375, + "learning_rate": 0.00013109212851471604, + "loss": 0.4525, + "step": 13772 + }, + { + "epoch": 1.84, + "grad_norm": 0.470703125, + "learning_rate": 0.00013108106046455823, + "loss": 0.4527, + "step": 13773 + }, + { + "epoch": 1.84, + "grad_norm": 0.796875, + "learning_rate": 0.0001310699919929213, + "loss": 0.4915, + "step": 13774 + }, + { + "epoch": 1.84, + "grad_norm": 0.46875, + "learning_rate": 0.00013105892309995534, + "loss": 0.1737, + "step": 13775 + }, + { + "epoch": 1.84, + "grad_norm": 0.55078125, + "learning_rate": 0.00013104785378581046, + "loss": 0.307, + "step": 13776 + }, + { + "epoch": 1.84, + "grad_norm": 0.5546875, + "learning_rate": 0.00013103678405063676, + "loss": 0.3228, + "step": 13777 + }, + { + "epoch": 1.84, + "grad_norm": 0.546875, + "learning_rate": 0.00013102571389458434, + "loss": 0.3625, + "step": 13778 + }, + { + "epoch": 1.84, + "grad_norm": 0.6875, + "learning_rate": 0.0001310146433178033, + "loss": 0.3557, + "step": 13779 + }, + { + "epoch": 1.84, + "grad_norm": 0.64453125, + "learning_rate": 0.0001310035723204438, + "loss": 0.2212, + "step": 13780 + }, + { + "epoch": 1.84, + "grad_norm": 0.66796875, + "learning_rate": 0.000130992500902656, + "loss": 0.2914, + "step": 13781 + }, + { + "epoch": 1.84, + "grad_norm": 0.3671875, + "learning_rate": 0.00013098142906458994, + "loss": 0.122, + "step": 13782 + }, + { + "epoch": 1.84, + "grad_norm": 0.5078125, + "learning_rate": 0.00013097035680639587, + "loss": 0.3771, + "step": 13783 + }, + { + "epoch": 1.84, + "grad_norm": 0.6328125, + "learning_rate": 0.00013095928412822384, + "loss": 0.559, + "step": 13784 + }, + { + "epoch": 1.84, + "grad_norm": 0.4609375, + "learning_rate": 0.00013094821103022406, + "loss": 0.4565, + "step": 13785 + }, + { + "epoch": 1.84, + "grad_norm": 0.51171875, + "learning_rate": 0.00013093713751254668, + "loss": 0.3191, + "step": 13786 + }, + { + "epoch": 1.84, + "grad_norm": 0.6015625, + "learning_rate": 0.0001309260635753419, + "loss": 0.4115, + "step": 13787 + }, + { + "epoch": 1.84, + "grad_norm": 0.298828125, + "learning_rate": 0.0001309149892187598, + "loss": 0.1509, + "step": 13788 + }, + { + "epoch": 1.84, + "grad_norm": 0.69921875, + "learning_rate": 0.00013090391444295062, + "loss": 0.4208, + "step": 13789 + }, + { + "epoch": 1.84, + "grad_norm": 0.64453125, + "learning_rate": 0.00013089283924806454, + "loss": 0.4013, + "step": 13790 + }, + { + "epoch": 1.84, + "grad_norm": 0.466796875, + "learning_rate": 0.0001308817636342517, + "loss": 0.4018, + "step": 13791 + }, + { + "epoch": 1.84, + "grad_norm": 0.328125, + "learning_rate": 0.00013087068760166237, + "loss": 0.2055, + "step": 13792 + }, + { + "epoch": 1.84, + "grad_norm": 0.578125, + "learning_rate": 0.00013085961115044667, + "loss": 0.3101, + "step": 13793 + }, + { + "epoch": 1.84, + "grad_norm": 0.59765625, + "learning_rate": 0.00013084853428075484, + "loss": 0.4436, + "step": 13794 + }, + { + "epoch": 1.84, + "grad_norm": 0.6328125, + "learning_rate": 0.0001308374569927371, + "loss": 0.6157, + "step": 13795 + }, + { + "epoch": 1.84, + "grad_norm": 0.478515625, + "learning_rate": 0.00013082637928654367, + "loss": 0.3029, + "step": 13796 + }, + { + "epoch": 1.84, + "grad_norm": 0.447265625, + "learning_rate": 0.0001308153011623247, + "loss": 0.4594, + "step": 13797 + }, + { + "epoch": 1.84, + "grad_norm": 0.7890625, + "learning_rate": 0.00013080422262023048, + "loss": 0.3879, + "step": 13798 + }, + { + "epoch": 1.84, + "grad_norm": 0.66015625, + "learning_rate": 0.00013079314366041126, + "loss": 0.3746, + "step": 13799 + }, + { + "epoch": 1.84, + "grad_norm": 0.57421875, + "learning_rate": 0.00013078206428301722, + "loss": 0.5135, + "step": 13800 + }, + { + "epoch": 1.84, + "grad_norm": 0.74609375, + "learning_rate": 0.0001307709844881987, + "loss": 0.437, + "step": 13801 + }, + { + "epoch": 1.84, + "grad_norm": 0.703125, + "learning_rate": 0.0001307599042761058, + "loss": 0.3987, + "step": 13802 + }, + { + "epoch": 1.84, + "grad_norm": 0.42578125, + "learning_rate": 0.0001307488236468889, + "loss": 0.3731, + "step": 13803 + }, + { + "epoch": 1.84, + "grad_norm": 0.50390625, + "learning_rate": 0.0001307377426006982, + "loss": 0.5507, + "step": 13804 + }, + { + "epoch": 1.84, + "grad_norm": 0.515625, + "learning_rate": 0.000130726661137684, + "loss": 0.4464, + "step": 13805 + }, + { + "epoch": 1.84, + "grad_norm": 0.5234375, + "learning_rate": 0.00013071557925799652, + "loss": 0.4584, + "step": 13806 + }, + { + "epoch": 1.84, + "grad_norm": 0.447265625, + "learning_rate": 0.0001307044969617861, + "loss": 0.2832, + "step": 13807 + }, + { + "epoch": 1.84, + "grad_norm": 0.39453125, + "learning_rate": 0.000130693414249203, + "loss": 0.2455, + "step": 13808 + }, + { + "epoch": 1.84, + "grad_norm": 0.53125, + "learning_rate": 0.00013068233112039748, + "loss": 0.2287, + "step": 13809 + }, + { + "epoch": 1.84, + "grad_norm": 0.470703125, + "learning_rate": 0.00013067124757551988, + "loss": 0.3186, + "step": 13810 + }, + { + "epoch": 1.84, + "grad_norm": 0.5625, + "learning_rate": 0.00013066016361472043, + "loss": 0.3349, + "step": 13811 + }, + { + "epoch": 1.84, + "grad_norm": 0.5625, + "learning_rate": 0.00013064907923814956, + "loss": 0.3765, + "step": 13812 + }, + { + "epoch": 1.84, + "grad_norm": 0.49609375, + "learning_rate": 0.00013063799444595748, + "loss": 0.3907, + "step": 13813 + }, + { + "epoch": 1.84, + "grad_norm": 0.48828125, + "learning_rate": 0.0001306269092382945, + "loss": 0.2329, + "step": 13814 + }, + { + "epoch": 1.84, + "grad_norm": 0.474609375, + "learning_rate": 0.00013061582361531098, + "loss": 0.3821, + "step": 13815 + }, + { + "epoch": 1.84, + "grad_norm": 0.462890625, + "learning_rate": 0.00013060473757715727, + "loss": 0.2526, + "step": 13816 + }, + { + "epoch": 1.84, + "grad_norm": 0.47265625, + "learning_rate": 0.00013059365112398367, + "loss": 0.4142, + "step": 13817 + }, + { + "epoch": 1.84, + "grad_norm": 0.62109375, + "learning_rate": 0.00013058256425594052, + "loss": 0.175, + "step": 13818 + }, + { + "epoch": 1.84, + "grad_norm": 0.3515625, + "learning_rate": 0.0001305714769731782, + "loss": 0.2109, + "step": 13819 + }, + { + "epoch": 1.84, + "grad_norm": 0.5390625, + "learning_rate": 0.000130560389275847, + "loss": 0.3653, + "step": 13820 + }, + { + "epoch": 1.84, + "grad_norm": 0.66015625, + "learning_rate": 0.00013054930116409728, + "loss": 0.7633, + "step": 13821 + }, + { + "epoch": 1.84, + "grad_norm": 0.53515625, + "learning_rate": 0.00013053821263807946, + "loss": 0.4555, + "step": 13822 + }, + { + "epoch": 1.84, + "grad_norm": 0.68359375, + "learning_rate": 0.00013052712369794386, + "loss": 0.2735, + "step": 13823 + }, + { + "epoch": 1.84, + "grad_norm": 0.470703125, + "learning_rate": 0.00013051603434384093, + "loss": 0.4455, + "step": 13824 + }, + { + "epoch": 1.84, + "grad_norm": 0.6171875, + "learning_rate": 0.0001305049445759209, + "loss": 0.3643, + "step": 13825 + }, + { + "epoch": 1.84, + "grad_norm": 0.66015625, + "learning_rate": 0.00013049385439433433, + "loss": 0.4992, + "step": 13826 + }, + { + "epoch": 1.85, + "grad_norm": 0.482421875, + "learning_rate": 0.00013048276379923148, + "loss": 0.5691, + "step": 13827 + }, + { + "epoch": 1.85, + "grad_norm": 0.62890625, + "learning_rate": 0.0001304716727907628, + "loss": 0.3845, + "step": 13828 + }, + { + "epoch": 1.85, + "grad_norm": 0.388671875, + "learning_rate": 0.00013046058136907866, + "loss": 0.3123, + "step": 13829 + }, + { + "epoch": 1.85, + "grad_norm": 0.61328125, + "learning_rate": 0.0001304494895343295, + "loss": 0.6341, + "step": 13830 + }, + { + "epoch": 1.85, + "grad_norm": 0.494140625, + "learning_rate": 0.00013043839728666572, + "loss": 0.2783, + "step": 13831 + }, + { + "epoch": 1.85, + "grad_norm": 0.423828125, + "learning_rate": 0.00013042730462623774, + "loss": 0.3156, + "step": 13832 + }, + { + "epoch": 1.85, + "grad_norm": 0.625, + "learning_rate": 0.000130416211553196, + "loss": 0.4077, + "step": 13833 + }, + { + "epoch": 1.85, + "grad_norm": 0.4375, + "learning_rate": 0.00013040511806769087, + "loss": 0.6089, + "step": 13834 + }, + { + "epoch": 1.85, + "grad_norm": 0.369140625, + "learning_rate": 0.00013039402416987285, + "loss": 0.1899, + "step": 13835 + }, + { + "epoch": 1.85, + "grad_norm": 0.5625, + "learning_rate": 0.00013038292985989233, + "loss": 0.507, + "step": 13836 + }, + { + "epoch": 1.85, + "grad_norm": 0.47265625, + "learning_rate": 0.00013037183513789984, + "loss": 0.3736, + "step": 13837 + }, + { + "epoch": 1.85, + "grad_norm": 0.75390625, + "learning_rate": 0.00013036074000404572, + "loss": 0.4873, + "step": 13838 + }, + { + "epoch": 1.85, + "grad_norm": 0.671875, + "learning_rate": 0.0001303496444584805, + "loss": 0.343, + "step": 13839 + }, + { + "epoch": 1.85, + "grad_norm": 0.48046875, + "learning_rate": 0.00013033854850135463, + "loss": 0.3397, + "step": 13840 + }, + { + "epoch": 1.85, + "grad_norm": 0.37890625, + "learning_rate": 0.00013032745213281853, + "loss": 0.1662, + "step": 13841 + }, + { + "epoch": 1.85, + "grad_norm": 0.65625, + "learning_rate": 0.00013031635535302275, + "loss": 0.5318, + "step": 13842 + }, + { + "epoch": 1.85, + "grad_norm": 0.5546875, + "learning_rate": 0.0001303052581621177, + "loss": 0.4818, + "step": 13843 + }, + { + "epoch": 1.85, + "grad_norm": 0.42578125, + "learning_rate": 0.00013029416056025399, + "loss": 0.3693, + "step": 13844 + }, + { + "epoch": 1.85, + "grad_norm": 0.5625, + "learning_rate": 0.00013028306254758194, + "loss": 0.3767, + "step": 13845 + }, + { + "epoch": 1.85, + "grad_norm": 0.482421875, + "learning_rate": 0.00013027196412425214, + "loss": 0.3937, + "step": 13846 + }, + { + "epoch": 1.85, + "grad_norm": 0.8046875, + "learning_rate": 0.00013026086529041507, + "loss": 0.603, + "step": 13847 + }, + { + "epoch": 1.85, + "grad_norm": 0.466796875, + "learning_rate": 0.00013024976604622125, + "loss": 0.1871, + "step": 13848 + }, + { + "epoch": 1.85, + "grad_norm": 0.53515625, + "learning_rate": 0.0001302386663918212, + "loss": 0.4934, + "step": 13849 + }, + { + "epoch": 1.85, + "grad_norm": 0.384765625, + "learning_rate": 0.00013022756632736547, + "loss": 0.3656, + "step": 13850 + }, + { + "epoch": 1.85, + "grad_norm": 0.392578125, + "learning_rate": 0.00013021646585300448, + "loss": 0.2159, + "step": 13851 + }, + { + "epoch": 1.85, + "grad_norm": 0.458984375, + "learning_rate": 0.00013020536496888881, + "loss": 0.1779, + "step": 13852 + }, + { + "epoch": 1.85, + "grad_norm": 0.6328125, + "learning_rate": 0.00013019426367516905, + "loss": 0.4953, + "step": 13853 + }, + { + "epoch": 1.85, + "grad_norm": 0.6796875, + "learning_rate": 0.00013018316197199568, + "loss": 0.2548, + "step": 13854 + }, + { + "epoch": 1.85, + "grad_norm": 0.63671875, + "learning_rate": 0.00013017205985951926, + "loss": 0.1979, + "step": 13855 + }, + { + "epoch": 1.85, + "grad_norm": 0.54296875, + "learning_rate": 0.00013016095733789035, + "loss": 0.3162, + "step": 13856 + }, + { + "epoch": 1.85, + "grad_norm": 0.57421875, + "learning_rate": 0.00013014985440725946, + "loss": 0.6308, + "step": 13857 + }, + { + "epoch": 1.85, + "grad_norm": 0.4375, + "learning_rate": 0.00013013875106777725, + "loss": 0.2591, + "step": 13858 + }, + { + "epoch": 1.85, + "grad_norm": 0.55859375, + "learning_rate": 0.0001301276473195942, + "loss": 0.3857, + "step": 13859 + }, + { + "epoch": 1.85, + "grad_norm": 0.462890625, + "learning_rate": 0.00013011654316286093, + "loss": 0.336, + "step": 13860 + }, + { + "epoch": 1.85, + "grad_norm": 0.65234375, + "learning_rate": 0.000130105438597728, + "loss": 0.3835, + "step": 13861 + }, + { + "epoch": 1.85, + "grad_norm": 1.0546875, + "learning_rate": 0.000130094333624346, + "loss": 0.3651, + "step": 13862 + }, + { + "epoch": 1.85, + "grad_norm": 0.4453125, + "learning_rate": 0.00013008322824286555, + "loss": 0.169, + "step": 13863 + }, + { + "epoch": 1.85, + "grad_norm": 0.486328125, + "learning_rate": 0.00013007212245343716, + "loss": 0.2766, + "step": 13864 + }, + { + "epoch": 1.85, + "grad_norm": 0.51171875, + "learning_rate": 0.00013006101625621152, + "loss": 0.4345, + "step": 13865 + }, + { + "epoch": 1.85, + "grad_norm": 0.404296875, + "learning_rate": 0.0001300499096513392, + "loss": 0.2204, + "step": 13866 + }, + { + "epoch": 1.85, + "grad_norm": 0.45703125, + "learning_rate": 0.00013003880263897085, + "loss": 0.3315, + "step": 13867 + }, + { + "epoch": 1.85, + "grad_norm": 0.50390625, + "learning_rate": 0.00013002769521925703, + "loss": 0.2946, + "step": 13868 + }, + { + "epoch": 1.85, + "grad_norm": 0.5703125, + "learning_rate": 0.0001300165873923484, + "loss": 0.3564, + "step": 13869 + }, + { + "epoch": 1.85, + "grad_norm": 0.443359375, + "learning_rate": 0.0001300054791583956, + "loss": 0.4724, + "step": 13870 + }, + { + "epoch": 1.85, + "grad_norm": 0.453125, + "learning_rate": 0.00012999437051754918, + "loss": 0.2891, + "step": 13871 + }, + { + "epoch": 1.85, + "grad_norm": 0.373046875, + "learning_rate": 0.0001299832614699599, + "loss": 0.1951, + "step": 13872 + }, + { + "epoch": 1.85, + "grad_norm": 0.458984375, + "learning_rate": 0.00012997215201577834, + "loss": 0.2386, + "step": 13873 + }, + { + "epoch": 1.85, + "grad_norm": 0.3203125, + "learning_rate": 0.00012996104215515518, + "loss": 0.1261, + "step": 13874 + }, + { + "epoch": 1.85, + "grad_norm": 0.34765625, + "learning_rate": 0.00012994993188824102, + "loss": 0.1805, + "step": 13875 + }, + { + "epoch": 1.85, + "grad_norm": 0.427734375, + "learning_rate": 0.00012993882121518658, + "loss": 0.2452, + "step": 13876 + }, + { + "epoch": 1.85, + "grad_norm": 0.384765625, + "learning_rate": 0.00012992771013614252, + "loss": 0.172, + "step": 13877 + }, + { + "epoch": 1.85, + "grad_norm": 0.52734375, + "learning_rate": 0.00012991659865125953, + "loss": 0.3805, + "step": 13878 + }, + { + "epoch": 1.85, + "grad_norm": 0.4453125, + "learning_rate": 0.0001299054867606882, + "loss": 0.4251, + "step": 13879 + }, + { + "epoch": 1.85, + "grad_norm": 0.392578125, + "learning_rate": 0.0001298943744645793, + "loss": 0.255, + "step": 13880 + }, + { + "epoch": 1.85, + "grad_norm": 0.41796875, + "learning_rate": 0.00012988326176308353, + "loss": 0.2943, + "step": 13881 + }, + { + "epoch": 1.85, + "grad_norm": 0.48046875, + "learning_rate": 0.00012987214865635148, + "loss": 0.3348, + "step": 13882 + }, + { + "epoch": 1.85, + "grad_norm": 0.5078125, + "learning_rate": 0.00012986103514453398, + "loss": 0.2903, + "step": 13883 + }, + { + "epoch": 1.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00012984992122778166, + "loss": 0.5099, + "step": 13884 + }, + { + "epoch": 1.85, + "grad_norm": 0.76171875, + "learning_rate": 0.00012983880690624526, + "loss": 0.7238, + "step": 13885 + }, + { + "epoch": 1.85, + "grad_norm": 0.65234375, + "learning_rate": 0.0001298276921800755, + "loss": 0.5051, + "step": 13886 + }, + { + "epoch": 1.85, + "grad_norm": 0.625, + "learning_rate": 0.00012981657704942305, + "loss": 0.423, + "step": 13887 + }, + { + "epoch": 1.85, + "grad_norm": 0.439453125, + "learning_rate": 0.0001298054615144387, + "loss": 0.2836, + "step": 13888 + }, + { + "epoch": 1.85, + "grad_norm": 0.53125, + "learning_rate": 0.00012979434557527317, + "loss": 0.2526, + "step": 13889 + }, + { + "epoch": 1.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00012978322923207718, + "loss": 0.4809, + "step": 13890 + }, + { + "epoch": 1.85, + "grad_norm": 0.3984375, + "learning_rate": 0.0001297721124850015, + "loss": 0.1397, + "step": 13891 + }, + { + "epoch": 1.85, + "grad_norm": 0.490234375, + "learning_rate": 0.00012976099533419688, + "loss": 0.5347, + "step": 13892 + }, + { + "epoch": 1.85, + "grad_norm": 0.5703125, + "learning_rate": 0.00012974987777981402, + "loss": 0.4572, + "step": 13893 + }, + { + "epoch": 1.85, + "grad_norm": 0.625, + "learning_rate": 0.00012973875982200377, + "loss": 0.4807, + "step": 13894 + }, + { + "epoch": 1.85, + "grad_norm": 0.50390625, + "learning_rate": 0.0001297276414609168, + "loss": 0.4409, + "step": 13895 + }, + { + "epoch": 1.85, + "grad_norm": 0.46875, + "learning_rate": 0.00012971652269670397, + "loss": 0.1884, + "step": 13896 + }, + { + "epoch": 1.85, + "grad_norm": 0.6015625, + "learning_rate": 0.00012970540352951601, + "loss": 0.5306, + "step": 13897 + }, + { + "epoch": 1.85, + "grad_norm": 0.53515625, + "learning_rate": 0.00012969428395950372, + "loss": 0.343, + "step": 13898 + }, + { + "epoch": 1.85, + "grad_norm": 0.373046875, + "learning_rate": 0.00012968316398681787, + "loss": 0.2197, + "step": 13899 + }, + { + "epoch": 1.85, + "grad_norm": 0.419921875, + "learning_rate": 0.00012967204361160928, + "loss": 0.3354, + "step": 13900 + }, + { + "epoch": 1.85, + "grad_norm": 0.46875, + "learning_rate": 0.00012966092283402873, + "loss": 0.2094, + "step": 13901 + }, + { + "epoch": 1.86, + "grad_norm": 0.53515625, + "learning_rate": 0.000129649801654227, + "loss": 0.5545, + "step": 13902 + }, + { + "epoch": 1.86, + "grad_norm": 0.546875, + "learning_rate": 0.000129638680072355, + "loss": 0.3967, + "step": 13903 + }, + { + "epoch": 1.86, + "grad_norm": 0.58984375, + "learning_rate": 0.00012962755808856342, + "loss": 0.6074, + "step": 13904 + }, + { + "epoch": 1.86, + "grad_norm": 0.515625, + "learning_rate": 0.00012961643570300315, + "loss": 0.4155, + "step": 13905 + }, + { + "epoch": 1.86, + "grad_norm": 0.45703125, + "learning_rate": 0.000129605312915825, + "loss": 0.1538, + "step": 13906 + }, + { + "epoch": 1.86, + "grad_norm": 0.5625, + "learning_rate": 0.00012959418972717981, + "loss": 0.7183, + "step": 13907 + }, + { + "epoch": 1.86, + "grad_norm": 0.56640625, + "learning_rate": 0.00012958306613721843, + "loss": 0.3688, + "step": 13908 + }, + { + "epoch": 1.86, + "grad_norm": 0.58984375, + "learning_rate": 0.00012957194214609165, + "loss": 0.5587, + "step": 13909 + }, + { + "epoch": 1.86, + "grad_norm": 0.53515625, + "learning_rate": 0.00012956081775395036, + "loss": 0.5919, + "step": 13910 + }, + { + "epoch": 1.86, + "grad_norm": 0.5859375, + "learning_rate": 0.00012954969296094545, + "loss": 0.4326, + "step": 13911 + }, + { + "epoch": 1.86, + "grad_norm": 0.6328125, + "learning_rate": 0.00012953856776722772, + "loss": 0.4433, + "step": 13912 + }, + { + "epoch": 1.86, + "grad_norm": 0.59765625, + "learning_rate": 0.00012952744217294805, + "loss": 0.2991, + "step": 13913 + }, + { + "epoch": 1.86, + "grad_norm": 0.53515625, + "learning_rate": 0.0001295163161782573, + "loss": 0.3177, + "step": 13914 + }, + { + "epoch": 1.86, + "grad_norm": 0.46484375, + "learning_rate": 0.0001295051897833064, + "loss": 0.4224, + "step": 13915 + }, + { + "epoch": 1.86, + "grad_norm": 0.3828125, + "learning_rate": 0.00012949406298824615, + "loss": 0.1555, + "step": 13916 + }, + { + "epoch": 1.86, + "grad_norm": 0.46484375, + "learning_rate": 0.0001294829357932275, + "loss": 0.3782, + "step": 13917 + }, + { + "epoch": 1.86, + "grad_norm": 0.431640625, + "learning_rate": 0.0001294718081984013, + "loss": 0.2848, + "step": 13918 + }, + { + "epoch": 1.86, + "grad_norm": 0.56640625, + "learning_rate": 0.00012946068020391847, + "loss": 0.1934, + "step": 13919 + }, + { + "epoch": 1.86, + "grad_norm": 0.6171875, + "learning_rate": 0.0001294495518099299, + "loss": 0.275, + "step": 13920 + }, + { + "epoch": 1.86, + "grad_norm": 0.498046875, + "learning_rate": 0.0001294384230165865, + "loss": 0.3235, + "step": 13921 + }, + { + "epoch": 1.86, + "grad_norm": 0.6484375, + "learning_rate": 0.00012942729382403923, + "loss": 0.3566, + "step": 13922 + }, + { + "epoch": 1.86, + "grad_norm": 0.625, + "learning_rate": 0.00012941616423243896, + "loss": 0.4705, + "step": 13923 + }, + { + "epoch": 1.86, + "grad_norm": 0.494140625, + "learning_rate": 0.00012940503424193662, + "loss": 0.2049, + "step": 13924 + }, + { + "epoch": 1.86, + "grad_norm": 0.66015625, + "learning_rate": 0.00012939390385268313, + "loss": 0.7524, + "step": 13925 + }, + { + "epoch": 1.86, + "grad_norm": 0.33203125, + "learning_rate": 0.00012938277306482945, + "loss": 0.1915, + "step": 13926 + }, + { + "epoch": 1.86, + "grad_norm": 0.53125, + "learning_rate": 0.0001293716418785265, + "loss": 0.3941, + "step": 13927 + }, + { + "epoch": 1.86, + "grad_norm": 0.6171875, + "learning_rate": 0.00012936051029392525, + "loss": 0.297, + "step": 13928 + }, + { + "epoch": 1.86, + "grad_norm": 0.5703125, + "learning_rate": 0.0001293493783111766, + "loss": 0.5833, + "step": 13929 + }, + { + "epoch": 1.86, + "grad_norm": 0.38671875, + "learning_rate": 0.0001293382459304316, + "loss": 0.2307, + "step": 13930 + }, + { + "epoch": 1.86, + "grad_norm": 0.60546875, + "learning_rate": 0.0001293271131518411, + "loss": 0.4001, + "step": 13931 + }, + { + "epoch": 1.86, + "grad_norm": 0.498046875, + "learning_rate": 0.00012931597997555615, + "loss": 0.4403, + "step": 13932 + }, + { + "epoch": 1.86, + "grad_norm": 0.54296875, + "learning_rate": 0.0001293048464017277, + "loss": 0.1711, + "step": 13933 + }, + { + "epoch": 1.86, + "grad_norm": 0.703125, + "learning_rate": 0.00012929371243050673, + "loss": 0.4525, + "step": 13934 + }, + { + "epoch": 1.86, + "grad_norm": 0.56640625, + "learning_rate": 0.0001292825780620442, + "loss": 0.2109, + "step": 13935 + }, + { + "epoch": 1.86, + "grad_norm": 0.5625, + "learning_rate": 0.00012927144329649115, + "loss": 0.4977, + "step": 13936 + }, + { + "epoch": 1.86, + "grad_norm": 0.416015625, + "learning_rate": 0.0001292603081339985, + "loss": 0.2507, + "step": 13937 + }, + { + "epoch": 1.86, + "grad_norm": 0.5, + "learning_rate": 0.00012924917257471733, + "loss": 0.2575, + "step": 13938 + }, + { + "epoch": 1.86, + "grad_norm": 0.609375, + "learning_rate": 0.0001292380366187986, + "loss": 0.6405, + "step": 13939 + }, + { + "epoch": 1.86, + "grad_norm": 0.46875, + "learning_rate": 0.0001292269002663933, + "loss": 0.479, + "step": 13940 + }, + { + "epoch": 1.86, + "grad_norm": 0.68359375, + "learning_rate": 0.00012921576351765252, + "loss": 0.3792, + "step": 13941 + }, + { + "epoch": 1.86, + "grad_norm": 0.47265625, + "learning_rate": 0.0001292046263727272, + "loss": 0.2394, + "step": 13942 + }, + { + "epoch": 1.86, + "grad_norm": 0.5546875, + "learning_rate": 0.00012919348883176843, + "loss": 0.4633, + "step": 13943 + }, + { + "epoch": 1.86, + "grad_norm": 0.419921875, + "learning_rate": 0.00012918235089492717, + "loss": 0.5021, + "step": 13944 + }, + { + "epoch": 1.86, + "grad_norm": 0.5703125, + "learning_rate": 0.00012917121256235455, + "loss": 0.4151, + "step": 13945 + }, + { + "epoch": 1.86, + "grad_norm": 0.494140625, + "learning_rate": 0.00012916007383420152, + "loss": 0.1753, + "step": 13946 + }, + { + "epoch": 1.86, + "grad_norm": 0.6484375, + "learning_rate": 0.00012914893471061925, + "loss": 0.4128, + "step": 13947 + }, + { + "epoch": 1.86, + "grad_norm": 0.59375, + "learning_rate": 0.00012913779519175864, + "loss": 0.4116, + "step": 13948 + }, + { + "epoch": 1.86, + "grad_norm": 0.52734375, + "learning_rate": 0.00012912665527777087, + "loss": 0.2733, + "step": 13949 + }, + { + "epoch": 1.86, + "grad_norm": 0.3828125, + "learning_rate": 0.00012911551496880695, + "loss": 0.3113, + "step": 13950 + }, + { + "epoch": 1.86, + "grad_norm": 0.328125, + "learning_rate": 0.00012910437426501796, + "loss": 0.2048, + "step": 13951 + }, + { + "epoch": 1.86, + "grad_norm": 0.6484375, + "learning_rate": 0.00012909323316655496, + "loss": 0.4988, + "step": 13952 + }, + { + "epoch": 1.86, + "grad_norm": 0.46484375, + "learning_rate": 0.00012908209167356905, + "loss": 0.4041, + "step": 13953 + }, + { + "epoch": 1.86, + "grad_norm": 0.50390625, + "learning_rate": 0.00012907094978621136, + "loss": 0.3378, + "step": 13954 + }, + { + "epoch": 1.86, + "grad_norm": 0.625, + "learning_rate": 0.00012905980750463287, + "loss": 0.4213, + "step": 13955 + }, + { + "epoch": 1.86, + "grad_norm": 0.52734375, + "learning_rate": 0.00012904866482898477, + "loss": 0.3788, + "step": 13956 + }, + { + "epoch": 1.86, + "grad_norm": 0.40625, + "learning_rate": 0.00012903752175941812, + "loss": 0.2876, + "step": 13957 + }, + { + "epoch": 1.86, + "grad_norm": 0.3671875, + "learning_rate": 0.00012902637829608407, + "loss": 0.3271, + "step": 13958 + }, + { + "epoch": 1.86, + "grad_norm": 0.53125, + "learning_rate": 0.00012901523443913367, + "loss": 0.6598, + "step": 13959 + }, + { + "epoch": 1.86, + "grad_norm": 0.5625, + "learning_rate": 0.00012900409018871807, + "loss": 0.2967, + "step": 13960 + }, + { + "epoch": 1.86, + "grad_norm": 0.703125, + "learning_rate": 0.0001289929455449884, + "loss": 0.3479, + "step": 13961 + }, + { + "epoch": 1.86, + "grad_norm": 0.431640625, + "learning_rate": 0.00012898180050809578, + "loss": 0.2678, + "step": 13962 + }, + { + "epoch": 1.86, + "grad_norm": 0.6015625, + "learning_rate": 0.00012897065507819135, + "loss": 0.5312, + "step": 13963 + }, + { + "epoch": 1.86, + "grad_norm": 0.40625, + "learning_rate": 0.00012895950925542624, + "loss": 0.2513, + "step": 13964 + }, + { + "epoch": 1.86, + "grad_norm": 0.54296875, + "learning_rate": 0.0001289483630399516, + "loss": 0.3115, + "step": 13965 + }, + { + "epoch": 1.86, + "grad_norm": 0.443359375, + "learning_rate": 0.00012893721643191859, + "loss": 0.1907, + "step": 13966 + }, + { + "epoch": 1.86, + "grad_norm": 0.53125, + "learning_rate": 0.00012892606943147833, + "loss": 0.4428, + "step": 13967 + }, + { + "epoch": 1.86, + "grad_norm": 0.53515625, + "learning_rate": 0.00012891492203878201, + "loss": 0.5316, + "step": 13968 + }, + { + "epoch": 1.86, + "grad_norm": 0.50390625, + "learning_rate": 0.00012890377425398076, + "loss": 0.5373, + "step": 13969 + }, + { + "epoch": 1.86, + "grad_norm": 0.408203125, + "learning_rate": 0.00012889262607722584, + "loss": 0.3582, + "step": 13970 + }, + { + "epoch": 1.86, + "grad_norm": 0.5859375, + "learning_rate": 0.00012888147750866834, + "loss": 0.4603, + "step": 13971 + }, + { + "epoch": 1.86, + "grad_norm": 0.53515625, + "learning_rate": 0.0001288703285484595, + "loss": 0.3742, + "step": 13972 + }, + { + "epoch": 1.86, + "grad_norm": 0.326171875, + "learning_rate": 0.00012885917919675042, + "loss": 0.1517, + "step": 13973 + }, + { + "epoch": 1.86, + "grad_norm": 0.45703125, + "learning_rate": 0.00012884802945369237, + "loss": 0.5543, + "step": 13974 + }, + { + "epoch": 1.86, + "grad_norm": 0.443359375, + "learning_rate": 0.00012883687931943653, + "loss": 0.3196, + "step": 13975 + }, + { + "epoch": 1.86, + "grad_norm": 0.65234375, + "learning_rate": 0.0001288257287941341, + "loss": 0.2852, + "step": 13976 + }, + { + "epoch": 1.87, + "grad_norm": 0.62109375, + "learning_rate": 0.0001288145778779363, + "loss": 0.456, + "step": 13977 + }, + { + "epoch": 1.87, + "grad_norm": 0.70703125, + "learning_rate": 0.0001288034265709943, + "loss": 0.4686, + "step": 13978 + }, + { + "epoch": 1.87, + "grad_norm": 0.4765625, + "learning_rate": 0.00012879227487345938, + "loss": 0.3141, + "step": 13979 + }, + { + "epoch": 1.87, + "grad_norm": 0.52734375, + "learning_rate": 0.00012878112278548272, + "loss": 0.4378, + "step": 13980 + }, + { + "epoch": 1.87, + "grad_norm": 0.5703125, + "learning_rate": 0.00012876997030721556, + "loss": 0.2972, + "step": 13981 + }, + { + "epoch": 1.87, + "grad_norm": 0.439453125, + "learning_rate": 0.00012875881743880914, + "loss": 0.2833, + "step": 13982 + }, + { + "epoch": 1.87, + "grad_norm": 0.42578125, + "learning_rate": 0.0001287476641804147, + "loss": 0.2617, + "step": 13983 + }, + { + "epoch": 1.87, + "grad_norm": 0.49609375, + "learning_rate": 0.00012873651053218349, + "loss": 0.2811, + "step": 13984 + }, + { + "epoch": 1.87, + "grad_norm": 0.486328125, + "learning_rate": 0.00012872535649426672, + "loss": 0.4087, + "step": 13985 + }, + { + "epoch": 1.87, + "grad_norm": 0.6640625, + "learning_rate": 0.00012871420206681571, + "loss": 0.7217, + "step": 13986 + }, + { + "epoch": 1.87, + "grad_norm": 0.498046875, + "learning_rate": 0.0001287030472499817, + "loss": 0.3535, + "step": 13987 + }, + { + "epoch": 1.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00012869189204391595, + "loss": 0.3749, + "step": 13988 + }, + { + "epoch": 1.87, + "grad_norm": 0.474609375, + "learning_rate": 0.00012868073644876972, + "loss": 0.3939, + "step": 13989 + }, + { + "epoch": 1.87, + "grad_norm": 0.53125, + "learning_rate": 0.0001286695804646943, + "loss": 0.7412, + "step": 13990 + }, + { + "epoch": 1.87, + "grad_norm": 0.48046875, + "learning_rate": 0.00012865842409184096, + "loss": 0.3023, + "step": 13991 + }, + { + "epoch": 1.87, + "grad_norm": 0.7421875, + "learning_rate": 0.000128647267330361, + "loss": 0.5262, + "step": 13992 + }, + { + "epoch": 1.87, + "grad_norm": 0.765625, + "learning_rate": 0.0001286361101804057, + "loss": 0.4113, + "step": 13993 + }, + { + "epoch": 1.87, + "grad_norm": 0.4765625, + "learning_rate": 0.00012862495264212638, + "loss": 0.3172, + "step": 13994 + }, + { + "epoch": 1.87, + "grad_norm": 0.71484375, + "learning_rate": 0.00012861379471567434, + "loss": 0.5508, + "step": 13995 + }, + { + "epoch": 1.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00012860263640120085, + "loss": 0.463, + "step": 13996 + }, + { + "epoch": 1.87, + "grad_norm": 0.51171875, + "learning_rate": 0.00012859147769885728, + "loss": 0.2303, + "step": 13997 + }, + { + "epoch": 1.87, + "grad_norm": 0.56640625, + "learning_rate": 0.0001285803186087949, + "loss": 0.3113, + "step": 13998 + }, + { + "epoch": 1.87, + "grad_norm": 0.474609375, + "learning_rate": 0.00012856915913116507, + "loss": 0.5036, + "step": 13999 + }, + { + "epoch": 1.87, + "grad_norm": 0.439453125, + "learning_rate": 0.0001285579992661191, + "loss": 0.4013, + "step": 14000 + }, + { + "epoch": 1.87, + "grad_norm": 0.5546875, + "learning_rate": 0.00012854683901380835, + "loss": 0.2876, + "step": 14001 + }, + { + "epoch": 1.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00012853567837438413, + "loss": 0.68, + "step": 14002 + }, + { + "epoch": 1.87, + "grad_norm": 0.39453125, + "learning_rate": 0.0001285245173479978, + "loss": 0.2938, + "step": 14003 + }, + { + "epoch": 1.87, + "grad_norm": 0.337890625, + "learning_rate": 0.00012851335593480072, + "loss": 0.2375, + "step": 14004 + }, + { + "epoch": 1.87, + "grad_norm": 0.462890625, + "learning_rate": 0.0001285021941349442, + "loss": 0.3439, + "step": 14005 + }, + { + "epoch": 1.87, + "grad_norm": 0.39453125, + "learning_rate": 0.00012849103194857966, + "loss": 0.2569, + "step": 14006 + }, + { + "epoch": 1.87, + "grad_norm": 0.365234375, + "learning_rate": 0.00012847986937585841, + "loss": 0.3808, + "step": 14007 + }, + { + "epoch": 1.87, + "grad_norm": 0.609375, + "learning_rate": 0.00012846870641693189, + "loss": 0.4098, + "step": 14008 + }, + { + "epoch": 1.87, + "grad_norm": 0.6171875, + "learning_rate": 0.00012845754307195143, + "loss": 0.5867, + "step": 14009 + }, + { + "epoch": 1.87, + "grad_norm": 0.65625, + "learning_rate": 0.0001284463793410684, + "loss": 0.3913, + "step": 14010 + }, + { + "epoch": 1.87, + "grad_norm": 0.42578125, + "learning_rate": 0.00012843521522443423, + "loss": 0.2329, + "step": 14011 + }, + { + "epoch": 1.87, + "grad_norm": 0.53125, + "learning_rate": 0.0001284240507222003, + "loss": 0.5625, + "step": 14012 + }, + { + "epoch": 1.87, + "grad_norm": 0.578125, + "learning_rate": 0.000128412885834518, + "loss": 0.3389, + "step": 14013 + }, + { + "epoch": 1.87, + "grad_norm": 0.44921875, + "learning_rate": 0.0001284017205615387, + "loss": 0.3031, + "step": 14014 + }, + { + "epoch": 1.87, + "grad_norm": 0.63671875, + "learning_rate": 0.0001283905549034139, + "loss": 0.4497, + "step": 14015 + }, + { + "epoch": 1.87, + "grad_norm": 0.703125, + "learning_rate": 0.0001283793888602949, + "loss": 0.696, + "step": 14016 + }, + { + "epoch": 1.87, + "grad_norm": 0.5625, + "learning_rate": 0.00012836822243233317, + "loss": 0.5409, + "step": 14017 + }, + { + "epoch": 1.87, + "grad_norm": 0.640625, + "learning_rate": 0.0001283570556196802, + "loss": 0.5762, + "step": 14018 + }, + { + "epoch": 1.87, + "grad_norm": 0.70703125, + "learning_rate": 0.0001283458884224873, + "loss": 0.4654, + "step": 14019 + }, + { + "epoch": 1.87, + "grad_norm": 0.49609375, + "learning_rate": 0.00012833472084090598, + "loss": 0.4411, + "step": 14020 + }, + { + "epoch": 1.87, + "grad_norm": 0.40234375, + "learning_rate": 0.00012832355287508766, + "loss": 0.2329, + "step": 14021 + }, + { + "epoch": 1.87, + "grad_norm": 0.5234375, + "learning_rate": 0.00012831238452518378, + "loss": 0.4964, + "step": 14022 + }, + { + "epoch": 1.87, + "grad_norm": 0.412109375, + "learning_rate": 0.00012830121579134577, + "loss": 0.2524, + "step": 14023 + }, + { + "epoch": 1.87, + "grad_norm": 0.60546875, + "learning_rate": 0.00012829004667372517, + "loss": 0.5947, + "step": 14024 + }, + { + "epoch": 1.87, + "grad_norm": 0.59375, + "learning_rate": 0.00012827887717247334, + "loss": 0.4981, + "step": 14025 + }, + { + "epoch": 1.87, + "grad_norm": 0.54296875, + "learning_rate": 0.0001282677072877418, + "loss": 0.6122, + "step": 14026 + }, + { + "epoch": 1.87, + "grad_norm": 0.365234375, + "learning_rate": 0.000128256537019682, + "loss": 0.2632, + "step": 14027 + }, + { + "epoch": 1.87, + "grad_norm": 0.447265625, + "learning_rate": 0.00012824536636844543, + "loss": 0.3263, + "step": 14028 + }, + { + "epoch": 1.87, + "grad_norm": 0.72265625, + "learning_rate": 0.00012823419533418355, + "loss": 0.5041, + "step": 14029 + }, + { + "epoch": 1.87, + "grad_norm": 0.546875, + "learning_rate": 0.00012822302391704785, + "loss": 0.2966, + "step": 14030 + }, + { + "epoch": 1.87, + "grad_norm": 0.54296875, + "learning_rate": 0.00012821185211718986, + "loss": 0.1922, + "step": 14031 + }, + { + "epoch": 1.87, + "grad_norm": 0.341796875, + "learning_rate": 0.00012820067993476105, + "loss": 0.1539, + "step": 14032 + }, + { + "epoch": 1.87, + "grad_norm": 0.609375, + "learning_rate": 0.00012818950736991292, + "loss": 0.2776, + "step": 14033 + }, + { + "epoch": 1.87, + "grad_norm": 0.76953125, + "learning_rate": 0.00012817833442279697, + "loss": 0.7558, + "step": 14034 + }, + { + "epoch": 1.87, + "grad_norm": 0.50390625, + "learning_rate": 0.0001281671610935647, + "loss": 0.4193, + "step": 14035 + }, + { + "epoch": 1.87, + "grad_norm": 0.3828125, + "learning_rate": 0.00012815598738236768, + "loss": 0.3132, + "step": 14036 + }, + { + "epoch": 1.87, + "grad_norm": 0.3046875, + "learning_rate": 0.00012814481328935737, + "loss": 0.1593, + "step": 14037 + }, + { + "epoch": 1.87, + "grad_norm": 0.353515625, + "learning_rate": 0.0001281336388146854, + "loss": 0.1694, + "step": 14038 + }, + { + "epoch": 1.87, + "grad_norm": 0.40625, + "learning_rate": 0.00012812246395850317, + "loss": 0.3985, + "step": 14039 + }, + { + "epoch": 1.87, + "grad_norm": 0.41015625, + "learning_rate": 0.00012811128872096227, + "loss": 0.3517, + "step": 14040 + }, + { + "epoch": 1.87, + "grad_norm": 0.43359375, + "learning_rate": 0.00012810011310221425, + "loss": 0.2691, + "step": 14041 + }, + { + "epoch": 1.87, + "grad_norm": 0.5078125, + "learning_rate": 0.00012808893710241067, + "loss": 0.3383, + "step": 14042 + }, + { + "epoch": 1.87, + "grad_norm": 0.5078125, + "learning_rate": 0.0001280777607217031, + "loss": 0.4318, + "step": 14043 + }, + { + "epoch": 1.87, + "grad_norm": 0.515625, + "learning_rate": 0.0001280665839602431, + "loss": 0.5242, + "step": 14044 + }, + { + "epoch": 1.87, + "grad_norm": 0.37109375, + "learning_rate": 0.00012805540681818216, + "loss": 0.3145, + "step": 14045 + }, + { + "epoch": 1.87, + "grad_norm": 0.5625, + "learning_rate": 0.0001280442292956719, + "loss": 0.5225, + "step": 14046 + }, + { + "epoch": 1.87, + "grad_norm": 0.478515625, + "learning_rate": 0.0001280330513928639, + "loss": 0.3294, + "step": 14047 + }, + { + "epoch": 1.87, + "grad_norm": 0.478515625, + "learning_rate": 0.00012802187310990973, + "loss": 0.4834, + "step": 14048 + }, + { + "epoch": 1.87, + "grad_norm": 0.7421875, + "learning_rate": 0.000128010694446961, + "loss": 0.5382, + "step": 14049 + }, + { + "epoch": 1.87, + "grad_norm": 0.54296875, + "learning_rate": 0.00012799951540416925, + "loss": 0.2863, + "step": 14050 + }, + { + "epoch": 1.87, + "grad_norm": 0.453125, + "learning_rate": 0.00012798833598168613, + "loss": 0.3587, + "step": 14051 + }, + { + "epoch": 1.88, + "grad_norm": 0.484375, + "learning_rate": 0.00012797715617966318, + "loss": 0.3099, + "step": 14052 + }, + { + "epoch": 1.88, + "grad_norm": 0.4140625, + "learning_rate": 0.000127965975998252, + "loss": 0.2242, + "step": 14053 + }, + { + "epoch": 1.88, + "grad_norm": 0.6640625, + "learning_rate": 0.00012795479543760432, + "loss": 0.4251, + "step": 14054 + }, + { + "epoch": 1.88, + "grad_norm": 0.4921875, + "learning_rate": 0.00012794361449787163, + "loss": 0.3803, + "step": 14055 + }, + { + "epoch": 1.88, + "grad_norm": 0.68359375, + "learning_rate": 0.0001279324331792056, + "loss": 0.2792, + "step": 14056 + }, + { + "epoch": 1.88, + "grad_norm": 0.4296875, + "learning_rate": 0.00012792125148175783, + "loss": 0.1996, + "step": 14057 + }, + { + "epoch": 1.88, + "grad_norm": 0.486328125, + "learning_rate": 0.00012791006940568, + "loss": 0.3207, + "step": 14058 + }, + { + "epoch": 1.88, + "grad_norm": 0.46484375, + "learning_rate": 0.0001278988869511237, + "loss": 0.577, + "step": 14059 + }, + { + "epoch": 1.88, + "grad_norm": 0.466796875, + "learning_rate": 0.00012788770411824058, + "loss": 0.2061, + "step": 14060 + }, + { + "epoch": 1.88, + "grad_norm": 0.375, + "learning_rate": 0.00012787652090718232, + "loss": 0.2395, + "step": 14061 + }, + { + "epoch": 1.88, + "grad_norm": 0.70703125, + "learning_rate": 0.00012786533731810052, + "loss": 0.6444, + "step": 14062 + }, + { + "epoch": 1.88, + "grad_norm": 0.63671875, + "learning_rate": 0.0001278541533511469, + "loss": 0.4216, + "step": 14063 + }, + { + "epoch": 1.88, + "grad_norm": 0.4296875, + "learning_rate": 0.00012784296900647304, + "loss": 0.342, + "step": 14064 + }, + { + "epoch": 1.88, + "grad_norm": 0.435546875, + "learning_rate": 0.00012783178428423065, + "loss": 0.4416, + "step": 14065 + }, + { + "epoch": 1.88, + "grad_norm": 0.51953125, + "learning_rate": 0.00012782059918457143, + "loss": 0.2985, + "step": 14066 + }, + { + "epoch": 1.88, + "grad_norm": 0.6640625, + "learning_rate": 0.00012780941370764702, + "loss": 0.457, + "step": 14067 + }, + { + "epoch": 1.88, + "grad_norm": 0.466796875, + "learning_rate": 0.00012779822785360912, + "loss": 0.3162, + "step": 14068 + }, + { + "epoch": 1.88, + "grad_norm": 0.53125, + "learning_rate": 0.0001277870416226094, + "loss": 0.486, + "step": 14069 + }, + { + "epoch": 1.88, + "grad_norm": 0.482421875, + "learning_rate": 0.00012777585501479958, + "loss": 0.4185, + "step": 14070 + }, + { + "epoch": 1.88, + "grad_norm": 0.62109375, + "learning_rate": 0.00012776466803033132, + "loss": 0.5544, + "step": 14071 + }, + { + "epoch": 1.88, + "grad_norm": 0.419921875, + "learning_rate": 0.00012775348066935635, + "loss": 0.3187, + "step": 14072 + }, + { + "epoch": 1.88, + "grad_norm": 0.431640625, + "learning_rate": 0.00012774229293202634, + "loss": 0.4589, + "step": 14073 + }, + { + "epoch": 1.88, + "grad_norm": 0.51171875, + "learning_rate": 0.0001277311048184931, + "loss": 0.2594, + "step": 14074 + }, + { + "epoch": 1.88, + "grad_norm": 0.71484375, + "learning_rate": 0.00012771991632890822, + "loss": 0.3318, + "step": 14075 + }, + { + "epoch": 1.88, + "grad_norm": 0.49609375, + "learning_rate": 0.0001277087274634235, + "loss": 0.4751, + "step": 14076 + }, + { + "epoch": 1.88, + "grad_norm": 0.58984375, + "learning_rate": 0.00012769753822219066, + "loss": 0.2842, + "step": 14077 + }, + { + "epoch": 1.88, + "grad_norm": 0.80859375, + "learning_rate": 0.00012768634860536139, + "loss": 0.3716, + "step": 14078 + }, + { + "epoch": 1.88, + "grad_norm": 0.51171875, + "learning_rate": 0.0001276751586130875, + "loss": 0.3011, + "step": 14079 + }, + { + "epoch": 1.88, + "grad_norm": 0.515625, + "learning_rate": 0.0001276639682455207, + "loss": 0.3619, + "step": 14080 + }, + { + "epoch": 1.88, + "grad_norm": 0.5, + "learning_rate": 0.00012765277750281272, + "loss": 0.2628, + "step": 14081 + }, + { + "epoch": 1.88, + "grad_norm": 0.50390625, + "learning_rate": 0.00012764158638511536, + "loss": 0.4054, + "step": 14082 + }, + { + "epoch": 1.88, + "grad_norm": 0.45703125, + "learning_rate": 0.00012763039489258033, + "loss": 0.2502, + "step": 14083 + }, + { + "epoch": 1.88, + "grad_norm": 0.4375, + "learning_rate": 0.00012761920302535942, + "loss": 0.3768, + "step": 14084 + }, + { + "epoch": 1.88, + "grad_norm": 0.51171875, + "learning_rate": 0.00012760801078360437, + "loss": 0.2439, + "step": 14085 + }, + { + "epoch": 1.88, + "grad_norm": 0.4765625, + "learning_rate": 0.000127596818167467, + "loss": 0.5762, + "step": 14086 + }, + { + "epoch": 1.88, + "grad_norm": 0.6953125, + "learning_rate": 0.00012758562517709905, + "loss": 0.6254, + "step": 14087 + }, + { + "epoch": 1.88, + "grad_norm": 0.7578125, + "learning_rate": 0.00012757443181265232, + "loss": 0.2475, + "step": 14088 + }, + { + "epoch": 1.88, + "grad_norm": 0.62109375, + "learning_rate": 0.00012756323807427858, + "loss": 0.4681, + "step": 14089 + }, + { + "epoch": 1.88, + "grad_norm": 0.80859375, + "learning_rate": 0.00012755204396212966, + "loss": 0.9681, + "step": 14090 + }, + { + "epoch": 1.88, + "grad_norm": 0.63671875, + "learning_rate": 0.00012754084947635733, + "loss": 0.4733, + "step": 14091 + }, + { + "epoch": 1.88, + "grad_norm": 0.54296875, + "learning_rate": 0.0001275296546171134, + "loss": 0.2727, + "step": 14092 + }, + { + "epoch": 1.88, + "grad_norm": 0.5390625, + "learning_rate": 0.0001275184593845497, + "loss": 0.3971, + "step": 14093 + }, + { + "epoch": 1.88, + "grad_norm": 0.484375, + "learning_rate": 0.00012750726377881803, + "loss": 0.4074, + "step": 14094 + }, + { + "epoch": 1.88, + "grad_norm": 0.546875, + "learning_rate": 0.00012749606780007025, + "loss": 0.256, + "step": 14095 + }, + { + "epoch": 1.88, + "grad_norm": 0.515625, + "learning_rate": 0.00012748487144845807, + "loss": 0.3037, + "step": 14096 + }, + { + "epoch": 1.88, + "grad_norm": 0.58203125, + "learning_rate": 0.00012747367472413343, + "loss": 0.6425, + "step": 14097 + }, + { + "epoch": 1.88, + "grad_norm": 0.54296875, + "learning_rate": 0.0001274624776272481, + "loss": 0.6896, + "step": 14098 + }, + { + "epoch": 1.88, + "grad_norm": 0.4453125, + "learning_rate": 0.000127451280157954, + "loss": 0.217, + "step": 14099 + }, + { + "epoch": 1.88, + "grad_norm": 0.69921875, + "learning_rate": 0.0001274400823164029, + "loss": 0.6071, + "step": 14100 + }, + { + "epoch": 1.88, + "grad_norm": 0.76953125, + "learning_rate": 0.00012742888410274665, + "loss": 0.637, + "step": 14101 + }, + { + "epoch": 1.88, + "grad_norm": 0.57421875, + "learning_rate": 0.00012741768551713715, + "loss": 0.2654, + "step": 14102 + }, + { + "epoch": 1.88, + "grad_norm": 0.5078125, + "learning_rate": 0.00012740648655972623, + "loss": 0.3855, + "step": 14103 + }, + { + "epoch": 1.88, + "grad_norm": 0.51171875, + "learning_rate": 0.00012739528723066575, + "loss": 0.3808, + "step": 14104 + }, + { + "epoch": 1.88, + "grad_norm": 0.55078125, + "learning_rate": 0.00012738408753010762, + "loss": 0.6441, + "step": 14105 + }, + { + "epoch": 1.88, + "grad_norm": 0.3984375, + "learning_rate": 0.00012737288745820369, + "loss": 0.3522, + "step": 14106 + }, + { + "epoch": 1.88, + "grad_norm": 0.51953125, + "learning_rate": 0.0001273616870151058, + "loss": 0.4409, + "step": 14107 + }, + { + "epoch": 1.88, + "grad_norm": 0.54296875, + "learning_rate": 0.00012735048620096591, + "loss": 0.4198, + "step": 14108 + }, + { + "epoch": 1.88, + "grad_norm": 0.3828125, + "learning_rate": 0.00012733928501593587, + "loss": 0.2782, + "step": 14109 + }, + { + "epoch": 1.88, + "grad_norm": 0.65234375, + "learning_rate": 0.00012732808346016755, + "loss": 0.4033, + "step": 14110 + }, + { + "epoch": 1.88, + "grad_norm": 0.75, + "learning_rate": 0.00012731688153381292, + "loss": 0.2453, + "step": 14111 + }, + { + "epoch": 1.88, + "grad_norm": 0.55078125, + "learning_rate": 0.0001273056792370238, + "loss": 0.5375, + "step": 14112 + }, + { + "epoch": 1.88, + "grad_norm": 0.380859375, + "learning_rate": 0.0001272944765699522, + "loss": 0.2792, + "step": 14113 + }, + { + "epoch": 1.88, + "grad_norm": 0.6015625, + "learning_rate": 0.00012728327353274992, + "loss": 0.4548, + "step": 14114 + }, + { + "epoch": 1.88, + "grad_norm": 0.57421875, + "learning_rate": 0.00012727207012556898, + "loss": 0.2118, + "step": 14115 + }, + { + "epoch": 1.88, + "grad_norm": 0.578125, + "learning_rate": 0.00012726086634856125, + "loss": 0.2929, + "step": 14116 + }, + { + "epoch": 1.88, + "grad_norm": 0.5, + "learning_rate": 0.00012724966220187871, + "loss": 0.3581, + "step": 14117 + }, + { + "epoch": 1.88, + "grad_norm": 0.5390625, + "learning_rate": 0.00012723845768567325, + "loss": 0.6399, + "step": 14118 + }, + { + "epoch": 1.88, + "grad_norm": 0.431640625, + "learning_rate": 0.00012722725280009677, + "loss": 0.3251, + "step": 14119 + }, + { + "epoch": 1.88, + "grad_norm": 0.50390625, + "learning_rate": 0.0001272160475453013, + "loss": 0.4405, + "step": 14120 + }, + { + "epoch": 1.88, + "grad_norm": 0.609375, + "learning_rate": 0.00012720484192143874, + "loss": 0.3092, + "step": 14121 + }, + { + "epoch": 1.88, + "grad_norm": 0.51953125, + "learning_rate": 0.0001271936359286611, + "loss": 0.594, + "step": 14122 + }, + { + "epoch": 1.88, + "grad_norm": 0.546875, + "learning_rate": 0.0001271824295671203, + "loss": 0.3123, + "step": 14123 + }, + { + "epoch": 1.88, + "grad_norm": 0.75, + "learning_rate": 0.0001271712228369683, + "loss": 0.4231, + "step": 14124 + }, + { + "epoch": 1.88, + "grad_norm": 0.365234375, + "learning_rate": 0.0001271600157383571, + "loss": 0.2933, + "step": 14125 + }, + { + "epoch": 1.88, + "grad_norm": 0.357421875, + "learning_rate": 0.00012714880827143863, + "loss": 0.2006, + "step": 14126 + }, + { + "epoch": 1.89, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001271376004363649, + "loss": 0.0726, + "step": 14127 + }, + { + "epoch": 1.89, + "grad_norm": 0.419921875, + "learning_rate": 0.00012712639223328787, + "loss": 0.2334, + "step": 14128 + }, + { + "epoch": 1.89, + "grad_norm": 0.49609375, + "learning_rate": 0.0001271151836623596, + "loss": 0.3198, + "step": 14129 + }, + { + "epoch": 1.89, + "grad_norm": 0.62890625, + "learning_rate": 0.000127103974723732, + "loss": 0.5794, + "step": 14130 + }, + { + "epoch": 1.89, + "grad_norm": 0.396484375, + "learning_rate": 0.0001270927654175571, + "loss": 0.2578, + "step": 14131 + }, + { + "epoch": 1.89, + "grad_norm": 0.65625, + "learning_rate": 0.00012708155574398693, + "loss": 0.3392, + "step": 14132 + }, + { + "epoch": 1.89, + "grad_norm": 0.58203125, + "learning_rate": 0.0001270703457031735, + "loss": 0.4245, + "step": 14133 + }, + { + "epoch": 1.89, + "grad_norm": 0.38671875, + "learning_rate": 0.0001270591352952688, + "loss": 0.1751, + "step": 14134 + }, + { + "epoch": 1.89, + "grad_norm": 0.46875, + "learning_rate": 0.00012704792452042485, + "loss": 0.3601, + "step": 14135 + }, + { + "epoch": 1.89, + "grad_norm": 0.427734375, + "learning_rate": 0.0001270367133787937, + "loss": 0.4547, + "step": 14136 + }, + { + "epoch": 1.89, + "grad_norm": 0.62109375, + "learning_rate": 0.00012702550187052733, + "loss": 0.5443, + "step": 14137 + }, + { + "epoch": 1.89, + "grad_norm": 0.482421875, + "learning_rate": 0.00012701428999577783, + "loss": 0.3217, + "step": 14138 + }, + { + "epoch": 1.89, + "grad_norm": 0.66796875, + "learning_rate": 0.00012700307775469723, + "loss": 0.4574, + "step": 14139 + }, + { + "epoch": 1.89, + "grad_norm": 0.62109375, + "learning_rate": 0.00012699186514743754, + "loss": 0.2277, + "step": 14140 + }, + { + "epoch": 1.89, + "grad_norm": 0.490234375, + "learning_rate": 0.00012698065217415087, + "loss": 0.3105, + "step": 14141 + }, + { + "epoch": 1.89, + "grad_norm": 0.359375, + "learning_rate": 0.00012696943883498923, + "loss": 0.2454, + "step": 14142 + }, + { + "epoch": 1.89, + "grad_norm": 0.419921875, + "learning_rate": 0.0001269582251301047, + "loss": 0.2607, + "step": 14143 + }, + { + "epoch": 1.89, + "grad_norm": 0.59765625, + "learning_rate": 0.00012694701105964927, + "loss": 0.4905, + "step": 14144 + }, + { + "epoch": 1.89, + "grad_norm": 0.53515625, + "learning_rate": 0.00012693579662377514, + "loss": 0.3397, + "step": 14145 + }, + { + "epoch": 1.89, + "grad_norm": 0.53125, + "learning_rate": 0.00012692458182263428, + "loss": 0.2451, + "step": 14146 + }, + { + "epoch": 1.89, + "grad_norm": 0.55859375, + "learning_rate": 0.00012691336665637888, + "loss": 0.2601, + "step": 14147 + }, + { + "epoch": 1.89, + "grad_norm": 0.46875, + "learning_rate": 0.00012690215112516091, + "loss": 0.4831, + "step": 14148 + }, + { + "epoch": 1.89, + "grad_norm": 0.62109375, + "learning_rate": 0.0001268909352291325, + "loss": 0.347, + "step": 14149 + }, + { + "epoch": 1.89, + "grad_norm": 0.396484375, + "learning_rate": 0.00012687971896844575, + "loss": 0.2107, + "step": 14150 + }, + { + "epoch": 1.89, + "grad_norm": 0.55859375, + "learning_rate": 0.00012686850234325275, + "loss": 0.4813, + "step": 14151 + }, + { + "epoch": 1.89, + "grad_norm": 0.4296875, + "learning_rate": 0.00012685728535370564, + "loss": 0.2377, + "step": 14152 + }, + { + "epoch": 1.89, + "grad_norm": 0.62109375, + "learning_rate": 0.00012684606799995648, + "loss": 0.5546, + "step": 14153 + }, + { + "epoch": 1.89, + "grad_norm": 0.6171875, + "learning_rate": 0.0001268348502821574, + "loss": 0.4211, + "step": 14154 + }, + { + "epoch": 1.89, + "grad_norm": 0.640625, + "learning_rate": 0.00012682363220046058, + "loss": 0.4645, + "step": 14155 + }, + { + "epoch": 1.89, + "grad_norm": 0.42578125, + "learning_rate": 0.00012681241375501803, + "loss": 0.3971, + "step": 14156 + }, + { + "epoch": 1.89, + "grad_norm": 0.48828125, + "learning_rate": 0.000126801194945982, + "loss": 0.5126, + "step": 14157 + }, + { + "epoch": 1.89, + "grad_norm": 0.55078125, + "learning_rate": 0.0001267899757735045, + "loss": 0.2438, + "step": 14158 + }, + { + "epoch": 1.89, + "grad_norm": 0.5078125, + "learning_rate": 0.00012677875623773782, + "loss": 0.3285, + "step": 14159 + }, + { + "epoch": 1.89, + "grad_norm": 0.474609375, + "learning_rate": 0.00012676753633883396, + "loss": 0.3083, + "step": 14160 + }, + { + "epoch": 1.89, + "grad_norm": 0.5546875, + "learning_rate": 0.00012675631607694517, + "loss": 0.3817, + "step": 14161 + }, + { + "epoch": 1.89, + "grad_norm": 0.59765625, + "learning_rate": 0.00012674509545222348, + "loss": 0.3316, + "step": 14162 + }, + { + "epoch": 1.89, + "grad_norm": 0.396484375, + "learning_rate": 0.00012673387446482116, + "loss": 0.2299, + "step": 14163 + }, + { + "epoch": 1.89, + "grad_norm": 0.42578125, + "learning_rate": 0.0001267226531148904, + "loss": 0.2513, + "step": 14164 + }, + { + "epoch": 1.89, + "grad_norm": 0.39453125, + "learning_rate": 0.00012671143140258328, + "loss": 0.2253, + "step": 14165 + }, + { + "epoch": 1.89, + "grad_norm": 0.455078125, + "learning_rate": 0.00012670020932805203, + "loss": 0.3591, + "step": 14166 + }, + { + "epoch": 1.89, + "grad_norm": 0.5546875, + "learning_rate": 0.00012668898689144873, + "loss": 0.31, + "step": 14167 + }, + { + "epoch": 1.89, + "grad_norm": 0.37890625, + "learning_rate": 0.00012667776409292574, + "loss": 0.202, + "step": 14168 + }, + { + "epoch": 1.89, + "grad_norm": 0.67578125, + "learning_rate": 0.00012666654093263508, + "loss": 0.5979, + "step": 14169 + }, + { + "epoch": 1.89, + "grad_norm": 0.46875, + "learning_rate": 0.00012665531741072903, + "loss": 0.4679, + "step": 14170 + }, + { + "epoch": 1.89, + "grad_norm": 0.416015625, + "learning_rate": 0.00012664409352735974, + "loss": 0.4924, + "step": 14171 + }, + { + "epoch": 1.89, + "grad_norm": 0.625, + "learning_rate": 0.00012663286928267948, + "loss": 0.3676, + "step": 14172 + }, + { + "epoch": 1.89, + "grad_norm": 0.52734375, + "learning_rate": 0.00012662164467684041, + "loss": 0.5088, + "step": 14173 + }, + { + "epoch": 1.89, + "grad_norm": 0.478515625, + "learning_rate": 0.00012661041970999474, + "loss": 0.2937, + "step": 14174 + }, + { + "epoch": 1.89, + "grad_norm": 0.423828125, + "learning_rate": 0.00012659919438229474, + "loss": 0.2991, + "step": 14175 + }, + { + "epoch": 1.89, + "grad_norm": 0.490234375, + "learning_rate": 0.00012658796869389254, + "loss": 0.3902, + "step": 14176 + }, + { + "epoch": 1.89, + "grad_norm": 0.59375, + "learning_rate": 0.00012657674264494046, + "loss": 0.4173, + "step": 14177 + }, + { + "epoch": 1.89, + "grad_norm": 0.62109375, + "learning_rate": 0.00012656551623559068, + "loss": 0.4879, + "step": 14178 + }, + { + "epoch": 1.89, + "grad_norm": 0.82421875, + "learning_rate": 0.00012655428946599544, + "loss": 0.4972, + "step": 14179 + }, + { + "epoch": 1.89, + "grad_norm": 0.57421875, + "learning_rate": 0.000126543062336307, + "loss": 0.2362, + "step": 14180 + }, + { + "epoch": 1.89, + "grad_norm": 0.447265625, + "learning_rate": 0.0001265318348466776, + "loss": 0.2292, + "step": 14181 + }, + { + "epoch": 1.89, + "grad_norm": 0.5703125, + "learning_rate": 0.0001265206069972595, + "loss": 0.4565, + "step": 14182 + }, + { + "epoch": 1.89, + "grad_norm": 0.56640625, + "learning_rate": 0.00012650937878820492, + "loss": 0.4313, + "step": 14183 + }, + { + "epoch": 1.89, + "grad_norm": 0.515625, + "learning_rate": 0.0001264981502196662, + "loss": 0.4237, + "step": 14184 + }, + { + "epoch": 1.89, + "grad_norm": 0.447265625, + "learning_rate": 0.00012648692129179552, + "loss": 0.3679, + "step": 14185 + }, + { + "epoch": 1.89, + "grad_norm": 0.416015625, + "learning_rate": 0.00012647569200474517, + "loss": 0.2097, + "step": 14186 + }, + { + "epoch": 1.89, + "grad_norm": 0.52734375, + "learning_rate": 0.00012646446235866748, + "loss": 0.4383, + "step": 14187 + }, + { + "epoch": 1.89, + "grad_norm": 0.65234375, + "learning_rate": 0.00012645323235371467, + "loss": 0.5175, + "step": 14188 + }, + { + "epoch": 1.89, + "grad_norm": 0.64453125, + "learning_rate": 0.00012644200199003907, + "loss": 0.3508, + "step": 14189 + }, + { + "epoch": 1.89, + "grad_norm": 0.515625, + "learning_rate": 0.00012643077126779298, + "loss": 0.2686, + "step": 14190 + }, + { + "epoch": 1.89, + "grad_norm": 0.52734375, + "learning_rate": 0.00012641954018712863, + "loss": 0.2559, + "step": 14191 + }, + { + "epoch": 1.89, + "grad_norm": 0.58203125, + "learning_rate": 0.00012640830874819834, + "loss": 0.4132, + "step": 14192 + }, + { + "epoch": 1.89, + "grad_norm": 0.56640625, + "learning_rate": 0.00012639707695115448, + "loss": 0.418, + "step": 14193 + }, + { + "epoch": 1.89, + "grad_norm": 0.55078125, + "learning_rate": 0.00012638584479614929, + "loss": 0.4143, + "step": 14194 + }, + { + "epoch": 1.89, + "grad_norm": 0.46484375, + "learning_rate": 0.00012637461228333512, + "loss": 0.5199, + "step": 14195 + }, + { + "epoch": 1.89, + "grad_norm": 0.490234375, + "learning_rate": 0.00012636337941286427, + "loss": 0.4513, + "step": 14196 + }, + { + "epoch": 1.89, + "grad_norm": 0.494140625, + "learning_rate": 0.00012635214618488908, + "loss": 0.38, + "step": 14197 + }, + { + "epoch": 1.89, + "grad_norm": 0.58203125, + "learning_rate": 0.00012634091259956188, + "loss": 0.3984, + "step": 14198 + }, + { + "epoch": 1.89, + "grad_norm": 0.490234375, + "learning_rate": 0.00012632967865703498, + "loss": 0.3924, + "step": 14199 + }, + { + "epoch": 1.89, + "grad_norm": 0.5546875, + "learning_rate": 0.00012631844435746076, + "loss": 0.3724, + "step": 14200 + }, + { + "epoch": 1.89, + "grad_norm": 0.384765625, + "learning_rate": 0.00012630720970099152, + "loss": 0.2387, + "step": 14201 + }, + { + "epoch": 1.9, + "grad_norm": 0.55859375, + "learning_rate": 0.00012629597468777966, + "loss": 0.2353, + "step": 14202 + }, + { + "epoch": 1.9, + "grad_norm": 0.64453125, + "learning_rate": 0.00012628473931797752, + "loss": 0.3597, + "step": 14203 + }, + { + "epoch": 1.9, + "grad_norm": 0.5078125, + "learning_rate": 0.00012627350359173742, + "loss": 0.3752, + "step": 14204 + }, + { + "epoch": 1.9, + "grad_norm": 0.52734375, + "learning_rate": 0.00012626226750921175, + "loss": 0.394, + "step": 14205 + }, + { + "epoch": 1.9, + "grad_norm": 0.70703125, + "learning_rate": 0.00012625103107055286, + "loss": 0.4308, + "step": 14206 + }, + { + "epoch": 1.9, + "grad_norm": 0.423828125, + "learning_rate": 0.00012623979427591316, + "loss": 0.1671, + "step": 14207 + }, + { + "epoch": 1.9, + "grad_norm": 0.38671875, + "learning_rate": 0.00012622855712544505, + "loss": 0.3691, + "step": 14208 + }, + { + "epoch": 1.9, + "grad_norm": 0.640625, + "learning_rate": 0.00012621731961930084, + "loss": 0.3919, + "step": 14209 + }, + { + "epoch": 1.9, + "grad_norm": 0.55859375, + "learning_rate": 0.0001262060817576329, + "loss": 0.4342, + "step": 14210 + }, + { + "epoch": 1.9, + "grad_norm": 0.462890625, + "learning_rate": 0.00012619484354059373, + "loss": 0.3456, + "step": 14211 + }, + { + "epoch": 1.9, + "grad_norm": 0.6796875, + "learning_rate": 0.00012618360496833563, + "loss": 0.493, + "step": 14212 + }, + { + "epoch": 1.9, + "grad_norm": 0.546875, + "learning_rate": 0.00012617236604101106, + "loss": 0.3543, + "step": 14213 + }, + { + "epoch": 1.9, + "grad_norm": 0.5390625, + "learning_rate": 0.00012616112675877246, + "loss": 0.4042, + "step": 14214 + }, + { + "epoch": 1.9, + "grad_norm": 0.396484375, + "learning_rate": 0.00012614988712177213, + "loss": 0.2337, + "step": 14215 + }, + { + "epoch": 1.9, + "grad_norm": 0.5390625, + "learning_rate": 0.00012613864713016256, + "loss": 0.3024, + "step": 14216 + }, + { + "epoch": 1.9, + "grad_norm": 0.455078125, + "learning_rate": 0.00012612740678409615, + "loss": 0.2402, + "step": 14217 + }, + { + "epoch": 1.9, + "grad_norm": 0.46875, + "learning_rate": 0.00012611616608372537, + "loss": 0.3448, + "step": 14218 + }, + { + "epoch": 1.9, + "grad_norm": 0.353515625, + "learning_rate": 0.00012610492502920258, + "loss": 0.3252, + "step": 14219 + }, + { + "epoch": 1.9, + "grad_norm": 0.609375, + "learning_rate": 0.00012609368362068027, + "loss": 0.2987, + "step": 14220 + }, + { + "epoch": 1.9, + "grad_norm": 0.49609375, + "learning_rate": 0.0001260824418583109, + "loss": 0.3076, + "step": 14221 + }, + { + "epoch": 1.9, + "grad_norm": 0.65234375, + "learning_rate": 0.00012607119974224678, + "loss": 0.29, + "step": 14222 + }, + { + "epoch": 1.9, + "grad_norm": 0.5625, + "learning_rate": 0.00012605995727264056, + "loss": 0.444, + "step": 14223 + }, + { + "epoch": 1.9, + "grad_norm": 0.4765625, + "learning_rate": 0.00012604871444964454, + "loss": 0.316, + "step": 14224 + }, + { + "epoch": 1.9, + "grad_norm": 0.484375, + "learning_rate": 0.00012603747127341126, + "loss": 0.5058, + "step": 14225 + }, + { + "epoch": 1.9, + "grad_norm": 0.46875, + "learning_rate": 0.00012602622774409315, + "loss": 0.2759, + "step": 14226 + }, + { + "epoch": 1.9, + "grad_norm": 0.48046875, + "learning_rate": 0.00012601498386184267, + "loss": 0.4839, + "step": 14227 + }, + { + "epoch": 1.9, + "grad_norm": 0.51953125, + "learning_rate": 0.00012600373962681234, + "loss": 0.2644, + "step": 14228 + }, + { + "epoch": 1.9, + "grad_norm": 0.609375, + "learning_rate": 0.0001259924950391546, + "loss": 0.1818, + "step": 14229 + }, + { + "epoch": 1.9, + "grad_norm": 0.546875, + "learning_rate": 0.00012598125009902192, + "loss": 0.5224, + "step": 14230 + }, + { + "epoch": 1.9, + "grad_norm": 0.357421875, + "learning_rate": 0.00012597000480656684, + "loss": 0.2908, + "step": 14231 + }, + { + "epoch": 1.9, + "grad_norm": 0.61328125, + "learning_rate": 0.00012595875916194188, + "loss": 0.5026, + "step": 14232 + }, + { + "epoch": 1.9, + "grad_norm": 0.50390625, + "learning_rate": 0.00012594751316529938, + "loss": 0.2098, + "step": 14233 + }, + { + "epoch": 1.9, + "grad_norm": 0.66015625, + "learning_rate": 0.000125936266816792, + "loss": 0.6222, + "step": 14234 + }, + { + "epoch": 1.9, + "grad_norm": 0.478515625, + "learning_rate": 0.0001259250201165722, + "loss": 0.5372, + "step": 14235 + }, + { + "epoch": 1.9, + "grad_norm": 0.51171875, + "learning_rate": 0.00012591377306479248, + "loss": 0.5492, + "step": 14236 + }, + { + "epoch": 1.9, + "grad_norm": 0.404296875, + "learning_rate": 0.00012590252566160536, + "loss": 0.2403, + "step": 14237 + }, + { + "epoch": 1.9, + "grad_norm": 0.64453125, + "learning_rate": 0.0001258912779071634, + "loss": 0.4745, + "step": 14238 + }, + { + "epoch": 1.9, + "grad_norm": 0.4140625, + "learning_rate": 0.00012588002980161908, + "loss": 0.3518, + "step": 14239 + }, + { + "epoch": 1.9, + "grad_norm": 0.451171875, + "learning_rate": 0.00012586878134512492, + "loss": 0.4063, + "step": 14240 + }, + { + "epoch": 1.9, + "grad_norm": 0.5390625, + "learning_rate": 0.0001258575325378335, + "loss": 0.3655, + "step": 14241 + }, + { + "epoch": 1.9, + "grad_norm": 0.59375, + "learning_rate": 0.00012584628337989733, + "loss": 0.6243, + "step": 14242 + }, + { + "epoch": 1.9, + "grad_norm": 0.48828125, + "learning_rate": 0.00012583503387146903, + "loss": 0.4449, + "step": 14243 + }, + { + "epoch": 1.9, + "grad_norm": 0.54296875, + "learning_rate": 0.00012582378401270104, + "loss": 0.2963, + "step": 14244 + }, + { + "epoch": 1.9, + "grad_norm": 0.44140625, + "learning_rate": 0.00012581253380374596, + "loss": 0.4418, + "step": 14245 + }, + { + "epoch": 1.9, + "grad_norm": 0.45703125, + "learning_rate": 0.00012580128324475638, + "loss": 0.4809, + "step": 14246 + }, + { + "epoch": 1.9, + "grad_norm": 0.484375, + "learning_rate": 0.0001257900323358848, + "loss": 0.2195, + "step": 14247 + }, + { + "epoch": 1.9, + "grad_norm": 0.7578125, + "learning_rate": 0.00012577878107728388, + "loss": 0.5789, + "step": 14248 + }, + { + "epoch": 1.9, + "grad_norm": 0.5625, + "learning_rate": 0.00012576752946910608, + "loss": 0.3907, + "step": 14249 + }, + { + "epoch": 1.9, + "grad_norm": 0.380859375, + "learning_rate": 0.0001257562775115041, + "loss": 0.2329, + "step": 14250 + }, + { + "epoch": 1.9, + "grad_norm": 0.59765625, + "learning_rate": 0.00012574502520463045, + "loss": 0.2015, + "step": 14251 + }, + { + "epoch": 1.9, + "grad_norm": 0.55859375, + "learning_rate": 0.00012573377254863772, + "loss": 0.4349, + "step": 14252 + }, + { + "epoch": 1.9, + "grad_norm": 0.5390625, + "learning_rate": 0.00012572251954367852, + "loss": 0.3722, + "step": 14253 + }, + { + "epoch": 1.9, + "grad_norm": 0.546875, + "learning_rate": 0.00012571126618990546, + "loss": 0.2492, + "step": 14254 + }, + { + "epoch": 1.9, + "grad_norm": 0.43359375, + "learning_rate": 0.00012570001248747115, + "loss": 0.2091, + "step": 14255 + }, + { + "epoch": 1.9, + "grad_norm": 0.66015625, + "learning_rate": 0.00012568875843652812, + "loss": 0.3879, + "step": 14256 + }, + { + "epoch": 1.9, + "grad_norm": 0.734375, + "learning_rate": 0.00012567750403722912, + "loss": 0.3165, + "step": 14257 + }, + { + "epoch": 1.9, + "grad_norm": 0.61328125, + "learning_rate": 0.0001256662492897266, + "loss": 0.3229, + "step": 14258 + }, + { + "epoch": 1.9, + "grad_norm": 0.54296875, + "learning_rate": 0.00012565499419417333, + "loss": 0.3846, + "step": 14259 + }, + { + "epoch": 1.9, + "grad_norm": 0.3828125, + "learning_rate": 0.00012564373875072183, + "loss": 0.3728, + "step": 14260 + }, + { + "epoch": 1.9, + "grad_norm": 0.5625, + "learning_rate": 0.00012563248295952478, + "loss": 0.405, + "step": 14261 + }, + { + "epoch": 1.9, + "grad_norm": 0.74609375, + "learning_rate": 0.00012562122682073482, + "loss": 0.5569, + "step": 14262 + }, + { + "epoch": 1.9, + "grad_norm": 0.62109375, + "learning_rate": 0.0001256099703345046, + "loss": 0.4231, + "step": 14263 + }, + { + "epoch": 1.9, + "grad_norm": 0.486328125, + "learning_rate": 0.00012559871350098674, + "loss": 0.4055, + "step": 14264 + }, + { + "epoch": 1.9, + "grad_norm": 0.490234375, + "learning_rate": 0.0001255874563203339, + "loss": 0.5865, + "step": 14265 + }, + { + "epoch": 1.9, + "grad_norm": 0.62890625, + "learning_rate": 0.0001255761987926987, + "loss": 0.4264, + "step": 14266 + }, + { + "epoch": 1.9, + "grad_norm": 0.7109375, + "learning_rate": 0.00012556494091823384, + "loss": 0.4591, + "step": 14267 + }, + { + "epoch": 1.9, + "grad_norm": 0.4453125, + "learning_rate": 0.00012555368269709198, + "loss": 0.3991, + "step": 14268 + }, + { + "epoch": 1.9, + "grad_norm": 0.447265625, + "learning_rate": 0.00012554242412942578, + "loss": 0.2166, + "step": 14269 + }, + { + "epoch": 1.9, + "grad_norm": 0.5, + "learning_rate": 0.00012553116521538794, + "loss": 0.3987, + "step": 14270 + }, + { + "epoch": 1.9, + "grad_norm": 0.73046875, + "learning_rate": 0.00012551990595513108, + "loss": 0.8512, + "step": 14271 + }, + { + "epoch": 1.9, + "grad_norm": 0.5, + "learning_rate": 0.0001255086463488079, + "loss": 0.2773, + "step": 14272 + }, + { + "epoch": 1.9, + "grad_norm": 0.55859375, + "learning_rate": 0.00012549738639657115, + "loss": 0.5165, + "step": 14273 + }, + { + "epoch": 1.9, + "grad_norm": 0.61328125, + "learning_rate": 0.00012548612609857343, + "loss": 0.3722, + "step": 14274 + }, + { + "epoch": 1.9, + "grad_norm": 0.54296875, + "learning_rate": 0.00012547486545496755, + "loss": 0.3594, + "step": 14275 + }, + { + "epoch": 1.9, + "grad_norm": 0.33203125, + "learning_rate": 0.0001254636044659061, + "loss": 0.2119, + "step": 14276 + }, + { + "epoch": 1.91, + "grad_norm": 0.5234375, + "learning_rate": 0.00012545234313154183, + "loss": 0.2475, + "step": 14277 + }, + { + "epoch": 1.91, + "grad_norm": 0.6328125, + "learning_rate": 0.00012544108145202748, + "loss": 0.4986, + "step": 14278 + }, + { + "epoch": 1.91, + "grad_norm": 0.5390625, + "learning_rate": 0.00012542981942751568, + "loss": 0.504, + "step": 14279 + }, + { + "epoch": 1.91, + "grad_norm": 0.359375, + "learning_rate": 0.00012541855705815926, + "loss": 0.1829, + "step": 14280 + }, + { + "epoch": 1.91, + "grad_norm": 0.458984375, + "learning_rate": 0.0001254072943441109, + "loss": 0.4503, + "step": 14281 + }, + { + "epoch": 1.91, + "grad_norm": 0.48046875, + "learning_rate": 0.0001253960312855233, + "loss": 0.3345, + "step": 14282 + }, + { + "epoch": 1.91, + "grad_norm": 0.4765625, + "learning_rate": 0.0001253847678825492, + "loss": 0.3115, + "step": 14283 + }, + { + "epoch": 1.91, + "grad_norm": 0.5234375, + "learning_rate": 0.00012537350413534135, + "loss": 0.2939, + "step": 14284 + }, + { + "epoch": 1.91, + "grad_norm": 0.66796875, + "learning_rate": 0.00012536224004405252, + "loss": 0.6215, + "step": 14285 + }, + { + "epoch": 1.91, + "grad_norm": 0.404296875, + "learning_rate": 0.00012535097560883543, + "loss": 0.1622, + "step": 14286 + }, + { + "epoch": 1.91, + "grad_norm": 0.46875, + "learning_rate": 0.0001253397108298429, + "loss": 0.2199, + "step": 14287 + }, + { + "epoch": 1.91, + "grad_norm": 0.408203125, + "learning_rate": 0.00012532844570722754, + "loss": 0.1861, + "step": 14288 + }, + { + "epoch": 1.91, + "grad_norm": 0.60546875, + "learning_rate": 0.00012531718024114228, + "loss": 0.6334, + "step": 14289 + }, + { + "epoch": 1.91, + "grad_norm": 0.490234375, + "learning_rate": 0.00012530591443173973, + "loss": 0.2272, + "step": 14290 + }, + { + "epoch": 1.91, + "grad_norm": 0.609375, + "learning_rate": 0.0001252946482791728, + "loss": 0.3847, + "step": 14291 + }, + { + "epoch": 1.91, + "grad_norm": 0.490234375, + "learning_rate": 0.00012528338178359415, + "loss": 0.3546, + "step": 14292 + }, + { + "epoch": 1.91, + "grad_norm": 0.69140625, + "learning_rate": 0.00012527211494515667, + "loss": 0.5663, + "step": 14293 + }, + { + "epoch": 1.91, + "grad_norm": 0.462890625, + "learning_rate": 0.00012526084776401307, + "loss": 0.1956, + "step": 14294 + }, + { + "epoch": 1.91, + "grad_norm": 0.42578125, + "learning_rate": 0.00012524958024031616, + "loss": 0.305, + "step": 14295 + }, + { + "epoch": 1.91, + "grad_norm": 0.62109375, + "learning_rate": 0.00012523831237421873, + "loss": 0.5981, + "step": 14296 + }, + { + "epoch": 1.91, + "grad_norm": 0.53125, + "learning_rate": 0.0001252270441658736, + "loss": 0.3737, + "step": 14297 + }, + { + "epoch": 1.91, + "grad_norm": 0.470703125, + "learning_rate": 0.00012521577561543356, + "loss": 0.4037, + "step": 14298 + }, + { + "epoch": 1.91, + "grad_norm": 0.4296875, + "learning_rate": 0.00012520450672305142, + "loss": 0.3182, + "step": 14299 + }, + { + "epoch": 1.91, + "grad_norm": 0.5859375, + "learning_rate": 0.00012519323748887997, + "loss": 0.3309, + "step": 14300 + }, + { + "epoch": 1.91, + "grad_norm": 0.423828125, + "learning_rate": 0.00012518196791307207, + "loss": 0.1688, + "step": 14301 + }, + { + "epoch": 1.91, + "grad_norm": 0.373046875, + "learning_rate": 0.00012517069799578053, + "loss": 0.2926, + "step": 14302 + }, + { + "epoch": 1.91, + "grad_norm": 0.5546875, + "learning_rate": 0.00012515942773715815, + "loss": 0.4522, + "step": 14303 + }, + { + "epoch": 1.91, + "grad_norm": 0.671875, + "learning_rate": 0.0001251481571373578, + "loss": 0.4494, + "step": 14304 + }, + { + "epoch": 1.91, + "grad_norm": 0.478515625, + "learning_rate": 0.0001251368861965323, + "loss": 0.2397, + "step": 14305 + }, + { + "epoch": 1.91, + "grad_norm": 0.53515625, + "learning_rate": 0.00012512561491483448, + "loss": 0.4971, + "step": 14306 + }, + { + "epoch": 1.91, + "grad_norm": 0.494140625, + "learning_rate": 0.0001251143432924172, + "loss": 0.2926, + "step": 14307 + }, + { + "epoch": 1.91, + "grad_norm": 0.51953125, + "learning_rate": 0.0001251030713294333, + "loss": 0.4237, + "step": 14308 + }, + { + "epoch": 1.91, + "grad_norm": 0.5625, + "learning_rate": 0.00012509179902603563, + "loss": 0.2904, + "step": 14309 + }, + { + "epoch": 1.91, + "grad_norm": 0.58984375, + "learning_rate": 0.00012508052638237707, + "loss": 0.4314, + "step": 14310 + }, + { + "epoch": 1.91, + "grad_norm": 0.53515625, + "learning_rate": 0.0001250692533986105, + "loss": 0.2779, + "step": 14311 + }, + { + "epoch": 1.91, + "grad_norm": 0.478515625, + "learning_rate": 0.0001250579800748888, + "loss": 0.4577, + "step": 14312 + }, + { + "epoch": 1.91, + "grad_norm": 0.4375, + "learning_rate": 0.0001250467064113647, + "loss": 0.3232, + "step": 14313 + }, + { + "epoch": 1.91, + "grad_norm": 0.77734375, + "learning_rate": 0.00012503543240819127, + "loss": 0.4773, + "step": 14314 + }, + { + "epoch": 1.91, + "grad_norm": 0.46875, + "learning_rate": 0.00012502415806552128, + "loss": 0.2932, + "step": 14315 + }, + { + "epoch": 1.91, + "grad_norm": 0.341796875, + "learning_rate": 0.00012501288338350765, + "loss": 0.1838, + "step": 14316 + }, + { + "epoch": 1.91, + "grad_norm": 0.515625, + "learning_rate": 0.00012500160836230326, + "loss": 0.318, + "step": 14317 + }, + { + "epoch": 1.91, + "grad_norm": 0.5234375, + "learning_rate": 0.00012499033300206102, + "loss": 0.5108, + "step": 14318 + }, + { + "epoch": 1.91, + "grad_norm": 0.67578125, + "learning_rate": 0.00012497905730293385, + "loss": 0.3471, + "step": 14319 + }, + { + "epoch": 1.91, + "grad_norm": 0.380859375, + "learning_rate": 0.00012496778126507458, + "loss": 0.4298, + "step": 14320 + }, + { + "epoch": 1.91, + "grad_norm": 0.4140625, + "learning_rate": 0.0001249565048886362, + "loss": 0.3704, + "step": 14321 + }, + { + "epoch": 1.91, + "grad_norm": 0.46875, + "learning_rate": 0.0001249452281737716, + "loss": 0.2181, + "step": 14322 + }, + { + "epoch": 1.91, + "grad_norm": 0.6015625, + "learning_rate": 0.0001249339511206337, + "loss": 0.3409, + "step": 14323 + }, + { + "epoch": 1.91, + "grad_norm": 0.447265625, + "learning_rate": 0.00012492267372937542, + "loss": 0.3967, + "step": 14324 + }, + { + "epoch": 1.91, + "grad_norm": 0.4375, + "learning_rate": 0.00012491139600014967, + "loss": 0.305, + "step": 14325 + }, + { + "epoch": 1.91, + "grad_norm": 0.52734375, + "learning_rate": 0.00012490011793310943, + "loss": 0.3472, + "step": 14326 + }, + { + "epoch": 1.91, + "grad_norm": 0.455078125, + "learning_rate": 0.00012488883952840757, + "loss": 0.3889, + "step": 14327 + }, + { + "epoch": 1.91, + "grad_norm": 0.39453125, + "learning_rate": 0.00012487756078619712, + "loss": 0.1963, + "step": 14328 + }, + { + "epoch": 1.91, + "grad_norm": 0.45703125, + "learning_rate": 0.00012486628170663098, + "loss": 0.2282, + "step": 14329 + }, + { + "epoch": 1.91, + "grad_norm": 0.5703125, + "learning_rate": 0.00012485500228986206, + "loss": 0.3199, + "step": 14330 + }, + { + "epoch": 1.91, + "grad_norm": 0.5078125, + "learning_rate": 0.00012484372253604336, + "loss": 0.2681, + "step": 14331 + }, + { + "epoch": 1.91, + "grad_norm": 0.64453125, + "learning_rate": 0.0001248324424453279, + "loss": 0.4349, + "step": 14332 + }, + { + "epoch": 1.91, + "grad_norm": 0.5234375, + "learning_rate": 0.0001248211620178685, + "loss": 0.5218, + "step": 14333 + }, + { + "epoch": 1.91, + "grad_norm": 0.361328125, + "learning_rate": 0.00012480988125381824, + "loss": 0.2969, + "step": 14334 + }, + { + "epoch": 1.91, + "grad_norm": 0.423828125, + "learning_rate": 0.0001247986001533301, + "loss": 0.3328, + "step": 14335 + }, + { + "epoch": 1.91, + "grad_norm": 0.55859375, + "learning_rate": 0.000124787318716557, + "loss": 0.3824, + "step": 14336 + }, + { + "epoch": 1.91, + "grad_norm": 0.55078125, + "learning_rate": 0.00012477603694365197, + "loss": 0.6037, + "step": 14337 + }, + { + "epoch": 1.91, + "grad_norm": 0.6328125, + "learning_rate": 0.00012476475483476793, + "loss": 0.5778, + "step": 14338 + }, + { + "epoch": 1.91, + "grad_norm": 0.578125, + "learning_rate": 0.00012475347239005797, + "loss": 0.406, + "step": 14339 + }, + { + "epoch": 1.91, + "grad_norm": 0.478515625, + "learning_rate": 0.000124742189609675, + "loss": 0.1797, + "step": 14340 + }, + { + "epoch": 1.91, + "grad_norm": 1.140625, + "learning_rate": 0.0001247309064937721, + "loss": 0.6107, + "step": 14341 + }, + { + "epoch": 1.91, + "grad_norm": 0.5703125, + "learning_rate": 0.0001247196230425022, + "loss": 0.4543, + "step": 14342 + }, + { + "epoch": 1.91, + "grad_norm": 0.5546875, + "learning_rate": 0.00012470833925601834, + "loss": 0.3173, + "step": 14343 + }, + { + "epoch": 1.91, + "grad_norm": 0.55078125, + "learning_rate": 0.0001246970551344736, + "loss": 0.3813, + "step": 14344 + }, + { + "epoch": 1.91, + "grad_norm": 0.46484375, + "learning_rate": 0.00012468577067802086, + "loss": 0.4201, + "step": 14345 + }, + { + "epoch": 1.91, + "grad_norm": 0.361328125, + "learning_rate": 0.0001246744858868133, + "loss": 0.1813, + "step": 14346 + }, + { + "epoch": 1.91, + "grad_norm": 0.26171875, + "learning_rate": 0.00012466320076100383, + "loss": 0.1105, + "step": 14347 + }, + { + "epoch": 1.91, + "grad_norm": 0.69921875, + "learning_rate": 0.00012465191530074555, + "loss": 0.4674, + "step": 14348 + }, + { + "epoch": 1.91, + "grad_norm": 0.361328125, + "learning_rate": 0.00012464062950619148, + "loss": 0.1806, + "step": 14349 + }, + { + "epoch": 1.91, + "grad_norm": 0.412109375, + "learning_rate": 0.00012462934337749464, + "loss": 0.2398, + "step": 14350 + }, + { + "epoch": 1.91, + "grad_norm": 0.546875, + "learning_rate": 0.00012461805691480813, + "loss": 0.356, + "step": 14351 + }, + { + "epoch": 1.92, + "grad_norm": 0.453125, + "learning_rate": 0.00012460677011828493, + "loss": 0.2568, + "step": 14352 + }, + { + "epoch": 1.92, + "grad_norm": 0.58984375, + "learning_rate": 0.00012459548298807816, + "loss": 0.3194, + "step": 14353 + }, + { + "epoch": 1.92, + "grad_norm": 0.5078125, + "learning_rate": 0.00012458419552434084, + "loss": 0.2737, + "step": 14354 + }, + { + "epoch": 1.92, + "grad_norm": 0.4453125, + "learning_rate": 0.00012457290772722608, + "loss": 0.3029, + "step": 14355 + }, + { + "epoch": 1.92, + "grad_norm": 0.427734375, + "learning_rate": 0.00012456161959688687, + "loss": 0.1915, + "step": 14356 + }, + { + "epoch": 1.92, + "grad_norm": 0.423828125, + "learning_rate": 0.00012455033113347637, + "loss": 0.4342, + "step": 14357 + }, + { + "epoch": 1.92, + "grad_norm": 0.6953125, + "learning_rate": 0.0001245390423371476, + "loss": 0.4604, + "step": 14358 + }, + { + "epoch": 1.92, + "grad_norm": 0.74609375, + "learning_rate": 0.0001245277532080537, + "loss": 0.3022, + "step": 14359 + }, + { + "epoch": 1.92, + "grad_norm": 0.498046875, + "learning_rate": 0.00012451646374634775, + "loss": 0.2603, + "step": 14360 + }, + { + "epoch": 1.92, + "grad_norm": 0.58984375, + "learning_rate": 0.00012450517395218275, + "loss": 0.661, + "step": 14361 + }, + { + "epoch": 1.92, + "grad_norm": 0.470703125, + "learning_rate": 0.0001244938838257119, + "loss": 0.3977, + "step": 14362 + }, + { + "epoch": 1.92, + "grad_norm": 0.51171875, + "learning_rate": 0.00012448259336708822, + "loss": 0.6173, + "step": 14363 + }, + { + "epoch": 1.92, + "grad_norm": 0.578125, + "learning_rate": 0.0001244713025764649, + "loss": 0.3063, + "step": 14364 + }, + { + "epoch": 1.92, + "grad_norm": 0.37109375, + "learning_rate": 0.00012446001145399502, + "loss": 0.2073, + "step": 14365 + }, + { + "epoch": 1.92, + "grad_norm": 0.6484375, + "learning_rate": 0.00012444871999983169, + "loss": 0.5009, + "step": 14366 + }, + { + "epoch": 1.92, + "grad_norm": 0.4296875, + "learning_rate": 0.000124437428214128, + "loss": 0.3594, + "step": 14367 + }, + { + "epoch": 1.92, + "grad_norm": 0.486328125, + "learning_rate": 0.00012442613609703711, + "loss": 0.5695, + "step": 14368 + }, + { + "epoch": 1.92, + "grad_norm": 0.609375, + "learning_rate": 0.00012441484364871213, + "loss": 0.5479, + "step": 14369 + }, + { + "epoch": 1.92, + "grad_norm": 0.73828125, + "learning_rate": 0.0001244035508693062, + "loss": 0.5141, + "step": 14370 + }, + { + "epoch": 1.92, + "grad_norm": 0.69140625, + "learning_rate": 0.00012439225775897248, + "loss": 0.5095, + "step": 14371 + }, + { + "epoch": 1.92, + "grad_norm": 0.60546875, + "learning_rate": 0.00012438096431786408, + "loss": 0.5233, + "step": 14372 + }, + { + "epoch": 1.92, + "grad_norm": 0.51171875, + "learning_rate": 0.00012436967054613412, + "loss": 0.5252, + "step": 14373 + }, + { + "epoch": 1.92, + "grad_norm": 0.41015625, + "learning_rate": 0.00012435837644393584, + "loss": 0.3215, + "step": 14374 + }, + { + "epoch": 1.92, + "grad_norm": 0.443359375, + "learning_rate": 0.0001243470820114223, + "loss": 0.2337, + "step": 14375 + }, + { + "epoch": 1.92, + "grad_norm": 0.55859375, + "learning_rate": 0.00012433578724874674, + "loss": 0.496, + "step": 14376 + }, + { + "epoch": 1.92, + "grad_norm": 0.734375, + "learning_rate": 0.00012432449215606225, + "loss": 0.3122, + "step": 14377 + }, + { + "epoch": 1.92, + "grad_norm": 0.55078125, + "learning_rate": 0.00012431319673352208, + "loss": 0.262, + "step": 14378 + }, + { + "epoch": 1.92, + "grad_norm": 0.5, + "learning_rate": 0.00012430190098127933, + "loss": 0.3093, + "step": 14379 + }, + { + "epoch": 1.92, + "grad_norm": 0.484375, + "learning_rate": 0.00012429060489948723, + "loss": 0.3632, + "step": 14380 + }, + { + "epoch": 1.92, + "grad_norm": 0.5, + "learning_rate": 0.0001242793084882989, + "loss": 0.3973, + "step": 14381 + }, + { + "epoch": 1.92, + "grad_norm": 0.5390625, + "learning_rate": 0.00012426801174786754, + "loss": 0.4968, + "step": 14382 + }, + { + "epoch": 1.92, + "grad_norm": 0.5703125, + "learning_rate": 0.00012425671467834642, + "loss": 0.4367, + "step": 14383 + }, + { + "epoch": 1.92, + "grad_norm": 0.419921875, + "learning_rate": 0.0001242454172798887, + "loss": 0.4171, + "step": 14384 + }, + { + "epoch": 1.92, + "grad_norm": 0.4609375, + "learning_rate": 0.00012423411955264752, + "loss": 0.404, + "step": 14385 + }, + { + "epoch": 1.92, + "grad_norm": 0.455078125, + "learning_rate": 0.00012422282149677613, + "loss": 0.2703, + "step": 14386 + }, + { + "epoch": 1.92, + "grad_norm": 0.5546875, + "learning_rate": 0.00012421152311242774, + "loss": 0.6189, + "step": 14387 + }, + { + "epoch": 1.92, + "grad_norm": 0.373046875, + "learning_rate": 0.00012420022439975557, + "loss": 0.1836, + "step": 14388 + }, + { + "epoch": 1.92, + "grad_norm": 0.53125, + "learning_rate": 0.0001241889253589128, + "loss": 0.272, + "step": 14389 + }, + { + "epoch": 1.92, + "grad_norm": 0.4453125, + "learning_rate": 0.00012417762599005268, + "loss": 0.2869, + "step": 14390 + }, + { + "epoch": 1.92, + "grad_norm": 0.376953125, + "learning_rate": 0.00012416632629332845, + "loss": 0.1543, + "step": 14391 + }, + { + "epoch": 1.92, + "grad_norm": 0.578125, + "learning_rate": 0.00012415502626889332, + "loss": 0.64, + "step": 14392 + }, + { + "epoch": 1.92, + "grad_norm": 0.5, + "learning_rate": 0.00012414372591690054, + "loss": 0.3301, + "step": 14393 + }, + { + "epoch": 1.92, + "grad_norm": 0.4609375, + "learning_rate": 0.00012413242523750333, + "loss": 0.5974, + "step": 14394 + }, + { + "epoch": 1.92, + "grad_norm": 0.5703125, + "learning_rate": 0.00012412112423085495, + "loss": 0.2232, + "step": 14395 + }, + { + "epoch": 1.92, + "grad_norm": 0.435546875, + "learning_rate": 0.00012410982289710865, + "loss": 0.2436, + "step": 14396 + }, + { + "epoch": 1.92, + "grad_norm": 0.451171875, + "learning_rate": 0.00012409852123641767, + "loss": 0.3118, + "step": 14397 + }, + { + "epoch": 1.92, + "grad_norm": 0.419921875, + "learning_rate": 0.00012408721924893524, + "loss": 0.2405, + "step": 14398 + }, + { + "epoch": 1.92, + "grad_norm": 0.5546875, + "learning_rate": 0.00012407591693481472, + "loss": 0.6374, + "step": 14399 + }, + { + "epoch": 1.92, + "grad_norm": 0.46875, + "learning_rate": 0.0001240646142942093, + "loss": 0.3479, + "step": 14400 + }, + { + "epoch": 1.92, + "grad_norm": 0.65234375, + "learning_rate": 0.00012405331132727226, + "loss": 0.3352, + "step": 14401 + }, + { + "epoch": 1.92, + "grad_norm": 0.46875, + "learning_rate": 0.00012404200803415685, + "loss": 0.2959, + "step": 14402 + }, + { + "epoch": 1.92, + "grad_norm": 0.68359375, + "learning_rate": 0.00012403070441501643, + "loss": 0.2642, + "step": 14403 + }, + { + "epoch": 1.92, + "grad_norm": 0.55859375, + "learning_rate": 0.0001240194004700042, + "loss": 0.5299, + "step": 14404 + }, + { + "epoch": 1.92, + "grad_norm": 0.4921875, + "learning_rate": 0.0001240080961992735, + "loss": 0.2775, + "step": 14405 + }, + { + "epoch": 1.92, + "grad_norm": 0.515625, + "learning_rate": 0.00012399679160297758, + "loss": 0.24, + "step": 14406 + }, + { + "epoch": 1.92, + "grad_norm": 0.443359375, + "learning_rate": 0.0001239854866812698, + "loss": 0.2355, + "step": 14407 + }, + { + "epoch": 1.92, + "grad_norm": 0.54296875, + "learning_rate": 0.00012397418143430343, + "loss": 0.2894, + "step": 14408 + }, + { + "epoch": 1.92, + "grad_norm": 0.4140625, + "learning_rate": 0.00012396287586223173, + "loss": 0.2652, + "step": 14409 + }, + { + "epoch": 1.92, + "grad_norm": 0.47265625, + "learning_rate": 0.0001239515699652081, + "loss": 0.2145, + "step": 14410 + }, + { + "epoch": 1.92, + "grad_norm": 0.59765625, + "learning_rate": 0.00012394026374338578, + "loss": 0.2992, + "step": 14411 + }, + { + "epoch": 1.92, + "grad_norm": 0.451171875, + "learning_rate": 0.00012392895719691815, + "loss": 0.4178, + "step": 14412 + }, + { + "epoch": 1.92, + "grad_norm": 0.51171875, + "learning_rate": 0.00012391765032595848, + "loss": 0.2912, + "step": 14413 + }, + { + "epoch": 1.92, + "grad_norm": 0.431640625, + "learning_rate": 0.00012390634313066013, + "loss": 0.3101, + "step": 14414 + }, + { + "epoch": 1.92, + "grad_norm": 0.36328125, + "learning_rate": 0.00012389503561117642, + "loss": 0.1891, + "step": 14415 + }, + { + "epoch": 1.92, + "grad_norm": 0.44140625, + "learning_rate": 0.00012388372776766068, + "loss": 0.2987, + "step": 14416 + }, + { + "epoch": 1.92, + "grad_norm": 0.52734375, + "learning_rate": 0.00012387241960026629, + "loss": 0.4001, + "step": 14417 + }, + { + "epoch": 1.92, + "grad_norm": 0.470703125, + "learning_rate": 0.00012386111110914653, + "loss": 0.4114, + "step": 14418 + }, + { + "epoch": 1.92, + "grad_norm": 0.40234375, + "learning_rate": 0.00012384980229445482, + "loss": 0.221, + "step": 14419 + }, + { + "epoch": 1.92, + "grad_norm": 0.59765625, + "learning_rate": 0.00012383849315634449, + "loss": 0.2628, + "step": 14420 + }, + { + "epoch": 1.92, + "grad_norm": 0.470703125, + "learning_rate": 0.00012382718369496887, + "loss": 0.3639, + "step": 14421 + }, + { + "epoch": 1.92, + "grad_norm": 0.375, + "learning_rate": 0.00012381587391048135, + "loss": 0.3163, + "step": 14422 + }, + { + "epoch": 1.92, + "grad_norm": 0.5546875, + "learning_rate": 0.0001238045638030353, + "loss": 0.5612, + "step": 14423 + }, + { + "epoch": 1.92, + "grad_norm": 0.5390625, + "learning_rate": 0.0001237932533727841, + "loss": 0.2766, + "step": 14424 + }, + { + "epoch": 1.92, + "grad_norm": 0.640625, + "learning_rate": 0.00012378194261988108, + "loss": 0.208, + "step": 14425 + }, + { + "epoch": 1.93, + "grad_norm": 0.490234375, + "learning_rate": 0.00012377063154447968, + "loss": 0.1871, + "step": 14426 + }, + { + "epoch": 1.93, + "grad_norm": 0.56640625, + "learning_rate": 0.00012375932014673327, + "loss": 0.3077, + "step": 14427 + }, + { + "epoch": 1.93, + "grad_norm": 0.58203125, + "learning_rate": 0.00012374800842679523, + "loss": 0.5616, + "step": 14428 + }, + { + "epoch": 1.93, + "grad_norm": 0.46875, + "learning_rate": 0.00012373669638481892, + "loss": 0.2387, + "step": 14429 + }, + { + "epoch": 1.93, + "grad_norm": 0.70703125, + "learning_rate": 0.00012372538402095779, + "loss": 0.3587, + "step": 14430 + }, + { + "epoch": 1.93, + "grad_norm": 0.6640625, + "learning_rate": 0.00012371407133536524, + "loss": 0.3534, + "step": 14431 + }, + { + "epoch": 1.93, + "grad_norm": 0.68359375, + "learning_rate": 0.00012370275832819462, + "loss": 0.3204, + "step": 14432 + }, + { + "epoch": 1.93, + "grad_norm": 0.48046875, + "learning_rate": 0.00012369144499959947, + "loss": 0.3157, + "step": 14433 + }, + { + "epoch": 1.93, + "grad_norm": 0.361328125, + "learning_rate": 0.00012368013134973305, + "loss": 0.1798, + "step": 14434 + }, + { + "epoch": 1.93, + "grad_norm": 0.447265625, + "learning_rate": 0.00012366881737874887, + "loss": 0.3471, + "step": 14435 + }, + { + "epoch": 1.93, + "grad_norm": 0.4921875, + "learning_rate": 0.00012365750308680033, + "loss": 0.4304, + "step": 14436 + }, + { + "epoch": 1.93, + "grad_norm": 0.5078125, + "learning_rate": 0.0001236461884740409, + "loss": 0.298, + "step": 14437 + }, + { + "epoch": 1.93, + "grad_norm": 0.396484375, + "learning_rate": 0.0001236348735406239, + "loss": 0.1698, + "step": 14438 + }, + { + "epoch": 1.93, + "grad_norm": 0.5390625, + "learning_rate": 0.00012362355828670292, + "loss": 0.425, + "step": 14439 + }, + { + "epoch": 1.93, + "grad_norm": 0.515625, + "learning_rate": 0.0001236122427124313, + "loss": 0.433, + "step": 14440 + }, + { + "epoch": 1.93, + "grad_norm": 0.5546875, + "learning_rate": 0.00012360092681796251, + "loss": 0.3879, + "step": 14441 + }, + { + "epoch": 1.93, + "grad_norm": 0.47265625, + "learning_rate": 0.00012358961060345003, + "loss": 0.2285, + "step": 14442 + }, + { + "epoch": 1.93, + "grad_norm": 0.62890625, + "learning_rate": 0.00012357829406904727, + "loss": 0.6524, + "step": 14443 + }, + { + "epoch": 1.93, + "grad_norm": 1.2109375, + "learning_rate": 0.0001235669772149077, + "loss": 0.4792, + "step": 14444 + }, + { + "epoch": 1.93, + "grad_norm": 0.55078125, + "learning_rate": 0.0001235556600411848, + "loss": 0.5639, + "step": 14445 + }, + { + "epoch": 1.93, + "grad_norm": 0.404296875, + "learning_rate": 0.00012354434254803205, + "loss": 0.2361, + "step": 14446 + }, + { + "epoch": 1.93, + "grad_norm": 0.70703125, + "learning_rate": 0.0001235330247356029, + "loss": 0.5674, + "step": 14447 + }, + { + "epoch": 1.93, + "grad_norm": 0.47265625, + "learning_rate": 0.0001235217066040508, + "loss": 0.2695, + "step": 14448 + }, + { + "epoch": 1.93, + "grad_norm": 0.68359375, + "learning_rate": 0.0001235103881535293, + "loss": 0.4473, + "step": 14449 + }, + { + "epoch": 1.93, + "grad_norm": 0.353515625, + "learning_rate": 0.0001234990693841918, + "loss": 0.1836, + "step": 14450 + }, + { + "epoch": 1.93, + "grad_norm": 0.51171875, + "learning_rate": 0.00012348775029619193, + "loss": 0.4582, + "step": 14451 + }, + { + "epoch": 1.93, + "grad_norm": 0.6015625, + "learning_rate": 0.000123476430889683, + "loss": 0.3389, + "step": 14452 + }, + { + "epoch": 1.93, + "grad_norm": 0.3984375, + "learning_rate": 0.00012346511116481868, + "loss": 0.2912, + "step": 14453 + }, + { + "epoch": 1.93, + "grad_norm": 0.56640625, + "learning_rate": 0.00012345379112175232, + "loss": 0.2313, + "step": 14454 + }, + { + "epoch": 1.93, + "grad_norm": 0.51171875, + "learning_rate": 0.0001234424707606375, + "loss": 0.3541, + "step": 14455 + }, + { + "epoch": 1.93, + "grad_norm": 0.51953125, + "learning_rate": 0.00012343115008162778, + "loss": 0.6727, + "step": 14456 + }, + { + "epoch": 1.93, + "grad_norm": 0.515625, + "learning_rate": 0.00012341982908487662, + "loss": 0.5958, + "step": 14457 + }, + { + "epoch": 1.93, + "grad_norm": 0.59765625, + "learning_rate": 0.00012340850777053754, + "loss": 0.6126, + "step": 14458 + }, + { + "epoch": 1.93, + "grad_norm": 0.484375, + "learning_rate": 0.00012339718613876404, + "loss": 0.4292, + "step": 14459 + }, + { + "epoch": 1.93, + "grad_norm": 0.5390625, + "learning_rate": 0.00012338586418970973, + "loss": 0.3863, + "step": 14460 + }, + { + "epoch": 1.93, + "grad_norm": 0.39453125, + "learning_rate": 0.00012337454192352804, + "loss": 0.2039, + "step": 14461 + }, + { + "epoch": 1.93, + "grad_norm": 0.6015625, + "learning_rate": 0.00012336321934037258, + "loss": 0.5379, + "step": 14462 + }, + { + "epoch": 1.93, + "grad_norm": 0.55078125, + "learning_rate": 0.00012335189644039688, + "loss": 0.6054, + "step": 14463 + }, + { + "epoch": 1.93, + "grad_norm": 0.458984375, + "learning_rate": 0.00012334057322375448, + "loss": 0.4107, + "step": 14464 + }, + { + "epoch": 1.93, + "grad_norm": 0.5078125, + "learning_rate": 0.00012332924969059894, + "loss": 0.3336, + "step": 14465 + }, + { + "epoch": 1.93, + "grad_norm": 0.640625, + "learning_rate": 0.00012331792584108374, + "loss": 0.5294, + "step": 14466 + }, + { + "epoch": 1.93, + "grad_norm": 0.66015625, + "learning_rate": 0.00012330660167536256, + "loss": 0.5464, + "step": 14467 + }, + { + "epoch": 1.93, + "grad_norm": 0.54296875, + "learning_rate": 0.00012329527719358887, + "loss": 0.3994, + "step": 14468 + }, + { + "epoch": 1.93, + "grad_norm": 0.546875, + "learning_rate": 0.0001232839523959163, + "loss": 0.2941, + "step": 14469 + }, + { + "epoch": 1.93, + "grad_norm": 0.3671875, + "learning_rate": 0.0001232726272824984, + "loss": 0.2431, + "step": 14470 + }, + { + "epoch": 1.93, + "grad_norm": 0.53125, + "learning_rate": 0.0001232613018534887, + "loss": 0.2855, + "step": 14471 + }, + { + "epoch": 1.93, + "grad_norm": 0.50390625, + "learning_rate": 0.00012324997610904084, + "loss": 0.2834, + "step": 14472 + }, + { + "epoch": 1.93, + "grad_norm": 0.6875, + "learning_rate": 0.00012323865004930837, + "loss": 0.3176, + "step": 14473 + }, + { + "epoch": 1.93, + "grad_norm": 0.47265625, + "learning_rate": 0.0001232273236744449, + "loss": 0.3698, + "step": 14474 + }, + { + "epoch": 1.93, + "grad_norm": 0.482421875, + "learning_rate": 0.000123215996984604, + "loss": 0.4565, + "step": 14475 + }, + { + "epoch": 1.93, + "grad_norm": 0.515625, + "learning_rate": 0.0001232046699799393, + "loss": 0.4443, + "step": 14476 + }, + { + "epoch": 1.93, + "grad_norm": 0.53515625, + "learning_rate": 0.00012319334266060438, + "loss": 0.407, + "step": 14477 + }, + { + "epoch": 1.93, + "grad_norm": 0.431640625, + "learning_rate": 0.00012318201502675285, + "loss": 0.4254, + "step": 14478 + }, + { + "epoch": 1.93, + "grad_norm": 0.388671875, + "learning_rate": 0.0001231706870785383, + "loss": 0.2852, + "step": 14479 + }, + { + "epoch": 1.93, + "grad_norm": 0.609375, + "learning_rate": 0.00012315935881611434, + "loss": 0.4913, + "step": 14480 + }, + { + "epoch": 1.93, + "grad_norm": 0.423828125, + "learning_rate": 0.00012314803023963467, + "loss": 0.2596, + "step": 14481 + }, + { + "epoch": 1.93, + "grad_norm": 0.435546875, + "learning_rate": 0.00012313670134925283, + "loss": 0.3706, + "step": 14482 + }, + { + "epoch": 1.93, + "grad_norm": 0.380859375, + "learning_rate": 0.00012312537214512247, + "loss": 0.2218, + "step": 14483 + }, + { + "epoch": 1.93, + "grad_norm": 0.5078125, + "learning_rate": 0.0001231140426273972, + "loss": 0.2113, + "step": 14484 + }, + { + "epoch": 1.93, + "grad_norm": 0.4375, + "learning_rate": 0.00012310271279623072, + "loss": 0.274, + "step": 14485 + }, + { + "epoch": 1.93, + "grad_norm": 0.609375, + "learning_rate": 0.0001230913826517766, + "loss": 0.4126, + "step": 14486 + }, + { + "epoch": 1.93, + "grad_norm": 0.69921875, + "learning_rate": 0.00012308005219418853, + "loss": 0.4258, + "step": 14487 + }, + { + "epoch": 1.93, + "grad_norm": 0.64453125, + "learning_rate": 0.0001230687214236201, + "loss": 0.4442, + "step": 14488 + }, + { + "epoch": 1.93, + "grad_norm": 0.546875, + "learning_rate": 0.00012305739034022502, + "loss": 0.4668, + "step": 14489 + }, + { + "epoch": 1.93, + "grad_norm": 0.462890625, + "learning_rate": 0.00012304605894415698, + "loss": 0.4808, + "step": 14490 + }, + { + "epoch": 1.93, + "grad_norm": 0.48046875, + "learning_rate": 0.00012303472723556954, + "loss": 0.159, + "step": 14491 + }, + { + "epoch": 1.93, + "grad_norm": 0.625, + "learning_rate": 0.00012302339521461642, + "loss": 0.5795, + "step": 14492 + }, + { + "epoch": 1.93, + "grad_norm": 0.462890625, + "learning_rate": 0.00012301206288145127, + "loss": 0.2561, + "step": 14493 + }, + { + "epoch": 1.93, + "grad_norm": 0.486328125, + "learning_rate": 0.0001230007302362278, + "loss": 0.3965, + "step": 14494 + }, + { + "epoch": 1.93, + "grad_norm": 0.56640625, + "learning_rate": 0.00012298939727909967, + "loss": 0.5083, + "step": 14495 + }, + { + "epoch": 1.93, + "grad_norm": 0.486328125, + "learning_rate": 0.00012297806401022053, + "loss": 0.2307, + "step": 14496 + }, + { + "epoch": 1.93, + "grad_norm": 0.50390625, + "learning_rate": 0.0001229667304297441, + "loss": 0.2938, + "step": 14497 + }, + { + "epoch": 1.93, + "grad_norm": 0.43359375, + "learning_rate": 0.0001229553965378241, + "loss": 0.3527, + "step": 14498 + }, + { + "epoch": 1.93, + "grad_norm": 0.46875, + "learning_rate": 0.00012294406233461416, + "loss": 0.193, + "step": 14499 + }, + { + "epoch": 1.93, + "grad_norm": 0.59765625, + "learning_rate": 0.00012293272782026802, + "loss": 0.6077, + "step": 14500 + }, + { + "epoch": 1.94, + "grad_norm": 0.353515625, + "learning_rate": 0.0001229213929949394, + "loss": 0.1316, + "step": 14501 + }, + { + "epoch": 1.94, + "grad_norm": 0.5078125, + "learning_rate": 0.00012291005785878195, + "loss": 0.4442, + "step": 14502 + }, + { + "epoch": 1.94, + "grad_norm": 0.515625, + "learning_rate": 0.00012289872241194938, + "loss": 0.4957, + "step": 14503 + }, + { + "epoch": 1.94, + "grad_norm": 0.408203125, + "learning_rate": 0.00012288738665459546, + "loss": 0.3445, + "step": 14504 + }, + { + "epoch": 1.94, + "grad_norm": 0.625, + "learning_rate": 0.00012287605058687392, + "loss": 0.6767, + "step": 14505 + }, + { + "epoch": 1.94, + "grad_norm": 0.56640625, + "learning_rate": 0.00012286471420893844, + "loss": 0.3653, + "step": 14506 + }, + { + "epoch": 1.94, + "grad_norm": 0.51953125, + "learning_rate": 0.00012285337752094275, + "loss": 0.3491, + "step": 14507 + }, + { + "epoch": 1.94, + "grad_norm": 0.515625, + "learning_rate": 0.0001228420405230406, + "loss": 0.3605, + "step": 14508 + }, + { + "epoch": 1.94, + "grad_norm": 0.44140625, + "learning_rate": 0.0001228307032153857, + "loss": 0.3767, + "step": 14509 + }, + { + "epoch": 1.94, + "grad_norm": 0.53125, + "learning_rate": 0.00012281936559813182, + "loss": 0.267, + "step": 14510 + }, + { + "epoch": 1.94, + "grad_norm": 0.5390625, + "learning_rate": 0.0001228080276714327, + "loss": 0.5335, + "step": 14511 + }, + { + "epoch": 1.94, + "grad_norm": 0.494140625, + "learning_rate": 0.0001227966894354421, + "loss": 0.3684, + "step": 14512 + }, + { + "epoch": 1.94, + "grad_norm": 0.55859375, + "learning_rate": 0.00012278535089031378, + "loss": 0.5591, + "step": 14513 + }, + { + "epoch": 1.94, + "grad_norm": 0.640625, + "learning_rate": 0.00012277401203620144, + "loss": 0.5607, + "step": 14514 + }, + { + "epoch": 1.94, + "grad_norm": 0.56640625, + "learning_rate": 0.00012276267287325888, + "loss": 0.4427, + "step": 14515 + }, + { + "epoch": 1.94, + "grad_norm": 0.396484375, + "learning_rate": 0.00012275133340163986, + "loss": 0.2468, + "step": 14516 + }, + { + "epoch": 1.94, + "grad_norm": 0.42578125, + "learning_rate": 0.0001227399936214982, + "loss": 0.2537, + "step": 14517 + }, + { + "epoch": 1.94, + "grad_norm": 0.66015625, + "learning_rate": 0.0001227286535329876, + "loss": 0.7572, + "step": 14518 + }, + { + "epoch": 1.94, + "grad_norm": 0.765625, + "learning_rate": 0.0001227173131362619, + "loss": 0.4241, + "step": 14519 + }, + { + "epoch": 1.94, + "grad_norm": 0.62890625, + "learning_rate": 0.00012270597243147483, + "loss": 0.5157, + "step": 14520 + }, + { + "epoch": 1.94, + "grad_norm": 0.53515625, + "learning_rate": 0.0001226946314187802, + "loss": 0.4977, + "step": 14521 + }, + { + "epoch": 1.94, + "grad_norm": 0.458984375, + "learning_rate": 0.00012268329009833178, + "loss": 0.3646, + "step": 14522 + }, + { + "epoch": 1.94, + "grad_norm": 0.5390625, + "learning_rate": 0.00012267194847028344, + "loss": 0.4223, + "step": 14523 + }, + { + "epoch": 1.94, + "grad_norm": 0.55078125, + "learning_rate": 0.00012266060653478891, + "loss": 0.3848, + "step": 14524 + }, + { + "epoch": 1.94, + "grad_norm": 0.59375, + "learning_rate": 0.00012264926429200198, + "loss": 0.2513, + "step": 14525 + }, + { + "epoch": 1.94, + "grad_norm": 0.37890625, + "learning_rate": 0.00012263792174207653, + "loss": 0.1572, + "step": 14526 + }, + { + "epoch": 1.94, + "grad_norm": 0.59375, + "learning_rate": 0.0001226265788851663, + "loss": 0.4247, + "step": 14527 + }, + { + "epoch": 1.94, + "grad_norm": 0.57421875, + "learning_rate": 0.00012261523572142514, + "loss": 0.6268, + "step": 14528 + }, + { + "epoch": 1.94, + "grad_norm": 0.423828125, + "learning_rate": 0.0001226038922510069, + "loss": 0.3489, + "step": 14529 + }, + { + "epoch": 1.94, + "grad_norm": 0.546875, + "learning_rate": 0.00012259254847406538, + "loss": 0.2338, + "step": 14530 + }, + { + "epoch": 1.94, + "grad_norm": 0.5703125, + "learning_rate": 0.00012258120439075437, + "loss": 0.5058, + "step": 14531 + }, + { + "epoch": 1.94, + "grad_norm": 0.5390625, + "learning_rate": 0.00012256986000122775, + "loss": 0.4161, + "step": 14532 + }, + { + "epoch": 1.94, + "grad_norm": 0.337890625, + "learning_rate": 0.00012255851530563935, + "loss": 0.1231, + "step": 14533 + }, + { + "epoch": 1.94, + "grad_norm": 0.58203125, + "learning_rate": 0.00012254717030414297, + "loss": 0.4127, + "step": 14534 + }, + { + "epoch": 1.94, + "grad_norm": 0.6796875, + "learning_rate": 0.00012253582499689253, + "loss": 0.5372, + "step": 14535 + }, + { + "epoch": 1.94, + "grad_norm": 0.5703125, + "learning_rate": 0.00012252447938404184, + "loss": 0.2799, + "step": 14536 + }, + { + "epoch": 1.94, + "grad_norm": 0.47265625, + "learning_rate": 0.00012251313346574473, + "loss": 0.4142, + "step": 14537 + }, + { + "epoch": 1.94, + "grad_norm": 0.5625, + "learning_rate": 0.00012250178724215508, + "loss": 0.2773, + "step": 14538 + }, + { + "epoch": 1.94, + "grad_norm": 0.796875, + "learning_rate": 0.00012249044071342677, + "loss": 0.5629, + "step": 14539 + }, + { + "epoch": 1.94, + "grad_norm": 0.66015625, + "learning_rate": 0.00012247909387971365, + "loss": 0.5494, + "step": 14540 + }, + { + "epoch": 1.94, + "grad_norm": 0.5625, + "learning_rate": 0.00012246774674116957, + "loss": 0.4227, + "step": 14541 + }, + { + "epoch": 1.94, + "grad_norm": 0.55078125, + "learning_rate": 0.00012245639929794842, + "loss": 0.3618, + "step": 14542 + }, + { + "epoch": 1.94, + "grad_norm": 0.5625, + "learning_rate": 0.00012244505155020413, + "loss": 0.4442, + "step": 14543 + }, + { + "epoch": 1.94, + "grad_norm": 0.318359375, + "learning_rate": 0.00012243370349809048, + "loss": 0.2253, + "step": 14544 + }, + { + "epoch": 1.94, + "grad_norm": 0.330078125, + "learning_rate": 0.00012242235514176145, + "loss": 0.1166, + "step": 14545 + }, + { + "epoch": 1.94, + "grad_norm": 0.30078125, + "learning_rate": 0.0001224110064813709, + "loss": 0.1372, + "step": 14546 + }, + { + "epoch": 1.94, + "grad_norm": 0.66015625, + "learning_rate": 0.0001223996575170727, + "loss": 0.4164, + "step": 14547 + }, + { + "epoch": 1.94, + "grad_norm": 0.71484375, + "learning_rate": 0.00012238830824902076, + "loss": 0.6616, + "step": 14548 + }, + { + "epoch": 1.94, + "grad_norm": 0.75, + "learning_rate": 0.000122376958677369, + "loss": 0.4485, + "step": 14549 + }, + { + "epoch": 1.94, + "grad_norm": 0.53125, + "learning_rate": 0.00012236560880227133, + "loss": 0.4254, + "step": 14550 + }, + { + "epoch": 1.94, + "grad_norm": 0.6328125, + "learning_rate": 0.00012235425862388167, + "loss": 0.776, + "step": 14551 + }, + { + "epoch": 1.94, + "grad_norm": 0.57421875, + "learning_rate": 0.00012234290814235385, + "loss": 0.2922, + "step": 14552 + }, + { + "epoch": 1.94, + "grad_norm": 0.51953125, + "learning_rate": 0.00012233155735784186, + "loss": 0.3682, + "step": 14553 + }, + { + "epoch": 1.94, + "grad_norm": 0.470703125, + "learning_rate": 0.00012232020627049969, + "loss": 0.3221, + "step": 14554 + }, + { + "epoch": 1.94, + "grad_norm": 0.55078125, + "learning_rate": 0.00012230885488048117, + "loss": 0.5505, + "step": 14555 + }, + { + "epoch": 1.94, + "grad_norm": 0.56640625, + "learning_rate": 0.00012229750318794023, + "loss": 0.2953, + "step": 14556 + }, + { + "epoch": 1.94, + "grad_norm": 0.49609375, + "learning_rate": 0.00012228615119303087, + "loss": 0.341, + "step": 14557 + }, + { + "epoch": 1.94, + "grad_norm": 0.55078125, + "learning_rate": 0.000122274798895907, + "loss": 0.586, + "step": 14558 + }, + { + "epoch": 1.94, + "grad_norm": 0.54296875, + "learning_rate": 0.00012226344629672254, + "loss": 0.51, + "step": 14559 + }, + { + "epoch": 1.94, + "grad_norm": 0.455078125, + "learning_rate": 0.00012225209339563145, + "loss": 0.4849, + "step": 14560 + }, + { + "epoch": 1.94, + "grad_norm": 0.490234375, + "learning_rate": 0.00012224074019278773, + "loss": 0.4691, + "step": 14561 + }, + { + "epoch": 1.94, + "grad_norm": 0.60546875, + "learning_rate": 0.00012222938668834526, + "loss": 0.4736, + "step": 14562 + }, + { + "epoch": 1.94, + "grad_norm": 0.6015625, + "learning_rate": 0.00012221803288245804, + "loss": 0.5031, + "step": 14563 + }, + { + "epoch": 1.94, + "grad_norm": 0.5234375, + "learning_rate": 0.00012220667877528004, + "loss": 0.2905, + "step": 14564 + }, + { + "epoch": 1.94, + "grad_norm": 0.71484375, + "learning_rate": 0.00012219532436696524, + "loss": 0.282, + "step": 14565 + }, + { + "epoch": 1.94, + "grad_norm": 0.53515625, + "learning_rate": 0.0001221839696576676, + "loss": 0.2622, + "step": 14566 + }, + { + "epoch": 1.94, + "grad_norm": 0.6328125, + "learning_rate": 0.00012217261464754107, + "loss": 0.3586, + "step": 14567 + }, + { + "epoch": 1.94, + "grad_norm": 0.451171875, + "learning_rate": 0.0001221612593367397, + "loss": 0.5172, + "step": 14568 + }, + { + "epoch": 1.94, + "grad_norm": 0.466796875, + "learning_rate": 0.00012214990372541736, + "loss": 0.2921, + "step": 14569 + }, + { + "epoch": 1.94, + "grad_norm": 0.359375, + "learning_rate": 0.00012213854781372818, + "loss": 0.1733, + "step": 14570 + }, + { + "epoch": 1.94, + "grad_norm": 0.451171875, + "learning_rate": 0.00012212719160182604, + "loss": 0.3332, + "step": 14571 + }, + { + "epoch": 1.94, + "grad_norm": 0.4375, + "learning_rate": 0.00012211583508986502, + "loss": 0.3099, + "step": 14572 + }, + { + "epoch": 1.94, + "grad_norm": 0.6015625, + "learning_rate": 0.00012210447827799904, + "loss": 0.5383, + "step": 14573 + }, + { + "epoch": 1.94, + "grad_norm": 0.5, + "learning_rate": 0.0001220931211663822, + "loss": 0.399, + "step": 14574 + }, + { + "epoch": 1.94, + "grad_norm": 0.4296875, + "learning_rate": 0.0001220817637551684, + "loss": 0.2739, + "step": 14575 + }, + { + "epoch": 1.95, + "grad_norm": 0.431640625, + "learning_rate": 0.0001220704060445117, + "loss": 0.306, + "step": 14576 + }, + { + "epoch": 1.95, + "grad_norm": 0.515625, + "learning_rate": 0.00012205904803456618, + "loss": 0.4114, + "step": 14577 + }, + { + "epoch": 1.95, + "grad_norm": 0.5546875, + "learning_rate": 0.0001220476897254858, + "loss": 0.2455, + "step": 14578 + }, + { + "epoch": 1.95, + "grad_norm": 0.50390625, + "learning_rate": 0.00012203633111742458, + "loss": 0.2484, + "step": 14579 + }, + { + "epoch": 1.95, + "grad_norm": 0.42578125, + "learning_rate": 0.00012202497221053657, + "loss": 0.3176, + "step": 14580 + }, + { + "epoch": 1.95, + "grad_norm": 0.51953125, + "learning_rate": 0.00012201361300497583, + "loss": 0.5071, + "step": 14581 + }, + { + "epoch": 1.95, + "grad_norm": 0.6171875, + "learning_rate": 0.00012200225350089633, + "loss": 0.5758, + "step": 14582 + }, + { + "epoch": 1.95, + "grad_norm": 0.484375, + "learning_rate": 0.00012199089369845219, + "loss": 0.3874, + "step": 14583 + }, + { + "epoch": 1.95, + "grad_norm": 0.466796875, + "learning_rate": 0.0001219795335977974, + "loss": 0.4717, + "step": 14584 + }, + { + "epoch": 1.95, + "grad_norm": 0.404296875, + "learning_rate": 0.00012196817319908602, + "loss": 0.4317, + "step": 14585 + }, + { + "epoch": 1.95, + "grad_norm": 0.53125, + "learning_rate": 0.00012195681250247213, + "loss": 0.3879, + "step": 14586 + }, + { + "epoch": 1.95, + "grad_norm": 0.431640625, + "learning_rate": 0.00012194545150810975, + "loss": 0.3925, + "step": 14587 + }, + { + "epoch": 1.95, + "grad_norm": 0.3828125, + "learning_rate": 0.00012193409021615296, + "loss": 0.2742, + "step": 14588 + }, + { + "epoch": 1.95, + "grad_norm": 0.71875, + "learning_rate": 0.00012192272862675584, + "loss": 0.5083, + "step": 14589 + }, + { + "epoch": 1.95, + "grad_norm": 0.578125, + "learning_rate": 0.00012191136674007247, + "loss": 0.7648, + "step": 14590 + }, + { + "epoch": 1.95, + "grad_norm": 0.4609375, + "learning_rate": 0.00012190000455625687, + "loss": 0.4745, + "step": 14591 + }, + { + "epoch": 1.95, + "grad_norm": 0.625, + "learning_rate": 0.00012188864207546316, + "loss": 0.4715, + "step": 14592 + }, + { + "epoch": 1.95, + "grad_norm": 0.37890625, + "learning_rate": 0.00012187727929784544, + "loss": 0.1829, + "step": 14593 + }, + { + "epoch": 1.95, + "grad_norm": 0.5546875, + "learning_rate": 0.00012186591622355774, + "loss": 0.3651, + "step": 14594 + }, + { + "epoch": 1.95, + "grad_norm": 0.6015625, + "learning_rate": 0.0001218545528527542, + "loss": 0.4533, + "step": 14595 + }, + { + "epoch": 1.95, + "grad_norm": 0.53125, + "learning_rate": 0.00012184318918558893, + "loss": 0.3616, + "step": 14596 + }, + { + "epoch": 1.95, + "grad_norm": 0.400390625, + "learning_rate": 0.00012183182522221597, + "loss": 0.1598, + "step": 14597 + }, + { + "epoch": 1.95, + "grad_norm": 0.462890625, + "learning_rate": 0.0001218204609627894, + "loss": 0.4605, + "step": 14598 + }, + { + "epoch": 1.95, + "grad_norm": 0.55859375, + "learning_rate": 0.00012180909640746343, + "loss": 0.5507, + "step": 14599 + }, + { + "epoch": 1.95, + "grad_norm": 0.6953125, + "learning_rate": 0.0001217977315563921, + "loss": 0.6257, + "step": 14600 + }, + { + "epoch": 1.95, + "grad_norm": 0.421875, + "learning_rate": 0.00012178636640972953, + "loss": 0.1993, + "step": 14601 + }, + { + "epoch": 1.95, + "grad_norm": 0.421875, + "learning_rate": 0.00012177500096762986, + "loss": 0.1594, + "step": 14602 + }, + { + "epoch": 1.95, + "grad_norm": 0.490234375, + "learning_rate": 0.00012176363523024721, + "loss": 0.4002, + "step": 14603 + }, + { + "epoch": 1.95, + "grad_norm": 0.36328125, + "learning_rate": 0.0001217522691977357, + "loss": 0.3109, + "step": 14604 + }, + { + "epoch": 1.95, + "grad_norm": 0.72265625, + "learning_rate": 0.00012174090287024944, + "loss": 0.4777, + "step": 14605 + }, + { + "epoch": 1.95, + "grad_norm": 0.76171875, + "learning_rate": 0.00012172953624794262, + "loss": 0.4137, + "step": 14606 + }, + { + "epoch": 1.95, + "grad_norm": 0.373046875, + "learning_rate": 0.00012171816933096929, + "loss": 0.2639, + "step": 14607 + }, + { + "epoch": 1.95, + "grad_norm": 0.44921875, + "learning_rate": 0.00012170680211948368, + "loss": 0.2357, + "step": 14608 + }, + { + "epoch": 1.95, + "grad_norm": 0.47265625, + "learning_rate": 0.0001216954346136399, + "loss": 0.4389, + "step": 14609 + }, + { + "epoch": 1.95, + "grad_norm": 0.38671875, + "learning_rate": 0.00012168406681359208, + "loss": 0.4302, + "step": 14610 + }, + { + "epoch": 1.95, + "grad_norm": 0.51171875, + "learning_rate": 0.00012167269871949442, + "loss": 0.3061, + "step": 14611 + }, + { + "epoch": 1.95, + "grad_norm": 0.4375, + "learning_rate": 0.00012166133033150104, + "loss": 0.1596, + "step": 14612 + }, + { + "epoch": 1.95, + "grad_norm": 0.74609375, + "learning_rate": 0.00012164996164976614, + "loss": 0.7435, + "step": 14613 + }, + { + "epoch": 1.95, + "grad_norm": 0.546875, + "learning_rate": 0.00012163859267444384, + "loss": 0.35, + "step": 14614 + }, + { + "epoch": 1.95, + "grad_norm": 0.384765625, + "learning_rate": 0.00012162722340568835, + "loss": 0.2403, + "step": 14615 + }, + { + "epoch": 1.95, + "grad_norm": 0.57421875, + "learning_rate": 0.00012161585384365384, + "loss": 0.6066, + "step": 14616 + }, + { + "epoch": 1.95, + "grad_norm": 0.71875, + "learning_rate": 0.00012160448398849445, + "loss": 0.4355, + "step": 14617 + }, + { + "epoch": 1.95, + "grad_norm": 0.54296875, + "learning_rate": 0.00012159311384036442, + "loss": 0.3281, + "step": 14618 + }, + { + "epoch": 1.95, + "grad_norm": 0.54296875, + "learning_rate": 0.00012158174339941788, + "loss": 0.4137, + "step": 14619 + }, + { + "epoch": 1.95, + "grad_norm": 0.5, + "learning_rate": 0.00012157037266580909, + "loss": 0.4509, + "step": 14620 + }, + { + "epoch": 1.95, + "grad_norm": 0.4765625, + "learning_rate": 0.00012155900163969219, + "loss": 0.2892, + "step": 14621 + }, + { + "epoch": 1.95, + "grad_norm": 0.5390625, + "learning_rate": 0.00012154763032122139, + "loss": 0.2562, + "step": 14622 + }, + { + "epoch": 1.95, + "grad_norm": 0.4921875, + "learning_rate": 0.00012153625871055085, + "loss": 0.3986, + "step": 14623 + }, + { + "epoch": 1.95, + "grad_norm": 0.50390625, + "learning_rate": 0.00012152488680783487, + "loss": 0.2516, + "step": 14624 + }, + { + "epoch": 1.95, + "grad_norm": 0.5859375, + "learning_rate": 0.00012151351461322757, + "loss": 0.4982, + "step": 14625 + }, + { + "epoch": 1.95, + "grad_norm": 0.447265625, + "learning_rate": 0.00012150214212688322, + "loss": 0.2794, + "step": 14626 + }, + { + "epoch": 1.95, + "grad_norm": 0.50390625, + "learning_rate": 0.00012149076934895607, + "loss": 0.4189, + "step": 14627 + }, + { + "epoch": 1.95, + "grad_norm": 0.4765625, + "learning_rate": 0.00012147939627960023, + "loss": 0.266, + "step": 14628 + }, + { + "epoch": 1.95, + "grad_norm": 0.4453125, + "learning_rate": 0.00012146802291897004, + "loss": 0.195, + "step": 14629 + }, + { + "epoch": 1.95, + "grad_norm": 0.546875, + "learning_rate": 0.00012145664926721964, + "loss": 0.4268, + "step": 14630 + }, + { + "epoch": 1.95, + "grad_norm": 0.59765625, + "learning_rate": 0.00012144527532450334, + "loss": 0.4945, + "step": 14631 + }, + { + "epoch": 1.95, + "grad_norm": 0.39453125, + "learning_rate": 0.00012143390109097532, + "loss": 0.2873, + "step": 14632 + }, + { + "epoch": 1.95, + "grad_norm": 0.55859375, + "learning_rate": 0.00012142252656678985, + "loss": 0.3562, + "step": 14633 + }, + { + "epoch": 1.95, + "grad_norm": 0.6640625, + "learning_rate": 0.00012141115175210118, + "loss": 0.6865, + "step": 14634 + }, + { + "epoch": 1.95, + "grad_norm": 0.490234375, + "learning_rate": 0.00012139977664706355, + "loss": 0.2554, + "step": 14635 + }, + { + "epoch": 1.95, + "grad_norm": 0.392578125, + "learning_rate": 0.0001213884012518312, + "loss": 0.2705, + "step": 14636 + }, + { + "epoch": 1.95, + "grad_norm": 0.6875, + "learning_rate": 0.00012137702556655839, + "loss": 0.8292, + "step": 14637 + }, + { + "epoch": 1.95, + "grad_norm": 0.447265625, + "learning_rate": 0.00012136564959139942, + "loss": 0.3892, + "step": 14638 + }, + { + "epoch": 1.95, + "grad_norm": 0.70703125, + "learning_rate": 0.00012135427332650855, + "loss": 0.2819, + "step": 14639 + }, + { + "epoch": 1.95, + "grad_norm": 0.578125, + "learning_rate": 0.00012134289677203999, + "loss": 0.3421, + "step": 14640 + }, + { + "epoch": 1.95, + "grad_norm": 0.65625, + "learning_rate": 0.00012133151992814807, + "loss": 0.5114, + "step": 14641 + }, + { + "epoch": 1.95, + "grad_norm": 0.390625, + "learning_rate": 0.00012132014279498703, + "loss": 0.3185, + "step": 14642 + }, + { + "epoch": 1.95, + "grad_norm": 0.44921875, + "learning_rate": 0.00012130876537271118, + "loss": 0.3379, + "step": 14643 + }, + { + "epoch": 1.95, + "grad_norm": 0.58984375, + "learning_rate": 0.00012129738766147478, + "loss": 0.4732, + "step": 14644 + }, + { + "epoch": 1.95, + "grad_norm": 0.390625, + "learning_rate": 0.0001212860096614322, + "loss": 0.2503, + "step": 14645 + }, + { + "epoch": 1.95, + "grad_norm": 0.470703125, + "learning_rate": 0.0001212746313727376, + "loss": 0.5049, + "step": 14646 + }, + { + "epoch": 1.95, + "grad_norm": 0.412109375, + "learning_rate": 0.00012126325279554538, + "loss": 0.3208, + "step": 14647 + }, + { + "epoch": 1.95, + "grad_norm": 0.482421875, + "learning_rate": 0.00012125187393000976, + "loss": 0.3681, + "step": 14648 + }, + { + "epoch": 1.95, + "grad_norm": 0.59765625, + "learning_rate": 0.00012124049477628511, + "loss": 0.3707, + "step": 14649 + }, + { + "epoch": 1.95, + "grad_norm": 0.6640625, + "learning_rate": 0.00012122911533452574, + "loss": 0.3633, + "step": 14650 + }, + { + "epoch": 1.96, + "grad_norm": 0.484375, + "learning_rate": 0.00012121773560488594, + "loss": 0.3392, + "step": 14651 + }, + { + "epoch": 1.96, + "grad_norm": 0.490234375, + "learning_rate": 0.00012120635558752003, + "loss": 0.4318, + "step": 14652 + }, + { + "epoch": 1.96, + "grad_norm": 0.466796875, + "learning_rate": 0.00012119497528258232, + "loss": 0.3507, + "step": 14653 + }, + { + "epoch": 1.96, + "grad_norm": 0.46484375, + "learning_rate": 0.00012118359469022712, + "loss": 0.2481, + "step": 14654 + }, + { + "epoch": 1.96, + "grad_norm": 0.443359375, + "learning_rate": 0.00012117221381060881, + "loss": 0.3311, + "step": 14655 + }, + { + "epoch": 1.96, + "grad_norm": 0.5234375, + "learning_rate": 0.0001211608326438817, + "loss": 0.336, + "step": 14656 + }, + { + "epoch": 1.96, + "grad_norm": 0.330078125, + "learning_rate": 0.0001211494511902001, + "loss": 0.3037, + "step": 14657 + }, + { + "epoch": 1.96, + "grad_norm": 0.71875, + "learning_rate": 0.00012113806944971835, + "loss": 0.6369, + "step": 14658 + }, + { + "epoch": 1.96, + "grad_norm": 0.515625, + "learning_rate": 0.00012112668742259083, + "loss": 0.3727, + "step": 14659 + }, + { + "epoch": 1.96, + "grad_norm": 0.466796875, + "learning_rate": 0.00012111530510897187, + "loss": 0.4296, + "step": 14660 + }, + { + "epoch": 1.96, + "grad_norm": 0.466796875, + "learning_rate": 0.00012110392250901583, + "loss": 0.1791, + "step": 14661 + }, + { + "epoch": 1.96, + "grad_norm": 0.46875, + "learning_rate": 0.00012109253962287703, + "loss": 0.3856, + "step": 14662 + }, + { + "epoch": 1.96, + "grad_norm": 0.455078125, + "learning_rate": 0.00012108115645070989, + "loss": 0.3606, + "step": 14663 + }, + { + "epoch": 1.96, + "grad_norm": 0.609375, + "learning_rate": 0.00012106977299266872, + "loss": 0.4133, + "step": 14664 + }, + { + "epoch": 1.96, + "grad_norm": 0.6875, + "learning_rate": 0.0001210583892489079, + "loss": 0.5257, + "step": 14665 + }, + { + "epoch": 1.96, + "grad_norm": 0.609375, + "learning_rate": 0.00012104700521958183, + "loss": 0.2257, + "step": 14666 + }, + { + "epoch": 1.96, + "grad_norm": 0.515625, + "learning_rate": 0.00012103562090484484, + "loss": 0.3584, + "step": 14667 + }, + { + "epoch": 1.96, + "grad_norm": 0.58203125, + "learning_rate": 0.00012102423630485134, + "loss": 0.4862, + "step": 14668 + }, + { + "epoch": 1.96, + "grad_norm": 0.498046875, + "learning_rate": 0.00012101285141975572, + "loss": 0.4822, + "step": 14669 + }, + { + "epoch": 1.96, + "grad_norm": 0.5859375, + "learning_rate": 0.00012100146624971232, + "loss": 0.6309, + "step": 14670 + }, + { + "epoch": 1.96, + "grad_norm": 0.447265625, + "learning_rate": 0.00012099008079487554, + "loss": 0.36, + "step": 14671 + }, + { + "epoch": 1.96, + "grad_norm": 0.49609375, + "learning_rate": 0.00012097869505539985, + "loss": 0.2857, + "step": 14672 + }, + { + "epoch": 1.96, + "grad_norm": 0.478515625, + "learning_rate": 0.00012096730903143954, + "loss": 0.3225, + "step": 14673 + }, + { + "epoch": 1.96, + "grad_norm": 0.64453125, + "learning_rate": 0.00012095592272314906, + "loss": 0.4256, + "step": 14674 + }, + { + "epoch": 1.96, + "grad_norm": 0.470703125, + "learning_rate": 0.00012094453613068288, + "loss": 0.3797, + "step": 14675 + }, + { + "epoch": 1.96, + "grad_norm": 0.55859375, + "learning_rate": 0.0001209331492541953, + "loss": 0.59, + "step": 14676 + }, + { + "epoch": 1.96, + "grad_norm": 0.6015625, + "learning_rate": 0.00012092176209384081, + "loss": 0.4133, + "step": 14677 + }, + { + "epoch": 1.96, + "grad_norm": 0.5234375, + "learning_rate": 0.00012091037464977375, + "loss": 0.4999, + "step": 14678 + }, + { + "epoch": 1.96, + "grad_norm": 0.59765625, + "learning_rate": 0.00012089898692214863, + "loss": 0.3538, + "step": 14679 + }, + { + "epoch": 1.96, + "grad_norm": 0.609375, + "learning_rate": 0.0001208875989111198, + "loss": 0.5111, + "step": 14680 + }, + { + "epoch": 1.96, + "grad_norm": 0.455078125, + "learning_rate": 0.00012087621061684178, + "loss": 0.3515, + "step": 14681 + }, + { + "epoch": 1.96, + "grad_norm": 0.46875, + "learning_rate": 0.00012086482203946891, + "loss": 0.3056, + "step": 14682 + }, + { + "epoch": 1.96, + "grad_norm": 0.734375, + "learning_rate": 0.00012085343317915565, + "loss": 0.2928, + "step": 14683 + }, + { + "epoch": 1.96, + "grad_norm": 0.55859375, + "learning_rate": 0.00012084204403605648, + "loss": 0.3987, + "step": 14684 + }, + { + "epoch": 1.96, + "grad_norm": 0.396484375, + "learning_rate": 0.00012083065461032579, + "loss": 0.2522, + "step": 14685 + }, + { + "epoch": 1.96, + "grad_norm": 0.306640625, + "learning_rate": 0.00012081926490211807, + "loss": 0.233, + "step": 14686 + }, + { + "epoch": 1.96, + "grad_norm": 0.462890625, + "learning_rate": 0.00012080787491158774, + "loss": 0.4041, + "step": 14687 + }, + { + "epoch": 1.96, + "grad_norm": 0.482421875, + "learning_rate": 0.00012079648463888931, + "loss": 0.3041, + "step": 14688 + }, + { + "epoch": 1.96, + "grad_norm": 0.43359375, + "learning_rate": 0.00012078509408417718, + "loss": 0.245, + "step": 14689 + }, + { + "epoch": 1.96, + "grad_norm": 0.52734375, + "learning_rate": 0.00012077370324760582, + "loss": 0.4224, + "step": 14690 + }, + { + "epoch": 1.96, + "grad_norm": 0.54296875, + "learning_rate": 0.00012076231212932971, + "loss": 0.4352, + "step": 14691 + }, + { + "epoch": 1.96, + "grad_norm": 0.4453125, + "learning_rate": 0.00012075092072950334, + "loss": 0.2759, + "step": 14692 + }, + { + "epoch": 1.96, + "grad_norm": 0.328125, + "learning_rate": 0.00012073952904828116, + "loss": 0.3001, + "step": 14693 + }, + { + "epoch": 1.96, + "grad_norm": 0.474609375, + "learning_rate": 0.00012072813708581767, + "loss": 0.2834, + "step": 14694 + }, + { + "epoch": 1.96, + "grad_norm": 0.462890625, + "learning_rate": 0.00012071674484226733, + "loss": 0.3479, + "step": 14695 + }, + { + "epoch": 1.96, + "grad_norm": 0.5390625, + "learning_rate": 0.00012070535231778461, + "loss": 0.3365, + "step": 14696 + }, + { + "epoch": 1.96, + "grad_norm": 0.45703125, + "learning_rate": 0.00012069395951252407, + "loss": 0.1693, + "step": 14697 + }, + { + "epoch": 1.96, + "grad_norm": 0.53125, + "learning_rate": 0.00012068256642664011, + "loss": 0.3575, + "step": 14698 + }, + { + "epoch": 1.96, + "grad_norm": 0.384765625, + "learning_rate": 0.0001206711730602873, + "loss": 0.2347, + "step": 14699 + }, + { + "epoch": 1.96, + "grad_norm": 0.466796875, + "learning_rate": 0.00012065977941362015, + "loss": 0.3571, + "step": 14700 + }, + { + "epoch": 1.96, + "grad_norm": 0.62890625, + "learning_rate": 0.00012064838548679307, + "loss": 0.3302, + "step": 14701 + }, + { + "epoch": 1.96, + "grad_norm": 0.58203125, + "learning_rate": 0.00012063699127996066, + "loss": 0.3494, + "step": 14702 + }, + { + "epoch": 1.96, + "grad_norm": 0.58984375, + "learning_rate": 0.00012062559679327739, + "loss": 0.3215, + "step": 14703 + }, + { + "epoch": 1.96, + "grad_norm": 0.5078125, + "learning_rate": 0.00012061420202689781, + "loss": 0.3503, + "step": 14704 + }, + { + "epoch": 1.96, + "grad_norm": 0.53125, + "learning_rate": 0.0001206028069809764, + "loss": 0.5697, + "step": 14705 + }, + { + "epoch": 1.96, + "grad_norm": 0.53125, + "learning_rate": 0.00012059141165566772, + "loss": 0.4143, + "step": 14706 + }, + { + "epoch": 1.96, + "grad_norm": 0.58203125, + "learning_rate": 0.00012058001605112628, + "loss": 0.4009, + "step": 14707 + }, + { + "epoch": 1.96, + "grad_norm": 0.375, + "learning_rate": 0.00012056862016750661, + "loss": 0.2783, + "step": 14708 + }, + { + "epoch": 1.96, + "grad_norm": 0.62109375, + "learning_rate": 0.00012055722400496325, + "loss": 0.4192, + "step": 14709 + }, + { + "epoch": 1.96, + "grad_norm": 0.392578125, + "learning_rate": 0.00012054582756365074, + "loss": 0.2847, + "step": 14710 + }, + { + "epoch": 1.96, + "grad_norm": 0.57421875, + "learning_rate": 0.00012053443084372362, + "loss": 0.4879, + "step": 14711 + }, + { + "epoch": 1.96, + "grad_norm": 0.53515625, + "learning_rate": 0.00012052303384533646, + "loss": 0.3155, + "step": 14712 + }, + { + "epoch": 1.96, + "grad_norm": 0.52734375, + "learning_rate": 0.00012051163656864374, + "loss": 0.5418, + "step": 14713 + }, + { + "epoch": 1.96, + "grad_norm": 0.4453125, + "learning_rate": 0.0001205002390138001, + "loss": 0.4641, + "step": 14714 + }, + { + "epoch": 1.96, + "grad_norm": 0.466796875, + "learning_rate": 0.00012048884118096007, + "loss": 0.3842, + "step": 14715 + }, + { + "epoch": 1.96, + "grad_norm": 0.80078125, + "learning_rate": 0.00012047744307027817, + "loss": 0.1997, + "step": 14716 + }, + { + "epoch": 1.96, + "grad_norm": 0.421875, + "learning_rate": 0.00012046604468190903, + "loss": 0.3067, + "step": 14717 + }, + { + "epoch": 1.96, + "grad_norm": 0.48828125, + "learning_rate": 0.0001204546460160072, + "loss": 0.348, + "step": 14718 + }, + { + "epoch": 1.96, + "grad_norm": 0.5546875, + "learning_rate": 0.00012044324707272721, + "loss": 0.3551, + "step": 14719 + }, + { + "epoch": 1.96, + "grad_norm": 0.55859375, + "learning_rate": 0.00012043184785222368, + "loss": 0.491, + "step": 14720 + }, + { + "epoch": 1.96, + "grad_norm": 0.462890625, + "learning_rate": 0.00012042044835465116, + "loss": 0.2777, + "step": 14721 + }, + { + "epoch": 1.96, + "grad_norm": 0.5390625, + "learning_rate": 0.00012040904858016428, + "loss": 0.4286, + "step": 14722 + }, + { + "epoch": 1.96, + "grad_norm": 0.498046875, + "learning_rate": 0.0001203976485289176, + "loss": 0.2288, + "step": 14723 + }, + { + "epoch": 1.96, + "grad_norm": 0.470703125, + "learning_rate": 0.00012038624820106572, + "loss": 0.6278, + "step": 14724 + }, + { + "epoch": 1.96, + "grad_norm": 0.671875, + "learning_rate": 0.00012037484759676323, + "loss": 0.4005, + "step": 14725 + }, + { + "epoch": 1.97, + "grad_norm": 0.42578125, + "learning_rate": 0.00012036344671616471, + "loss": 0.3997, + "step": 14726 + }, + { + "epoch": 1.97, + "grad_norm": 0.640625, + "learning_rate": 0.0001203520455594248, + "loss": 0.5321, + "step": 14727 + }, + { + "epoch": 1.97, + "grad_norm": 0.486328125, + "learning_rate": 0.00012034064412669808, + "loss": 0.5302, + "step": 14728 + }, + { + "epoch": 1.97, + "grad_norm": 0.5078125, + "learning_rate": 0.00012032924241813918, + "loss": 0.3168, + "step": 14729 + }, + { + "epoch": 1.97, + "grad_norm": 0.5078125, + "learning_rate": 0.00012031784043390274, + "loss": 0.4878, + "step": 14730 + }, + { + "epoch": 1.97, + "grad_norm": 0.61328125, + "learning_rate": 0.00012030643817414329, + "loss": 0.6161, + "step": 14731 + }, + { + "epoch": 1.97, + "grad_norm": 0.80078125, + "learning_rate": 0.00012029503563901551, + "loss": 0.4175, + "step": 14732 + }, + { + "epoch": 1.97, + "grad_norm": 0.671875, + "learning_rate": 0.00012028363282867403, + "loss": 0.2391, + "step": 14733 + }, + { + "epoch": 1.97, + "grad_norm": 0.4296875, + "learning_rate": 0.00012027222974327348, + "loss": 0.2129, + "step": 14734 + }, + { + "epoch": 1.97, + "grad_norm": 0.48828125, + "learning_rate": 0.00012026082638296847, + "loss": 0.4877, + "step": 14735 + }, + { + "epoch": 1.97, + "grad_norm": 0.60546875, + "learning_rate": 0.00012024942274791367, + "loss": 0.6043, + "step": 14736 + }, + { + "epoch": 1.97, + "grad_norm": 0.4453125, + "learning_rate": 0.00012023801883826371, + "loss": 0.3556, + "step": 14737 + }, + { + "epoch": 1.97, + "grad_norm": 0.490234375, + "learning_rate": 0.00012022661465417319, + "loss": 0.3216, + "step": 14738 + }, + { + "epoch": 1.97, + "grad_norm": 0.52734375, + "learning_rate": 0.00012021521019579682, + "loss": 0.3705, + "step": 14739 + }, + { + "epoch": 1.97, + "grad_norm": 0.63671875, + "learning_rate": 0.0001202038054632892, + "loss": 0.622, + "step": 14740 + }, + { + "epoch": 1.97, + "grad_norm": 0.7890625, + "learning_rate": 0.00012019240045680505, + "loss": 0.4339, + "step": 14741 + }, + { + "epoch": 1.97, + "grad_norm": 0.5859375, + "learning_rate": 0.00012018099517649897, + "loss": 0.5545, + "step": 14742 + }, + { + "epoch": 1.97, + "grad_norm": 0.59375, + "learning_rate": 0.00012016958962252564, + "loss": 0.2368, + "step": 14743 + }, + { + "epoch": 1.97, + "grad_norm": 0.640625, + "learning_rate": 0.00012015818379503973, + "loss": 0.7703, + "step": 14744 + }, + { + "epoch": 1.97, + "grad_norm": 0.318359375, + "learning_rate": 0.00012014677769419593, + "loss": 0.1498, + "step": 14745 + }, + { + "epoch": 1.97, + "grad_norm": 0.59765625, + "learning_rate": 0.00012013537132014886, + "loss": 0.5101, + "step": 14746 + }, + { + "epoch": 1.97, + "grad_norm": 0.56640625, + "learning_rate": 0.00012012396467305325, + "loss": 0.3361, + "step": 14747 + }, + { + "epoch": 1.97, + "grad_norm": 0.46484375, + "learning_rate": 0.00012011255775306378, + "loss": 0.3979, + "step": 14748 + }, + { + "epoch": 1.97, + "grad_norm": 0.65234375, + "learning_rate": 0.00012010115056033508, + "loss": 0.5978, + "step": 14749 + }, + { + "epoch": 1.97, + "grad_norm": 0.5703125, + "learning_rate": 0.00012008974309502193, + "loss": 0.19, + "step": 14750 + }, + { + "epoch": 1.97, + "grad_norm": 0.53515625, + "learning_rate": 0.0001200783353572789, + "loss": 0.3986, + "step": 14751 + }, + { + "epoch": 1.97, + "grad_norm": 0.54296875, + "learning_rate": 0.00012006692734726082, + "loss": 0.4835, + "step": 14752 + }, + { + "epoch": 1.97, + "grad_norm": 0.56640625, + "learning_rate": 0.00012005551906512229, + "loss": 0.5422, + "step": 14753 + }, + { + "epoch": 1.97, + "grad_norm": 0.48046875, + "learning_rate": 0.00012004411051101807, + "loss": 0.3992, + "step": 14754 + }, + { + "epoch": 1.97, + "grad_norm": 0.458984375, + "learning_rate": 0.00012003270168510284, + "loss": 0.339, + "step": 14755 + }, + { + "epoch": 1.97, + "grad_norm": 0.66796875, + "learning_rate": 0.00012002129258753129, + "loss": 0.4199, + "step": 14756 + }, + { + "epoch": 1.97, + "grad_norm": 0.57421875, + "learning_rate": 0.00012000988321845823, + "loss": 0.4492, + "step": 14757 + }, + { + "epoch": 1.97, + "grad_norm": 0.515625, + "learning_rate": 0.00011999847357803823, + "loss": 0.3919, + "step": 14758 + }, + { + "epoch": 1.97, + "grad_norm": 0.421875, + "learning_rate": 0.00011998706366642616, + "loss": 0.3163, + "step": 14759 + }, + { + "epoch": 1.97, + "grad_norm": 0.5625, + "learning_rate": 0.00011997565348377666, + "loss": 0.4399, + "step": 14760 + }, + { + "epoch": 1.97, + "grad_norm": 0.6484375, + "learning_rate": 0.00011996424303024447, + "loss": 0.4856, + "step": 14761 + }, + { + "epoch": 1.97, + "grad_norm": 0.578125, + "learning_rate": 0.00011995283230598433, + "loss": 0.5225, + "step": 14762 + }, + { + "epoch": 1.97, + "grad_norm": 0.515625, + "learning_rate": 0.00011994142131115099, + "loss": 0.4219, + "step": 14763 + }, + { + "epoch": 1.97, + "grad_norm": 0.53125, + "learning_rate": 0.00011993001004589917, + "loss": 0.3898, + "step": 14764 + }, + { + "epoch": 1.97, + "grad_norm": 0.60546875, + "learning_rate": 0.0001199185985103836, + "loss": 0.681, + "step": 14765 + }, + { + "epoch": 1.97, + "grad_norm": 0.66015625, + "learning_rate": 0.00011990718670475912, + "loss": 0.3652, + "step": 14766 + }, + { + "epoch": 1.97, + "grad_norm": 0.5234375, + "learning_rate": 0.00011989577462918036, + "loss": 0.4059, + "step": 14767 + }, + { + "epoch": 1.97, + "grad_norm": 0.4921875, + "learning_rate": 0.00011988436228380213, + "loss": 0.4113, + "step": 14768 + }, + { + "epoch": 1.97, + "grad_norm": 0.4921875, + "learning_rate": 0.00011987294966877918, + "loss": 0.4542, + "step": 14769 + }, + { + "epoch": 1.97, + "grad_norm": 0.671875, + "learning_rate": 0.00011986153678426628, + "loss": 0.77, + "step": 14770 + }, + { + "epoch": 1.97, + "grad_norm": 0.56640625, + "learning_rate": 0.00011985012363041819, + "loss": 0.3765, + "step": 14771 + }, + { + "epoch": 1.97, + "grad_norm": 0.7109375, + "learning_rate": 0.00011983871020738968, + "loss": 0.6641, + "step": 14772 + }, + { + "epoch": 1.97, + "grad_norm": 0.73046875, + "learning_rate": 0.00011982729651533558, + "loss": 0.2643, + "step": 14773 + }, + { + "epoch": 1.97, + "grad_norm": 0.6875, + "learning_rate": 0.00011981588255441057, + "loss": 0.4476, + "step": 14774 + }, + { + "epoch": 1.97, + "grad_norm": 0.47265625, + "learning_rate": 0.00011980446832476947, + "loss": 0.555, + "step": 14775 + }, + { + "epoch": 1.97, + "grad_norm": 0.5625, + "learning_rate": 0.00011979305382656709, + "loss": 0.4594, + "step": 14776 + }, + { + "epoch": 1.97, + "grad_norm": 0.4921875, + "learning_rate": 0.00011978163905995819, + "loss": 0.3911, + "step": 14777 + }, + { + "epoch": 1.97, + "grad_norm": 0.35546875, + "learning_rate": 0.00011977022402509757, + "loss": 0.1974, + "step": 14778 + }, + { + "epoch": 1.97, + "grad_norm": 0.515625, + "learning_rate": 0.00011975880872214, + "loss": 0.2783, + "step": 14779 + }, + { + "epoch": 1.97, + "grad_norm": 0.5625, + "learning_rate": 0.00011974739315124035, + "loss": 0.4708, + "step": 14780 + }, + { + "epoch": 1.97, + "grad_norm": 0.64453125, + "learning_rate": 0.00011973597731255333, + "loss": 0.7546, + "step": 14781 + }, + { + "epoch": 1.97, + "grad_norm": 0.69140625, + "learning_rate": 0.00011972456120623381, + "loss": 0.373, + "step": 14782 + }, + { + "epoch": 1.97, + "grad_norm": 1.421875, + "learning_rate": 0.00011971314483243658, + "loss": 0.2214, + "step": 14783 + }, + { + "epoch": 1.97, + "grad_norm": 0.6328125, + "learning_rate": 0.00011970172819131646, + "loss": 0.7018, + "step": 14784 + }, + { + "epoch": 1.97, + "grad_norm": 0.5625, + "learning_rate": 0.00011969031128302826, + "loss": 0.3281, + "step": 14785 + }, + { + "epoch": 1.97, + "grad_norm": 0.48828125, + "learning_rate": 0.0001196788941077268, + "loss": 0.3107, + "step": 14786 + }, + { + "epoch": 1.97, + "grad_norm": 0.43359375, + "learning_rate": 0.0001196674766655669, + "loss": 0.187, + "step": 14787 + }, + { + "epoch": 1.97, + "grad_norm": 0.6484375, + "learning_rate": 0.00011965605895670338, + "loss": 0.3697, + "step": 14788 + }, + { + "epoch": 1.97, + "grad_norm": 0.41015625, + "learning_rate": 0.00011964464098129111, + "loss": 0.2781, + "step": 14789 + }, + { + "epoch": 1.97, + "grad_norm": 0.4921875, + "learning_rate": 0.00011963322273948487, + "loss": 0.5, + "step": 14790 + }, + { + "epoch": 1.97, + "grad_norm": 0.6171875, + "learning_rate": 0.00011962180423143961, + "loss": 0.509, + "step": 14791 + }, + { + "epoch": 1.97, + "grad_norm": 0.5859375, + "learning_rate": 0.00011961038545731, + "loss": 0.4233, + "step": 14792 + }, + { + "epoch": 1.97, + "grad_norm": 0.515625, + "learning_rate": 0.00011959896641725104, + "loss": 0.4095, + "step": 14793 + }, + { + "epoch": 1.97, + "grad_norm": 0.57421875, + "learning_rate": 0.00011958754711141747, + "loss": 0.3418, + "step": 14794 + }, + { + "epoch": 1.97, + "grad_norm": 0.75, + "learning_rate": 0.0001195761275399642, + "loss": 0.7148, + "step": 14795 + }, + { + "epoch": 1.97, + "grad_norm": 0.515625, + "learning_rate": 0.00011956470770304609, + "loss": 0.2774, + "step": 14796 + }, + { + "epoch": 1.97, + "grad_norm": 0.5390625, + "learning_rate": 0.00011955328760081798, + "loss": 0.2877, + "step": 14797 + }, + { + "epoch": 1.97, + "grad_norm": 0.400390625, + "learning_rate": 0.00011954186723343475, + "loss": 0.2946, + "step": 14798 + }, + { + "epoch": 1.97, + "grad_norm": 0.4375, + "learning_rate": 0.00011953044660105124, + "loss": 0.3355, + "step": 14799 + }, + { + "epoch": 1.97, + "grad_norm": 0.46875, + "learning_rate": 0.00011951902570382236, + "loss": 0.4956, + "step": 14800 + }, + { + "epoch": 1.98, + "grad_norm": 0.51171875, + "learning_rate": 0.00011950760454190293, + "loss": 0.4122, + "step": 14801 + }, + { + "epoch": 1.98, + "grad_norm": 0.5546875, + "learning_rate": 0.0001194961831154479, + "loss": 0.3207, + "step": 14802 + }, + { + "epoch": 1.98, + "grad_norm": 0.53515625, + "learning_rate": 0.00011948476142461212, + "loss": 0.3229, + "step": 14803 + }, + { + "epoch": 1.98, + "grad_norm": 0.625, + "learning_rate": 0.00011947333946955044, + "loss": 0.1829, + "step": 14804 + }, + { + "epoch": 1.98, + "grad_norm": 0.439453125, + "learning_rate": 0.0001194619172504178, + "loss": 0.3541, + "step": 14805 + }, + { + "epoch": 1.98, + "grad_norm": 0.427734375, + "learning_rate": 0.00011945049476736905, + "loss": 0.2412, + "step": 14806 + }, + { + "epoch": 1.98, + "grad_norm": 0.39453125, + "learning_rate": 0.0001194390720205591, + "loss": 0.2719, + "step": 14807 + }, + { + "epoch": 1.98, + "grad_norm": 0.5390625, + "learning_rate": 0.00011942764901014288, + "loss": 0.7398, + "step": 14808 + }, + { + "epoch": 1.98, + "grad_norm": 0.58203125, + "learning_rate": 0.00011941622573627526, + "loss": 0.3256, + "step": 14809 + }, + { + "epoch": 1.98, + "grad_norm": 0.416015625, + "learning_rate": 0.00011940480219911119, + "loss": 0.5126, + "step": 14810 + }, + { + "epoch": 1.98, + "grad_norm": 0.6875, + "learning_rate": 0.00011939337839880551, + "loss": 0.1944, + "step": 14811 + }, + { + "epoch": 1.98, + "grad_norm": 0.57421875, + "learning_rate": 0.0001193819543355132, + "loss": 0.3319, + "step": 14812 + }, + { + "epoch": 1.98, + "grad_norm": 0.48828125, + "learning_rate": 0.00011937053000938911, + "loss": 0.3538, + "step": 14813 + }, + { + "epoch": 1.98, + "grad_norm": 0.54296875, + "learning_rate": 0.00011935910542058824, + "loss": 0.4006, + "step": 14814 + }, + { + "epoch": 1.98, + "grad_norm": 0.53125, + "learning_rate": 0.00011934768056926549, + "loss": 0.4392, + "step": 14815 + }, + { + "epoch": 1.98, + "grad_norm": 0.443359375, + "learning_rate": 0.00011933625545557574, + "loss": 0.2266, + "step": 14816 + }, + { + "epoch": 1.98, + "grad_norm": 0.50390625, + "learning_rate": 0.00011932483007967394, + "loss": 0.2768, + "step": 14817 + }, + { + "epoch": 1.98, + "grad_norm": 0.5390625, + "learning_rate": 0.00011931340444171508, + "loss": 0.4015, + "step": 14818 + }, + { + "epoch": 1.98, + "grad_norm": 0.58984375, + "learning_rate": 0.00011930197854185403, + "loss": 0.465, + "step": 14819 + }, + { + "epoch": 1.98, + "grad_norm": 0.5390625, + "learning_rate": 0.00011929055238024578, + "loss": 0.5723, + "step": 14820 + }, + { + "epoch": 1.98, + "grad_norm": 0.462890625, + "learning_rate": 0.00011927912595704528, + "loss": 0.2641, + "step": 14821 + }, + { + "epoch": 1.98, + "grad_norm": 0.498046875, + "learning_rate": 0.00011926769927240742, + "loss": 0.4838, + "step": 14822 + }, + { + "epoch": 1.98, + "grad_norm": 0.55859375, + "learning_rate": 0.00011925627232648722, + "loss": 0.551, + "step": 14823 + }, + { + "epoch": 1.98, + "grad_norm": 0.60546875, + "learning_rate": 0.00011924484511943958, + "loss": 0.3872, + "step": 14824 + }, + { + "epoch": 1.98, + "grad_norm": 0.5234375, + "learning_rate": 0.00011923341765141951, + "loss": 0.4345, + "step": 14825 + }, + { + "epoch": 1.98, + "grad_norm": 0.48828125, + "learning_rate": 0.00011922198992258194, + "loss": 0.3272, + "step": 14826 + }, + { + "epoch": 1.98, + "grad_norm": 0.458984375, + "learning_rate": 0.00011921056193308187, + "loss": 0.2494, + "step": 14827 + }, + { + "epoch": 1.98, + "grad_norm": 0.4765625, + "learning_rate": 0.00011919913368307425, + "loss": 0.3315, + "step": 14828 + }, + { + "epoch": 1.98, + "grad_norm": 0.478515625, + "learning_rate": 0.00011918770517271403, + "loss": 0.5437, + "step": 14829 + }, + { + "epoch": 1.98, + "grad_norm": 0.59375, + "learning_rate": 0.00011917627640215623, + "loss": 0.5117, + "step": 14830 + }, + { + "epoch": 1.98, + "grad_norm": 0.478515625, + "learning_rate": 0.00011916484737155582, + "loss": 0.3999, + "step": 14831 + }, + { + "epoch": 1.98, + "grad_norm": 0.59375, + "learning_rate": 0.00011915341808106777, + "loss": 0.312, + "step": 14832 + }, + { + "epoch": 1.98, + "grad_norm": 0.51171875, + "learning_rate": 0.00011914198853084708, + "loss": 0.3249, + "step": 14833 + }, + { + "epoch": 1.98, + "grad_norm": 0.455078125, + "learning_rate": 0.00011913055872104875, + "loss": 0.3916, + "step": 14834 + }, + { + "epoch": 1.98, + "grad_norm": 0.62890625, + "learning_rate": 0.00011911912865182775, + "loss": 0.2956, + "step": 14835 + }, + { + "epoch": 1.98, + "grad_norm": 0.61328125, + "learning_rate": 0.0001191076983233391, + "loss": 0.5731, + "step": 14836 + }, + { + "epoch": 1.98, + "grad_norm": 0.6328125, + "learning_rate": 0.00011909626773573781, + "loss": 0.4202, + "step": 14837 + }, + { + "epoch": 1.98, + "grad_norm": 0.5859375, + "learning_rate": 0.00011908483688917885, + "loss": 0.3295, + "step": 14838 + }, + { + "epoch": 1.98, + "grad_norm": 0.3671875, + "learning_rate": 0.00011907340578381729, + "loss": 0.1841, + "step": 14839 + }, + { + "epoch": 1.98, + "grad_norm": 0.5546875, + "learning_rate": 0.00011906197441980807, + "loss": 0.6539, + "step": 14840 + }, + { + "epoch": 1.98, + "grad_norm": 0.546875, + "learning_rate": 0.00011905054279730625, + "loss": 0.3153, + "step": 14841 + }, + { + "epoch": 1.98, + "grad_norm": 0.5234375, + "learning_rate": 0.00011903911091646684, + "loss": 0.623, + "step": 14842 + }, + { + "epoch": 1.98, + "grad_norm": 0.404296875, + "learning_rate": 0.00011902767877744489, + "loss": 0.4061, + "step": 14843 + }, + { + "epoch": 1.98, + "grad_norm": 0.51953125, + "learning_rate": 0.00011901624638039537, + "loss": 0.3729, + "step": 14844 + }, + { + "epoch": 1.98, + "grad_norm": 0.494140625, + "learning_rate": 0.00011900481372547338, + "loss": 0.2549, + "step": 14845 + }, + { + "epoch": 1.98, + "grad_norm": 0.5390625, + "learning_rate": 0.00011899338081283391, + "loss": 0.5756, + "step": 14846 + }, + { + "epoch": 1.98, + "grad_norm": 0.67578125, + "learning_rate": 0.00011898194764263197, + "loss": 0.4899, + "step": 14847 + }, + { + "epoch": 1.98, + "grad_norm": 0.46484375, + "learning_rate": 0.0001189705142150227, + "loss": 0.2369, + "step": 14848 + }, + { + "epoch": 1.98, + "grad_norm": 0.515625, + "learning_rate": 0.00011895908053016103, + "loss": 0.4164, + "step": 14849 + }, + { + "epoch": 1.98, + "grad_norm": 0.56640625, + "learning_rate": 0.00011894764658820207, + "loss": 0.2069, + "step": 14850 + }, + { + "epoch": 1.98, + "grad_norm": 0.671875, + "learning_rate": 0.00011893621238930088, + "loss": 0.4247, + "step": 14851 + }, + { + "epoch": 1.98, + "grad_norm": 0.66015625, + "learning_rate": 0.0001189247779336125, + "loss": 0.5254, + "step": 14852 + }, + { + "epoch": 1.98, + "grad_norm": 0.6015625, + "learning_rate": 0.00011891334322129197, + "loss": 0.2925, + "step": 14853 + }, + { + "epoch": 1.98, + "grad_norm": 0.396484375, + "learning_rate": 0.00011890190825249434, + "loss": 0.2929, + "step": 14854 + }, + { + "epoch": 1.98, + "grad_norm": 0.59375, + "learning_rate": 0.00011889047302737475, + "loss": 0.4243, + "step": 14855 + }, + { + "epoch": 1.98, + "grad_norm": 0.453125, + "learning_rate": 0.0001188790375460882, + "loss": 0.3377, + "step": 14856 + }, + { + "epoch": 1.98, + "grad_norm": 0.61328125, + "learning_rate": 0.0001188676018087898, + "loss": 0.3803, + "step": 14857 + }, + { + "epoch": 1.98, + "grad_norm": 0.546875, + "learning_rate": 0.00011885616581563461, + "loss": 0.389, + "step": 14858 + }, + { + "epoch": 1.98, + "grad_norm": 0.6328125, + "learning_rate": 0.00011884472956677768, + "loss": 0.4751, + "step": 14859 + }, + { + "epoch": 1.98, + "grad_norm": 0.44921875, + "learning_rate": 0.00011883329306237416, + "loss": 0.2883, + "step": 14860 + }, + { + "epoch": 1.98, + "grad_norm": 0.7109375, + "learning_rate": 0.00011882185630257905, + "loss": 0.5113, + "step": 14861 + }, + { + "epoch": 1.98, + "grad_norm": 0.59375, + "learning_rate": 0.00011881041928754752, + "loss": 0.5295, + "step": 14862 + }, + { + "epoch": 1.98, + "grad_norm": 0.36328125, + "learning_rate": 0.00011879898201743462, + "loss": 0.202, + "step": 14863 + }, + { + "epoch": 1.98, + "grad_norm": 0.5625, + "learning_rate": 0.0001187875444923955, + "loss": 0.328, + "step": 14864 + }, + { + "epoch": 1.98, + "grad_norm": 0.5390625, + "learning_rate": 0.0001187761067125852, + "loss": 0.2253, + "step": 14865 + }, + { + "epoch": 1.98, + "grad_norm": 0.482421875, + "learning_rate": 0.00011876466867815883, + "loss": 0.5311, + "step": 14866 + }, + { + "epoch": 1.98, + "grad_norm": 0.4375, + "learning_rate": 0.00011875323038927151, + "loss": 0.3402, + "step": 14867 + }, + { + "epoch": 1.98, + "grad_norm": 0.56640625, + "learning_rate": 0.00011874179184607832, + "loss": 0.3168, + "step": 14868 + }, + { + "epoch": 1.98, + "grad_norm": 0.734375, + "learning_rate": 0.00011873035304873445, + "loss": 0.6806, + "step": 14869 + }, + { + "epoch": 1.98, + "grad_norm": 0.5, + "learning_rate": 0.00011871891399739496, + "loss": 0.3774, + "step": 14870 + }, + { + "epoch": 1.98, + "grad_norm": 0.5546875, + "learning_rate": 0.00011870747469221499, + "loss": 0.548, + "step": 14871 + }, + { + "epoch": 1.98, + "grad_norm": 0.447265625, + "learning_rate": 0.00011869603513334964, + "loss": 0.4677, + "step": 14872 + }, + { + "epoch": 1.98, + "grad_norm": 0.59375, + "learning_rate": 0.00011868459532095406, + "loss": 0.7963, + "step": 14873 + }, + { + "epoch": 1.98, + "grad_norm": 0.53515625, + "learning_rate": 0.00011867315525518336, + "loss": 0.7818, + "step": 14874 + }, + { + "epoch": 1.98, + "grad_norm": 0.478515625, + "learning_rate": 0.00011866171493619273, + "loss": 0.3012, + "step": 14875 + }, + { + "epoch": 1.99, + "grad_norm": 0.67578125, + "learning_rate": 0.00011865027436413724, + "loss": 0.7917, + "step": 14876 + }, + { + "epoch": 1.99, + "grad_norm": 0.59375, + "learning_rate": 0.00011863883353917206, + "loss": 0.542, + "step": 14877 + }, + { + "epoch": 1.99, + "grad_norm": 0.72265625, + "learning_rate": 0.00011862739246145236, + "loss": 0.6103, + "step": 14878 + }, + { + "epoch": 1.99, + "grad_norm": 0.4140625, + "learning_rate": 0.00011861595113113322, + "loss": 0.1714, + "step": 14879 + }, + { + "epoch": 1.99, + "grad_norm": 0.78125, + "learning_rate": 0.00011860450954836988, + "loss": 0.3444, + "step": 14880 + }, + { + "epoch": 1.99, + "grad_norm": 0.462890625, + "learning_rate": 0.00011859306771331742, + "loss": 0.2861, + "step": 14881 + }, + { + "epoch": 1.99, + "grad_norm": 0.5078125, + "learning_rate": 0.00011858162562613104, + "loss": 0.3524, + "step": 14882 + }, + { + "epoch": 1.99, + "grad_norm": 0.578125, + "learning_rate": 0.0001185701832869659, + "loss": 0.5238, + "step": 14883 + }, + { + "epoch": 1.99, + "grad_norm": 0.400390625, + "learning_rate": 0.00011855874069597713, + "loss": 0.3338, + "step": 14884 + }, + { + "epoch": 1.99, + "grad_norm": 0.3515625, + "learning_rate": 0.00011854729785331995, + "loss": 0.3149, + "step": 14885 + }, + { + "epoch": 1.99, + "grad_norm": 0.4765625, + "learning_rate": 0.00011853585475914948, + "loss": 0.259, + "step": 14886 + }, + { + "epoch": 1.99, + "grad_norm": 0.51171875, + "learning_rate": 0.00011852441141362094, + "loss": 0.3318, + "step": 14887 + }, + { + "epoch": 1.99, + "grad_norm": 0.62890625, + "learning_rate": 0.00011851296781688952, + "loss": 0.4579, + "step": 14888 + }, + { + "epoch": 1.99, + "grad_norm": 0.6953125, + "learning_rate": 0.00011850152396911034, + "loss": 0.3794, + "step": 14889 + }, + { + "epoch": 1.99, + "grad_norm": 0.447265625, + "learning_rate": 0.00011849007987043863, + "loss": 0.339, + "step": 14890 + }, + { + "epoch": 1.99, + "grad_norm": 0.4453125, + "learning_rate": 0.00011847863552102957, + "loss": 0.3588, + "step": 14891 + }, + { + "epoch": 1.99, + "grad_norm": 0.5703125, + "learning_rate": 0.00011846719092103835, + "loss": 0.3821, + "step": 14892 + }, + { + "epoch": 1.99, + "grad_norm": 0.39453125, + "learning_rate": 0.00011845574607062015, + "loss": 0.3922, + "step": 14893 + }, + { + "epoch": 1.99, + "grad_norm": 0.427734375, + "learning_rate": 0.00011844430096993025, + "loss": 0.3135, + "step": 14894 + }, + { + "epoch": 1.99, + "grad_norm": 0.5390625, + "learning_rate": 0.00011843285561912374, + "loss": 0.4282, + "step": 14895 + }, + { + "epoch": 1.99, + "grad_norm": 0.91015625, + "learning_rate": 0.0001184214100183559, + "loss": 0.4317, + "step": 14896 + }, + { + "epoch": 1.99, + "grad_norm": 0.625, + "learning_rate": 0.00011840996416778187, + "loss": 0.4395, + "step": 14897 + }, + { + "epoch": 1.99, + "grad_norm": 0.375, + "learning_rate": 0.00011839851806755695, + "loss": 0.1778, + "step": 14898 + }, + { + "epoch": 1.99, + "grad_norm": 0.4765625, + "learning_rate": 0.00011838707171783632, + "loss": 0.3719, + "step": 14899 + }, + { + "epoch": 1.99, + "grad_norm": 0.5234375, + "learning_rate": 0.00011837562511877518, + "loss": 0.2053, + "step": 14900 + }, + { + "epoch": 1.99, + "grad_norm": 0.41015625, + "learning_rate": 0.00011836417827052879, + "loss": 0.3498, + "step": 14901 + }, + { + "epoch": 1.99, + "grad_norm": 0.796875, + "learning_rate": 0.00011835273117325234, + "loss": 0.1711, + "step": 14902 + }, + { + "epoch": 1.99, + "grad_norm": 0.466796875, + "learning_rate": 0.0001183412838271011, + "loss": 0.5137, + "step": 14903 + }, + { + "epoch": 1.99, + "grad_norm": 0.69921875, + "learning_rate": 0.00011832983623223024, + "loss": 0.3431, + "step": 14904 + }, + { + "epoch": 1.99, + "grad_norm": 0.51171875, + "learning_rate": 0.00011831838838879507, + "loss": 0.7699, + "step": 14905 + }, + { + "epoch": 1.99, + "grad_norm": 0.53515625, + "learning_rate": 0.00011830694029695079, + "loss": 0.3593, + "step": 14906 + }, + { + "epoch": 1.99, + "grad_norm": 0.455078125, + "learning_rate": 0.00011829549195685263, + "loss": 0.3424, + "step": 14907 + }, + { + "epoch": 1.99, + "grad_norm": 0.328125, + "learning_rate": 0.00011828404336865591, + "loss": 0.1492, + "step": 14908 + }, + { + "epoch": 1.99, + "grad_norm": 0.8515625, + "learning_rate": 0.00011827259453251576, + "loss": 0.6897, + "step": 14909 + }, + { + "epoch": 1.99, + "grad_norm": 0.447265625, + "learning_rate": 0.00011826114544858755, + "loss": 0.4256, + "step": 14910 + }, + { + "epoch": 1.99, + "grad_norm": 0.65625, + "learning_rate": 0.00011824969611702645, + "loss": 0.6376, + "step": 14911 + }, + { + "epoch": 1.99, + "grad_norm": 0.322265625, + "learning_rate": 0.00011823824653798781, + "loss": 0.1626, + "step": 14912 + }, + { + "epoch": 1.99, + "grad_norm": 0.58203125, + "learning_rate": 0.0001182267967116268, + "loss": 0.4098, + "step": 14913 + }, + { + "epoch": 1.99, + "grad_norm": 0.5703125, + "learning_rate": 0.00011821534663809875, + "loss": 0.2766, + "step": 14914 + }, + { + "epoch": 1.99, + "grad_norm": 0.50390625, + "learning_rate": 0.00011820389631755888, + "loss": 0.3625, + "step": 14915 + }, + { + "epoch": 1.99, + "grad_norm": 0.50390625, + "learning_rate": 0.00011819244575016253, + "loss": 0.582, + "step": 14916 + }, + { + "epoch": 1.99, + "grad_norm": 0.75390625, + "learning_rate": 0.00011818099493606489, + "loss": 0.8586, + "step": 14917 + }, + { + "epoch": 1.99, + "grad_norm": 0.5546875, + "learning_rate": 0.00011816954387542134, + "loss": 0.4175, + "step": 14918 + }, + { + "epoch": 1.99, + "grad_norm": 0.71875, + "learning_rate": 0.00011815809256838711, + "loss": 0.4242, + "step": 14919 + }, + { + "epoch": 1.99, + "grad_norm": 0.302734375, + "learning_rate": 0.00011814664101511745, + "loss": 0.1263, + "step": 14920 + }, + { + "epoch": 1.99, + "grad_norm": 0.326171875, + "learning_rate": 0.00011813518921576772, + "loss": 0.2261, + "step": 14921 + }, + { + "epoch": 1.99, + "grad_norm": 0.396484375, + "learning_rate": 0.00011812373717049317, + "loss": 0.156, + "step": 14922 + }, + { + "epoch": 1.99, + "grad_norm": 0.462890625, + "learning_rate": 0.00011811228487944914, + "loss": 0.4127, + "step": 14923 + }, + { + "epoch": 1.99, + "grad_norm": 0.359375, + "learning_rate": 0.00011810083234279088, + "loss": 0.1596, + "step": 14924 + }, + { + "epoch": 1.99, + "grad_norm": 0.64453125, + "learning_rate": 0.00011808937956067372, + "loss": 0.2785, + "step": 14925 + }, + { + "epoch": 1.99, + "grad_norm": 0.439453125, + "learning_rate": 0.00011807792653325298, + "loss": 0.273, + "step": 14926 + }, + { + "epoch": 1.99, + "grad_norm": 0.578125, + "learning_rate": 0.00011806647326068392, + "loss": 0.2352, + "step": 14927 + }, + { + "epoch": 1.99, + "grad_norm": 0.53125, + "learning_rate": 0.00011805501974312193, + "loss": 0.5344, + "step": 14928 + }, + { + "epoch": 1.99, + "grad_norm": 0.4296875, + "learning_rate": 0.00011804356598072223, + "loss": 0.215, + "step": 14929 + }, + { + "epoch": 1.99, + "grad_norm": 0.546875, + "learning_rate": 0.00011803211197364024, + "loss": 0.305, + "step": 14930 + }, + { + "epoch": 1.99, + "grad_norm": 0.466796875, + "learning_rate": 0.00011802065772203124, + "loss": 0.3529, + "step": 14931 + }, + { + "epoch": 1.99, + "grad_norm": 0.5546875, + "learning_rate": 0.00011800920322605053, + "loss": 0.4477, + "step": 14932 + }, + { + "epoch": 1.99, + "grad_norm": 0.4296875, + "learning_rate": 0.00011799774848585347, + "loss": 0.3209, + "step": 14933 + }, + { + "epoch": 1.99, + "grad_norm": 0.6484375, + "learning_rate": 0.0001179862935015954, + "loss": 0.4631, + "step": 14934 + }, + { + "epoch": 1.99, + "grad_norm": 0.5625, + "learning_rate": 0.00011797483827343166, + "loss": 0.428, + "step": 14935 + }, + { + "epoch": 1.99, + "grad_norm": 0.458984375, + "learning_rate": 0.00011796338280151756, + "loss": 0.2008, + "step": 14936 + }, + { + "epoch": 1.99, + "grad_norm": 0.51953125, + "learning_rate": 0.00011795192708600846, + "loss": 0.252, + "step": 14937 + }, + { + "epoch": 1.99, + "grad_norm": 0.6640625, + "learning_rate": 0.0001179404711270597, + "loss": 0.6566, + "step": 14938 + }, + { + "epoch": 1.99, + "grad_norm": 0.39453125, + "learning_rate": 0.00011792901492482666, + "loss": 0.3794, + "step": 14939 + }, + { + "epoch": 1.99, + "grad_norm": 0.44921875, + "learning_rate": 0.00011791755847946464, + "loss": 0.3842, + "step": 14940 + }, + { + "epoch": 1.99, + "grad_norm": 0.54296875, + "learning_rate": 0.00011790610179112904, + "loss": 0.3435, + "step": 14941 + }, + { + "epoch": 1.99, + "grad_norm": 0.47265625, + "learning_rate": 0.00011789464485997524, + "loss": 0.2461, + "step": 14942 + }, + { + "epoch": 1.99, + "grad_norm": 0.45703125, + "learning_rate": 0.00011788318768615854, + "loss": 0.3221, + "step": 14943 + }, + { + "epoch": 1.99, + "grad_norm": 0.64453125, + "learning_rate": 0.00011787173026983433, + "loss": 0.4689, + "step": 14944 + }, + { + "epoch": 1.99, + "grad_norm": 0.4453125, + "learning_rate": 0.000117860272611158, + "loss": 0.2518, + "step": 14945 + }, + { + "epoch": 1.99, + "grad_norm": 0.46875, + "learning_rate": 0.00011784881471028493, + "loss": 0.4221, + "step": 14946 + }, + { + "epoch": 1.99, + "grad_norm": 0.703125, + "learning_rate": 0.00011783735656737043, + "loss": 0.6904, + "step": 14947 + }, + { + "epoch": 1.99, + "grad_norm": 0.62109375, + "learning_rate": 0.00011782589818256998, + "loss": 0.2606, + "step": 14948 + }, + { + "epoch": 1.99, + "grad_norm": 0.5078125, + "learning_rate": 0.00011781443955603889, + "loss": 0.2525, + "step": 14949 + }, + { + "epoch": 1.99, + "grad_norm": 0.57421875, + "learning_rate": 0.00011780298068793254, + "loss": 0.2822, + "step": 14950 + }, + { + "epoch": 2.0, + "grad_norm": 0.462890625, + "learning_rate": 0.00011779152157840636, + "loss": 0.194, + "step": 14951 + }, + { + "epoch": 2.0, + "grad_norm": 0.44921875, + "learning_rate": 0.00011778006222761573, + "loss": 0.3072, + "step": 14952 + }, + { + "epoch": 2.0, + "grad_norm": 0.54296875, + "learning_rate": 0.00011776860263571606, + "loss": 0.3413, + "step": 14953 + }, + { + "epoch": 2.0, + "grad_norm": 0.5234375, + "learning_rate": 0.00011775714280286272, + "loss": 0.1695, + "step": 14954 + }, + { + "epoch": 2.0, + "grad_norm": 0.427734375, + "learning_rate": 0.00011774568272921112, + "loss": 0.3778, + "step": 14955 + }, + { + "epoch": 2.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00011773422241491669, + "loss": 0.3211, + "step": 14956 + }, + { + "epoch": 2.0, + "grad_norm": 0.50390625, + "learning_rate": 0.00011772276186013482, + "loss": 0.6597, + "step": 14957 + }, + { + "epoch": 2.0, + "grad_norm": 0.84765625, + "learning_rate": 0.00011771130106502093, + "loss": 0.3633, + "step": 14958 + }, + { + "epoch": 2.0, + "grad_norm": 0.66015625, + "learning_rate": 0.0001176998400297304, + "loss": 0.5367, + "step": 14959 + }, + { + "epoch": 2.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00011768837875441871, + "loss": 0.2527, + "step": 14960 + }, + { + "epoch": 2.0, + "grad_norm": 0.443359375, + "learning_rate": 0.00011767691723924126, + "loss": 0.2567, + "step": 14961 + }, + { + "epoch": 2.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00011766545548435346, + "loss": 0.2365, + "step": 14962 + }, + { + "epoch": 2.0, + "grad_norm": 0.484375, + "learning_rate": 0.00011765399348991071, + "loss": 0.2089, + "step": 14963 + }, + { + "epoch": 2.0, + "grad_norm": 0.341796875, + "learning_rate": 0.00011764253125606852, + "loss": 0.2413, + "step": 14964 + }, + { + "epoch": 2.0, + "grad_norm": 0.498046875, + "learning_rate": 0.00011763106878298225, + "loss": 0.2369, + "step": 14965 + }, + { + "epoch": 2.0, + "grad_norm": 0.65234375, + "learning_rate": 0.00011761960607080738, + "loss": 0.2201, + "step": 14966 + }, + { + "epoch": 2.0, + "grad_norm": 0.7109375, + "learning_rate": 0.00011760814311969939, + "loss": 0.3282, + "step": 14967 + }, + { + "epoch": 2.0, + "grad_norm": 0.515625, + "learning_rate": 0.00011759667992981363, + "loss": 0.3469, + "step": 14968 + }, + { + "epoch": 2.0, + "grad_norm": 0.44921875, + "learning_rate": 0.0001175852165013056, + "loss": 0.2275, + "step": 14969 + }, + { + "epoch": 2.0, + "grad_norm": 0.48046875, + "learning_rate": 0.00011757375283433076, + "loss": 0.3024, + "step": 14970 + }, + { + "epoch": 2.0, + "grad_norm": 0.482421875, + "learning_rate": 0.00011756228892904455, + "loss": 0.4856, + "step": 14971 + }, + { + "epoch": 2.0, + "grad_norm": 0.48046875, + "learning_rate": 0.00011755082478560242, + "loss": 0.2495, + "step": 14972 + }, + { + "epoch": 2.0, + "grad_norm": 0.4609375, + "learning_rate": 0.00011753936040415987, + "loss": 0.4234, + "step": 14973 + }, + { + "epoch": 2.0, + "grad_norm": 0.52734375, + "learning_rate": 0.0001175278957848723, + "loss": 0.2417, + "step": 14974 + }, + { + "epoch": 2.0, + "grad_norm": 0.54296875, + "learning_rate": 0.00011751643092789523, + "loss": 0.2205, + "step": 14975 + }, + { + "epoch": 2.0, + "grad_norm": 0.59765625, + "learning_rate": 0.00011750496583338413, + "loss": 0.5844, + "step": 14976 + }, + { + "epoch": 2.0, + "grad_norm": 0.59765625, + "learning_rate": 0.00011749350050149444, + "loss": 0.5803, + "step": 14977 + }, + { + "epoch": 2.0, + "grad_norm": 0.43359375, + "learning_rate": 0.00011748203493238165, + "loss": 0.2905, + "step": 14978 + }, + { + "epoch": 2.0, + "grad_norm": 0.5390625, + "learning_rate": 0.0001174705691262013, + "loss": 0.53, + "step": 14979 + }, + { + "epoch": 2.0, + "grad_norm": 0.47265625, + "learning_rate": 0.00011745910308310874, + "loss": 0.2355, + "step": 14980 + }, + { + "epoch": 2.0, + "grad_norm": 0.59765625, + "learning_rate": 0.00011744763680325958, + "loss": 0.404, + "step": 14981 + }, + { + "epoch": 2.0, + "grad_norm": 0.490234375, + "learning_rate": 0.00011743617028680926, + "loss": 0.3228, + "step": 14982 + }, + { + "epoch": 2.0, + "grad_norm": 0.57421875, + "learning_rate": 0.00011742470353391329, + "loss": 0.2485, + "step": 14983 + }, + { + "epoch": 2.0, + "grad_norm": 0.482421875, + "learning_rate": 0.00011741323654472713, + "loss": 0.2395, + "step": 14984 + }, + { + "epoch": 2.0, + "grad_norm": 0.73828125, + "learning_rate": 0.00011740176931940637, + "loss": 0.9104, + "step": 14985 + }, + { + "epoch": 2.0, + "grad_norm": 0.4609375, + "learning_rate": 0.00011739030185810638, + "loss": 0.2303, + "step": 14986 + }, + { + "epoch": 2.0, + "grad_norm": 0.59375, + "learning_rate": 0.00011737883416098276, + "loss": 0.3065, + "step": 14987 + }, + { + "epoch": 2.0, + "grad_norm": 0.609375, + "learning_rate": 0.000117367366228191, + "loss": 0.3393, + "step": 14988 + }, + { + "epoch": 2.0, + "grad_norm": 0.455078125, + "learning_rate": 0.0001173558980598866, + "loss": 0.3889, + "step": 14989 + }, + { + "epoch": 2.0, + "grad_norm": 0.55078125, + "learning_rate": 0.00011734442965622507, + "loss": 0.3195, + "step": 14990 + }, + { + "epoch": 2.0, + "grad_norm": 0.4609375, + "learning_rate": 0.00011733296101736197, + "loss": 0.3359, + "step": 14991 + }, + { + "epoch": 2.0, + "grad_norm": 0.5, + "learning_rate": 0.0001173214921434528, + "loss": 0.3953, + "step": 14992 + }, + { + "epoch": 2.0, + "grad_norm": 0.6171875, + "learning_rate": 0.00011731002303465306, + "loss": 0.4045, + "step": 14993 + }, + { + "epoch": 2.0, + "grad_norm": 0.328125, + "learning_rate": 0.0001172985536911183, + "loss": 0.203, + "step": 14994 + }, + { + "epoch": 2.0, + "grad_norm": 0.58203125, + "learning_rate": 0.00011728708411300404, + "loss": 0.5638, + "step": 14995 + }, + { + "epoch": 2.0, + "grad_norm": 0.423828125, + "learning_rate": 0.00011727561430046585, + "loss": 0.2174, + "step": 14996 + }, + { + "epoch": 2.0, + "grad_norm": 0.58984375, + "learning_rate": 0.00011726414425365924, + "loss": 0.3748, + "step": 14997 + }, + { + "epoch": 2.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00011725267397273974, + "loss": 0.2428, + "step": 14998 + }, + { + "epoch": 2.0, + "grad_norm": 0.462890625, + "learning_rate": 0.00011724120345786291, + "loss": 0.1819, + "step": 14999 + }, + { + "epoch": 2.0, + "grad_norm": 0.41796875, + "learning_rate": 0.00011722973270918429, + "loss": 0.2024, + "step": 15000 + }, + { + "epoch": 2.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00011721826172685944, + "loss": 0.3794, + "step": 15001 + }, + { + "epoch": 2.0, + "grad_norm": 0.515625, + "learning_rate": 0.0001172067905110439, + "loss": 0.2565, + "step": 15002 + }, + { + "epoch": 2.0, + "grad_norm": 0.478515625, + "learning_rate": 0.00011719531906189327, + "loss": 0.3958, + "step": 15003 + }, + { + "epoch": 2.0, + "grad_norm": 0.490234375, + "learning_rate": 0.00011718384737956308, + "loss": 0.2135, + "step": 15004 + }, + { + "epoch": 2.0, + "grad_norm": 0.482421875, + "learning_rate": 0.00011717237546420885, + "loss": 0.2082, + "step": 15005 + }, + { + "epoch": 2.0, + "grad_norm": 0.453125, + "learning_rate": 0.00011716090331598622, + "loss": 0.332, + "step": 15006 + }, + { + "epoch": 2.0, + "grad_norm": 0.337890625, + "learning_rate": 0.00011714943093505069, + "loss": 0.147, + "step": 15007 + }, + { + "epoch": 2.0, + "grad_norm": 0.337890625, + "learning_rate": 0.00011713795832155789, + "loss": 0.1774, + "step": 15008 + }, + { + "epoch": 2.0, + "grad_norm": 0.578125, + "learning_rate": 0.0001171264854756634, + "loss": 0.3288, + "step": 15009 + }, + { + "epoch": 2.0, + "grad_norm": 0.50390625, + "learning_rate": 0.00011711501239752277, + "loss": 0.4265, + "step": 15010 + }, + { + "epoch": 2.0, + "grad_norm": 0.625, + "learning_rate": 0.00011710353908729156, + "loss": 0.4932, + "step": 15011 + }, + { + "epoch": 2.0, + "grad_norm": 0.455078125, + "learning_rate": 0.0001170920655451254, + "loss": 0.2537, + "step": 15012 + }, + { + "epoch": 2.0, + "grad_norm": 0.337890625, + "learning_rate": 0.00011708059177117985, + "loss": 0.1772, + "step": 15013 + }, + { + "epoch": 2.0, + "grad_norm": 0.384765625, + "learning_rate": 0.00011706911776561051, + "loss": 0.2429, + "step": 15014 + }, + { + "epoch": 2.0, + "grad_norm": 0.6640625, + "learning_rate": 0.00011705764352857303, + "loss": 0.5988, + "step": 15015 + }, + { + "epoch": 2.0, + "grad_norm": 0.392578125, + "learning_rate": 0.0001170461690602229, + "loss": 0.2139, + "step": 15016 + }, + { + "epoch": 2.0, + "grad_norm": 0.412109375, + "learning_rate": 0.0001170346943607158, + "loss": 0.1912, + "step": 15017 + }, + { + "epoch": 2.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00011702321943020729, + "loss": 0.1775, + "step": 15018 + }, + { + "epoch": 2.0, + "grad_norm": 0.60546875, + "learning_rate": 0.00011701174426885302, + "loss": 0.403, + "step": 15019 + }, + { + "epoch": 2.0, + "grad_norm": 0.37109375, + "learning_rate": 0.00011700026887680855, + "loss": 0.276, + "step": 15020 + }, + { + "epoch": 2.0, + "grad_norm": 0.57421875, + "learning_rate": 0.00011698879325422955, + "loss": 0.3845, + "step": 15021 + }, + { + "epoch": 2.0, + "grad_norm": 0.34375, + "learning_rate": 0.00011697731740127162, + "loss": 0.2272, + "step": 15022 + }, + { + "epoch": 2.0, + "grad_norm": 0.498046875, + "learning_rate": 0.00011696584131809037, + "loss": 0.1725, + "step": 15023 + }, + { + "epoch": 2.0, + "grad_norm": 0.75, + "learning_rate": 0.00011695436500484141, + "loss": 0.3979, + "step": 15024 + }, + { + "epoch": 2.0, + "grad_norm": 0.76171875, + "learning_rate": 0.00011694288846168036, + "loss": 0.6306, + "step": 15025 + }, + { + "epoch": 2.01, + "grad_norm": 0.5625, + "learning_rate": 0.0001169314116887629, + "loss": 0.2976, + "step": 15026 + }, + { + "epoch": 2.01, + "grad_norm": 0.392578125, + "learning_rate": 0.0001169199346862446, + "loss": 0.2783, + "step": 15027 + }, + { + "epoch": 2.01, + "grad_norm": 0.48046875, + "learning_rate": 0.00011690845745428116, + "loss": 0.1672, + "step": 15028 + }, + { + "epoch": 2.01, + "grad_norm": 0.5390625, + "learning_rate": 0.00011689697999302817, + "loss": 0.3615, + "step": 15029 + }, + { + "epoch": 2.01, + "grad_norm": 0.75390625, + "learning_rate": 0.00011688550230264128, + "loss": 0.2946, + "step": 15030 + }, + { + "epoch": 2.01, + "grad_norm": 0.5390625, + "learning_rate": 0.00011687402438327617, + "loss": 0.5316, + "step": 15031 + }, + { + "epoch": 2.01, + "grad_norm": 0.462890625, + "learning_rate": 0.00011686254623508843, + "loss": 0.5237, + "step": 15032 + }, + { + "epoch": 2.01, + "grad_norm": 0.4765625, + "learning_rate": 0.00011685106785823374, + "loss": 0.5077, + "step": 15033 + }, + { + "epoch": 2.01, + "grad_norm": 0.56640625, + "learning_rate": 0.00011683958925286777, + "loss": 0.3268, + "step": 15034 + }, + { + "epoch": 2.01, + "grad_norm": 0.625, + "learning_rate": 0.00011682811041914618, + "loss": 0.5932, + "step": 15035 + }, + { + "epoch": 2.01, + "grad_norm": 0.408203125, + "learning_rate": 0.00011681663135722459, + "loss": 0.2454, + "step": 15036 + }, + { + "epoch": 2.01, + "grad_norm": 0.51953125, + "learning_rate": 0.00011680515206725868, + "loss": 0.5948, + "step": 15037 + }, + { + "epoch": 2.01, + "grad_norm": 0.63671875, + "learning_rate": 0.00011679367254940413, + "loss": 0.438, + "step": 15038 + }, + { + "epoch": 2.01, + "grad_norm": 0.76953125, + "learning_rate": 0.00011678219280381657, + "loss": 0.4452, + "step": 15039 + }, + { + "epoch": 2.01, + "grad_norm": 0.40625, + "learning_rate": 0.00011677071283065177, + "loss": 0.23, + "step": 15040 + }, + { + "epoch": 2.01, + "grad_norm": 0.5078125, + "learning_rate": 0.0001167592326300653, + "loss": 0.3776, + "step": 15041 + }, + { + "epoch": 2.01, + "grad_norm": 0.6953125, + "learning_rate": 0.00011674775220221288, + "loss": 0.3034, + "step": 15042 + }, + { + "epoch": 2.01, + "grad_norm": 0.58203125, + "learning_rate": 0.0001167362715472502, + "loss": 0.4106, + "step": 15043 + }, + { + "epoch": 2.01, + "grad_norm": 0.53125, + "learning_rate": 0.00011672479066533293, + "loss": 0.3085, + "step": 15044 + }, + { + "epoch": 2.01, + "grad_norm": 0.546875, + "learning_rate": 0.00011671330955661677, + "loss": 0.4423, + "step": 15045 + }, + { + "epoch": 2.01, + "grad_norm": 0.322265625, + "learning_rate": 0.0001167018282212574, + "loss": 0.1235, + "step": 15046 + }, + { + "epoch": 2.01, + "grad_norm": 0.435546875, + "learning_rate": 0.00011669034665941052, + "loss": 0.3422, + "step": 15047 + }, + { + "epoch": 2.01, + "grad_norm": 0.546875, + "learning_rate": 0.00011667886487123182, + "loss": 0.2995, + "step": 15048 + }, + { + "epoch": 2.01, + "grad_norm": 0.58984375, + "learning_rate": 0.00011666738285687703, + "loss": 0.4199, + "step": 15049 + }, + { + "epoch": 2.01, + "grad_norm": 0.5, + "learning_rate": 0.00011665590061650183, + "loss": 0.4456, + "step": 15050 + }, + { + "epoch": 2.01, + "grad_norm": 0.4921875, + "learning_rate": 0.00011664441815026192, + "loss": 0.2287, + "step": 15051 + }, + { + "epoch": 2.01, + "grad_norm": 0.45703125, + "learning_rate": 0.00011663293545831302, + "loss": 0.1751, + "step": 15052 + }, + { + "epoch": 2.01, + "grad_norm": 0.5625, + "learning_rate": 0.00011662145254081082, + "loss": 0.3597, + "step": 15053 + }, + { + "epoch": 2.01, + "grad_norm": 0.58984375, + "learning_rate": 0.00011660996939791109, + "loss": 0.2726, + "step": 15054 + }, + { + "epoch": 2.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00011659848602976949, + "loss": 0.2817, + "step": 15055 + }, + { + "epoch": 2.01, + "grad_norm": 0.58203125, + "learning_rate": 0.00011658700243654177, + "loss": 0.4301, + "step": 15056 + }, + { + "epoch": 2.01, + "grad_norm": 0.408203125, + "learning_rate": 0.00011657551861838365, + "loss": 0.1896, + "step": 15057 + }, + { + "epoch": 2.01, + "grad_norm": 0.48046875, + "learning_rate": 0.00011656403457545089, + "loss": 0.3975, + "step": 15058 + }, + { + "epoch": 2.01, + "grad_norm": 0.435546875, + "learning_rate": 0.00011655255030789915, + "loss": 0.1839, + "step": 15059 + }, + { + "epoch": 2.01, + "grad_norm": 0.51171875, + "learning_rate": 0.00011654106581588422, + "loss": 0.2077, + "step": 15060 + }, + { + "epoch": 2.01, + "grad_norm": 0.64453125, + "learning_rate": 0.00011652958109956183, + "loss": 0.3952, + "step": 15061 + }, + { + "epoch": 2.01, + "grad_norm": 0.35546875, + "learning_rate": 0.0001165180961590877, + "loss": 0.1375, + "step": 15062 + }, + { + "epoch": 2.01, + "grad_norm": 0.431640625, + "learning_rate": 0.00011650661099461757, + "loss": 0.3795, + "step": 15063 + }, + { + "epoch": 2.01, + "grad_norm": 0.404296875, + "learning_rate": 0.00011649512560630724, + "loss": 0.2417, + "step": 15064 + }, + { + "epoch": 2.01, + "grad_norm": 0.55078125, + "learning_rate": 0.0001164836399943124, + "loss": 0.3292, + "step": 15065 + }, + { + "epoch": 2.01, + "grad_norm": 0.46484375, + "learning_rate": 0.00011647215415878881, + "loss": 0.3073, + "step": 15066 + }, + { + "epoch": 2.01, + "grad_norm": 0.51171875, + "learning_rate": 0.00011646066809989227, + "loss": 0.4545, + "step": 15067 + }, + { + "epoch": 2.01, + "grad_norm": 0.55078125, + "learning_rate": 0.00011644918181777847, + "loss": 0.4361, + "step": 15068 + }, + { + "epoch": 2.01, + "grad_norm": 0.72265625, + "learning_rate": 0.00011643769531260323, + "loss": 0.305, + "step": 15069 + }, + { + "epoch": 2.01, + "grad_norm": 0.419921875, + "learning_rate": 0.0001164262085845223, + "loss": 0.1635, + "step": 15070 + }, + { + "epoch": 2.01, + "grad_norm": 0.58203125, + "learning_rate": 0.0001164147216336914, + "loss": 0.3245, + "step": 15071 + }, + { + "epoch": 2.01, + "grad_norm": 0.455078125, + "learning_rate": 0.00011640323446026638, + "loss": 0.3776, + "step": 15072 + }, + { + "epoch": 2.01, + "grad_norm": 0.6484375, + "learning_rate": 0.00011639174706440295, + "loss": 0.4963, + "step": 15073 + }, + { + "epoch": 2.01, + "grad_norm": 0.455078125, + "learning_rate": 0.00011638025944625693, + "loss": 0.3214, + "step": 15074 + }, + { + "epoch": 2.01, + "grad_norm": 0.43359375, + "learning_rate": 0.00011636877160598403, + "loss": 0.3166, + "step": 15075 + }, + { + "epoch": 2.01, + "grad_norm": 0.52734375, + "learning_rate": 0.00011635728354374013, + "loss": 0.3185, + "step": 15076 + }, + { + "epoch": 2.01, + "grad_norm": 0.6015625, + "learning_rate": 0.00011634579525968097, + "loss": 0.3072, + "step": 15077 + }, + { + "epoch": 2.01, + "grad_norm": 0.5078125, + "learning_rate": 0.00011633430675396231, + "loss": 0.6415, + "step": 15078 + }, + { + "epoch": 2.01, + "grad_norm": 0.62890625, + "learning_rate": 0.00011632281802673999, + "loss": 0.5076, + "step": 15079 + }, + { + "epoch": 2.01, + "grad_norm": 0.408203125, + "learning_rate": 0.00011631132907816975, + "loss": 0.318, + "step": 15080 + }, + { + "epoch": 2.01, + "grad_norm": 0.39453125, + "learning_rate": 0.00011629983990840745, + "loss": 0.1481, + "step": 15081 + }, + { + "epoch": 2.01, + "grad_norm": 0.5546875, + "learning_rate": 0.00011628835051760884, + "loss": 0.3464, + "step": 15082 + }, + { + "epoch": 2.01, + "grad_norm": 0.546875, + "learning_rate": 0.00011627686090592978, + "loss": 0.2553, + "step": 15083 + }, + { + "epoch": 2.01, + "grad_norm": 0.423828125, + "learning_rate": 0.00011626537107352599, + "loss": 0.161, + "step": 15084 + }, + { + "epoch": 2.01, + "grad_norm": 0.48828125, + "learning_rate": 0.00011625388102055337, + "loss": 0.289, + "step": 15085 + }, + { + "epoch": 2.01, + "grad_norm": 0.54296875, + "learning_rate": 0.00011624239074716766, + "loss": 0.1857, + "step": 15086 + }, + { + "epoch": 2.01, + "grad_norm": 0.7109375, + "learning_rate": 0.0001162309002535247, + "loss": 0.5381, + "step": 15087 + }, + { + "epoch": 2.01, + "grad_norm": 0.609375, + "learning_rate": 0.00011621940953978036, + "loss": 0.6288, + "step": 15088 + }, + { + "epoch": 2.01, + "grad_norm": 0.47265625, + "learning_rate": 0.0001162079186060904, + "loss": 0.2746, + "step": 15089 + }, + { + "epoch": 2.01, + "grad_norm": 0.439453125, + "learning_rate": 0.00011619642745261066, + "loss": 0.2202, + "step": 15090 + }, + { + "epoch": 2.01, + "grad_norm": 0.49609375, + "learning_rate": 0.00011618493607949694, + "loss": 0.4882, + "step": 15091 + }, + { + "epoch": 2.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00011617344448690513, + "loss": 0.5736, + "step": 15092 + }, + { + "epoch": 2.01, + "grad_norm": 0.5078125, + "learning_rate": 0.00011616195267499102, + "loss": 0.2727, + "step": 15093 + }, + { + "epoch": 2.01, + "grad_norm": 0.45703125, + "learning_rate": 0.00011615046064391046, + "loss": 0.1783, + "step": 15094 + }, + { + "epoch": 2.01, + "grad_norm": 0.80078125, + "learning_rate": 0.00011613896839381931, + "loss": 0.3498, + "step": 15095 + }, + { + "epoch": 2.01, + "grad_norm": 0.375, + "learning_rate": 0.00011612747592487334, + "loss": 0.24, + "step": 15096 + }, + { + "epoch": 2.01, + "grad_norm": 0.470703125, + "learning_rate": 0.0001161159832372285, + "loss": 0.2005, + "step": 15097 + }, + { + "epoch": 2.01, + "grad_norm": 0.68359375, + "learning_rate": 0.00011610449033104054, + "loss": 0.5482, + "step": 15098 + }, + { + "epoch": 2.01, + "grad_norm": 0.5234375, + "learning_rate": 0.0001160929972064654, + "loss": 0.4128, + "step": 15099 + }, + { + "epoch": 2.01, + "grad_norm": 0.63671875, + "learning_rate": 0.00011608150386365886, + "loss": 0.5856, + "step": 15100 + }, + { + "epoch": 2.02, + "grad_norm": 0.373046875, + "learning_rate": 0.0001160700103027768, + "loss": 0.1288, + "step": 15101 + }, + { + "epoch": 2.02, + "grad_norm": 0.431640625, + "learning_rate": 0.00011605851652397513, + "loss": 0.1826, + "step": 15102 + }, + { + "epoch": 2.02, + "grad_norm": 0.52734375, + "learning_rate": 0.00011604702252740962, + "loss": 0.323, + "step": 15103 + }, + { + "epoch": 2.02, + "grad_norm": 0.63671875, + "learning_rate": 0.00011603552831323621, + "loss": 0.5584, + "step": 15104 + }, + { + "epoch": 2.02, + "grad_norm": 0.6171875, + "learning_rate": 0.00011602403388161072, + "loss": 0.6313, + "step": 15105 + }, + { + "epoch": 2.02, + "grad_norm": 0.453125, + "learning_rate": 0.00011601253923268911, + "loss": 0.2071, + "step": 15106 + }, + { + "epoch": 2.02, + "grad_norm": 0.498046875, + "learning_rate": 0.00011600104436662713, + "loss": 0.3587, + "step": 15107 + }, + { + "epoch": 2.02, + "grad_norm": 0.52734375, + "learning_rate": 0.00011598954928358074, + "loss": 0.3554, + "step": 15108 + }, + { + "epoch": 2.02, + "grad_norm": 0.5546875, + "learning_rate": 0.00011597805398370578, + "loss": 0.2959, + "step": 15109 + }, + { + "epoch": 2.02, + "grad_norm": 0.4609375, + "learning_rate": 0.00011596655846715819, + "loss": 0.4316, + "step": 15110 + }, + { + "epoch": 2.02, + "grad_norm": 0.55859375, + "learning_rate": 0.00011595506273409378, + "loss": 0.4059, + "step": 15111 + }, + { + "epoch": 2.02, + "grad_norm": 0.455078125, + "learning_rate": 0.00011594356678466851, + "loss": 0.2975, + "step": 15112 + }, + { + "epoch": 2.02, + "grad_norm": 0.435546875, + "learning_rate": 0.00011593207061903823, + "loss": 0.245, + "step": 15113 + }, + { + "epoch": 2.02, + "grad_norm": 0.56640625, + "learning_rate": 0.00011592057423735885, + "loss": 0.3271, + "step": 15114 + }, + { + "epoch": 2.02, + "grad_norm": 0.470703125, + "learning_rate": 0.00011590907763978627, + "loss": 0.2625, + "step": 15115 + }, + { + "epoch": 2.02, + "grad_norm": 0.51953125, + "learning_rate": 0.00011589758082647637, + "loss": 0.3804, + "step": 15116 + }, + { + "epoch": 2.02, + "grad_norm": 0.5078125, + "learning_rate": 0.0001158860837975851, + "loss": 0.4221, + "step": 15117 + }, + { + "epoch": 2.02, + "grad_norm": 0.43359375, + "learning_rate": 0.0001158745865532683, + "loss": 0.2066, + "step": 15118 + }, + { + "epoch": 2.02, + "grad_norm": 0.486328125, + "learning_rate": 0.00011586308909368195, + "loss": 0.2111, + "step": 15119 + }, + { + "epoch": 2.02, + "grad_norm": 0.6328125, + "learning_rate": 0.00011585159141898192, + "loss": 0.3128, + "step": 15120 + }, + { + "epoch": 2.02, + "grad_norm": 0.498046875, + "learning_rate": 0.00011584009352932413, + "loss": 0.2372, + "step": 15121 + }, + { + "epoch": 2.02, + "grad_norm": 0.515625, + "learning_rate": 0.00011582859542486453, + "loss": 0.2372, + "step": 15122 + }, + { + "epoch": 2.02, + "grad_norm": 0.65625, + "learning_rate": 0.00011581709710575896, + "loss": 0.2662, + "step": 15123 + }, + { + "epoch": 2.02, + "grad_norm": 0.435546875, + "learning_rate": 0.00011580559857216347, + "loss": 0.454, + "step": 15124 + }, + { + "epoch": 2.02, + "grad_norm": 0.59375, + "learning_rate": 0.0001157940998242339, + "loss": 0.583, + "step": 15125 + }, + { + "epoch": 2.02, + "grad_norm": 0.44921875, + "learning_rate": 0.00011578260086212618, + "loss": 0.278, + "step": 15126 + }, + { + "epoch": 2.02, + "grad_norm": 0.5625, + "learning_rate": 0.00011577110168599628, + "loss": 0.2987, + "step": 15127 + }, + { + "epoch": 2.02, + "grad_norm": 0.58203125, + "learning_rate": 0.00011575960229600009, + "loss": 0.4338, + "step": 15128 + }, + { + "epoch": 2.02, + "grad_norm": 0.439453125, + "learning_rate": 0.00011574810269229361, + "loss": 0.2012, + "step": 15129 + }, + { + "epoch": 2.02, + "grad_norm": 0.59375, + "learning_rate": 0.00011573660287503272, + "loss": 0.5355, + "step": 15130 + }, + { + "epoch": 2.02, + "grad_norm": 0.640625, + "learning_rate": 0.00011572510284437346, + "loss": 0.4721, + "step": 15131 + }, + { + "epoch": 2.02, + "grad_norm": 0.54296875, + "learning_rate": 0.00011571360260047166, + "loss": 0.4903, + "step": 15132 + }, + { + "epoch": 2.02, + "grad_norm": 0.62109375, + "learning_rate": 0.00011570210214348334, + "loss": 0.4664, + "step": 15133 + }, + { + "epoch": 2.02, + "grad_norm": 0.49609375, + "learning_rate": 0.00011569060147356441, + "loss": 0.3361, + "step": 15134 + }, + { + "epoch": 2.02, + "grad_norm": 0.5859375, + "learning_rate": 0.00011567910059087088, + "loss": 0.4007, + "step": 15135 + }, + { + "epoch": 2.02, + "grad_norm": 0.6875, + "learning_rate": 0.00011566759949555868, + "loss": 0.3175, + "step": 15136 + }, + { + "epoch": 2.02, + "grad_norm": 0.447265625, + "learning_rate": 0.00011565609818778378, + "loss": 0.2047, + "step": 15137 + }, + { + "epoch": 2.02, + "grad_norm": 0.61328125, + "learning_rate": 0.00011564459666770214, + "loss": 0.5159, + "step": 15138 + }, + { + "epoch": 2.02, + "grad_norm": 0.5234375, + "learning_rate": 0.0001156330949354697, + "loss": 0.2887, + "step": 15139 + }, + { + "epoch": 2.02, + "grad_norm": 0.478515625, + "learning_rate": 0.00011562159299124249, + "loss": 0.4528, + "step": 15140 + }, + { + "epoch": 2.02, + "grad_norm": 0.53125, + "learning_rate": 0.00011561009083517644, + "loss": 0.3353, + "step": 15141 + }, + { + "epoch": 2.02, + "grad_norm": 0.515625, + "learning_rate": 0.00011559858846742755, + "loss": 0.4214, + "step": 15142 + }, + { + "epoch": 2.02, + "grad_norm": 0.404296875, + "learning_rate": 0.00011558708588815179, + "loss": 0.1954, + "step": 15143 + }, + { + "epoch": 2.02, + "grad_norm": 0.435546875, + "learning_rate": 0.00011557558309750514, + "loss": 0.3753, + "step": 15144 + }, + { + "epoch": 2.02, + "grad_norm": 0.498046875, + "learning_rate": 0.00011556408009564358, + "loss": 0.4757, + "step": 15145 + }, + { + "epoch": 2.02, + "grad_norm": 0.71875, + "learning_rate": 0.00011555257688272309, + "loss": 0.4584, + "step": 15146 + }, + { + "epoch": 2.02, + "grad_norm": 0.609375, + "learning_rate": 0.00011554107345889971, + "loss": 0.3671, + "step": 15147 + }, + { + "epoch": 2.02, + "grad_norm": 0.478515625, + "learning_rate": 0.00011552956982432936, + "loss": 0.4269, + "step": 15148 + }, + { + "epoch": 2.02, + "grad_norm": 0.5546875, + "learning_rate": 0.00011551806597916812, + "loss": 0.4947, + "step": 15149 + }, + { + "epoch": 2.02, + "grad_norm": 0.51171875, + "learning_rate": 0.00011550656192357194, + "loss": 0.4826, + "step": 15150 + }, + { + "epoch": 2.02, + "grad_norm": 0.4296875, + "learning_rate": 0.00011549505765769679, + "loss": 0.1847, + "step": 15151 + }, + { + "epoch": 2.02, + "grad_norm": 0.50390625, + "learning_rate": 0.00011548355318169875, + "loss": 0.2441, + "step": 15152 + }, + { + "epoch": 2.02, + "grad_norm": 0.46484375, + "learning_rate": 0.00011547204849573376, + "loss": 0.2199, + "step": 15153 + }, + { + "epoch": 2.02, + "grad_norm": 0.47265625, + "learning_rate": 0.0001154605435999579, + "loss": 0.3143, + "step": 15154 + }, + { + "epoch": 2.02, + "grad_norm": 0.71875, + "learning_rate": 0.00011544903849452713, + "loss": 0.4017, + "step": 15155 + }, + { + "epoch": 2.02, + "grad_norm": 0.65234375, + "learning_rate": 0.00011543753317959749, + "loss": 0.3475, + "step": 15156 + }, + { + "epoch": 2.02, + "grad_norm": 0.49609375, + "learning_rate": 0.00011542602765532497, + "loss": 0.5063, + "step": 15157 + }, + { + "epoch": 2.02, + "grad_norm": 0.72265625, + "learning_rate": 0.00011541452192186563, + "loss": 0.624, + "step": 15158 + }, + { + "epoch": 2.02, + "grad_norm": 0.5234375, + "learning_rate": 0.00011540301597937549, + "loss": 0.3737, + "step": 15159 + }, + { + "epoch": 2.02, + "grad_norm": 0.51953125, + "learning_rate": 0.00011539150982801052, + "loss": 0.3453, + "step": 15160 + }, + { + "epoch": 2.02, + "grad_norm": 0.7421875, + "learning_rate": 0.00011538000346792688, + "loss": 0.2174, + "step": 15161 + }, + { + "epoch": 2.02, + "grad_norm": 0.63671875, + "learning_rate": 0.00011536849689928044, + "loss": 0.5079, + "step": 15162 + }, + { + "epoch": 2.02, + "grad_norm": 0.55859375, + "learning_rate": 0.00011535699012222738, + "loss": 0.3804, + "step": 15163 + }, + { + "epoch": 2.02, + "grad_norm": 0.484375, + "learning_rate": 0.00011534548313692364, + "loss": 0.2377, + "step": 15164 + }, + { + "epoch": 2.02, + "grad_norm": 0.40625, + "learning_rate": 0.00011533397594352531, + "loss": 0.3247, + "step": 15165 + }, + { + "epoch": 2.02, + "grad_norm": 0.6484375, + "learning_rate": 0.00011532246854218842, + "loss": 0.3846, + "step": 15166 + }, + { + "epoch": 2.02, + "grad_norm": 0.53515625, + "learning_rate": 0.00011531096093306903, + "loss": 0.4194, + "step": 15167 + }, + { + "epoch": 2.02, + "grad_norm": 0.48046875, + "learning_rate": 0.0001152994531163232, + "loss": 0.2816, + "step": 15168 + }, + { + "epoch": 2.02, + "grad_norm": 0.48046875, + "learning_rate": 0.00011528794509210695, + "loss": 0.2381, + "step": 15169 + }, + { + "epoch": 2.02, + "grad_norm": 0.53125, + "learning_rate": 0.00011527643686057634, + "loss": 0.4761, + "step": 15170 + }, + { + "epoch": 2.02, + "grad_norm": 0.578125, + "learning_rate": 0.00011526492842188745, + "loss": 0.2718, + "step": 15171 + }, + { + "epoch": 2.02, + "grad_norm": 0.5078125, + "learning_rate": 0.00011525341977619635, + "loss": 0.4151, + "step": 15172 + }, + { + "epoch": 2.02, + "grad_norm": 0.59375, + "learning_rate": 0.00011524191092365908, + "loss": 0.3144, + "step": 15173 + }, + { + "epoch": 2.02, + "grad_norm": 0.5, + "learning_rate": 0.0001152304018644317, + "loss": 0.3793, + "step": 15174 + }, + { + "epoch": 2.02, + "grad_norm": 0.55859375, + "learning_rate": 0.00011521889259867032, + "loss": 0.2044, + "step": 15175 + }, + { + "epoch": 2.03, + "grad_norm": 0.58203125, + "learning_rate": 0.00011520738312653097, + "loss": 0.3845, + "step": 15176 + }, + { + "epoch": 2.03, + "grad_norm": 0.546875, + "learning_rate": 0.00011519587344816976, + "loss": 0.2707, + "step": 15177 + }, + { + "epoch": 2.03, + "grad_norm": 0.48828125, + "learning_rate": 0.00011518436356374271, + "loss": 0.1979, + "step": 15178 + }, + { + "epoch": 2.03, + "grad_norm": 0.6015625, + "learning_rate": 0.00011517285347340603, + "loss": 0.1862, + "step": 15179 + }, + { + "epoch": 2.03, + "grad_norm": 0.5234375, + "learning_rate": 0.00011516134317731566, + "loss": 0.2223, + "step": 15180 + }, + { + "epoch": 2.03, + "grad_norm": 0.5625, + "learning_rate": 0.00011514983267562775, + "loss": 0.3376, + "step": 15181 + }, + { + "epoch": 2.03, + "grad_norm": 0.7109375, + "learning_rate": 0.00011513832196849839, + "loss": 0.6547, + "step": 15182 + }, + { + "epoch": 2.03, + "grad_norm": 0.7265625, + "learning_rate": 0.00011512681105608367, + "loss": 0.583, + "step": 15183 + }, + { + "epoch": 2.03, + "grad_norm": 0.55859375, + "learning_rate": 0.00011511529993853966, + "loss": 0.3536, + "step": 15184 + }, + { + "epoch": 2.03, + "grad_norm": 0.48828125, + "learning_rate": 0.00011510378861602254, + "loss": 0.3659, + "step": 15185 + }, + { + "epoch": 2.03, + "grad_norm": 0.498046875, + "learning_rate": 0.0001150922770886883, + "loss": 0.1931, + "step": 15186 + }, + { + "epoch": 2.03, + "grad_norm": 0.64453125, + "learning_rate": 0.00011508076535669312, + "loss": 0.2993, + "step": 15187 + }, + { + "epoch": 2.03, + "grad_norm": 0.5390625, + "learning_rate": 0.00011506925342019308, + "loss": 0.3101, + "step": 15188 + }, + { + "epoch": 2.03, + "grad_norm": 0.50390625, + "learning_rate": 0.00011505774127934428, + "loss": 0.293, + "step": 15189 + }, + { + "epoch": 2.03, + "grad_norm": 0.375, + "learning_rate": 0.00011504622893430285, + "loss": 0.1438, + "step": 15190 + }, + { + "epoch": 2.03, + "grad_norm": 0.447265625, + "learning_rate": 0.00011503471638522491, + "loss": 0.2163, + "step": 15191 + }, + { + "epoch": 2.03, + "grad_norm": 0.51953125, + "learning_rate": 0.00011502320363226655, + "loss": 0.4593, + "step": 15192 + }, + { + "epoch": 2.03, + "grad_norm": 0.65234375, + "learning_rate": 0.00011501169067558393, + "loss": 0.4916, + "step": 15193 + }, + { + "epoch": 2.03, + "grad_norm": 0.73828125, + "learning_rate": 0.00011500017751533312, + "loss": 0.4382, + "step": 15194 + }, + { + "epoch": 2.03, + "grad_norm": 0.4453125, + "learning_rate": 0.00011498866415167031, + "loss": 0.1665, + "step": 15195 + }, + { + "epoch": 2.03, + "grad_norm": 0.47265625, + "learning_rate": 0.00011497715058475156, + "loss": 0.1867, + "step": 15196 + }, + { + "epoch": 2.03, + "grad_norm": 0.6328125, + "learning_rate": 0.00011496563681473307, + "loss": 0.3427, + "step": 15197 + }, + { + "epoch": 2.03, + "grad_norm": 0.5234375, + "learning_rate": 0.00011495412284177092, + "loss": 0.5391, + "step": 15198 + }, + { + "epoch": 2.03, + "grad_norm": 0.5546875, + "learning_rate": 0.00011494260866602124, + "loss": 0.3686, + "step": 15199 + }, + { + "epoch": 2.03, + "grad_norm": 0.5234375, + "learning_rate": 0.00011493109428764024, + "loss": 0.4257, + "step": 15200 + }, + { + "epoch": 2.03, + "grad_norm": 0.5703125, + "learning_rate": 0.000114919579706784, + "loss": 0.6914, + "step": 15201 + }, + { + "epoch": 2.03, + "grad_norm": 0.3984375, + "learning_rate": 0.0001149080649236087, + "loss": 0.1681, + "step": 15202 + }, + { + "epoch": 2.03, + "grad_norm": 0.515625, + "learning_rate": 0.00011489654993827044, + "loss": 0.262, + "step": 15203 + }, + { + "epoch": 2.03, + "grad_norm": 0.4921875, + "learning_rate": 0.00011488503475092544, + "loss": 0.2308, + "step": 15204 + }, + { + "epoch": 2.03, + "grad_norm": 0.671875, + "learning_rate": 0.00011487351936172981, + "loss": 0.2575, + "step": 15205 + }, + { + "epoch": 2.03, + "grad_norm": 0.470703125, + "learning_rate": 0.0001148620037708397, + "loss": 0.3062, + "step": 15206 + }, + { + "epoch": 2.03, + "grad_norm": 0.498046875, + "learning_rate": 0.00011485048797841127, + "loss": 0.3924, + "step": 15207 + }, + { + "epoch": 2.03, + "grad_norm": 0.5078125, + "learning_rate": 0.00011483897198460069, + "loss": 0.4417, + "step": 15208 + }, + { + "epoch": 2.03, + "grad_norm": 0.48828125, + "learning_rate": 0.00011482745578956414, + "loss": 0.3733, + "step": 15209 + }, + { + "epoch": 2.03, + "grad_norm": 0.68359375, + "learning_rate": 0.00011481593939345779, + "loss": 0.5263, + "step": 15210 + }, + { + "epoch": 2.03, + "grad_norm": 0.40234375, + "learning_rate": 0.00011480442279643777, + "loss": 0.2401, + "step": 15211 + }, + { + "epoch": 2.03, + "grad_norm": 0.4609375, + "learning_rate": 0.00011479290599866028, + "loss": 0.1834, + "step": 15212 + }, + { + "epoch": 2.03, + "grad_norm": 0.58203125, + "learning_rate": 0.00011478138900028147, + "loss": 0.7149, + "step": 15213 + }, + { + "epoch": 2.03, + "grad_norm": 0.515625, + "learning_rate": 0.00011476987180145757, + "loss": 0.2256, + "step": 15214 + }, + { + "epoch": 2.03, + "grad_norm": 0.41796875, + "learning_rate": 0.00011475835440234473, + "loss": 0.2269, + "step": 15215 + }, + { + "epoch": 2.03, + "grad_norm": 0.412109375, + "learning_rate": 0.00011474683680309912, + "loss": 0.2446, + "step": 15216 + }, + { + "epoch": 2.03, + "grad_norm": 0.6640625, + "learning_rate": 0.00011473531900387694, + "loss": 0.3881, + "step": 15217 + }, + { + "epoch": 2.03, + "grad_norm": 0.484375, + "learning_rate": 0.00011472380100483438, + "loss": 0.5984, + "step": 15218 + }, + { + "epoch": 2.03, + "grad_norm": 0.4453125, + "learning_rate": 0.0001147122828061276, + "loss": 0.3525, + "step": 15219 + }, + { + "epoch": 2.03, + "grad_norm": 0.486328125, + "learning_rate": 0.00011470076440791286, + "loss": 0.3375, + "step": 15220 + }, + { + "epoch": 2.03, + "grad_norm": 0.45703125, + "learning_rate": 0.0001146892458103463, + "loss": 0.2327, + "step": 15221 + }, + { + "epoch": 2.03, + "grad_norm": 0.60546875, + "learning_rate": 0.00011467772701358414, + "loss": 0.5062, + "step": 15222 + }, + { + "epoch": 2.03, + "grad_norm": 1.03125, + "learning_rate": 0.00011466620801778261, + "loss": 0.3194, + "step": 15223 + }, + { + "epoch": 2.03, + "grad_norm": 0.484375, + "learning_rate": 0.00011465468882309785, + "loss": 0.2306, + "step": 15224 + }, + { + "epoch": 2.03, + "grad_norm": 0.55859375, + "learning_rate": 0.00011464316942968614, + "loss": 0.3458, + "step": 15225 + }, + { + "epoch": 2.03, + "grad_norm": 0.54296875, + "learning_rate": 0.00011463164983770361, + "loss": 0.2986, + "step": 15226 + }, + { + "epoch": 2.03, + "grad_norm": 0.466796875, + "learning_rate": 0.00011462013004730657, + "loss": 0.2818, + "step": 15227 + }, + { + "epoch": 2.03, + "grad_norm": 0.8125, + "learning_rate": 0.00011460861005865115, + "loss": 0.5366, + "step": 15228 + }, + { + "epoch": 2.03, + "grad_norm": 0.416015625, + "learning_rate": 0.00011459708987189362, + "loss": 0.2014, + "step": 15229 + }, + { + "epoch": 2.03, + "grad_norm": 0.66015625, + "learning_rate": 0.00011458556948719015, + "loss": 0.577, + "step": 15230 + }, + { + "epoch": 2.03, + "grad_norm": 0.451171875, + "learning_rate": 0.00011457404890469703, + "loss": 0.2588, + "step": 15231 + }, + { + "epoch": 2.03, + "grad_norm": 0.640625, + "learning_rate": 0.00011456252812457042, + "loss": 0.4899, + "step": 15232 + }, + { + "epoch": 2.03, + "grad_norm": 0.51171875, + "learning_rate": 0.00011455100714696658, + "loss": 0.2128, + "step": 15233 + }, + { + "epoch": 2.03, + "grad_norm": 0.53515625, + "learning_rate": 0.0001145394859720418, + "loss": 0.2033, + "step": 15234 + }, + { + "epoch": 2.03, + "grad_norm": 0.54296875, + "learning_rate": 0.00011452796459995224, + "loss": 0.3942, + "step": 15235 + }, + { + "epoch": 2.03, + "grad_norm": 0.578125, + "learning_rate": 0.00011451644303085414, + "loss": 0.4983, + "step": 15236 + }, + { + "epoch": 2.03, + "grad_norm": 0.71484375, + "learning_rate": 0.00011450492126490373, + "loss": 0.3781, + "step": 15237 + }, + { + "epoch": 2.03, + "grad_norm": 0.6015625, + "learning_rate": 0.0001144933993022573, + "loss": 0.4683, + "step": 15238 + }, + { + "epoch": 2.03, + "grad_norm": 0.5390625, + "learning_rate": 0.00011448187714307108, + "loss": 0.2502, + "step": 15239 + }, + { + "epoch": 2.03, + "grad_norm": 0.62890625, + "learning_rate": 0.00011447035478750132, + "loss": 0.4092, + "step": 15240 + }, + { + "epoch": 2.03, + "grad_norm": 1.2109375, + "learning_rate": 0.00011445883223570425, + "loss": 0.2884, + "step": 15241 + }, + { + "epoch": 2.03, + "grad_norm": 0.5703125, + "learning_rate": 0.00011444730948783615, + "loss": 0.4354, + "step": 15242 + }, + { + "epoch": 2.03, + "grad_norm": 0.56640625, + "learning_rate": 0.00011443578654405325, + "loss": 0.3583, + "step": 15243 + }, + { + "epoch": 2.03, + "grad_norm": 0.61328125, + "learning_rate": 0.0001144242634045118, + "loss": 0.362, + "step": 15244 + }, + { + "epoch": 2.03, + "grad_norm": 0.5625, + "learning_rate": 0.00011441274006936811, + "loss": 0.512, + "step": 15245 + }, + { + "epoch": 2.03, + "grad_norm": 0.40625, + "learning_rate": 0.00011440121653877842, + "loss": 0.2748, + "step": 15246 + }, + { + "epoch": 2.03, + "grad_norm": 0.625, + "learning_rate": 0.00011438969281289897, + "loss": 0.2927, + "step": 15247 + }, + { + "epoch": 2.03, + "grad_norm": 0.51953125, + "learning_rate": 0.00011437816889188607, + "loss": 0.2932, + "step": 15248 + }, + { + "epoch": 2.03, + "grad_norm": 0.58203125, + "learning_rate": 0.00011436664477589593, + "loss": 0.3199, + "step": 15249 + }, + { + "epoch": 2.03, + "grad_norm": 0.470703125, + "learning_rate": 0.00011435512046508491, + "loss": 0.1451, + "step": 15250 + }, + { + "epoch": 2.04, + "grad_norm": 0.625, + "learning_rate": 0.00011434359595960922, + "loss": 0.3311, + "step": 15251 + }, + { + "epoch": 2.04, + "grad_norm": 0.55078125, + "learning_rate": 0.0001143320712596252, + "loss": 0.2719, + "step": 15252 + }, + { + "epoch": 2.04, + "grad_norm": 0.466796875, + "learning_rate": 0.00011432054636528903, + "loss": 0.3395, + "step": 15253 + }, + { + "epoch": 2.04, + "grad_norm": 0.5546875, + "learning_rate": 0.00011430902127675714, + "loss": 0.4621, + "step": 15254 + }, + { + "epoch": 2.04, + "grad_norm": 0.451171875, + "learning_rate": 0.00011429749599418567, + "loss": 0.1635, + "step": 15255 + }, + { + "epoch": 2.04, + "grad_norm": 0.71484375, + "learning_rate": 0.000114285970517731, + "loss": 0.3126, + "step": 15256 + }, + { + "epoch": 2.04, + "grad_norm": 0.443359375, + "learning_rate": 0.0001142744448475494, + "loss": 0.2351, + "step": 15257 + }, + { + "epoch": 2.04, + "grad_norm": 0.51171875, + "learning_rate": 0.00011426291898379718, + "loss": 0.2999, + "step": 15258 + }, + { + "epoch": 2.04, + "grad_norm": 0.5234375, + "learning_rate": 0.00011425139292663063, + "loss": 0.2858, + "step": 15259 + }, + { + "epoch": 2.04, + "grad_norm": 0.46484375, + "learning_rate": 0.00011423986667620601, + "loss": 0.3751, + "step": 15260 + }, + { + "epoch": 2.04, + "grad_norm": 0.4921875, + "learning_rate": 0.0001142283402326797, + "loss": 0.3775, + "step": 15261 + }, + { + "epoch": 2.04, + "grad_norm": 0.65625, + "learning_rate": 0.00011421681359620793, + "loss": 0.3288, + "step": 15262 + }, + { + "epoch": 2.04, + "grad_norm": 0.392578125, + "learning_rate": 0.00011420528676694706, + "loss": 0.2132, + "step": 15263 + }, + { + "epoch": 2.04, + "grad_norm": 0.734375, + "learning_rate": 0.00011419375974505339, + "loss": 0.418, + "step": 15264 + }, + { + "epoch": 2.04, + "grad_norm": 0.578125, + "learning_rate": 0.00011418223253068321, + "loss": 0.4348, + "step": 15265 + }, + { + "epoch": 2.04, + "grad_norm": 0.49609375, + "learning_rate": 0.00011417070512399286, + "loss": 0.3512, + "step": 15266 + }, + { + "epoch": 2.04, + "grad_norm": 0.5859375, + "learning_rate": 0.00011415917752513865, + "loss": 0.2855, + "step": 15267 + }, + { + "epoch": 2.04, + "grad_norm": 0.42578125, + "learning_rate": 0.00011414764973427694, + "loss": 0.1962, + "step": 15268 + }, + { + "epoch": 2.04, + "grad_norm": 0.4765625, + "learning_rate": 0.00011413612175156396, + "loss": 0.2365, + "step": 15269 + }, + { + "epoch": 2.04, + "grad_norm": 0.5859375, + "learning_rate": 0.00011412459357715617, + "loss": 0.2212, + "step": 15270 + }, + { + "epoch": 2.04, + "grad_norm": 0.53125, + "learning_rate": 0.00011411306521120977, + "loss": 0.5558, + "step": 15271 + }, + { + "epoch": 2.04, + "grad_norm": 0.609375, + "learning_rate": 0.00011410153665388118, + "loss": 0.425, + "step": 15272 + }, + { + "epoch": 2.04, + "grad_norm": 0.50390625, + "learning_rate": 0.00011409000790532669, + "loss": 0.3103, + "step": 15273 + }, + { + "epoch": 2.04, + "grad_norm": 0.57421875, + "learning_rate": 0.00011407847896570263, + "loss": 0.3243, + "step": 15274 + }, + { + "epoch": 2.04, + "grad_norm": 0.39453125, + "learning_rate": 0.00011406694983516537, + "loss": 0.2437, + "step": 15275 + }, + { + "epoch": 2.04, + "grad_norm": 0.421875, + "learning_rate": 0.00011405542051387126, + "loss": 0.3014, + "step": 15276 + }, + { + "epoch": 2.04, + "grad_norm": 0.54296875, + "learning_rate": 0.00011404389100197663, + "loss": 0.463, + "step": 15277 + }, + { + "epoch": 2.04, + "grad_norm": 0.52734375, + "learning_rate": 0.00011403236129963781, + "loss": 0.4103, + "step": 15278 + }, + { + "epoch": 2.04, + "grad_norm": 0.515625, + "learning_rate": 0.00011402083140701116, + "loss": 0.3413, + "step": 15279 + }, + { + "epoch": 2.04, + "grad_norm": 0.462890625, + "learning_rate": 0.00011400930132425305, + "loss": 0.2472, + "step": 15280 + }, + { + "epoch": 2.04, + "grad_norm": 0.41796875, + "learning_rate": 0.0001139977710515198, + "loss": 0.4534, + "step": 15281 + }, + { + "epoch": 2.04, + "grad_norm": 0.5078125, + "learning_rate": 0.00011398624058896783, + "loss": 0.3684, + "step": 15282 + }, + { + "epoch": 2.04, + "grad_norm": 0.5546875, + "learning_rate": 0.00011397470993675343, + "loss": 0.3411, + "step": 15283 + }, + { + "epoch": 2.04, + "grad_norm": 0.453125, + "learning_rate": 0.000113963179095033, + "loss": 0.4682, + "step": 15284 + }, + { + "epoch": 2.04, + "grad_norm": 0.373046875, + "learning_rate": 0.0001139516480639629, + "loss": 0.1848, + "step": 15285 + }, + { + "epoch": 2.04, + "grad_norm": 0.60546875, + "learning_rate": 0.0001139401168436995, + "loss": 0.3313, + "step": 15286 + }, + { + "epoch": 2.04, + "grad_norm": 0.59375, + "learning_rate": 0.00011392858543439917, + "loss": 0.3176, + "step": 15287 + }, + { + "epoch": 2.04, + "grad_norm": 0.5234375, + "learning_rate": 0.0001139170538362183, + "loss": 0.2034, + "step": 15288 + }, + { + "epoch": 2.04, + "grad_norm": 0.49609375, + "learning_rate": 0.00011390552204931322, + "loss": 0.3143, + "step": 15289 + }, + { + "epoch": 2.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00011389399007384034, + "loss": 0.4862, + "step": 15290 + }, + { + "epoch": 2.04, + "grad_norm": 0.515625, + "learning_rate": 0.00011388245790995604, + "loss": 0.2844, + "step": 15291 + }, + { + "epoch": 2.04, + "grad_norm": 0.53515625, + "learning_rate": 0.00011387092555781671, + "loss": 0.2037, + "step": 15292 + }, + { + "epoch": 2.04, + "grad_norm": 0.5625, + "learning_rate": 0.00011385939301757874, + "loss": 0.7224, + "step": 15293 + }, + { + "epoch": 2.04, + "grad_norm": 0.609375, + "learning_rate": 0.00011384786028939847, + "loss": 0.526, + "step": 15294 + }, + { + "epoch": 2.04, + "grad_norm": 0.66796875, + "learning_rate": 0.00011383632737343234, + "loss": 0.5819, + "step": 15295 + }, + { + "epoch": 2.04, + "grad_norm": 0.41796875, + "learning_rate": 0.00011382479426983676, + "loss": 0.2016, + "step": 15296 + }, + { + "epoch": 2.04, + "grad_norm": 0.72265625, + "learning_rate": 0.00011381326097876806, + "loss": 0.3843, + "step": 15297 + }, + { + "epoch": 2.04, + "grad_norm": 0.55859375, + "learning_rate": 0.00011380172750038269, + "loss": 0.4238, + "step": 15298 + }, + { + "epoch": 2.04, + "grad_norm": 0.421875, + "learning_rate": 0.00011379019383483702, + "loss": 0.2764, + "step": 15299 + }, + { + "epoch": 2.04, + "grad_norm": 0.67578125, + "learning_rate": 0.0001137786599822875, + "loss": 0.5341, + "step": 15300 + }, + { + "epoch": 2.04, + "grad_norm": 0.53125, + "learning_rate": 0.0001137671259428905, + "loss": 0.3227, + "step": 15301 + }, + { + "epoch": 2.04, + "grad_norm": 0.33984375, + "learning_rate": 0.00011375559171680245, + "loss": 0.2898, + "step": 15302 + }, + { + "epoch": 2.04, + "grad_norm": 0.5625, + "learning_rate": 0.00011374405730417971, + "loss": 0.3673, + "step": 15303 + }, + { + "epoch": 2.04, + "grad_norm": 0.53125, + "learning_rate": 0.00011373252270517876, + "loss": 0.4541, + "step": 15304 + }, + { + "epoch": 2.04, + "grad_norm": 0.73828125, + "learning_rate": 0.00011372098791995598, + "loss": 0.3712, + "step": 15305 + }, + { + "epoch": 2.04, + "grad_norm": 0.65625, + "learning_rate": 0.00011370945294866777, + "loss": 0.1949, + "step": 15306 + }, + { + "epoch": 2.04, + "grad_norm": 0.77734375, + "learning_rate": 0.00011369791779147064, + "loss": 0.2263, + "step": 15307 + }, + { + "epoch": 2.04, + "grad_norm": 0.63671875, + "learning_rate": 0.0001136863824485209, + "loss": 0.3912, + "step": 15308 + }, + { + "epoch": 2.04, + "grad_norm": 0.5078125, + "learning_rate": 0.00011367484691997508, + "loss": 0.2646, + "step": 15309 + }, + { + "epoch": 2.04, + "grad_norm": 0.482421875, + "learning_rate": 0.0001136633112059895, + "loss": 0.3556, + "step": 15310 + }, + { + "epoch": 2.04, + "grad_norm": 0.59375, + "learning_rate": 0.00011365177530672069, + "loss": 0.3739, + "step": 15311 + }, + { + "epoch": 2.04, + "grad_norm": 0.64453125, + "learning_rate": 0.00011364023922232503, + "loss": 0.5504, + "step": 15312 + }, + { + "epoch": 2.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00011362870295295898, + "loss": 0.2662, + "step": 15313 + }, + { + "epoch": 2.04, + "grad_norm": 0.486328125, + "learning_rate": 0.00011361716649877898, + "loss": 0.3553, + "step": 15314 + }, + { + "epoch": 2.04, + "grad_norm": 0.59375, + "learning_rate": 0.00011360562985994146, + "loss": 0.3273, + "step": 15315 + }, + { + "epoch": 2.04, + "grad_norm": 0.515625, + "learning_rate": 0.00011359409303660285, + "loss": 0.4049, + "step": 15316 + }, + { + "epoch": 2.04, + "grad_norm": 0.59765625, + "learning_rate": 0.0001135825560289196, + "loss": 0.4765, + "step": 15317 + }, + { + "epoch": 2.04, + "grad_norm": 0.56640625, + "learning_rate": 0.00011357101883704821, + "loss": 0.4016, + "step": 15318 + }, + { + "epoch": 2.04, + "grad_norm": 0.57421875, + "learning_rate": 0.0001135594814611451, + "loss": 0.5386, + "step": 15319 + }, + { + "epoch": 2.04, + "grad_norm": 0.52734375, + "learning_rate": 0.0001135479439013667, + "loss": 0.2816, + "step": 15320 + }, + { + "epoch": 2.04, + "grad_norm": 0.40234375, + "learning_rate": 0.00011353640615786947, + "loss": 0.1565, + "step": 15321 + }, + { + "epoch": 2.04, + "grad_norm": 0.455078125, + "learning_rate": 0.0001135248682308099, + "loss": 0.4359, + "step": 15322 + }, + { + "epoch": 2.04, + "grad_norm": 0.75, + "learning_rate": 0.00011351333012034444, + "loss": 0.3909, + "step": 15323 + }, + { + "epoch": 2.04, + "grad_norm": 0.515625, + "learning_rate": 0.00011350179182662954, + "loss": 0.3045, + "step": 15324 + }, + { + "epoch": 2.04, + "grad_norm": 0.43359375, + "learning_rate": 0.00011349025334982171, + "loss": 0.3466, + "step": 15325 + }, + { + "epoch": 2.05, + "grad_norm": 0.515625, + "learning_rate": 0.00011347871469007736, + "loss": 0.5097, + "step": 15326 + }, + { + "epoch": 2.05, + "grad_norm": 0.49609375, + "learning_rate": 0.000113467175847553, + "loss": 0.2788, + "step": 15327 + }, + { + "epoch": 2.05, + "grad_norm": 0.64453125, + "learning_rate": 0.00011345563682240505, + "loss": 0.5544, + "step": 15328 + }, + { + "epoch": 2.05, + "grad_norm": 0.625, + "learning_rate": 0.00011344409761479009, + "loss": 0.4691, + "step": 15329 + }, + { + "epoch": 2.05, + "grad_norm": 0.69921875, + "learning_rate": 0.0001134325582248645, + "loss": 0.2776, + "step": 15330 + }, + { + "epoch": 2.05, + "grad_norm": 0.5, + "learning_rate": 0.0001134210186527848, + "loss": 0.4394, + "step": 15331 + }, + { + "epoch": 2.05, + "grad_norm": 0.62109375, + "learning_rate": 0.0001134094788987075, + "loss": 0.4087, + "step": 15332 + }, + { + "epoch": 2.05, + "grad_norm": 0.640625, + "learning_rate": 0.00011339793896278903, + "loss": 0.4718, + "step": 15333 + }, + { + "epoch": 2.05, + "grad_norm": 0.45703125, + "learning_rate": 0.00011338639884518594, + "loss": 0.2911, + "step": 15334 + }, + { + "epoch": 2.05, + "grad_norm": 0.67578125, + "learning_rate": 0.00011337485854605465, + "loss": 0.3634, + "step": 15335 + }, + { + "epoch": 2.05, + "grad_norm": 0.61328125, + "learning_rate": 0.00011336331806555171, + "loss": 0.472, + "step": 15336 + }, + { + "epoch": 2.05, + "grad_norm": 0.55859375, + "learning_rate": 0.00011335177740383363, + "loss": 0.2794, + "step": 15337 + }, + { + "epoch": 2.05, + "grad_norm": 0.546875, + "learning_rate": 0.00011334023656105683, + "loss": 0.342, + "step": 15338 + }, + { + "epoch": 2.05, + "grad_norm": 0.57421875, + "learning_rate": 0.0001133286955373779, + "loss": 0.194, + "step": 15339 + }, + { + "epoch": 2.05, + "grad_norm": 0.7421875, + "learning_rate": 0.00011331715433295329, + "loss": 0.1892, + "step": 15340 + }, + { + "epoch": 2.05, + "grad_norm": 0.55859375, + "learning_rate": 0.00011330561294793952, + "loss": 0.2344, + "step": 15341 + }, + { + "epoch": 2.05, + "grad_norm": 0.453125, + "learning_rate": 0.00011329407138249309, + "loss": 0.3716, + "step": 15342 + }, + { + "epoch": 2.05, + "grad_norm": 0.4140625, + "learning_rate": 0.00011328252963677055, + "loss": 0.3731, + "step": 15343 + }, + { + "epoch": 2.05, + "grad_norm": 0.5078125, + "learning_rate": 0.00011327098771092836, + "loss": 0.3458, + "step": 15344 + }, + { + "epoch": 2.05, + "grad_norm": 0.7890625, + "learning_rate": 0.00011325944560512305, + "loss": 0.6254, + "step": 15345 + }, + { + "epoch": 2.05, + "grad_norm": 0.5859375, + "learning_rate": 0.00011324790331951116, + "loss": 0.4636, + "step": 15346 + }, + { + "epoch": 2.05, + "grad_norm": 0.50390625, + "learning_rate": 0.00011323636085424918, + "loss": 0.3299, + "step": 15347 + }, + { + "epoch": 2.05, + "grad_norm": 0.515625, + "learning_rate": 0.00011322481820949369, + "loss": 0.4056, + "step": 15348 + }, + { + "epoch": 2.05, + "grad_norm": 0.5703125, + "learning_rate": 0.00011321327538540116, + "loss": 0.2386, + "step": 15349 + }, + { + "epoch": 2.05, + "grad_norm": 0.5703125, + "learning_rate": 0.00011320173238212814, + "loss": 0.4153, + "step": 15350 + }, + { + "epoch": 2.05, + "grad_norm": 0.50390625, + "learning_rate": 0.00011319018919983113, + "loss": 0.331, + "step": 15351 + }, + { + "epoch": 2.05, + "grad_norm": 0.431640625, + "learning_rate": 0.00011317864583866673, + "loss": 0.3844, + "step": 15352 + }, + { + "epoch": 2.05, + "grad_norm": 0.63671875, + "learning_rate": 0.0001131671022987914, + "loss": 0.3603, + "step": 15353 + }, + { + "epoch": 2.05, + "grad_norm": 0.5703125, + "learning_rate": 0.00011315555858036173, + "loss": 0.2621, + "step": 15354 + }, + { + "epoch": 2.05, + "grad_norm": 0.451171875, + "learning_rate": 0.00011314401468353425, + "loss": 0.1535, + "step": 15355 + }, + { + "epoch": 2.05, + "grad_norm": 0.6328125, + "learning_rate": 0.00011313247060846547, + "loss": 0.3109, + "step": 15356 + }, + { + "epoch": 2.05, + "grad_norm": 0.69140625, + "learning_rate": 0.000113120926355312, + "loss": 0.2872, + "step": 15357 + }, + { + "epoch": 2.05, + "grad_norm": 0.365234375, + "learning_rate": 0.00011310938192423028, + "loss": 0.1235, + "step": 15358 + }, + { + "epoch": 2.05, + "grad_norm": 0.53125, + "learning_rate": 0.00011309783731537699, + "loss": 0.2989, + "step": 15359 + }, + { + "epoch": 2.05, + "grad_norm": 0.515625, + "learning_rate": 0.00011308629252890859, + "loss": 0.3023, + "step": 15360 + }, + { + "epoch": 2.05, + "grad_norm": 0.55078125, + "learning_rate": 0.00011307474756498169, + "loss": 0.5877, + "step": 15361 + }, + { + "epoch": 2.05, + "grad_norm": 0.71484375, + "learning_rate": 0.00011306320242375279, + "loss": 0.4895, + "step": 15362 + }, + { + "epoch": 2.05, + "grad_norm": 0.5390625, + "learning_rate": 0.00011305165710537848, + "loss": 0.2529, + "step": 15363 + }, + { + "epoch": 2.05, + "grad_norm": 0.58203125, + "learning_rate": 0.00011304011161001535, + "loss": 0.1958, + "step": 15364 + }, + { + "epoch": 2.05, + "grad_norm": 0.61328125, + "learning_rate": 0.00011302856593781993, + "loss": 0.4631, + "step": 15365 + }, + { + "epoch": 2.05, + "grad_norm": 0.625, + "learning_rate": 0.0001130170200889488, + "loss": 0.3524, + "step": 15366 + }, + { + "epoch": 2.05, + "grad_norm": 0.56640625, + "learning_rate": 0.00011300547406355853, + "loss": 0.2185, + "step": 15367 + }, + { + "epoch": 2.05, + "grad_norm": 0.41796875, + "learning_rate": 0.00011299392786180565, + "loss": 0.1161, + "step": 15368 + }, + { + "epoch": 2.05, + "grad_norm": 0.328125, + "learning_rate": 0.00011298238148384679, + "loss": 0.1253, + "step": 15369 + }, + { + "epoch": 2.05, + "grad_norm": 0.50390625, + "learning_rate": 0.00011297083492983848, + "loss": 0.3991, + "step": 15370 + }, + { + "epoch": 2.05, + "grad_norm": 0.6015625, + "learning_rate": 0.00011295928819993736, + "loss": 0.4872, + "step": 15371 + }, + { + "epoch": 2.05, + "grad_norm": 0.451171875, + "learning_rate": 0.00011294774129429995, + "loss": 0.1149, + "step": 15372 + }, + { + "epoch": 2.05, + "grad_norm": 0.609375, + "learning_rate": 0.00011293619421308288, + "loss": 0.5924, + "step": 15373 + }, + { + "epoch": 2.05, + "grad_norm": 0.423828125, + "learning_rate": 0.00011292464695644271, + "loss": 0.2205, + "step": 15374 + }, + { + "epoch": 2.05, + "grad_norm": 0.48828125, + "learning_rate": 0.00011291309952453604, + "loss": 0.282, + "step": 15375 + }, + { + "epoch": 2.05, + "grad_norm": 0.6484375, + "learning_rate": 0.00011290155191751941, + "loss": 0.4057, + "step": 15376 + }, + { + "epoch": 2.05, + "grad_norm": 0.5390625, + "learning_rate": 0.00011289000413554951, + "loss": 0.3137, + "step": 15377 + }, + { + "epoch": 2.05, + "grad_norm": 0.5234375, + "learning_rate": 0.00011287845617878283, + "loss": 0.1529, + "step": 15378 + }, + { + "epoch": 2.05, + "grad_norm": 0.439453125, + "learning_rate": 0.00011286690804737603, + "loss": 0.19, + "step": 15379 + }, + { + "epoch": 2.05, + "grad_norm": 0.5078125, + "learning_rate": 0.00011285535974148576, + "loss": 0.3808, + "step": 15380 + }, + { + "epoch": 2.05, + "grad_norm": 0.466796875, + "learning_rate": 0.00011284381126126848, + "loss": 0.2526, + "step": 15381 + }, + { + "epoch": 2.05, + "grad_norm": 0.55078125, + "learning_rate": 0.00011283226260688094, + "loss": 0.7053, + "step": 15382 + }, + { + "epoch": 2.05, + "grad_norm": 0.515625, + "learning_rate": 0.00011282071377847962, + "loss": 0.2499, + "step": 15383 + }, + { + "epoch": 2.05, + "grad_norm": 0.58984375, + "learning_rate": 0.00011280916477622126, + "loss": 0.4944, + "step": 15384 + }, + { + "epoch": 2.05, + "grad_norm": 0.427734375, + "learning_rate": 0.00011279761560026235, + "loss": 0.2272, + "step": 15385 + }, + { + "epoch": 2.05, + "grad_norm": 0.59375, + "learning_rate": 0.00011278606625075959, + "loss": 0.6157, + "step": 15386 + }, + { + "epoch": 2.05, + "grad_norm": 0.6640625, + "learning_rate": 0.00011277451672786956, + "loss": 0.5232, + "step": 15387 + }, + { + "epoch": 2.05, + "grad_norm": 0.3671875, + "learning_rate": 0.00011276296703174887, + "loss": 0.1551, + "step": 15388 + }, + { + "epoch": 2.05, + "grad_norm": 0.57421875, + "learning_rate": 0.00011275141716255418, + "loss": 0.3438, + "step": 15389 + }, + { + "epoch": 2.05, + "grad_norm": 0.5078125, + "learning_rate": 0.00011273986712044207, + "loss": 0.2603, + "step": 15390 + }, + { + "epoch": 2.05, + "grad_norm": 0.4375, + "learning_rate": 0.00011272831690556919, + "loss": 0.2485, + "step": 15391 + }, + { + "epoch": 2.05, + "grad_norm": 0.546875, + "learning_rate": 0.00011271676651809217, + "loss": 0.2062, + "step": 15392 + }, + { + "epoch": 2.05, + "grad_norm": 0.39453125, + "learning_rate": 0.00011270521595816763, + "loss": 0.2692, + "step": 15393 + }, + { + "epoch": 2.05, + "grad_norm": 0.609375, + "learning_rate": 0.00011269366522595222, + "loss": 0.4201, + "step": 15394 + }, + { + "epoch": 2.05, + "grad_norm": 0.4921875, + "learning_rate": 0.00011268211432160252, + "loss": 0.2306, + "step": 15395 + }, + { + "epoch": 2.05, + "grad_norm": 0.50390625, + "learning_rate": 0.00011267056324527525, + "loss": 0.2454, + "step": 15396 + }, + { + "epoch": 2.05, + "grad_norm": 0.67578125, + "learning_rate": 0.00011265901199712699, + "loss": 0.3094, + "step": 15397 + }, + { + "epoch": 2.05, + "grad_norm": 0.671875, + "learning_rate": 0.00011264746057731445, + "loss": 0.5311, + "step": 15398 + }, + { + "epoch": 2.05, + "grad_norm": 0.45703125, + "learning_rate": 0.00011263590898599416, + "loss": 0.3315, + "step": 15399 + }, + { + "epoch": 2.05, + "grad_norm": 0.52734375, + "learning_rate": 0.0001126243572233229, + "loss": 0.1993, + "step": 15400 + }, + { + "epoch": 2.06, + "grad_norm": 0.5703125, + "learning_rate": 0.00011261280528945719, + "loss": 0.2829, + "step": 15401 + }, + { + "epoch": 2.06, + "grad_norm": 0.62109375, + "learning_rate": 0.0001126012531845538, + "loss": 0.2496, + "step": 15402 + }, + { + "epoch": 2.06, + "grad_norm": 0.498046875, + "learning_rate": 0.00011258970090876929, + "loss": 0.3802, + "step": 15403 + }, + { + "epoch": 2.06, + "grad_norm": 0.4296875, + "learning_rate": 0.00011257814846226038, + "loss": 0.1797, + "step": 15404 + }, + { + "epoch": 2.06, + "grad_norm": 0.388671875, + "learning_rate": 0.00011256659584518369, + "loss": 0.3249, + "step": 15405 + }, + { + "epoch": 2.06, + "grad_norm": 0.56640625, + "learning_rate": 0.00011255504305769589, + "loss": 0.3895, + "step": 15406 + }, + { + "epoch": 2.06, + "grad_norm": 0.5234375, + "learning_rate": 0.00011254349009995366, + "loss": 0.3399, + "step": 15407 + }, + { + "epoch": 2.06, + "grad_norm": 0.65234375, + "learning_rate": 0.00011253193697211364, + "loss": 0.5669, + "step": 15408 + }, + { + "epoch": 2.06, + "grad_norm": 0.5390625, + "learning_rate": 0.00011252038367433253, + "loss": 0.3883, + "step": 15409 + }, + { + "epoch": 2.06, + "grad_norm": 0.443359375, + "learning_rate": 0.000112508830206767, + "loss": 0.2618, + "step": 15410 + }, + { + "epoch": 2.06, + "grad_norm": 0.58203125, + "learning_rate": 0.00011249727656957367, + "loss": 0.2618, + "step": 15411 + }, + { + "epoch": 2.06, + "grad_norm": 0.5078125, + "learning_rate": 0.00011248572276290925, + "loss": 0.3306, + "step": 15412 + }, + { + "epoch": 2.06, + "grad_norm": 0.51953125, + "learning_rate": 0.00011247416878693042, + "loss": 0.5178, + "step": 15413 + }, + { + "epoch": 2.06, + "grad_norm": 0.5625, + "learning_rate": 0.00011246261464179386, + "loss": 0.2882, + "step": 15414 + }, + { + "epoch": 2.06, + "grad_norm": 0.5, + "learning_rate": 0.00011245106032765624, + "loss": 0.5431, + "step": 15415 + }, + { + "epoch": 2.06, + "grad_norm": 0.65234375, + "learning_rate": 0.00011243950584467429, + "loss": 0.3256, + "step": 15416 + }, + { + "epoch": 2.06, + "grad_norm": 0.51953125, + "learning_rate": 0.00011242795119300462, + "loss": 0.3584, + "step": 15417 + }, + { + "epoch": 2.06, + "grad_norm": 0.7734375, + "learning_rate": 0.00011241639637280394, + "loss": 0.7699, + "step": 15418 + }, + { + "epoch": 2.06, + "grad_norm": 0.5234375, + "learning_rate": 0.000112404841384229, + "loss": 0.2791, + "step": 15419 + }, + { + "epoch": 2.06, + "grad_norm": 0.69921875, + "learning_rate": 0.00011239328622743641, + "loss": 0.4567, + "step": 15420 + }, + { + "epoch": 2.06, + "grad_norm": 0.384765625, + "learning_rate": 0.00011238173090258293, + "loss": 0.2201, + "step": 15421 + }, + { + "epoch": 2.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00011237017540982521, + "loss": 0.4673, + "step": 15422 + }, + { + "epoch": 2.06, + "grad_norm": 0.578125, + "learning_rate": 0.00011235861974932, + "loss": 0.4175, + "step": 15423 + }, + { + "epoch": 2.06, + "grad_norm": 0.5390625, + "learning_rate": 0.00011234706392122396, + "loss": 0.4767, + "step": 15424 + }, + { + "epoch": 2.06, + "grad_norm": 0.65234375, + "learning_rate": 0.00011233550792569383, + "loss": 0.4165, + "step": 15425 + }, + { + "epoch": 2.06, + "grad_norm": 0.49609375, + "learning_rate": 0.00011232395176288626, + "loss": 0.2501, + "step": 15426 + }, + { + "epoch": 2.06, + "grad_norm": 0.50390625, + "learning_rate": 0.00011231239543295802, + "loss": 0.3001, + "step": 15427 + }, + { + "epoch": 2.06, + "grad_norm": 0.62890625, + "learning_rate": 0.00011230083893606578, + "loss": 0.2637, + "step": 15428 + }, + { + "epoch": 2.06, + "grad_norm": 0.376953125, + "learning_rate": 0.00011228928227236626, + "loss": 0.1163, + "step": 15429 + }, + { + "epoch": 2.06, + "grad_norm": 0.578125, + "learning_rate": 0.00011227772544201623, + "loss": 0.4544, + "step": 15430 + }, + { + "epoch": 2.06, + "grad_norm": 0.4609375, + "learning_rate": 0.0001122661684451723, + "loss": 0.2406, + "step": 15431 + }, + { + "epoch": 2.06, + "grad_norm": 0.63671875, + "learning_rate": 0.0001122546112819913, + "loss": 0.383, + "step": 15432 + }, + { + "epoch": 2.06, + "grad_norm": 0.51171875, + "learning_rate": 0.00011224305395262988, + "loss": 0.2138, + "step": 15433 + }, + { + "epoch": 2.06, + "grad_norm": 0.6953125, + "learning_rate": 0.00011223149645724482, + "loss": 0.3135, + "step": 15434 + }, + { + "epoch": 2.06, + "grad_norm": 0.57421875, + "learning_rate": 0.0001122199387959928, + "loss": 0.3738, + "step": 15435 + }, + { + "epoch": 2.06, + "grad_norm": 0.451171875, + "learning_rate": 0.00011220838096903057, + "loss": 0.3515, + "step": 15436 + }, + { + "epoch": 2.06, + "grad_norm": 0.50390625, + "learning_rate": 0.00011219682297651487, + "loss": 0.3695, + "step": 15437 + }, + { + "epoch": 2.06, + "grad_norm": 0.412109375, + "learning_rate": 0.00011218526481860239, + "loss": 0.2683, + "step": 15438 + }, + { + "epoch": 2.06, + "grad_norm": 0.478515625, + "learning_rate": 0.00011217370649544991, + "loss": 0.3167, + "step": 15439 + }, + { + "epoch": 2.06, + "grad_norm": 0.5625, + "learning_rate": 0.00011216214800721418, + "loss": 0.4627, + "step": 15440 + }, + { + "epoch": 2.06, + "grad_norm": 0.56640625, + "learning_rate": 0.00011215058935405189, + "loss": 0.3576, + "step": 15441 + }, + { + "epoch": 2.06, + "grad_norm": 0.4296875, + "learning_rate": 0.00011213903053611985, + "loss": 0.2597, + "step": 15442 + }, + { + "epoch": 2.06, + "grad_norm": 0.50390625, + "learning_rate": 0.00011212747155357473, + "loss": 0.3821, + "step": 15443 + }, + { + "epoch": 2.06, + "grad_norm": 0.5546875, + "learning_rate": 0.00011211591240657333, + "loss": 0.3605, + "step": 15444 + }, + { + "epoch": 2.06, + "grad_norm": 0.55078125, + "learning_rate": 0.00011210435309527234, + "loss": 0.21, + "step": 15445 + }, + { + "epoch": 2.06, + "grad_norm": 0.4609375, + "learning_rate": 0.00011209279361982862, + "loss": 0.3342, + "step": 15446 + }, + { + "epoch": 2.06, + "grad_norm": 0.5703125, + "learning_rate": 0.00011208123398039883, + "loss": 0.3461, + "step": 15447 + }, + { + "epoch": 2.06, + "grad_norm": 0.51171875, + "learning_rate": 0.00011206967417713973, + "loss": 0.3025, + "step": 15448 + }, + { + "epoch": 2.06, + "grad_norm": 0.6875, + "learning_rate": 0.00011205811421020813, + "loss": 0.4525, + "step": 15449 + }, + { + "epoch": 2.06, + "grad_norm": 0.41796875, + "learning_rate": 0.00011204655407976074, + "loss": 0.2106, + "step": 15450 + }, + { + "epoch": 2.06, + "grad_norm": 0.609375, + "learning_rate": 0.00011203499378595434, + "loss": 0.3973, + "step": 15451 + }, + { + "epoch": 2.06, + "grad_norm": 0.51953125, + "learning_rate": 0.00011202343332894573, + "loss": 0.3099, + "step": 15452 + }, + { + "epoch": 2.06, + "grad_norm": 0.416015625, + "learning_rate": 0.00011201187270889167, + "loss": 0.3031, + "step": 15453 + }, + { + "epoch": 2.06, + "grad_norm": 0.64453125, + "learning_rate": 0.00011200031192594884, + "loss": 0.5322, + "step": 15454 + }, + { + "epoch": 2.06, + "grad_norm": 0.53515625, + "learning_rate": 0.00011198875098027413, + "loss": 0.4825, + "step": 15455 + }, + { + "epoch": 2.06, + "grad_norm": 0.59375, + "learning_rate": 0.0001119771898720242, + "loss": 0.514, + "step": 15456 + }, + { + "epoch": 2.06, + "grad_norm": 0.55859375, + "learning_rate": 0.00011196562860135595, + "loss": 0.1773, + "step": 15457 + }, + { + "epoch": 2.06, + "grad_norm": 0.49609375, + "learning_rate": 0.00011195406716842607, + "loss": 0.2621, + "step": 15458 + }, + { + "epoch": 2.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00011194250557339139, + "loss": 0.26, + "step": 15459 + }, + { + "epoch": 2.06, + "grad_norm": 0.4921875, + "learning_rate": 0.00011193094381640867, + "loss": 0.1852, + "step": 15460 + }, + { + "epoch": 2.06, + "grad_norm": 0.67578125, + "learning_rate": 0.00011191938189763466, + "loss": 0.4188, + "step": 15461 + }, + { + "epoch": 2.06, + "grad_norm": 0.72265625, + "learning_rate": 0.00011190781981722623, + "loss": 0.2986, + "step": 15462 + }, + { + "epoch": 2.06, + "grad_norm": 0.412109375, + "learning_rate": 0.00011189625757534006, + "loss": 0.2571, + "step": 15463 + }, + { + "epoch": 2.06, + "grad_norm": 0.390625, + "learning_rate": 0.00011188469517213305, + "loss": 0.2192, + "step": 15464 + }, + { + "epoch": 2.06, + "grad_norm": 0.455078125, + "learning_rate": 0.00011187313260776193, + "loss": 0.1884, + "step": 15465 + }, + { + "epoch": 2.06, + "grad_norm": 0.66015625, + "learning_rate": 0.0001118615698823835, + "loss": 0.5928, + "step": 15466 + }, + { + "epoch": 2.06, + "grad_norm": 0.671875, + "learning_rate": 0.0001118500069961546, + "loss": 0.4146, + "step": 15467 + }, + { + "epoch": 2.06, + "grad_norm": 0.5546875, + "learning_rate": 0.00011183844394923196, + "loss": 0.3594, + "step": 15468 + }, + { + "epoch": 2.06, + "grad_norm": 0.62109375, + "learning_rate": 0.00011182688074177245, + "loss": 0.5954, + "step": 15469 + }, + { + "epoch": 2.06, + "grad_norm": 0.6953125, + "learning_rate": 0.00011181531737393282, + "loss": 0.2922, + "step": 15470 + }, + { + "epoch": 2.06, + "grad_norm": 0.56640625, + "learning_rate": 0.00011180375384586996, + "loss": 0.2344, + "step": 15471 + }, + { + "epoch": 2.06, + "grad_norm": 0.515625, + "learning_rate": 0.00011179219015774059, + "loss": 0.4761, + "step": 15472 + }, + { + "epoch": 2.06, + "grad_norm": 0.72265625, + "learning_rate": 0.00011178062630970155, + "loss": 0.462, + "step": 15473 + }, + { + "epoch": 2.06, + "grad_norm": 0.6328125, + "learning_rate": 0.00011176906230190965, + "loss": 0.4959, + "step": 15474 + }, + { + "epoch": 2.06, + "grad_norm": 0.6015625, + "learning_rate": 0.00011175749813452172, + "loss": 0.4934, + "step": 15475 + }, + { + "epoch": 2.07, + "grad_norm": 0.54296875, + "learning_rate": 0.00011174593380769454, + "loss": 0.2216, + "step": 15476 + }, + { + "epoch": 2.07, + "grad_norm": 0.5546875, + "learning_rate": 0.000111734369321585, + "loss": 0.5499, + "step": 15477 + }, + { + "epoch": 2.07, + "grad_norm": 0.63671875, + "learning_rate": 0.00011172280467634988, + "loss": 0.2182, + "step": 15478 + }, + { + "epoch": 2.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00011171123987214598, + "loss": 0.5647, + "step": 15479 + }, + { + "epoch": 2.07, + "grad_norm": 0.7578125, + "learning_rate": 0.00011169967490913016, + "loss": 0.3818, + "step": 15480 + }, + { + "epoch": 2.07, + "grad_norm": 0.3515625, + "learning_rate": 0.00011168810978745923, + "loss": 0.2864, + "step": 15481 + }, + { + "epoch": 2.07, + "grad_norm": 0.44140625, + "learning_rate": 0.00011167654450729007, + "loss": 0.3732, + "step": 15482 + }, + { + "epoch": 2.07, + "grad_norm": 0.62109375, + "learning_rate": 0.00011166497906877944, + "loss": 0.3771, + "step": 15483 + }, + { + "epoch": 2.07, + "grad_norm": 0.443359375, + "learning_rate": 0.0001116534134720842, + "loss": 0.1919, + "step": 15484 + }, + { + "epoch": 2.07, + "grad_norm": 0.59765625, + "learning_rate": 0.00011164184771736122, + "loss": 0.2186, + "step": 15485 + }, + { + "epoch": 2.07, + "grad_norm": 0.6640625, + "learning_rate": 0.00011163028180476729, + "loss": 0.2726, + "step": 15486 + }, + { + "epoch": 2.07, + "grad_norm": 0.423828125, + "learning_rate": 0.00011161871573445928, + "loss": 0.3638, + "step": 15487 + }, + { + "epoch": 2.07, + "grad_norm": 0.609375, + "learning_rate": 0.00011160714950659404, + "loss": 0.3189, + "step": 15488 + }, + { + "epoch": 2.07, + "grad_norm": 0.87109375, + "learning_rate": 0.00011159558312132841, + "loss": 0.4746, + "step": 15489 + }, + { + "epoch": 2.07, + "grad_norm": 0.62890625, + "learning_rate": 0.0001115840165788192, + "loss": 0.4683, + "step": 15490 + }, + { + "epoch": 2.07, + "grad_norm": 0.5625, + "learning_rate": 0.0001115724498792233, + "loss": 0.3795, + "step": 15491 + }, + { + "epoch": 2.07, + "grad_norm": 0.63671875, + "learning_rate": 0.00011156088302269756, + "loss": 0.3518, + "step": 15492 + }, + { + "epoch": 2.07, + "grad_norm": 0.51171875, + "learning_rate": 0.00011154931600939882, + "loss": 0.2409, + "step": 15493 + }, + { + "epoch": 2.07, + "grad_norm": 0.5703125, + "learning_rate": 0.00011153774883948395, + "loss": 0.5402, + "step": 15494 + }, + { + "epoch": 2.07, + "grad_norm": 0.50390625, + "learning_rate": 0.0001115261815131098, + "loss": 0.2938, + "step": 15495 + }, + { + "epoch": 2.07, + "grad_norm": 0.703125, + "learning_rate": 0.00011151461403043326, + "loss": 0.4221, + "step": 15496 + }, + { + "epoch": 2.07, + "grad_norm": 0.54296875, + "learning_rate": 0.00011150304639161111, + "loss": 0.3879, + "step": 15497 + }, + { + "epoch": 2.07, + "grad_norm": 0.5546875, + "learning_rate": 0.00011149147859680031, + "loss": 0.4284, + "step": 15498 + }, + { + "epoch": 2.07, + "grad_norm": 0.44140625, + "learning_rate": 0.00011147991064615766, + "loss": 0.3458, + "step": 15499 + }, + { + "epoch": 2.07, + "grad_norm": 0.50390625, + "learning_rate": 0.00011146834253984006, + "loss": 0.3864, + "step": 15500 + }, + { + "epoch": 2.07, + "grad_norm": 0.66796875, + "learning_rate": 0.00011145677427800438, + "loss": 0.387, + "step": 15501 + }, + { + "epoch": 2.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00011144520586080747, + "loss": 0.2003, + "step": 15502 + }, + { + "epoch": 2.07, + "grad_norm": 0.5703125, + "learning_rate": 0.00011143363728840625, + "loss": 0.2452, + "step": 15503 + }, + { + "epoch": 2.07, + "grad_norm": 0.6875, + "learning_rate": 0.00011142206856095753, + "loss": 0.4434, + "step": 15504 + }, + { + "epoch": 2.07, + "grad_norm": 0.7265625, + "learning_rate": 0.00011141049967861827, + "loss": 0.4427, + "step": 15505 + }, + { + "epoch": 2.07, + "grad_norm": 0.609375, + "learning_rate": 0.00011139893064154529, + "loss": 0.2703, + "step": 15506 + }, + { + "epoch": 2.07, + "grad_norm": 0.380859375, + "learning_rate": 0.00011138736144989552, + "loss": 0.2054, + "step": 15507 + }, + { + "epoch": 2.07, + "grad_norm": 0.578125, + "learning_rate": 0.0001113757921038258, + "loss": 0.4201, + "step": 15508 + }, + { + "epoch": 2.07, + "grad_norm": 0.51171875, + "learning_rate": 0.00011136422260349304, + "loss": 0.2653, + "step": 15509 + }, + { + "epoch": 2.07, + "grad_norm": 0.5078125, + "learning_rate": 0.00011135265294905412, + "loss": 0.1845, + "step": 15510 + }, + { + "epoch": 2.07, + "grad_norm": 0.48828125, + "learning_rate": 0.00011134108314066596, + "loss": 0.2447, + "step": 15511 + }, + { + "epoch": 2.07, + "grad_norm": 0.51953125, + "learning_rate": 0.00011132951317848543, + "loss": 0.269, + "step": 15512 + }, + { + "epoch": 2.07, + "grad_norm": 0.484375, + "learning_rate": 0.00011131794306266942, + "loss": 0.2013, + "step": 15513 + }, + { + "epoch": 2.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00011130637279337483, + "loss": 0.3708, + "step": 15514 + }, + { + "epoch": 2.07, + "grad_norm": 0.58203125, + "learning_rate": 0.0001112948023707586, + "loss": 0.2386, + "step": 15515 + }, + { + "epoch": 2.07, + "grad_norm": 0.84765625, + "learning_rate": 0.00011128323179497757, + "loss": 0.3157, + "step": 15516 + }, + { + "epoch": 2.07, + "grad_norm": 0.4296875, + "learning_rate": 0.00011127166106618869, + "loss": 0.3584, + "step": 15517 + }, + { + "epoch": 2.07, + "grad_norm": 0.56640625, + "learning_rate": 0.00011126009018454884, + "loss": 0.2949, + "step": 15518 + }, + { + "epoch": 2.07, + "grad_norm": 0.734375, + "learning_rate": 0.00011124851915021497, + "loss": 0.2123, + "step": 15519 + }, + { + "epoch": 2.07, + "grad_norm": 0.474609375, + "learning_rate": 0.00011123694796334392, + "loss": 0.2022, + "step": 15520 + }, + { + "epoch": 2.07, + "grad_norm": 0.4765625, + "learning_rate": 0.00011122537662409266, + "loss": 0.347, + "step": 15521 + }, + { + "epoch": 2.07, + "grad_norm": 0.484375, + "learning_rate": 0.0001112138051326181, + "loss": 0.3132, + "step": 15522 + }, + { + "epoch": 2.07, + "grad_norm": 0.46875, + "learning_rate": 0.00011120223348907715, + "loss": 0.2703, + "step": 15523 + }, + { + "epoch": 2.07, + "grad_norm": 0.4375, + "learning_rate": 0.00011119066169362669, + "loss": 0.1469, + "step": 15524 + }, + { + "epoch": 2.07, + "grad_norm": 0.52734375, + "learning_rate": 0.0001111790897464237, + "loss": 0.3883, + "step": 15525 + }, + { + "epoch": 2.07, + "grad_norm": 0.466796875, + "learning_rate": 0.0001111675176476251, + "loss": 0.173, + "step": 15526 + }, + { + "epoch": 2.07, + "grad_norm": 0.47265625, + "learning_rate": 0.00011115594539738774, + "loss": 0.3188, + "step": 15527 + }, + { + "epoch": 2.07, + "grad_norm": 0.71875, + "learning_rate": 0.00011114437299586864, + "loss": 0.2957, + "step": 15528 + }, + { + "epoch": 2.07, + "grad_norm": 0.54296875, + "learning_rate": 0.00011113280044322467, + "loss": 0.3877, + "step": 15529 + }, + { + "epoch": 2.07, + "grad_norm": 0.462890625, + "learning_rate": 0.0001111212277396128, + "loss": 0.3107, + "step": 15530 + }, + { + "epoch": 2.07, + "grad_norm": 0.48828125, + "learning_rate": 0.00011110965488518994, + "loss": 0.3251, + "step": 15531 + }, + { + "epoch": 2.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00011109808188011301, + "loss": 0.377, + "step": 15532 + }, + { + "epoch": 2.07, + "grad_norm": 0.84375, + "learning_rate": 0.000111086508724539, + "loss": 0.4462, + "step": 15533 + }, + { + "epoch": 2.07, + "grad_norm": 0.43359375, + "learning_rate": 0.00011107493541862479, + "loss": 0.2299, + "step": 15534 + }, + { + "epoch": 2.07, + "grad_norm": 0.58984375, + "learning_rate": 0.00011106336196252735, + "loss": 0.4362, + "step": 15535 + }, + { + "epoch": 2.07, + "grad_norm": 0.87109375, + "learning_rate": 0.00011105178835640361, + "loss": 0.2389, + "step": 15536 + }, + { + "epoch": 2.07, + "grad_norm": 0.462890625, + "learning_rate": 0.00011104021460041058, + "loss": 0.3697, + "step": 15537 + }, + { + "epoch": 2.07, + "grad_norm": 0.74609375, + "learning_rate": 0.00011102864069470512, + "loss": 0.6187, + "step": 15538 + }, + { + "epoch": 2.07, + "grad_norm": 0.52734375, + "learning_rate": 0.0001110170666394442, + "loss": 0.2483, + "step": 15539 + }, + { + "epoch": 2.07, + "grad_norm": 0.578125, + "learning_rate": 0.0001110054924347848, + "loss": 0.1906, + "step": 15540 + }, + { + "epoch": 2.07, + "grad_norm": 0.6484375, + "learning_rate": 0.00011099391808088384, + "loss": 0.4019, + "step": 15541 + }, + { + "epoch": 2.07, + "grad_norm": 0.4140625, + "learning_rate": 0.00011098234357789831, + "loss": 0.2677, + "step": 15542 + }, + { + "epoch": 2.07, + "grad_norm": 0.64453125, + "learning_rate": 0.00011097076892598514, + "loss": 0.3501, + "step": 15543 + }, + { + "epoch": 2.07, + "grad_norm": 0.5078125, + "learning_rate": 0.00011095919412530136, + "loss": 0.4528, + "step": 15544 + }, + { + "epoch": 2.07, + "grad_norm": 0.59765625, + "learning_rate": 0.00011094761917600382, + "loss": 0.4781, + "step": 15545 + }, + { + "epoch": 2.07, + "grad_norm": 0.73046875, + "learning_rate": 0.00011093604407824955, + "loss": 0.3417, + "step": 15546 + }, + { + "epoch": 2.07, + "grad_norm": 0.55078125, + "learning_rate": 0.00011092446883219549, + "loss": 0.3666, + "step": 15547 + }, + { + "epoch": 2.07, + "grad_norm": 0.640625, + "learning_rate": 0.00011091289343799864, + "loss": 0.3817, + "step": 15548 + }, + { + "epoch": 2.07, + "grad_norm": 0.578125, + "learning_rate": 0.00011090131789581595, + "loss": 0.467, + "step": 15549 + }, + { + "epoch": 2.07, + "grad_norm": 0.484375, + "learning_rate": 0.00011088974220580439, + "loss": 0.3155, + "step": 15550 + }, + { + "epoch": 2.08, + "grad_norm": 0.640625, + "learning_rate": 0.00011087816636812094, + "loss": 0.3577, + "step": 15551 + }, + { + "epoch": 2.08, + "grad_norm": 0.486328125, + "learning_rate": 0.00011086659038292255, + "loss": 0.3693, + "step": 15552 + }, + { + "epoch": 2.08, + "grad_norm": 0.484375, + "learning_rate": 0.00011085501425036626, + "loss": 0.2929, + "step": 15553 + }, + { + "epoch": 2.08, + "grad_norm": 0.546875, + "learning_rate": 0.00011084343797060897, + "loss": 0.2961, + "step": 15554 + }, + { + "epoch": 2.08, + "grad_norm": 0.48046875, + "learning_rate": 0.00011083186154380774, + "loss": 0.33, + "step": 15555 + }, + { + "epoch": 2.08, + "grad_norm": 0.310546875, + "learning_rate": 0.0001108202849701195, + "loss": 0.1209, + "step": 15556 + }, + { + "epoch": 2.08, + "grad_norm": 0.6875, + "learning_rate": 0.00011080870824970122, + "loss": 0.2012, + "step": 15557 + }, + { + "epoch": 2.08, + "grad_norm": 0.48828125, + "learning_rate": 0.00011079713138270994, + "loss": 0.3662, + "step": 15558 + }, + { + "epoch": 2.08, + "grad_norm": 0.53125, + "learning_rate": 0.00011078555436930265, + "loss": 0.2585, + "step": 15559 + }, + { + "epoch": 2.08, + "grad_norm": 0.63671875, + "learning_rate": 0.00011077397720963631, + "loss": 0.4929, + "step": 15560 + }, + { + "epoch": 2.08, + "grad_norm": 0.578125, + "learning_rate": 0.0001107623999038679, + "loss": 0.2089, + "step": 15561 + }, + { + "epoch": 2.08, + "grad_norm": 0.5703125, + "learning_rate": 0.00011075082245215449, + "loss": 0.3376, + "step": 15562 + }, + { + "epoch": 2.08, + "grad_norm": 0.42578125, + "learning_rate": 0.00011073924485465299, + "loss": 0.2219, + "step": 15563 + }, + { + "epoch": 2.08, + "grad_norm": 0.5703125, + "learning_rate": 0.00011072766711152043, + "loss": 0.2969, + "step": 15564 + }, + { + "epoch": 2.08, + "grad_norm": 0.5078125, + "learning_rate": 0.00011071608922291384, + "loss": 0.2304, + "step": 15565 + }, + { + "epoch": 2.08, + "grad_norm": 0.66015625, + "learning_rate": 0.00011070451118899019, + "loss": 0.6101, + "step": 15566 + }, + { + "epoch": 2.08, + "grad_norm": 0.5234375, + "learning_rate": 0.0001106929330099065, + "loss": 0.4377, + "step": 15567 + }, + { + "epoch": 2.08, + "grad_norm": 0.46875, + "learning_rate": 0.0001106813546858198, + "loss": 0.144, + "step": 15568 + }, + { + "epoch": 2.08, + "grad_norm": 0.640625, + "learning_rate": 0.00011066977621688705, + "loss": 0.2961, + "step": 15569 + }, + { + "epoch": 2.08, + "grad_norm": 0.390625, + "learning_rate": 0.0001106581976032653, + "loss": 0.1486, + "step": 15570 + }, + { + "epoch": 2.08, + "grad_norm": 0.4609375, + "learning_rate": 0.00011064661884511153, + "loss": 0.3038, + "step": 15571 + }, + { + "epoch": 2.08, + "grad_norm": 0.5625, + "learning_rate": 0.0001106350399425828, + "loss": 0.1747, + "step": 15572 + }, + { + "epoch": 2.08, + "grad_norm": 0.47265625, + "learning_rate": 0.00011062346089583608, + "loss": 0.1714, + "step": 15573 + }, + { + "epoch": 2.08, + "grad_norm": 0.494140625, + "learning_rate": 0.00011061188170502843, + "loss": 0.3008, + "step": 15574 + }, + { + "epoch": 2.08, + "grad_norm": 0.51171875, + "learning_rate": 0.00011060030237031685, + "loss": 0.407, + "step": 15575 + }, + { + "epoch": 2.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00011058872289185836, + "loss": 0.2625, + "step": 15576 + }, + { + "epoch": 2.08, + "grad_norm": 0.49609375, + "learning_rate": 0.00011057714326980999, + "loss": 0.4948, + "step": 15577 + }, + { + "epoch": 2.08, + "grad_norm": 0.47265625, + "learning_rate": 0.00011056556350432878, + "loss": 0.3096, + "step": 15578 + }, + { + "epoch": 2.08, + "grad_norm": 0.494140625, + "learning_rate": 0.00011055398359557171, + "loss": 0.1939, + "step": 15579 + }, + { + "epoch": 2.08, + "grad_norm": 0.478515625, + "learning_rate": 0.0001105424035436959, + "loss": 0.2822, + "step": 15580 + }, + { + "epoch": 2.08, + "grad_norm": 0.78125, + "learning_rate": 0.00011053082334885832, + "loss": 0.6394, + "step": 15581 + }, + { + "epoch": 2.08, + "grad_norm": 0.490234375, + "learning_rate": 0.00011051924301121598, + "loss": 0.4518, + "step": 15582 + }, + { + "epoch": 2.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00011050766253092601, + "loss": 0.5086, + "step": 15583 + }, + { + "epoch": 2.08, + "grad_norm": 0.75, + "learning_rate": 0.00011049608190814535, + "loss": 0.5519, + "step": 15584 + }, + { + "epoch": 2.08, + "grad_norm": 0.46875, + "learning_rate": 0.0001104845011430311, + "loss": 0.3242, + "step": 15585 + }, + { + "epoch": 2.08, + "grad_norm": 0.5703125, + "learning_rate": 0.00011047292023574029, + "loss": 0.3022, + "step": 15586 + }, + { + "epoch": 2.08, + "grad_norm": 0.6328125, + "learning_rate": 0.00011046133918642996, + "loss": 0.4122, + "step": 15587 + }, + { + "epoch": 2.08, + "grad_norm": 0.5546875, + "learning_rate": 0.00011044975799525716, + "loss": 0.4305, + "step": 15588 + }, + { + "epoch": 2.08, + "grad_norm": 0.62109375, + "learning_rate": 0.0001104381766623789, + "loss": 0.3557, + "step": 15589 + }, + { + "epoch": 2.08, + "grad_norm": 0.49609375, + "learning_rate": 0.00011042659518795231, + "loss": 0.1753, + "step": 15590 + }, + { + "epoch": 2.08, + "grad_norm": 0.5703125, + "learning_rate": 0.00011041501357213435, + "loss": 0.3585, + "step": 15591 + }, + { + "epoch": 2.08, + "grad_norm": 0.392578125, + "learning_rate": 0.00011040343181508219, + "loss": 0.3313, + "step": 15592 + }, + { + "epoch": 2.08, + "grad_norm": 0.455078125, + "learning_rate": 0.00011039184991695277, + "loss": 0.2077, + "step": 15593 + }, + { + "epoch": 2.08, + "grad_norm": 0.52734375, + "learning_rate": 0.0001103802678779032, + "loss": 0.3886, + "step": 15594 + }, + { + "epoch": 2.08, + "grad_norm": 0.39453125, + "learning_rate": 0.00011036868569809054, + "loss": 0.2753, + "step": 15595 + }, + { + "epoch": 2.08, + "grad_norm": 0.90234375, + "learning_rate": 0.00011035710337767185, + "loss": 0.4083, + "step": 15596 + }, + { + "epoch": 2.08, + "grad_norm": 0.453125, + "learning_rate": 0.00011034552091680417, + "loss": 0.2602, + "step": 15597 + }, + { + "epoch": 2.08, + "grad_norm": 0.431640625, + "learning_rate": 0.0001103339383156446, + "loss": 0.2317, + "step": 15598 + }, + { + "epoch": 2.08, + "grad_norm": 0.482421875, + "learning_rate": 0.00011032235557435023, + "loss": 0.3718, + "step": 15599 + }, + { + "epoch": 2.08, + "grad_norm": 0.56640625, + "learning_rate": 0.00011031077269307804, + "loss": 0.1795, + "step": 15600 + }, + { + "epoch": 2.08, + "grad_norm": 0.4765625, + "learning_rate": 0.00011029918967198519, + "loss": 0.2611, + "step": 15601 + }, + { + "epoch": 2.08, + "grad_norm": 0.51171875, + "learning_rate": 0.00011028760651122869, + "loss": 0.202, + "step": 15602 + }, + { + "epoch": 2.08, + "grad_norm": 0.443359375, + "learning_rate": 0.00011027602321096566, + "loss": 0.1769, + "step": 15603 + }, + { + "epoch": 2.08, + "grad_norm": 0.5703125, + "learning_rate": 0.00011026443977135316, + "loss": 0.4665, + "step": 15604 + }, + { + "epoch": 2.08, + "grad_norm": 0.546875, + "learning_rate": 0.00011025285619254826, + "loss": 0.3411, + "step": 15605 + }, + { + "epoch": 2.08, + "grad_norm": 0.427734375, + "learning_rate": 0.00011024127247470806, + "loss": 0.3118, + "step": 15606 + }, + { + "epoch": 2.08, + "grad_norm": 0.390625, + "learning_rate": 0.00011022968861798961, + "loss": 0.1669, + "step": 15607 + }, + { + "epoch": 2.08, + "grad_norm": 0.455078125, + "learning_rate": 0.00011021810462255007, + "loss": 0.1762, + "step": 15608 + }, + { + "epoch": 2.08, + "grad_norm": 0.5546875, + "learning_rate": 0.00011020652048854641, + "loss": 0.2387, + "step": 15609 + }, + { + "epoch": 2.08, + "grad_norm": 0.515625, + "learning_rate": 0.00011019493621613583, + "loss": 0.221, + "step": 15610 + }, + { + "epoch": 2.08, + "grad_norm": 0.91015625, + "learning_rate": 0.00011018335180547535, + "loss": 0.5682, + "step": 15611 + }, + { + "epoch": 2.08, + "grad_norm": 0.58984375, + "learning_rate": 0.00011017176725672206, + "loss": 0.3189, + "step": 15612 + }, + { + "epoch": 2.08, + "grad_norm": 0.5859375, + "learning_rate": 0.00011016018257003314, + "loss": 0.2148, + "step": 15613 + }, + { + "epoch": 2.08, + "grad_norm": 0.515625, + "learning_rate": 0.00011014859774556556, + "loss": 0.3247, + "step": 15614 + }, + { + "epoch": 2.08, + "grad_norm": 0.6171875, + "learning_rate": 0.00011013701278347653, + "loss": 0.5203, + "step": 15615 + }, + { + "epoch": 2.08, + "grad_norm": 0.470703125, + "learning_rate": 0.00011012542768392309, + "loss": 0.1952, + "step": 15616 + }, + { + "epoch": 2.08, + "grad_norm": 0.53515625, + "learning_rate": 0.00011011384244706233, + "loss": 0.5251, + "step": 15617 + }, + { + "epoch": 2.08, + "grad_norm": 0.53125, + "learning_rate": 0.0001101022570730514, + "loss": 0.4422, + "step": 15618 + }, + { + "epoch": 2.08, + "grad_norm": 0.57421875, + "learning_rate": 0.00011009067156204736, + "loss": 0.3944, + "step": 15619 + }, + { + "epoch": 2.08, + "grad_norm": 0.61328125, + "learning_rate": 0.00011007908591420735, + "loss": 0.2829, + "step": 15620 + }, + { + "epoch": 2.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00011006750012968845, + "loss": 0.5743, + "step": 15621 + }, + { + "epoch": 2.08, + "grad_norm": 0.75, + "learning_rate": 0.00011005591420864782, + "loss": 0.6255, + "step": 15622 + }, + { + "epoch": 2.08, + "grad_norm": 0.625, + "learning_rate": 0.0001100443281512425, + "loss": 0.3436, + "step": 15623 + }, + { + "epoch": 2.08, + "grad_norm": 0.49609375, + "learning_rate": 0.00011003274195762967, + "loss": 0.3164, + "step": 15624 + }, + { + "epoch": 2.09, + "grad_norm": 0.50390625, + "learning_rate": 0.0001100211556279664, + "loss": 0.3451, + "step": 15625 + }, + { + "epoch": 2.09, + "grad_norm": 0.74609375, + "learning_rate": 0.00011000956916240985, + "loss": 0.2853, + "step": 15626 + }, + { + "epoch": 2.09, + "grad_norm": 0.6328125, + "learning_rate": 0.00010999798256111709, + "loss": 0.3619, + "step": 15627 + }, + { + "epoch": 2.09, + "grad_norm": 0.6328125, + "learning_rate": 0.0001099863958242453, + "loss": 0.4374, + "step": 15628 + }, + { + "epoch": 2.09, + "grad_norm": 0.56640625, + "learning_rate": 0.00010997480895195158, + "loss": 0.5763, + "step": 15629 + }, + { + "epoch": 2.09, + "grad_norm": 0.546875, + "learning_rate": 0.00010996322194439302, + "loss": 0.3781, + "step": 15630 + }, + { + "epoch": 2.09, + "grad_norm": 0.369140625, + "learning_rate": 0.00010995163480172679, + "loss": 0.2571, + "step": 15631 + }, + { + "epoch": 2.09, + "grad_norm": 0.466796875, + "learning_rate": 0.00010994004752410998, + "loss": 0.2941, + "step": 15632 + }, + { + "epoch": 2.09, + "grad_norm": 0.55078125, + "learning_rate": 0.00010992846011169977, + "loss": 0.3692, + "step": 15633 + }, + { + "epoch": 2.09, + "grad_norm": 0.4921875, + "learning_rate": 0.00010991687256465326, + "loss": 0.4095, + "step": 15634 + }, + { + "epoch": 2.09, + "grad_norm": 0.52734375, + "learning_rate": 0.00010990528488312764, + "loss": 0.4896, + "step": 15635 + }, + { + "epoch": 2.09, + "grad_norm": 0.4453125, + "learning_rate": 0.00010989369706727994, + "loss": 0.2113, + "step": 15636 + }, + { + "epoch": 2.09, + "grad_norm": 0.69921875, + "learning_rate": 0.00010988210911726738, + "loss": 0.4134, + "step": 15637 + }, + { + "epoch": 2.09, + "grad_norm": 0.4296875, + "learning_rate": 0.00010987052103324707, + "loss": 0.293, + "step": 15638 + }, + { + "epoch": 2.09, + "grad_norm": 0.51171875, + "learning_rate": 0.00010985893281537616, + "loss": 0.2067, + "step": 15639 + }, + { + "epoch": 2.09, + "grad_norm": 0.56640625, + "learning_rate": 0.0001098473444638118, + "loss": 0.4236, + "step": 15640 + }, + { + "epoch": 2.09, + "grad_norm": 0.58203125, + "learning_rate": 0.00010983575597871114, + "loss": 0.477, + "step": 15641 + }, + { + "epoch": 2.09, + "grad_norm": 0.54296875, + "learning_rate": 0.00010982416736023132, + "loss": 0.2465, + "step": 15642 + }, + { + "epoch": 2.09, + "grad_norm": 0.5625, + "learning_rate": 0.00010981257860852945, + "loss": 0.2459, + "step": 15643 + }, + { + "epoch": 2.09, + "grad_norm": 0.68359375, + "learning_rate": 0.00010980098972376276, + "loss": 0.6729, + "step": 15644 + }, + { + "epoch": 2.09, + "grad_norm": 0.5859375, + "learning_rate": 0.00010978940070608832, + "loss": 0.48, + "step": 15645 + }, + { + "epoch": 2.09, + "grad_norm": 0.482421875, + "learning_rate": 0.00010977781155566333, + "loss": 0.3307, + "step": 15646 + }, + { + "epoch": 2.09, + "grad_norm": 0.65625, + "learning_rate": 0.00010976622227264497, + "loss": 0.4311, + "step": 15647 + }, + { + "epoch": 2.09, + "grad_norm": 0.53515625, + "learning_rate": 0.00010975463285719034, + "loss": 0.4227, + "step": 15648 + }, + { + "epoch": 2.09, + "grad_norm": 0.4921875, + "learning_rate": 0.00010974304330945664, + "loss": 0.2064, + "step": 15649 + }, + { + "epoch": 2.09, + "grad_norm": 0.419921875, + "learning_rate": 0.000109731453629601, + "loss": 0.1799, + "step": 15650 + }, + { + "epoch": 2.09, + "grad_norm": 0.51953125, + "learning_rate": 0.00010971986381778064, + "loss": 0.2644, + "step": 15651 + }, + { + "epoch": 2.09, + "grad_norm": 0.59375, + "learning_rate": 0.00010970827387415263, + "loss": 0.4612, + "step": 15652 + }, + { + "epoch": 2.09, + "grad_norm": 0.60546875, + "learning_rate": 0.00010969668379887426, + "loss": 0.2734, + "step": 15653 + }, + { + "epoch": 2.09, + "grad_norm": 0.578125, + "learning_rate": 0.00010968509359210262, + "loss": 0.407, + "step": 15654 + }, + { + "epoch": 2.09, + "grad_norm": 0.439453125, + "learning_rate": 0.00010967350325399485, + "loss": 0.3518, + "step": 15655 + }, + { + "epoch": 2.09, + "grad_norm": 0.60546875, + "learning_rate": 0.00010966191278470822, + "loss": 0.3915, + "step": 15656 + }, + { + "epoch": 2.09, + "grad_norm": 0.74609375, + "learning_rate": 0.00010965032218439981, + "loss": 0.48, + "step": 15657 + }, + { + "epoch": 2.09, + "grad_norm": 0.51171875, + "learning_rate": 0.00010963873145322687, + "loss": 0.3479, + "step": 15658 + }, + { + "epoch": 2.09, + "grad_norm": 0.4921875, + "learning_rate": 0.00010962714059134653, + "loss": 0.3454, + "step": 15659 + }, + { + "epoch": 2.09, + "grad_norm": 0.5, + "learning_rate": 0.00010961554959891598, + "loss": 0.283, + "step": 15660 + }, + { + "epoch": 2.09, + "grad_norm": 0.57421875, + "learning_rate": 0.00010960395847609244, + "loss": 0.3367, + "step": 15661 + }, + { + "epoch": 2.09, + "grad_norm": 0.3984375, + "learning_rate": 0.000109592367223033, + "loss": 0.2211, + "step": 15662 + }, + { + "epoch": 2.09, + "grad_norm": 0.515625, + "learning_rate": 0.00010958077583989496, + "loss": 0.3735, + "step": 15663 + }, + { + "epoch": 2.09, + "grad_norm": 0.64453125, + "learning_rate": 0.00010956918432683543, + "loss": 0.4167, + "step": 15664 + }, + { + "epoch": 2.09, + "grad_norm": 0.7890625, + "learning_rate": 0.00010955759268401165, + "loss": 0.6103, + "step": 15665 + }, + { + "epoch": 2.09, + "grad_norm": 0.416015625, + "learning_rate": 0.00010954600091158072, + "loss": 0.2267, + "step": 15666 + }, + { + "epoch": 2.09, + "grad_norm": 0.67578125, + "learning_rate": 0.00010953440900969994, + "loss": 0.4972, + "step": 15667 + }, + { + "epoch": 2.09, + "grad_norm": 0.60546875, + "learning_rate": 0.00010952281697852641, + "loss": 0.4668, + "step": 15668 + }, + { + "epoch": 2.09, + "grad_norm": 0.5546875, + "learning_rate": 0.0001095112248182174, + "loss": 0.5058, + "step": 15669 + }, + { + "epoch": 2.09, + "grad_norm": 0.69140625, + "learning_rate": 0.00010949963252893005, + "loss": 0.2603, + "step": 15670 + }, + { + "epoch": 2.09, + "grad_norm": 0.326171875, + "learning_rate": 0.00010948804011082164, + "loss": 0.1945, + "step": 15671 + }, + { + "epoch": 2.09, + "grad_norm": 0.5703125, + "learning_rate": 0.00010947644756404929, + "loss": 0.4298, + "step": 15672 + }, + { + "epoch": 2.09, + "grad_norm": 0.68359375, + "learning_rate": 0.0001094648548887702, + "loss": 0.4199, + "step": 15673 + }, + { + "epoch": 2.09, + "grad_norm": 0.63671875, + "learning_rate": 0.00010945326208514164, + "loss": 0.3369, + "step": 15674 + }, + { + "epoch": 2.09, + "grad_norm": 0.57421875, + "learning_rate": 0.00010944166915332075, + "loss": 0.3157, + "step": 15675 + }, + { + "epoch": 2.09, + "grad_norm": 0.5546875, + "learning_rate": 0.00010943007609346478, + "loss": 0.5357, + "step": 15676 + }, + { + "epoch": 2.09, + "grad_norm": 0.478515625, + "learning_rate": 0.00010941848290573093, + "loss": 0.2308, + "step": 15677 + }, + { + "epoch": 2.09, + "grad_norm": 0.6875, + "learning_rate": 0.0001094068895902764, + "loss": 0.4113, + "step": 15678 + }, + { + "epoch": 2.09, + "grad_norm": 0.6171875, + "learning_rate": 0.00010939529614725843, + "loss": 0.2819, + "step": 15679 + }, + { + "epoch": 2.09, + "grad_norm": 0.470703125, + "learning_rate": 0.00010938370257683419, + "loss": 0.1937, + "step": 15680 + }, + { + "epoch": 2.09, + "grad_norm": 0.52734375, + "learning_rate": 0.00010937210887916092, + "loss": 0.3249, + "step": 15681 + }, + { + "epoch": 2.09, + "grad_norm": 0.5078125, + "learning_rate": 0.00010936051505439584, + "loss": 0.3898, + "step": 15682 + }, + { + "epoch": 2.09, + "grad_norm": 0.5546875, + "learning_rate": 0.00010934892110269622, + "loss": 0.2826, + "step": 15683 + }, + { + "epoch": 2.09, + "grad_norm": 0.5703125, + "learning_rate": 0.00010933732702421917, + "loss": 0.3747, + "step": 15684 + }, + { + "epoch": 2.09, + "grad_norm": 0.3984375, + "learning_rate": 0.00010932573281912197, + "loss": 0.126, + "step": 15685 + }, + { + "epoch": 2.09, + "grad_norm": 0.53125, + "learning_rate": 0.00010931413848756186, + "loss": 0.3272, + "step": 15686 + }, + { + "epoch": 2.09, + "grad_norm": 0.396484375, + "learning_rate": 0.00010930254402969607, + "loss": 0.3226, + "step": 15687 + }, + { + "epoch": 2.09, + "grad_norm": 0.56640625, + "learning_rate": 0.00010929094944568182, + "loss": 0.2929, + "step": 15688 + }, + { + "epoch": 2.09, + "grad_norm": 0.4296875, + "learning_rate": 0.00010927935473567632, + "loss": 0.3146, + "step": 15689 + }, + { + "epoch": 2.09, + "grad_norm": 0.482421875, + "learning_rate": 0.00010926775989983682, + "loss": 0.3198, + "step": 15690 + }, + { + "epoch": 2.09, + "grad_norm": 0.625, + "learning_rate": 0.00010925616493832051, + "loss": 0.3863, + "step": 15691 + }, + { + "epoch": 2.09, + "grad_norm": 0.431640625, + "learning_rate": 0.0001092445698512847, + "loss": 0.2583, + "step": 15692 + }, + { + "epoch": 2.09, + "grad_norm": 0.46484375, + "learning_rate": 0.00010923297463888658, + "loss": 0.2426, + "step": 15693 + }, + { + "epoch": 2.09, + "grad_norm": 0.58203125, + "learning_rate": 0.0001092213793012834, + "loss": 0.4092, + "step": 15694 + }, + { + "epoch": 2.09, + "grad_norm": 0.64453125, + "learning_rate": 0.0001092097838386324, + "loss": 0.2347, + "step": 15695 + }, + { + "epoch": 2.09, + "grad_norm": 0.57421875, + "learning_rate": 0.0001091981882510908, + "loss": 0.3902, + "step": 15696 + }, + { + "epoch": 2.09, + "grad_norm": 0.5625, + "learning_rate": 0.00010918659253881591, + "loss": 0.2905, + "step": 15697 + }, + { + "epoch": 2.09, + "grad_norm": 0.5625, + "learning_rate": 0.0001091749967019649, + "loss": 0.2331, + "step": 15698 + }, + { + "epoch": 2.09, + "grad_norm": 0.7734375, + "learning_rate": 0.00010916340074069505, + "loss": 0.6922, + "step": 15699 + }, + { + "epoch": 2.1, + "grad_norm": 0.447265625, + "learning_rate": 0.00010915180465516357, + "loss": 0.2276, + "step": 15700 + }, + { + "epoch": 2.1, + "grad_norm": 0.5078125, + "learning_rate": 0.0001091402084455278, + "loss": 0.4493, + "step": 15701 + }, + { + "epoch": 2.1, + "grad_norm": 0.60546875, + "learning_rate": 0.00010912861211194491, + "loss": 0.4045, + "step": 15702 + }, + { + "epoch": 2.1, + "grad_norm": 0.609375, + "learning_rate": 0.00010911701565457218, + "loss": 0.2849, + "step": 15703 + }, + { + "epoch": 2.1, + "grad_norm": 0.494140625, + "learning_rate": 0.00010910541907356688, + "loss": 0.3865, + "step": 15704 + }, + { + "epoch": 2.1, + "grad_norm": 0.404296875, + "learning_rate": 0.00010909382236908623, + "loss": 0.3738, + "step": 15705 + }, + { + "epoch": 2.1, + "grad_norm": 0.5703125, + "learning_rate": 0.00010908222554128755, + "loss": 0.3066, + "step": 15706 + }, + { + "epoch": 2.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00010907062859032803, + "loss": 0.5477, + "step": 15707 + }, + { + "epoch": 2.1, + "grad_norm": 0.341796875, + "learning_rate": 0.00010905903151636501, + "loss": 0.1263, + "step": 15708 + }, + { + "epoch": 2.1, + "grad_norm": 0.46875, + "learning_rate": 0.00010904743431955564, + "loss": 0.2888, + "step": 15709 + }, + { + "epoch": 2.1, + "grad_norm": 0.6484375, + "learning_rate": 0.00010903583700005728, + "loss": 0.6389, + "step": 15710 + }, + { + "epoch": 2.1, + "grad_norm": 0.6953125, + "learning_rate": 0.00010902423955802719, + "loss": 0.3373, + "step": 15711 + }, + { + "epoch": 2.1, + "grad_norm": 0.59375, + "learning_rate": 0.0001090126419936226, + "loss": 0.4875, + "step": 15712 + }, + { + "epoch": 2.1, + "grad_norm": 0.87890625, + "learning_rate": 0.00010900104430700084, + "loss": 0.4424, + "step": 15713 + }, + { + "epoch": 2.1, + "grad_norm": 0.48046875, + "learning_rate": 0.00010898944649831911, + "loss": 0.2683, + "step": 15714 + }, + { + "epoch": 2.1, + "grad_norm": 0.55859375, + "learning_rate": 0.00010897784856773473, + "loss": 0.3377, + "step": 15715 + }, + { + "epoch": 2.1, + "grad_norm": 0.65625, + "learning_rate": 0.00010896625051540492, + "loss": 0.2497, + "step": 15716 + }, + { + "epoch": 2.1, + "grad_norm": 0.67578125, + "learning_rate": 0.00010895465234148702, + "loss": 0.355, + "step": 15717 + }, + { + "epoch": 2.1, + "grad_norm": 0.6796875, + "learning_rate": 0.0001089430540461383, + "loss": 0.6146, + "step": 15718 + }, + { + "epoch": 2.1, + "grad_norm": 0.54296875, + "learning_rate": 0.00010893145562951603, + "loss": 0.2613, + "step": 15719 + }, + { + "epoch": 2.1, + "grad_norm": 0.61328125, + "learning_rate": 0.00010891985709177749, + "loss": 0.3919, + "step": 15720 + }, + { + "epoch": 2.1, + "grad_norm": 0.474609375, + "learning_rate": 0.00010890825843307994, + "loss": 0.3231, + "step": 15721 + }, + { + "epoch": 2.1, + "grad_norm": 0.48046875, + "learning_rate": 0.00010889665965358071, + "loss": 0.2623, + "step": 15722 + }, + { + "epoch": 2.1, + "grad_norm": 0.484375, + "learning_rate": 0.00010888506075343706, + "loss": 0.3858, + "step": 15723 + }, + { + "epoch": 2.1, + "grad_norm": 0.671875, + "learning_rate": 0.00010887346173280629, + "loss": 0.3057, + "step": 15724 + }, + { + "epoch": 2.1, + "grad_norm": 0.546875, + "learning_rate": 0.00010886186259184565, + "loss": 0.262, + "step": 15725 + }, + { + "epoch": 2.1, + "grad_norm": 0.447265625, + "learning_rate": 0.0001088502633307125, + "loss": 0.214, + "step": 15726 + }, + { + "epoch": 2.1, + "grad_norm": 0.76171875, + "learning_rate": 0.00010883866394956413, + "loss": 0.4834, + "step": 15727 + }, + { + "epoch": 2.1, + "grad_norm": 0.53125, + "learning_rate": 0.00010882706444855774, + "loss": 0.1602, + "step": 15728 + }, + { + "epoch": 2.1, + "grad_norm": 0.40234375, + "learning_rate": 0.00010881546482785072, + "loss": 0.1803, + "step": 15729 + }, + { + "epoch": 2.1, + "grad_norm": 0.4296875, + "learning_rate": 0.00010880386508760032, + "loss": 0.2146, + "step": 15730 + }, + { + "epoch": 2.1, + "grad_norm": 0.55859375, + "learning_rate": 0.00010879226522796388, + "loss": 0.3373, + "step": 15731 + }, + { + "epoch": 2.1, + "grad_norm": 0.59375, + "learning_rate": 0.00010878066524909869, + "loss": 0.2459, + "step": 15732 + }, + { + "epoch": 2.1, + "grad_norm": 0.69140625, + "learning_rate": 0.00010876906515116203, + "loss": 0.5803, + "step": 15733 + }, + { + "epoch": 2.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00010875746493431123, + "loss": 0.3743, + "step": 15734 + }, + { + "epoch": 2.1, + "grad_norm": 0.62109375, + "learning_rate": 0.00010874586459870355, + "loss": 0.4404, + "step": 15735 + }, + { + "epoch": 2.1, + "grad_norm": 0.61328125, + "learning_rate": 0.00010873426414449636, + "loss": 0.3518, + "step": 15736 + }, + { + "epoch": 2.1, + "grad_norm": 0.55859375, + "learning_rate": 0.00010872266357184693, + "loss": 0.1931, + "step": 15737 + }, + { + "epoch": 2.1, + "grad_norm": 0.546875, + "learning_rate": 0.00010871106288091262, + "loss": 0.2536, + "step": 15738 + }, + { + "epoch": 2.1, + "grad_norm": 0.470703125, + "learning_rate": 0.00010869946207185067, + "loss": 0.3201, + "step": 15739 + }, + { + "epoch": 2.1, + "grad_norm": 0.431640625, + "learning_rate": 0.00010868786114481846, + "loss": 0.195, + "step": 15740 + }, + { + "epoch": 2.1, + "grad_norm": 0.439453125, + "learning_rate": 0.00010867626009997323, + "loss": 0.2431, + "step": 15741 + }, + { + "epoch": 2.1, + "grad_norm": 0.6484375, + "learning_rate": 0.00010866465893747238, + "loss": 0.4403, + "step": 15742 + }, + { + "epoch": 2.1, + "grad_norm": 0.498046875, + "learning_rate": 0.00010865305765747317, + "loss": 0.2242, + "step": 15743 + }, + { + "epoch": 2.1, + "grad_norm": 0.546875, + "learning_rate": 0.00010864145626013298, + "loss": 0.2624, + "step": 15744 + }, + { + "epoch": 2.1, + "grad_norm": 0.427734375, + "learning_rate": 0.0001086298547456091, + "loss": 0.2081, + "step": 15745 + }, + { + "epoch": 2.1, + "grad_norm": 0.43359375, + "learning_rate": 0.0001086182531140588, + "loss": 0.1183, + "step": 15746 + }, + { + "epoch": 2.1, + "grad_norm": 0.578125, + "learning_rate": 0.0001086066513656395, + "loss": 0.3366, + "step": 15747 + }, + { + "epoch": 2.1, + "grad_norm": 0.9296875, + "learning_rate": 0.00010859504950050846, + "loss": 0.1598, + "step": 15748 + }, + { + "epoch": 2.1, + "grad_norm": 0.51171875, + "learning_rate": 0.00010858344751882304, + "loss": 0.2662, + "step": 15749 + }, + { + "epoch": 2.1, + "grad_norm": 0.6171875, + "learning_rate": 0.00010857184542074058, + "loss": 0.3558, + "step": 15750 + }, + { + "epoch": 2.1, + "grad_norm": 0.486328125, + "learning_rate": 0.00010856024320641836, + "loss": 0.2504, + "step": 15751 + }, + { + "epoch": 2.1, + "grad_norm": 0.5546875, + "learning_rate": 0.00010854864087601378, + "loss": 0.4227, + "step": 15752 + }, + { + "epoch": 2.1, + "grad_norm": 0.53125, + "learning_rate": 0.00010853703842968413, + "loss": 0.4481, + "step": 15753 + }, + { + "epoch": 2.1, + "grad_norm": 0.48828125, + "learning_rate": 0.00010852543586758674, + "loss": 0.1773, + "step": 15754 + }, + { + "epoch": 2.1, + "grad_norm": 0.78515625, + "learning_rate": 0.00010851383318987897, + "loss": 0.2932, + "step": 15755 + }, + { + "epoch": 2.1, + "grad_norm": 0.7734375, + "learning_rate": 0.00010850223039671822, + "loss": 0.354, + "step": 15756 + }, + { + "epoch": 2.1, + "grad_norm": 0.435546875, + "learning_rate": 0.00010849062748826171, + "loss": 0.2019, + "step": 15757 + }, + { + "epoch": 2.1, + "grad_norm": 0.431640625, + "learning_rate": 0.00010847902446466684, + "loss": 0.3756, + "step": 15758 + }, + { + "epoch": 2.1, + "grad_norm": 0.490234375, + "learning_rate": 0.000108467421326091, + "loss": 0.315, + "step": 15759 + }, + { + "epoch": 2.1, + "grad_norm": 0.482421875, + "learning_rate": 0.00010845581807269145, + "loss": 0.3896, + "step": 15760 + }, + { + "epoch": 2.1, + "grad_norm": 0.65234375, + "learning_rate": 0.0001084442147046256, + "loss": 0.6271, + "step": 15761 + }, + { + "epoch": 2.1, + "grad_norm": 0.66796875, + "learning_rate": 0.00010843261122205079, + "loss": 0.5029, + "step": 15762 + }, + { + "epoch": 2.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00010842100762512435, + "loss": 0.3296, + "step": 15763 + }, + { + "epoch": 2.1, + "grad_norm": 0.65234375, + "learning_rate": 0.00010840940391400363, + "loss": 0.3424, + "step": 15764 + }, + { + "epoch": 2.1, + "grad_norm": 0.5390625, + "learning_rate": 0.00010839780008884602, + "loss": 0.4691, + "step": 15765 + }, + { + "epoch": 2.1, + "grad_norm": 0.408203125, + "learning_rate": 0.0001083861961498088, + "loss": 0.259, + "step": 15766 + }, + { + "epoch": 2.1, + "grad_norm": 0.62890625, + "learning_rate": 0.00010837459209704943, + "loss": 0.3923, + "step": 15767 + }, + { + "epoch": 2.1, + "grad_norm": 0.470703125, + "learning_rate": 0.00010836298793072521, + "loss": 0.2261, + "step": 15768 + }, + { + "epoch": 2.1, + "grad_norm": 0.6484375, + "learning_rate": 0.00010835138365099348, + "loss": 0.4009, + "step": 15769 + }, + { + "epoch": 2.1, + "grad_norm": 0.640625, + "learning_rate": 0.00010833977925801164, + "loss": 0.3538, + "step": 15770 + }, + { + "epoch": 2.1, + "grad_norm": 0.75390625, + "learning_rate": 0.00010832817475193704, + "loss": 0.4063, + "step": 15771 + }, + { + "epoch": 2.1, + "grad_norm": 0.56640625, + "learning_rate": 0.00010831657013292706, + "loss": 0.2646, + "step": 15772 + }, + { + "epoch": 2.1, + "grad_norm": 0.7734375, + "learning_rate": 0.00010830496540113903, + "loss": 0.2929, + "step": 15773 + }, + { + "epoch": 2.1, + "grad_norm": 0.48828125, + "learning_rate": 0.00010829336055673036, + "loss": 0.2537, + "step": 15774 + }, + { + "epoch": 2.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001082817555998584, + "loss": 0.1593, + "step": 15775 + }, + { + "epoch": 2.11, + "grad_norm": 0.486328125, + "learning_rate": 0.00010827015053068048, + "loss": 0.1959, + "step": 15776 + }, + { + "epoch": 2.11, + "grad_norm": 0.5, + "learning_rate": 0.00010825854534935405, + "loss": 0.1842, + "step": 15777 + }, + { + "epoch": 2.11, + "grad_norm": 0.5390625, + "learning_rate": 0.00010824694005603641, + "loss": 0.7788, + "step": 15778 + }, + { + "epoch": 2.11, + "grad_norm": 0.6328125, + "learning_rate": 0.00010823533465088503, + "loss": 0.4108, + "step": 15779 + }, + { + "epoch": 2.11, + "grad_norm": 0.4765625, + "learning_rate": 0.00010822372913405718, + "loss": 0.2455, + "step": 15780 + }, + { + "epoch": 2.11, + "grad_norm": 0.51953125, + "learning_rate": 0.00010821212350571031, + "loss": 0.4364, + "step": 15781 + }, + { + "epoch": 2.11, + "grad_norm": 0.4375, + "learning_rate": 0.00010820051776600175, + "loss": 0.37, + "step": 15782 + }, + { + "epoch": 2.11, + "grad_norm": 0.6796875, + "learning_rate": 0.00010818891191508888, + "loss": 0.5809, + "step": 15783 + }, + { + "epoch": 2.11, + "grad_norm": 0.44921875, + "learning_rate": 0.00010817730595312916, + "loss": 0.2482, + "step": 15784 + }, + { + "epoch": 2.11, + "grad_norm": 0.392578125, + "learning_rate": 0.00010816569988027989, + "loss": 0.1736, + "step": 15785 + }, + { + "epoch": 2.11, + "grad_norm": 0.734375, + "learning_rate": 0.00010815409369669856, + "loss": 0.4092, + "step": 15786 + }, + { + "epoch": 2.11, + "grad_norm": 0.54296875, + "learning_rate": 0.00010814248740254241, + "loss": 0.3782, + "step": 15787 + }, + { + "epoch": 2.11, + "grad_norm": 0.498046875, + "learning_rate": 0.00010813088099796894, + "loss": 0.372, + "step": 15788 + }, + { + "epoch": 2.11, + "grad_norm": 0.447265625, + "learning_rate": 0.00010811927448313548, + "loss": 0.2851, + "step": 15789 + }, + { + "epoch": 2.11, + "grad_norm": 0.515625, + "learning_rate": 0.00010810766785819946, + "loss": 0.2196, + "step": 15790 + }, + { + "epoch": 2.11, + "grad_norm": 0.55078125, + "learning_rate": 0.00010809606112331824, + "loss": 0.1868, + "step": 15791 + }, + { + "epoch": 2.11, + "grad_norm": 0.61328125, + "learning_rate": 0.00010808445427864926, + "loss": 0.2467, + "step": 15792 + }, + { + "epoch": 2.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001080728473243499, + "loss": 0.4926, + "step": 15793 + }, + { + "epoch": 2.11, + "grad_norm": 0.37109375, + "learning_rate": 0.00010806124026057754, + "loss": 0.2327, + "step": 15794 + }, + { + "epoch": 2.11, + "grad_norm": 0.65625, + "learning_rate": 0.00010804963308748959, + "loss": 0.4032, + "step": 15795 + }, + { + "epoch": 2.11, + "grad_norm": 0.66796875, + "learning_rate": 0.00010803802580524342, + "loss": 0.2449, + "step": 15796 + }, + { + "epoch": 2.11, + "grad_norm": 0.478515625, + "learning_rate": 0.0001080264184139965, + "loss": 0.3, + "step": 15797 + }, + { + "epoch": 2.11, + "grad_norm": 0.56640625, + "learning_rate": 0.00010801481091390617, + "loss": 0.6597, + "step": 15798 + }, + { + "epoch": 2.11, + "grad_norm": 0.58203125, + "learning_rate": 0.00010800320330512987, + "loss": 0.4113, + "step": 15799 + }, + { + "epoch": 2.11, + "grad_norm": 0.46484375, + "learning_rate": 0.000107991595587825, + "loss": 0.2363, + "step": 15800 + }, + { + "epoch": 2.11, + "grad_norm": 0.5546875, + "learning_rate": 0.00010797998776214895, + "loss": 0.2435, + "step": 15801 + }, + { + "epoch": 2.11, + "grad_norm": 0.515625, + "learning_rate": 0.00010796837982825916, + "loss": 0.2817, + "step": 15802 + }, + { + "epoch": 2.11, + "grad_norm": 0.5546875, + "learning_rate": 0.00010795677178631301, + "loss": 0.2641, + "step": 15803 + }, + { + "epoch": 2.11, + "grad_norm": 0.46484375, + "learning_rate": 0.00010794516363646797, + "loss": 0.4859, + "step": 15804 + }, + { + "epoch": 2.11, + "grad_norm": 0.41796875, + "learning_rate": 0.00010793355537888136, + "loss": 0.3533, + "step": 15805 + }, + { + "epoch": 2.11, + "grad_norm": 0.72265625, + "learning_rate": 0.00010792194701371065, + "loss": 0.3095, + "step": 15806 + }, + { + "epoch": 2.11, + "grad_norm": 0.65234375, + "learning_rate": 0.0001079103385411133, + "loss": 0.5951, + "step": 15807 + }, + { + "epoch": 2.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00010789872996124663, + "loss": 0.2859, + "step": 15808 + }, + { + "epoch": 2.11, + "grad_norm": 0.6484375, + "learning_rate": 0.00010788712127426817, + "loss": 0.417, + "step": 15809 + }, + { + "epoch": 2.11, + "grad_norm": 0.427734375, + "learning_rate": 0.00010787551248033523, + "loss": 0.2036, + "step": 15810 + }, + { + "epoch": 2.11, + "grad_norm": 0.4609375, + "learning_rate": 0.00010786390357960534, + "loss": 0.3196, + "step": 15811 + }, + { + "epoch": 2.11, + "grad_norm": 0.68359375, + "learning_rate": 0.00010785229457223582, + "loss": 0.3546, + "step": 15812 + }, + { + "epoch": 2.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00010784068545838417, + "loss": 0.6193, + "step": 15813 + }, + { + "epoch": 2.11, + "grad_norm": 0.6171875, + "learning_rate": 0.00010782907623820778, + "loss": 0.3801, + "step": 15814 + }, + { + "epoch": 2.11, + "grad_norm": 0.6640625, + "learning_rate": 0.0001078174669118641, + "loss": 0.3642, + "step": 15815 + }, + { + "epoch": 2.11, + "grad_norm": 0.59765625, + "learning_rate": 0.00010780585747951053, + "loss": 0.3672, + "step": 15816 + }, + { + "epoch": 2.11, + "grad_norm": 0.486328125, + "learning_rate": 0.00010779424794130455, + "loss": 0.4227, + "step": 15817 + }, + { + "epoch": 2.11, + "grad_norm": 0.40234375, + "learning_rate": 0.00010778263829740359, + "loss": 0.1822, + "step": 15818 + }, + { + "epoch": 2.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00010777102854796499, + "loss": 0.5001, + "step": 15819 + }, + { + "epoch": 2.11, + "grad_norm": 0.58203125, + "learning_rate": 0.00010775941869314633, + "loss": 0.5621, + "step": 15820 + }, + { + "epoch": 2.11, + "grad_norm": 0.6640625, + "learning_rate": 0.00010774780873310491, + "loss": 0.237, + "step": 15821 + }, + { + "epoch": 2.11, + "grad_norm": 0.67578125, + "learning_rate": 0.00010773619866799827, + "loss": 0.5477, + "step": 15822 + }, + { + "epoch": 2.11, + "grad_norm": 0.8984375, + "learning_rate": 0.0001077245884979838, + "loss": 0.4127, + "step": 15823 + }, + { + "epoch": 2.11, + "grad_norm": 0.4765625, + "learning_rate": 0.00010771297822321893, + "loss": 0.3326, + "step": 15824 + }, + { + "epoch": 2.11, + "grad_norm": 0.5625, + "learning_rate": 0.00010770136784386116, + "loss": 0.1732, + "step": 15825 + }, + { + "epoch": 2.11, + "grad_norm": 0.5234375, + "learning_rate": 0.00010768975736006788, + "loss": 0.3289, + "step": 15826 + }, + { + "epoch": 2.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00010767814677199657, + "loss": 0.3501, + "step": 15827 + }, + { + "epoch": 2.11, + "grad_norm": 0.76953125, + "learning_rate": 0.00010766653607980463, + "loss": 0.419, + "step": 15828 + }, + { + "epoch": 2.11, + "grad_norm": 0.5625, + "learning_rate": 0.0001076549252836496, + "loss": 0.4396, + "step": 15829 + }, + { + "epoch": 2.11, + "grad_norm": 0.53515625, + "learning_rate": 0.00010764331438368883, + "loss": 0.4087, + "step": 15830 + }, + { + "epoch": 2.11, + "grad_norm": 0.62109375, + "learning_rate": 0.00010763170338007978, + "loss": 0.5762, + "step": 15831 + }, + { + "epoch": 2.11, + "grad_norm": 0.490234375, + "learning_rate": 0.00010762009227297998, + "loss": 0.4347, + "step": 15832 + }, + { + "epoch": 2.11, + "grad_norm": 0.5234375, + "learning_rate": 0.00010760848106254682, + "loss": 0.1705, + "step": 15833 + }, + { + "epoch": 2.11, + "grad_norm": 0.625, + "learning_rate": 0.00010759686974893779, + "loss": 0.4705, + "step": 15834 + }, + { + "epoch": 2.11, + "grad_norm": 0.498046875, + "learning_rate": 0.00010758525833231032, + "loss": 0.2589, + "step": 15835 + }, + { + "epoch": 2.11, + "grad_norm": 0.46875, + "learning_rate": 0.0001075736468128219, + "loss": 0.4578, + "step": 15836 + }, + { + "epoch": 2.11, + "grad_norm": 0.68359375, + "learning_rate": 0.00010756203519062992, + "loss": 0.2852, + "step": 15837 + }, + { + "epoch": 2.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00010755042346589192, + "loss": 0.195, + "step": 15838 + }, + { + "epoch": 2.11, + "grad_norm": 0.6484375, + "learning_rate": 0.00010753881163876534, + "loss": 0.4346, + "step": 15839 + }, + { + "epoch": 2.11, + "grad_norm": 0.70703125, + "learning_rate": 0.00010752719970940764, + "loss": 0.5249, + "step": 15840 + }, + { + "epoch": 2.11, + "grad_norm": 0.4453125, + "learning_rate": 0.00010751558767797625, + "loss": 0.2087, + "step": 15841 + }, + { + "epoch": 2.11, + "grad_norm": 0.67578125, + "learning_rate": 0.00010750397554462867, + "loss": 0.2906, + "step": 15842 + }, + { + "epoch": 2.11, + "grad_norm": 0.6484375, + "learning_rate": 0.00010749236330952239, + "loss": 0.2204, + "step": 15843 + }, + { + "epoch": 2.11, + "grad_norm": 0.64453125, + "learning_rate": 0.00010748075097281485, + "loss": 0.4681, + "step": 15844 + }, + { + "epoch": 2.11, + "grad_norm": 0.5546875, + "learning_rate": 0.00010746913853466351, + "loss": 0.3263, + "step": 15845 + }, + { + "epoch": 2.11, + "grad_norm": 0.546875, + "learning_rate": 0.00010745752599522588, + "loss": 0.4872, + "step": 15846 + }, + { + "epoch": 2.11, + "grad_norm": 0.796875, + "learning_rate": 0.00010744591335465939, + "loss": 0.525, + "step": 15847 + }, + { + "epoch": 2.11, + "grad_norm": 0.60546875, + "learning_rate": 0.00010743430061312157, + "loss": 0.224, + "step": 15848 + }, + { + "epoch": 2.11, + "grad_norm": 0.46484375, + "learning_rate": 0.00010742268777076981, + "loss": 0.1533, + "step": 15849 + }, + { + "epoch": 2.12, + "grad_norm": 0.67578125, + "learning_rate": 0.00010741107482776167, + "loss": 0.3624, + "step": 15850 + }, + { + "epoch": 2.12, + "grad_norm": 0.515625, + "learning_rate": 0.0001073994617842546, + "loss": 0.333, + "step": 15851 + }, + { + "epoch": 2.12, + "grad_norm": 0.57421875, + "learning_rate": 0.00010738784864040607, + "loss": 0.5219, + "step": 15852 + }, + { + "epoch": 2.12, + "grad_norm": 0.5, + "learning_rate": 0.0001073762353963736, + "loss": 0.1309, + "step": 15853 + }, + { + "epoch": 2.12, + "grad_norm": 0.546875, + "learning_rate": 0.00010736462205231462, + "loss": 0.2054, + "step": 15854 + }, + { + "epoch": 2.12, + "grad_norm": 0.5546875, + "learning_rate": 0.00010735300860838661, + "loss": 0.1766, + "step": 15855 + }, + { + "epoch": 2.12, + "grad_norm": 0.5, + "learning_rate": 0.00010734139506474713, + "loss": 0.2699, + "step": 15856 + }, + { + "epoch": 2.12, + "grad_norm": 0.55078125, + "learning_rate": 0.0001073297814215536, + "loss": 0.247, + "step": 15857 + }, + { + "epoch": 2.12, + "grad_norm": 0.65234375, + "learning_rate": 0.00010731816767896352, + "loss": 0.3728, + "step": 15858 + }, + { + "epoch": 2.12, + "grad_norm": 0.60546875, + "learning_rate": 0.00010730655383713444, + "loss": 0.4598, + "step": 15859 + }, + { + "epoch": 2.12, + "grad_norm": 0.4453125, + "learning_rate": 0.00010729493989622375, + "loss": 0.3091, + "step": 15860 + }, + { + "epoch": 2.12, + "grad_norm": 0.421875, + "learning_rate": 0.000107283325856389, + "loss": 0.1498, + "step": 15861 + }, + { + "epoch": 2.12, + "grad_norm": 0.46875, + "learning_rate": 0.00010727171171778769, + "loss": 0.3418, + "step": 15862 + }, + { + "epoch": 2.12, + "grad_norm": 0.65625, + "learning_rate": 0.00010726009748057731, + "loss": 0.2402, + "step": 15863 + }, + { + "epoch": 2.12, + "grad_norm": 0.5390625, + "learning_rate": 0.00010724848314491531, + "loss": 0.2128, + "step": 15864 + }, + { + "epoch": 2.12, + "grad_norm": 0.71875, + "learning_rate": 0.00010723686871095926, + "loss": 0.5988, + "step": 15865 + }, + { + "epoch": 2.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00010722525417886664, + "loss": 0.2702, + "step": 15866 + }, + { + "epoch": 2.12, + "grad_norm": 0.7578125, + "learning_rate": 0.00010721363954879488, + "loss": 0.4772, + "step": 15867 + }, + { + "epoch": 2.12, + "grad_norm": 0.63671875, + "learning_rate": 0.00010720202482090161, + "loss": 0.4547, + "step": 15868 + }, + { + "epoch": 2.12, + "grad_norm": 0.578125, + "learning_rate": 0.0001071904099953442, + "loss": 0.2132, + "step": 15869 + }, + { + "epoch": 2.12, + "grad_norm": 0.6015625, + "learning_rate": 0.00010717879507228024, + "loss": 0.271, + "step": 15870 + }, + { + "epoch": 2.12, + "grad_norm": 0.443359375, + "learning_rate": 0.00010716718005186723, + "loss": 0.2893, + "step": 15871 + }, + { + "epoch": 2.12, + "grad_norm": 0.60546875, + "learning_rate": 0.00010715556493426262, + "loss": 0.3861, + "step": 15872 + }, + { + "epoch": 2.12, + "grad_norm": 0.65234375, + "learning_rate": 0.00010714394971962398, + "loss": 0.2167, + "step": 15873 + }, + { + "epoch": 2.12, + "grad_norm": 0.41015625, + "learning_rate": 0.00010713233440810879, + "loss": 0.1286, + "step": 15874 + }, + { + "epoch": 2.12, + "grad_norm": 0.671875, + "learning_rate": 0.00010712071899987458, + "loss": 0.4851, + "step": 15875 + }, + { + "epoch": 2.12, + "grad_norm": 0.65234375, + "learning_rate": 0.0001071091034950788, + "loss": 0.3588, + "step": 15876 + }, + { + "epoch": 2.12, + "grad_norm": 0.53125, + "learning_rate": 0.0001070974878938791, + "loss": 0.4555, + "step": 15877 + }, + { + "epoch": 2.12, + "grad_norm": 0.5, + "learning_rate": 0.00010708587219643287, + "loss": 0.4064, + "step": 15878 + }, + { + "epoch": 2.12, + "grad_norm": 0.7109375, + "learning_rate": 0.00010707425640289764, + "loss": 0.3873, + "step": 15879 + }, + { + "epoch": 2.12, + "grad_norm": 0.58203125, + "learning_rate": 0.00010706264051343098, + "loss": 0.6044, + "step": 15880 + }, + { + "epoch": 2.12, + "grad_norm": 0.62109375, + "learning_rate": 0.00010705102452819035, + "loss": 0.28, + "step": 15881 + }, + { + "epoch": 2.12, + "grad_norm": 0.58203125, + "learning_rate": 0.00010703940844733336, + "loss": 0.2094, + "step": 15882 + }, + { + "epoch": 2.12, + "grad_norm": 0.734375, + "learning_rate": 0.00010702779227101742, + "loss": 0.2873, + "step": 15883 + }, + { + "epoch": 2.12, + "grad_norm": 0.6484375, + "learning_rate": 0.00010701617599940015, + "loss": 0.3792, + "step": 15884 + }, + { + "epoch": 2.12, + "grad_norm": 0.5703125, + "learning_rate": 0.000107004559632639, + "loss": 0.3465, + "step": 15885 + }, + { + "epoch": 2.12, + "grad_norm": 0.640625, + "learning_rate": 0.00010699294317089152, + "loss": 0.3973, + "step": 15886 + }, + { + "epoch": 2.12, + "grad_norm": 0.41015625, + "learning_rate": 0.00010698132661431526, + "loss": 0.4152, + "step": 15887 + }, + { + "epoch": 2.12, + "grad_norm": 0.4765625, + "learning_rate": 0.00010696970996306772, + "loss": 0.3933, + "step": 15888 + }, + { + "epoch": 2.12, + "grad_norm": 0.51171875, + "learning_rate": 0.00010695809321730643, + "loss": 0.2808, + "step": 15889 + }, + { + "epoch": 2.12, + "grad_norm": 0.494140625, + "learning_rate": 0.00010694647637718897, + "loss": 0.3117, + "step": 15890 + }, + { + "epoch": 2.12, + "grad_norm": 0.72265625, + "learning_rate": 0.00010693485944287283, + "loss": 0.3911, + "step": 15891 + }, + { + "epoch": 2.12, + "grad_norm": 0.486328125, + "learning_rate": 0.0001069232424145155, + "loss": 0.2951, + "step": 15892 + }, + { + "epoch": 2.12, + "grad_norm": 0.45703125, + "learning_rate": 0.00010691162529227459, + "loss": 0.1878, + "step": 15893 + }, + { + "epoch": 2.12, + "grad_norm": 0.51953125, + "learning_rate": 0.00010690000807630758, + "loss": 0.2536, + "step": 15894 + }, + { + "epoch": 2.12, + "grad_norm": 0.455078125, + "learning_rate": 0.00010688839076677206, + "loss": 0.4108, + "step": 15895 + }, + { + "epoch": 2.12, + "grad_norm": 0.58984375, + "learning_rate": 0.00010687677336382553, + "loss": 0.3143, + "step": 15896 + }, + { + "epoch": 2.12, + "grad_norm": 0.515625, + "learning_rate": 0.00010686515586762553, + "loss": 0.304, + "step": 15897 + }, + { + "epoch": 2.12, + "grad_norm": 0.462890625, + "learning_rate": 0.00010685353827832964, + "loss": 0.3879, + "step": 15898 + }, + { + "epoch": 2.12, + "grad_norm": 0.7265625, + "learning_rate": 0.00010684192059609534, + "loss": 0.5136, + "step": 15899 + }, + { + "epoch": 2.12, + "grad_norm": 0.63671875, + "learning_rate": 0.00010683030282108022, + "loss": 0.5008, + "step": 15900 + }, + { + "epoch": 2.12, + "grad_norm": 0.40625, + "learning_rate": 0.0001068186849534418, + "loss": 0.2831, + "step": 15901 + }, + { + "epoch": 2.12, + "grad_norm": 0.435546875, + "learning_rate": 0.00010680706699333767, + "loss": 0.3836, + "step": 15902 + }, + { + "epoch": 2.12, + "grad_norm": 0.515625, + "learning_rate": 0.0001067954489409253, + "loss": 0.44, + "step": 15903 + }, + { + "epoch": 2.12, + "grad_norm": 0.3671875, + "learning_rate": 0.0001067838307963623, + "loss": 0.173, + "step": 15904 + }, + { + "epoch": 2.12, + "grad_norm": 0.59375, + "learning_rate": 0.00010677221255980621, + "loss": 0.3426, + "step": 15905 + }, + { + "epoch": 2.12, + "grad_norm": 0.46875, + "learning_rate": 0.00010676059423141454, + "loss": 0.2547, + "step": 15906 + }, + { + "epoch": 2.12, + "grad_norm": 0.67578125, + "learning_rate": 0.00010674897581134491, + "loss": 0.5915, + "step": 15907 + }, + { + "epoch": 2.12, + "grad_norm": 0.59765625, + "learning_rate": 0.00010673735729975482, + "loss": 0.3311, + "step": 15908 + }, + { + "epoch": 2.12, + "grad_norm": 0.48046875, + "learning_rate": 0.00010672573869680185, + "loss": 0.2257, + "step": 15909 + }, + { + "epoch": 2.12, + "grad_norm": 0.47265625, + "learning_rate": 0.00010671412000264353, + "loss": 0.3186, + "step": 15910 + }, + { + "epoch": 2.12, + "grad_norm": 0.59765625, + "learning_rate": 0.00010670250121743745, + "loss": 0.2467, + "step": 15911 + }, + { + "epoch": 2.12, + "grad_norm": 0.6171875, + "learning_rate": 0.00010669088234134113, + "loss": 0.2122, + "step": 15912 + }, + { + "epoch": 2.12, + "grad_norm": 0.83203125, + "learning_rate": 0.00010667926337451217, + "loss": 0.559, + "step": 15913 + }, + { + "epoch": 2.12, + "grad_norm": 0.515625, + "learning_rate": 0.00010666764431710812, + "loss": 0.3081, + "step": 15914 + }, + { + "epoch": 2.12, + "grad_norm": 0.640625, + "learning_rate": 0.0001066560251692865, + "loss": 0.3712, + "step": 15915 + }, + { + "epoch": 2.12, + "grad_norm": 0.765625, + "learning_rate": 0.00010664440593120493, + "loss": 0.5721, + "step": 15916 + }, + { + "epoch": 2.12, + "grad_norm": 0.50390625, + "learning_rate": 0.00010663278660302094, + "loss": 0.1615, + "step": 15917 + }, + { + "epoch": 2.12, + "grad_norm": 0.671875, + "learning_rate": 0.00010662116718489212, + "loss": 0.7122, + "step": 15918 + }, + { + "epoch": 2.12, + "grad_norm": 0.515625, + "learning_rate": 0.00010660954767697602, + "loss": 0.3056, + "step": 15919 + }, + { + "epoch": 2.12, + "grad_norm": 0.451171875, + "learning_rate": 0.00010659792807943021, + "loss": 0.3341, + "step": 15920 + }, + { + "epoch": 2.12, + "grad_norm": 0.451171875, + "learning_rate": 0.00010658630839241229, + "loss": 0.3043, + "step": 15921 + }, + { + "epoch": 2.12, + "grad_norm": 0.84765625, + "learning_rate": 0.00010657468861607973, + "loss": 0.2864, + "step": 15922 + }, + { + "epoch": 2.12, + "grad_norm": 0.3515625, + "learning_rate": 0.00010656306875059024, + "loss": 0.1086, + "step": 15923 + }, + { + "epoch": 2.12, + "grad_norm": 0.6484375, + "learning_rate": 0.00010655144879610128, + "loss": 0.3792, + "step": 15924 + }, + { + "epoch": 2.13, + "grad_norm": 0.486328125, + "learning_rate": 0.00010653982875277052, + "loss": 0.3304, + "step": 15925 + }, + { + "epoch": 2.13, + "grad_norm": 0.55859375, + "learning_rate": 0.00010652820862075546, + "loss": 0.3993, + "step": 15926 + }, + { + "epoch": 2.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00010651658840021368, + "loss": 0.2243, + "step": 15927 + }, + { + "epoch": 2.13, + "grad_norm": 0.5859375, + "learning_rate": 0.0001065049680913028, + "loss": 0.3001, + "step": 15928 + }, + { + "epoch": 2.13, + "grad_norm": 0.50390625, + "learning_rate": 0.00010649334769418036, + "loss": 0.289, + "step": 15929 + }, + { + "epoch": 2.13, + "grad_norm": 0.7265625, + "learning_rate": 0.00010648172720900398, + "loss": 0.5274, + "step": 15930 + }, + { + "epoch": 2.13, + "grad_norm": 0.65234375, + "learning_rate": 0.00010647010663593118, + "loss": 0.4675, + "step": 15931 + }, + { + "epoch": 2.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00010645848597511965, + "loss": 0.4228, + "step": 15932 + }, + { + "epoch": 2.13, + "grad_norm": 0.46484375, + "learning_rate": 0.00010644686522672684, + "loss": 0.2146, + "step": 15933 + }, + { + "epoch": 2.13, + "grad_norm": 0.365234375, + "learning_rate": 0.00010643524439091046, + "loss": 0.121, + "step": 15934 + }, + { + "epoch": 2.13, + "grad_norm": 0.71875, + "learning_rate": 0.00010642362346782798, + "loss": 0.7813, + "step": 15935 + }, + { + "epoch": 2.13, + "grad_norm": 0.58984375, + "learning_rate": 0.00010641200245763708, + "loss": 0.1913, + "step": 15936 + }, + { + "epoch": 2.13, + "grad_norm": 0.53125, + "learning_rate": 0.00010640038136049528, + "loss": 0.2184, + "step": 15937 + }, + { + "epoch": 2.13, + "grad_norm": 0.69140625, + "learning_rate": 0.00010638876017656023, + "loss": 0.3093, + "step": 15938 + }, + { + "epoch": 2.13, + "grad_norm": 0.470703125, + "learning_rate": 0.00010637713890598948, + "loss": 0.2653, + "step": 15939 + }, + { + "epoch": 2.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00010636551754894062, + "loss": 0.2411, + "step": 15940 + }, + { + "epoch": 2.13, + "grad_norm": 0.5546875, + "learning_rate": 0.00010635389610557129, + "loss": 0.3894, + "step": 15941 + }, + { + "epoch": 2.13, + "grad_norm": 0.6171875, + "learning_rate": 0.00010634227457603901, + "loss": 0.4329, + "step": 15942 + }, + { + "epoch": 2.13, + "grad_norm": 0.6640625, + "learning_rate": 0.00010633065296050144, + "loss": 0.3423, + "step": 15943 + }, + { + "epoch": 2.13, + "grad_norm": 0.400390625, + "learning_rate": 0.00010631903125911617, + "loss": 0.1306, + "step": 15944 + }, + { + "epoch": 2.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00010630740947204074, + "loss": 0.277, + "step": 15945 + }, + { + "epoch": 2.13, + "grad_norm": 0.6796875, + "learning_rate": 0.00010629578759943281, + "loss": 0.3146, + "step": 15946 + }, + { + "epoch": 2.13, + "grad_norm": 0.4765625, + "learning_rate": 0.00010628416564144997, + "loss": 0.1703, + "step": 15947 + }, + { + "epoch": 2.13, + "grad_norm": 0.439453125, + "learning_rate": 0.0001062725435982498, + "loss": 0.306, + "step": 15948 + }, + { + "epoch": 2.13, + "grad_norm": 0.62109375, + "learning_rate": 0.0001062609214699899, + "loss": 0.3784, + "step": 15949 + }, + { + "epoch": 2.13, + "grad_norm": 0.65625, + "learning_rate": 0.00010624929925682794, + "loss": 0.415, + "step": 15950 + }, + { + "epoch": 2.13, + "grad_norm": 0.60546875, + "learning_rate": 0.00010623767695892142, + "loss": 0.5154, + "step": 15951 + }, + { + "epoch": 2.13, + "grad_norm": 0.75, + "learning_rate": 0.00010622605457642798, + "loss": 0.2972, + "step": 15952 + }, + { + "epoch": 2.13, + "grad_norm": 0.5703125, + "learning_rate": 0.00010621443210950528, + "loss": 0.2929, + "step": 15953 + }, + { + "epoch": 2.13, + "grad_norm": 0.7890625, + "learning_rate": 0.00010620280955831087, + "loss": 0.4178, + "step": 15954 + }, + { + "epoch": 2.13, + "grad_norm": 0.55859375, + "learning_rate": 0.0001061911869230024, + "loss": 0.4541, + "step": 15955 + }, + { + "epoch": 2.13, + "grad_norm": 0.671875, + "learning_rate": 0.0001061795642037375, + "loss": 0.6396, + "step": 15956 + }, + { + "epoch": 2.13, + "grad_norm": 0.40625, + "learning_rate": 0.0001061679414006737, + "loss": 0.1853, + "step": 15957 + }, + { + "epoch": 2.13, + "grad_norm": 0.5234375, + "learning_rate": 0.00010615631851396864, + "loss": 0.2387, + "step": 15958 + }, + { + "epoch": 2.13, + "grad_norm": 0.7578125, + "learning_rate": 0.00010614469554377997, + "loss": 0.4527, + "step": 15959 + }, + { + "epoch": 2.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00010613307249026528, + "loss": 0.5683, + "step": 15960 + }, + { + "epoch": 2.13, + "grad_norm": 0.62109375, + "learning_rate": 0.0001061214493535822, + "loss": 0.515, + "step": 15961 + }, + { + "epoch": 2.13, + "grad_norm": 0.57421875, + "learning_rate": 0.00010610982613388833, + "loss": 0.2203, + "step": 15962 + }, + { + "epoch": 2.13, + "grad_norm": 0.5234375, + "learning_rate": 0.00010609820283134128, + "loss": 0.2643, + "step": 15963 + }, + { + "epoch": 2.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00010608657944609872, + "loss": 0.2548, + "step": 15964 + }, + { + "epoch": 2.13, + "grad_norm": 0.61328125, + "learning_rate": 0.0001060749559783182, + "loss": 0.3614, + "step": 15965 + }, + { + "epoch": 2.13, + "grad_norm": 0.5390625, + "learning_rate": 0.00010606333242815742, + "loss": 0.2013, + "step": 15966 + }, + { + "epoch": 2.13, + "grad_norm": 0.69921875, + "learning_rate": 0.00010605170879577392, + "loss": 0.4836, + "step": 15967 + }, + { + "epoch": 2.13, + "grad_norm": 0.70703125, + "learning_rate": 0.00010604008508132539, + "loss": 0.1498, + "step": 15968 + }, + { + "epoch": 2.13, + "grad_norm": 0.61328125, + "learning_rate": 0.00010602846128496943, + "loss": 0.4934, + "step": 15969 + }, + { + "epoch": 2.13, + "grad_norm": 0.359375, + "learning_rate": 0.00010601683740686366, + "loss": 0.1239, + "step": 15970 + }, + { + "epoch": 2.13, + "grad_norm": 0.412109375, + "learning_rate": 0.00010600521344716572, + "loss": 0.2325, + "step": 15971 + }, + { + "epoch": 2.13, + "grad_norm": 0.54296875, + "learning_rate": 0.00010599358940603322, + "loss": 0.2527, + "step": 15972 + }, + { + "epoch": 2.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00010598196528362381, + "loss": 0.5002, + "step": 15973 + }, + { + "epoch": 2.13, + "grad_norm": 0.474609375, + "learning_rate": 0.00010597034108009509, + "loss": 0.2396, + "step": 15974 + }, + { + "epoch": 2.13, + "grad_norm": 0.6015625, + "learning_rate": 0.00010595871679560477, + "loss": 0.3672, + "step": 15975 + }, + { + "epoch": 2.13, + "grad_norm": 0.5703125, + "learning_rate": 0.00010594709243031036, + "loss": 0.4516, + "step": 15976 + }, + { + "epoch": 2.13, + "grad_norm": 0.546875, + "learning_rate": 0.00010593546798436958, + "loss": 0.3233, + "step": 15977 + }, + { + "epoch": 2.13, + "grad_norm": 0.51953125, + "learning_rate": 0.00010592384345794004, + "loss": 0.4356, + "step": 15978 + }, + { + "epoch": 2.13, + "grad_norm": 0.7734375, + "learning_rate": 0.00010591221885117939, + "loss": 0.4366, + "step": 15979 + }, + { + "epoch": 2.13, + "grad_norm": 0.6875, + "learning_rate": 0.00010590059416424527, + "loss": 0.4578, + "step": 15980 + }, + { + "epoch": 2.13, + "grad_norm": 0.474609375, + "learning_rate": 0.0001058889693972953, + "loss": 0.3693, + "step": 15981 + }, + { + "epoch": 2.13, + "grad_norm": 0.49609375, + "learning_rate": 0.00010587734455048713, + "loss": 0.459, + "step": 15982 + }, + { + "epoch": 2.13, + "grad_norm": 0.400390625, + "learning_rate": 0.00010586571962397837, + "loss": 0.1577, + "step": 15983 + }, + { + "epoch": 2.13, + "grad_norm": 0.62109375, + "learning_rate": 0.00010585409461792673, + "loss": 0.4598, + "step": 15984 + }, + { + "epoch": 2.13, + "grad_norm": 0.65625, + "learning_rate": 0.00010584246953248975, + "loss": 0.2606, + "step": 15985 + }, + { + "epoch": 2.13, + "grad_norm": 0.78125, + "learning_rate": 0.00010583084436782518, + "loss": 0.5269, + "step": 15986 + }, + { + "epoch": 2.13, + "grad_norm": 0.55859375, + "learning_rate": 0.00010581921912409063, + "loss": 0.3076, + "step": 15987 + }, + { + "epoch": 2.13, + "grad_norm": 0.55078125, + "learning_rate": 0.0001058075938014437, + "loss": 0.3939, + "step": 15988 + }, + { + "epoch": 2.13, + "grad_norm": 0.62890625, + "learning_rate": 0.00010579596840004207, + "loss": 0.3218, + "step": 15989 + }, + { + "epoch": 2.13, + "grad_norm": 0.443359375, + "learning_rate": 0.0001057843429200434, + "loss": 0.2728, + "step": 15990 + }, + { + "epoch": 2.13, + "grad_norm": 0.5, + "learning_rate": 0.00010577271736160533, + "loss": 0.2601, + "step": 15991 + }, + { + "epoch": 2.13, + "grad_norm": 0.59765625, + "learning_rate": 0.00010576109172488549, + "loss": 0.2853, + "step": 15992 + }, + { + "epoch": 2.13, + "grad_norm": 0.63671875, + "learning_rate": 0.00010574946601004157, + "loss": 0.5966, + "step": 15993 + }, + { + "epoch": 2.13, + "grad_norm": 0.5859375, + "learning_rate": 0.0001057378402172312, + "loss": 0.3786, + "step": 15994 + }, + { + "epoch": 2.13, + "grad_norm": 0.490234375, + "learning_rate": 0.000105726214346612, + "loss": 0.2155, + "step": 15995 + }, + { + "epoch": 2.13, + "grad_norm": 0.62890625, + "learning_rate": 0.00010571458839834171, + "loss": 0.2385, + "step": 15996 + }, + { + "epoch": 2.13, + "grad_norm": 0.57421875, + "learning_rate": 0.00010570296237257791, + "loss": 0.3623, + "step": 15997 + }, + { + "epoch": 2.13, + "grad_norm": 0.65625, + "learning_rate": 0.00010569133626947827, + "loss": 0.1326, + "step": 15998 + }, + { + "epoch": 2.13, + "grad_norm": 0.43359375, + "learning_rate": 0.00010567971008920048, + "loss": 0.2736, + "step": 15999 + }, + { + "epoch": 2.14, + "grad_norm": 0.44921875, + "learning_rate": 0.00010566808383190219, + "loss": 0.2997, + "step": 16000 + }, + { + "epoch": 2.14, + "grad_norm": 0.57421875, + "learning_rate": 0.000105656457497741, + "loss": 0.3491, + "step": 16001 + }, + { + "epoch": 2.14, + "grad_norm": 0.58984375, + "learning_rate": 0.00010564483108687462, + "loss": 0.4093, + "step": 16002 + }, + { + "epoch": 2.14, + "grad_norm": 0.5390625, + "learning_rate": 0.00010563320459946073, + "loss": 0.2116, + "step": 16003 + }, + { + "epoch": 2.14, + "grad_norm": 0.74609375, + "learning_rate": 0.00010562157803565695, + "loss": 0.2927, + "step": 16004 + }, + { + "epoch": 2.14, + "grad_norm": 0.6015625, + "learning_rate": 0.000105609951395621, + "loss": 0.2488, + "step": 16005 + }, + { + "epoch": 2.14, + "grad_norm": 0.43359375, + "learning_rate": 0.00010559832467951048, + "loss": 0.1974, + "step": 16006 + }, + { + "epoch": 2.14, + "grad_norm": 0.62109375, + "learning_rate": 0.0001055866978874831, + "loss": 0.1925, + "step": 16007 + }, + { + "epoch": 2.14, + "grad_norm": 0.68359375, + "learning_rate": 0.00010557507101969648, + "loss": 0.3627, + "step": 16008 + }, + { + "epoch": 2.14, + "grad_norm": 0.470703125, + "learning_rate": 0.00010556344407630836, + "loss": 0.3397, + "step": 16009 + }, + { + "epoch": 2.14, + "grad_norm": 0.470703125, + "learning_rate": 0.00010555181705747632, + "loss": 0.3344, + "step": 16010 + }, + { + "epoch": 2.14, + "grad_norm": 0.7734375, + "learning_rate": 0.00010554018996335812, + "loss": 0.3731, + "step": 16011 + }, + { + "epoch": 2.14, + "grad_norm": 0.78125, + "learning_rate": 0.00010552856279411139, + "loss": 0.3441, + "step": 16012 + }, + { + "epoch": 2.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00010551693554989376, + "loss": 0.336, + "step": 16013 + }, + { + "epoch": 2.14, + "grad_norm": 0.55078125, + "learning_rate": 0.00010550530823086298, + "loss": 0.4154, + "step": 16014 + }, + { + "epoch": 2.14, + "grad_norm": 0.64453125, + "learning_rate": 0.00010549368083717665, + "loss": 0.5161, + "step": 16015 + }, + { + "epoch": 2.14, + "grad_norm": 0.578125, + "learning_rate": 0.00010548205336899252, + "loss": 0.5713, + "step": 16016 + }, + { + "epoch": 2.14, + "grad_norm": 0.427734375, + "learning_rate": 0.00010547042582646822, + "loss": 0.2191, + "step": 16017 + }, + { + "epoch": 2.14, + "grad_norm": 0.458984375, + "learning_rate": 0.00010545879820976144, + "loss": 0.3317, + "step": 16018 + }, + { + "epoch": 2.14, + "grad_norm": 0.404296875, + "learning_rate": 0.00010544717051902984, + "loss": 0.3331, + "step": 16019 + }, + { + "epoch": 2.14, + "grad_norm": 0.640625, + "learning_rate": 0.0001054355427544311, + "loss": 0.4084, + "step": 16020 + }, + { + "epoch": 2.14, + "grad_norm": 0.55859375, + "learning_rate": 0.00010542391491612293, + "loss": 0.3581, + "step": 16021 + }, + { + "epoch": 2.14, + "grad_norm": 0.51953125, + "learning_rate": 0.00010541228700426298, + "loss": 0.1391, + "step": 16022 + }, + { + "epoch": 2.14, + "grad_norm": 0.59375, + "learning_rate": 0.00010540065901900897, + "loss": 0.2814, + "step": 16023 + }, + { + "epoch": 2.14, + "grad_norm": 0.61328125, + "learning_rate": 0.00010538903096051854, + "loss": 0.2029, + "step": 16024 + }, + { + "epoch": 2.14, + "grad_norm": 0.7421875, + "learning_rate": 0.00010537740282894937, + "loss": 0.223, + "step": 16025 + }, + { + "epoch": 2.14, + "grad_norm": 0.53125, + "learning_rate": 0.0001053657746244592, + "loss": 0.2898, + "step": 16026 + }, + { + "epoch": 2.14, + "grad_norm": 0.83984375, + "learning_rate": 0.00010535414634720566, + "loss": 0.5167, + "step": 16027 + }, + { + "epoch": 2.14, + "grad_norm": 0.67578125, + "learning_rate": 0.0001053425179973465, + "loss": 0.4757, + "step": 16028 + }, + { + "epoch": 2.14, + "grad_norm": 0.458984375, + "learning_rate": 0.00010533088957503933, + "loss": 0.2288, + "step": 16029 + }, + { + "epoch": 2.14, + "grad_norm": 0.392578125, + "learning_rate": 0.0001053192610804419, + "loss": 0.291, + "step": 16030 + }, + { + "epoch": 2.14, + "grad_norm": 0.50390625, + "learning_rate": 0.00010530763251371183, + "loss": 0.2053, + "step": 16031 + }, + { + "epoch": 2.14, + "grad_norm": 0.84375, + "learning_rate": 0.00010529600387500691, + "loss": 0.3178, + "step": 16032 + }, + { + "epoch": 2.14, + "grad_norm": 0.5625, + "learning_rate": 0.00010528437516448473, + "loss": 0.3253, + "step": 16033 + }, + { + "epoch": 2.14, + "grad_norm": 0.7109375, + "learning_rate": 0.00010527274638230307, + "loss": 0.3775, + "step": 16034 + }, + { + "epoch": 2.14, + "grad_norm": 0.490234375, + "learning_rate": 0.00010526111752861958, + "loss": 0.2615, + "step": 16035 + }, + { + "epoch": 2.14, + "grad_norm": 0.5234375, + "learning_rate": 0.00010524948860359193, + "loss": 0.249, + "step": 16036 + }, + { + "epoch": 2.14, + "grad_norm": 0.546875, + "learning_rate": 0.00010523785960737788, + "loss": 0.6854, + "step": 16037 + }, + { + "epoch": 2.14, + "grad_norm": 0.46484375, + "learning_rate": 0.00010522623054013508, + "loss": 0.1754, + "step": 16038 + }, + { + "epoch": 2.14, + "grad_norm": 0.6328125, + "learning_rate": 0.00010521460140202124, + "loss": 0.4863, + "step": 16039 + }, + { + "epoch": 2.14, + "grad_norm": 0.54296875, + "learning_rate": 0.00010520297219319403, + "loss": 0.3484, + "step": 16040 + }, + { + "epoch": 2.14, + "grad_norm": 0.8359375, + "learning_rate": 0.00010519134291381121, + "loss": 0.5048, + "step": 16041 + }, + { + "epoch": 2.14, + "grad_norm": 0.4375, + "learning_rate": 0.00010517971356403044, + "loss": 0.4205, + "step": 16042 + }, + { + "epoch": 2.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00010516808414400942, + "loss": 0.4781, + "step": 16043 + }, + { + "epoch": 2.14, + "grad_norm": 0.45703125, + "learning_rate": 0.0001051564546539059, + "loss": 0.2318, + "step": 16044 + }, + { + "epoch": 2.14, + "grad_norm": 0.5625, + "learning_rate": 0.00010514482509387751, + "loss": 0.4402, + "step": 16045 + }, + { + "epoch": 2.14, + "grad_norm": 0.6015625, + "learning_rate": 0.000105133195464082, + "loss": 0.1574, + "step": 16046 + }, + { + "epoch": 2.14, + "grad_norm": 0.546875, + "learning_rate": 0.00010512156576467706, + "loss": 0.2173, + "step": 16047 + }, + { + "epoch": 2.14, + "grad_norm": 0.48046875, + "learning_rate": 0.00010510993599582044, + "loss": 0.3046, + "step": 16048 + }, + { + "epoch": 2.14, + "grad_norm": 0.5390625, + "learning_rate": 0.00010509830615766976, + "loss": 0.4013, + "step": 16049 + }, + { + "epoch": 2.14, + "grad_norm": 0.482421875, + "learning_rate": 0.00010508667625038278, + "loss": 0.5655, + "step": 16050 + }, + { + "epoch": 2.14, + "grad_norm": 0.58203125, + "learning_rate": 0.00010507504627411722, + "loss": 0.3249, + "step": 16051 + }, + { + "epoch": 2.14, + "grad_norm": 0.45703125, + "learning_rate": 0.00010506341622903075, + "loss": 0.3337, + "step": 16052 + }, + { + "epoch": 2.14, + "grad_norm": 0.57421875, + "learning_rate": 0.00010505178611528114, + "loss": 0.3356, + "step": 16053 + }, + { + "epoch": 2.14, + "grad_norm": 0.7109375, + "learning_rate": 0.00010504015593302605, + "loss": 0.4361, + "step": 16054 + }, + { + "epoch": 2.14, + "grad_norm": 0.59765625, + "learning_rate": 0.00010502852568242323, + "loss": 0.548, + "step": 16055 + }, + { + "epoch": 2.14, + "grad_norm": 0.55859375, + "learning_rate": 0.00010501689536363035, + "loss": 0.3949, + "step": 16056 + }, + { + "epoch": 2.14, + "grad_norm": 0.6640625, + "learning_rate": 0.00010500526497680516, + "loss": 0.3452, + "step": 16057 + }, + { + "epoch": 2.14, + "grad_norm": 0.51171875, + "learning_rate": 0.00010499363452210535, + "loss": 0.4287, + "step": 16058 + }, + { + "epoch": 2.14, + "grad_norm": 1.0078125, + "learning_rate": 0.00010498200399968869, + "loss": 0.4777, + "step": 16059 + }, + { + "epoch": 2.14, + "grad_norm": 0.455078125, + "learning_rate": 0.00010497037340971285, + "loss": 0.381, + "step": 16060 + }, + { + "epoch": 2.14, + "grad_norm": 0.53515625, + "learning_rate": 0.00010495874275233553, + "loss": 0.2277, + "step": 16061 + }, + { + "epoch": 2.14, + "grad_norm": 0.62890625, + "learning_rate": 0.00010494711202771448, + "loss": 0.3342, + "step": 16062 + }, + { + "epoch": 2.14, + "grad_norm": 0.59765625, + "learning_rate": 0.00010493548123600741, + "loss": 0.2444, + "step": 16063 + }, + { + "epoch": 2.14, + "grad_norm": 0.6484375, + "learning_rate": 0.00010492385037737207, + "loss": 0.3695, + "step": 16064 + }, + { + "epoch": 2.14, + "grad_norm": 0.79296875, + "learning_rate": 0.00010491221945196615, + "loss": 0.4136, + "step": 16065 + }, + { + "epoch": 2.14, + "grad_norm": 0.6875, + "learning_rate": 0.00010490058845994737, + "loss": 0.4496, + "step": 16066 + }, + { + "epoch": 2.14, + "grad_norm": 0.5, + "learning_rate": 0.00010488895740147349, + "loss": 0.5653, + "step": 16067 + }, + { + "epoch": 2.14, + "grad_norm": 0.5703125, + "learning_rate": 0.00010487732627670217, + "loss": 0.3339, + "step": 16068 + }, + { + "epoch": 2.14, + "grad_norm": 0.462890625, + "learning_rate": 0.0001048656950857912, + "loss": 0.3303, + "step": 16069 + }, + { + "epoch": 2.14, + "grad_norm": 0.59375, + "learning_rate": 0.00010485406382889829, + "loss": 0.4694, + "step": 16070 + }, + { + "epoch": 2.14, + "grad_norm": 0.4765625, + "learning_rate": 0.00010484243250618115, + "loss": 0.1713, + "step": 16071 + }, + { + "epoch": 2.14, + "grad_norm": 0.49609375, + "learning_rate": 0.00010483080111779753, + "loss": 0.3366, + "step": 16072 + }, + { + "epoch": 2.14, + "grad_norm": 0.92578125, + "learning_rate": 0.00010481916966390515, + "loss": 0.3915, + "step": 16073 + }, + { + "epoch": 2.14, + "grad_norm": 0.64453125, + "learning_rate": 0.00010480753814466171, + "loss": 0.2407, + "step": 16074 + }, + { + "epoch": 2.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00010479590656022495, + "loss": 0.4702, + "step": 16075 + }, + { + "epoch": 2.15, + "grad_norm": 0.4921875, + "learning_rate": 0.00010478427491075267, + "loss": 0.3386, + "step": 16076 + }, + { + "epoch": 2.15, + "grad_norm": 0.52734375, + "learning_rate": 0.00010477264319640252, + "loss": 0.3953, + "step": 16077 + }, + { + "epoch": 2.15, + "grad_norm": 0.494140625, + "learning_rate": 0.0001047610114173323, + "loss": 0.3183, + "step": 16078 + }, + { + "epoch": 2.15, + "grad_norm": 0.48046875, + "learning_rate": 0.00010474937957369966, + "loss": 0.2872, + "step": 16079 + }, + { + "epoch": 2.15, + "grad_norm": 0.640625, + "learning_rate": 0.00010473774766566242, + "loss": 0.4599, + "step": 16080 + }, + { + "epoch": 2.15, + "grad_norm": 0.63671875, + "learning_rate": 0.00010472611569337827, + "loss": 0.4364, + "step": 16081 + }, + { + "epoch": 2.15, + "grad_norm": 0.453125, + "learning_rate": 0.00010471448365700496, + "loss": 0.5084, + "step": 16082 + }, + { + "epoch": 2.15, + "grad_norm": 0.51953125, + "learning_rate": 0.00010470285155670021, + "loss": 0.2574, + "step": 16083 + }, + { + "epoch": 2.15, + "grad_norm": 0.466796875, + "learning_rate": 0.00010469121939262179, + "loss": 0.2538, + "step": 16084 + }, + { + "epoch": 2.15, + "grad_norm": 0.494140625, + "learning_rate": 0.00010467958716492743, + "loss": 0.2007, + "step": 16085 + }, + { + "epoch": 2.15, + "grad_norm": 0.69921875, + "learning_rate": 0.00010466795487377485, + "loss": 0.4665, + "step": 16086 + }, + { + "epoch": 2.15, + "grad_norm": 0.38671875, + "learning_rate": 0.00010465632251932183, + "loss": 0.1691, + "step": 16087 + }, + { + "epoch": 2.15, + "grad_norm": 0.4453125, + "learning_rate": 0.00010464469010172605, + "loss": 0.1604, + "step": 16088 + }, + { + "epoch": 2.15, + "grad_norm": 0.84765625, + "learning_rate": 0.00010463305762114532, + "loss": 0.3357, + "step": 16089 + }, + { + "epoch": 2.15, + "grad_norm": 0.7265625, + "learning_rate": 0.00010462142507773736, + "loss": 0.4023, + "step": 16090 + }, + { + "epoch": 2.15, + "grad_norm": 0.69921875, + "learning_rate": 0.00010460979247165987, + "loss": 0.4641, + "step": 16091 + }, + { + "epoch": 2.15, + "grad_norm": 0.53125, + "learning_rate": 0.00010459815980307068, + "loss": 0.4263, + "step": 16092 + }, + { + "epoch": 2.15, + "grad_norm": 0.55859375, + "learning_rate": 0.00010458652707212746, + "loss": 0.3673, + "step": 16093 + }, + { + "epoch": 2.15, + "grad_norm": 0.76953125, + "learning_rate": 0.00010457489427898801, + "loss": 0.4761, + "step": 16094 + }, + { + "epoch": 2.15, + "grad_norm": 0.65234375, + "learning_rate": 0.00010456326142381006, + "loss": 0.328, + "step": 16095 + }, + { + "epoch": 2.15, + "grad_norm": 0.423828125, + "learning_rate": 0.00010455162850675137, + "loss": 0.3509, + "step": 16096 + }, + { + "epoch": 2.15, + "grad_norm": 0.5546875, + "learning_rate": 0.00010453999552796964, + "loss": 0.3706, + "step": 16097 + }, + { + "epoch": 2.15, + "grad_norm": 0.56640625, + "learning_rate": 0.00010452836248762265, + "loss": 0.1986, + "step": 16098 + }, + { + "epoch": 2.15, + "grad_norm": 0.447265625, + "learning_rate": 0.0001045167293858682, + "loss": 0.213, + "step": 16099 + }, + { + "epoch": 2.15, + "grad_norm": 0.5078125, + "learning_rate": 0.00010450509622286398, + "loss": 0.3507, + "step": 16100 + }, + { + "epoch": 2.15, + "grad_norm": 0.60546875, + "learning_rate": 0.00010449346299876776, + "loss": 0.4327, + "step": 16101 + }, + { + "epoch": 2.15, + "grad_norm": 0.66796875, + "learning_rate": 0.00010448182971373732, + "loss": 0.2233, + "step": 16102 + }, + { + "epoch": 2.15, + "grad_norm": 0.39453125, + "learning_rate": 0.00010447019636793039, + "loss": 0.2164, + "step": 16103 + }, + { + "epoch": 2.15, + "grad_norm": 0.640625, + "learning_rate": 0.00010445856296150469, + "loss": 0.5278, + "step": 16104 + }, + { + "epoch": 2.15, + "grad_norm": 0.609375, + "learning_rate": 0.00010444692949461806, + "loss": 0.5068, + "step": 16105 + }, + { + "epoch": 2.15, + "grad_norm": 0.546875, + "learning_rate": 0.00010443529596742816, + "loss": 0.4721, + "step": 16106 + }, + { + "epoch": 2.15, + "grad_norm": 0.5234375, + "learning_rate": 0.00010442366238009285, + "loss": 0.256, + "step": 16107 + }, + { + "epoch": 2.15, + "grad_norm": 0.474609375, + "learning_rate": 0.0001044120287327698, + "loss": 0.2706, + "step": 16108 + }, + { + "epoch": 2.15, + "grad_norm": 0.82421875, + "learning_rate": 0.00010440039502561682, + "loss": 0.3308, + "step": 16109 + }, + { + "epoch": 2.15, + "grad_norm": 0.6953125, + "learning_rate": 0.00010438876125879169, + "loss": 0.3428, + "step": 16110 + }, + { + "epoch": 2.15, + "grad_norm": 0.859375, + "learning_rate": 0.00010437712743245209, + "loss": 0.4892, + "step": 16111 + }, + { + "epoch": 2.15, + "grad_norm": 0.63671875, + "learning_rate": 0.0001043654935467559, + "loss": 0.3535, + "step": 16112 + }, + { + "epoch": 2.15, + "grad_norm": 0.78515625, + "learning_rate": 0.00010435385960186074, + "loss": 0.3639, + "step": 16113 + }, + { + "epoch": 2.15, + "grad_norm": 0.625, + "learning_rate": 0.00010434222559792451, + "loss": 0.4356, + "step": 16114 + }, + { + "epoch": 2.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00010433059153510489, + "loss": 0.1649, + "step": 16115 + }, + { + "epoch": 2.15, + "grad_norm": 0.41015625, + "learning_rate": 0.00010431895741355966, + "loss": 0.2772, + "step": 16116 + }, + { + "epoch": 2.15, + "grad_norm": 0.59375, + "learning_rate": 0.0001043073232334466, + "loss": 0.5086, + "step": 16117 + }, + { + "epoch": 2.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00010429568899492348, + "loss": 0.3958, + "step": 16118 + }, + { + "epoch": 2.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00010428405469814806, + "loss": 0.2149, + "step": 16119 + }, + { + "epoch": 2.15, + "grad_norm": 0.73828125, + "learning_rate": 0.00010427242034327811, + "loss": 0.437, + "step": 16120 + }, + { + "epoch": 2.15, + "grad_norm": 0.5859375, + "learning_rate": 0.00010426078593047141, + "loss": 0.34, + "step": 16121 + }, + { + "epoch": 2.15, + "grad_norm": 0.6953125, + "learning_rate": 0.0001042491514598857, + "loss": 0.3647, + "step": 16122 + }, + { + "epoch": 2.15, + "grad_norm": 0.61328125, + "learning_rate": 0.00010423751693167876, + "loss": 0.385, + "step": 16123 + }, + { + "epoch": 2.15, + "grad_norm": 0.58203125, + "learning_rate": 0.00010422588234600839, + "loss": 0.495, + "step": 16124 + }, + { + "epoch": 2.15, + "grad_norm": 0.5625, + "learning_rate": 0.00010421424770303232, + "loss": 0.1615, + "step": 16125 + }, + { + "epoch": 2.15, + "grad_norm": 0.65234375, + "learning_rate": 0.00010420261300290839, + "loss": 0.5851, + "step": 16126 + }, + { + "epoch": 2.15, + "grad_norm": 1.0390625, + "learning_rate": 0.0001041909782457943, + "loss": 0.5194, + "step": 16127 + }, + { + "epoch": 2.15, + "grad_norm": 0.59765625, + "learning_rate": 0.00010417934343184785, + "loss": 0.5223, + "step": 16128 + }, + { + "epoch": 2.15, + "grad_norm": 0.4921875, + "learning_rate": 0.00010416770856122682, + "loss": 0.2341, + "step": 16129 + }, + { + "epoch": 2.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00010415607363408902, + "loss": 0.1958, + "step": 16130 + }, + { + "epoch": 2.15, + "grad_norm": 0.7734375, + "learning_rate": 0.00010414443865059214, + "loss": 0.3158, + "step": 16131 + }, + { + "epoch": 2.15, + "grad_norm": 0.5859375, + "learning_rate": 0.00010413280361089405, + "loss": 0.364, + "step": 16132 + }, + { + "epoch": 2.15, + "grad_norm": 0.53125, + "learning_rate": 0.0001041211685151525, + "loss": 0.2496, + "step": 16133 + }, + { + "epoch": 2.15, + "grad_norm": 0.421875, + "learning_rate": 0.00010410953336352522, + "loss": 0.2168, + "step": 16134 + }, + { + "epoch": 2.15, + "grad_norm": 0.69921875, + "learning_rate": 0.00010409789815617009, + "loss": 0.32, + "step": 16135 + }, + { + "epoch": 2.15, + "grad_norm": 0.546875, + "learning_rate": 0.00010408626289324476, + "loss": 0.239, + "step": 16136 + }, + { + "epoch": 2.15, + "grad_norm": 0.73828125, + "learning_rate": 0.00010407462757490714, + "loss": 0.4535, + "step": 16137 + }, + { + "epoch": 2.15, + "grad_norm": 0.671875, + "learning_rate": 0.00010406299220131492, + "loss": 0.3285, + "step": 16138 + }, + { + "epoch": 2.15, + "grad_norm": 0.65625, + "learning_rate": 0.00010405135677262593, + "loss": 0.3947, + "step": 16139 + }, + { + "epoch": 2.15, + "grad_norm": 0.40625, + "learning_rate": 0.00010403972128899796, + "loss": 0.213, + "step": 16140 + }, + { + "epoch": 2.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00010402808575058873, + "loss": 0.5012, + "step": 16141 + }, + { + "epoch": 2.15, + "grad_norm": 0.578125, + "learning_rate": 0.00010401645015755612, + "loss": 0.4128, + "step": 16142 + }, + { + "epoch": 2.15, + "grad_norm": 0.55078125, + "learning_rate": 0.00010400481451005783, + "loss": 0.4273, + "step": 16143 + }, + { + "epoch": 2.15, + "grad_norm": 0.3515625, + "learning_rate": 0.0001039931788082517, + "loss": 0.1657, + "step": 16144 + }, + { + "epoch": 2.15, + "grad_norm": 0.56640625, + "learning_rate": 0.00010398154305229552, + "loss": 0.3848, + "step": 16145 + }, + { + "epoch": 2.15, + "grad_norm": 0.6171875, + "learning_rate": 0.00010396990724234705, + "loss": 0.2476, + "step": 16146 + }, + { + "epoch": 2.15, + "grad_norm": 0.47265625, + "learning_rate": 0.00010395827137856408, + "loss": 0.1599, + "step": 16147 + }, + { + "epoch": 2.15, + "grad_norm": 0.5625, + "learning_rate": 0.00010394663546110442, + "loss": 0.3875, + "step": 16148 + }, + { + "epoch": 2.15, + "grad_norm": 0.494140625, + "learning_rate": 0.00010393499949012583, + "loss": 0.3354, + "step": 16149 + }, + { + "epoch": 2.16, + "grad_norm": 0.69140625, + "learning_rate": 0.00010392336346578616, + "loss": 0.3125, + "step": 16150 + }, + { + "epoch": 2.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00010391172738824315, + "loss": 0.1978, + "step": 16151 + }, + { + "epoch": 2.16, + "grad_norm": 0.48828125, + "learning_rate": 0.00010390009125765458, + "loss": 0.1628, + "step": 16152 + }, + { + "epoch": 2.16, + "grad_norm": 0.447265625, + "learning_rate": 0.0001038884550741783, + "loss": 0.2913, + "step": 16153 + }, + { + "epoch": 2.16, + "grad_norm": 0.474609375, + "learning_rate": 0.00010387681883797204, + "loss": 0.3069, + "step": 16154 + }, + { + "epoch": 2.16, + "grad_norm": 0.62109375, + "learning_rate": 0.00010386518254919367, + "loss": 0.5417, + "step": 16155 + }, + { + "epoch": 2.16, + "grad_norm": 0.494140625, + "learning_rate": 0.00010385354620800092, + "loss": 0.3777, + "step": 16156 + }, + { + "epoch": 2.16, + "grad_norm": 0.71484375, + "learning_rate": 0.00010384190981455162, + "loss": 0.2739, + "step": 16157 + }, + { + "epoch": 2.16, + "grad_norm": 0.6171875, + "learning_rate": 0.00010383027336900355, + "loss": 0.6233, + "step": 16158 + }, + { + "epoch": 2.16, + "grad_norm": 0.482421875, + "learning_rate": 0.0001038186368715145, + "loss": 0.2552, + "step": 16159 + }, + { + "epoch": 2.16, + "grad_norm": 0.56640625, + "learning_rate": 0.0001038070003222423, + "loss": 0.4421, + "step": 16160 + }, + { + "epoch": 2.16, + "grad_norm": 0.71875, + "learning_rate": 0.00010379536372134473, + "loss": 0.5337, + "step": 16161 + }, + { + "epoch": 2.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00010378372706897961, + "loss": 0.1528, + "step": 16162 + }, + { + "epoch": 2.16, + "grad_norm": 0.71484375, + "learning_rate": 0.00010377209036530471, + "loss": 0.2659, + "step": 16163 + }, + { + "epoch": 2.16, + "grad_norm": 0.5859375, + "learning_rate": 0.00010376045361047784, + "loss": 0.2418, + "step": 16164 + }, + { + "epoch": 2.16, + "grad_norm": 0.51171875, + "learning_rate": 0.0001037488168046568, + "loss": 0.325, + "step": 16165 + }, + { + "epoch": 2.16, + "grad_norm": 0.609375, + "learning_rate": 0.00010373717994799939, + "loss": 0.3418, + "step": 16166 + }, + { + "epoch": 2.16, + "grad_norm": 0.54296875, + "learning_rate": 0.00010372554304066344, + "loss": 0.368, + "step": 16167 + }, + { + "epoch": 2.16, + "grad_norm": 0.53125, + "learning_rate": 0.00010371390608280673, + "loss": 0.4733, + "step": 16168 + }, + { + "epoch": 2.16, + "grad_norm": 0.56640625, + "learning_rate": 0.00010370226907458709, + "loss": 0.2087, + "step": 16169 + }, + { + "epoch": 2.16, + "grad_norm": 0.4765625, + "learning_rate": 0.00010369063201616227, + "loss": 0.3643, + "step": 16170 + }, + { + "epoch": 2.16, + "grad_norm": 0.5, + "learning_rate": 0.00010367899490769013, + "loss": 0.2706, + "step": 16171 + }, + { + "epoch": 2.16, + "grad_norm": 0.51953125, + "learning_rate": 0.00010366735774932845, + "loss": 0.3952, + "step": 16172 + }, + { + "epoch": 2.16, + "grad_norm": 0.609375, + "learning_rate": 0.00010365572054123504, + "loss": 0.3171, + "step": 16173 + }, + { + "epoch": 2.16, + "grad_norm": 0.6875, + "learning_rate": 0.00010364408328356774, + "loss": 0.3742, + "step": 16174 + }, + { + "epoch": 2.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00010363244597648431, + "loss": 0.4672, + "step": 16175 + }, + { + "epoch": 2.16, + "grad_norm": 0.578125, + "learning_rate": 0.0001036208086201426, + "loss": 0.5618, + "step": 16176 + }, + { + "epoch": 2.16, + "grad_norm": 0.71484375, + "learning_rate": 0.00010360917121470038, + "loss": 0.3885, + "step": 16177 + }, + { + "epoch": 2.16, + "grad_norm": 0.416015625, + "learning_rate": 0.00010359753376031549, + "loss": 0.2768, + "step": 16178 + }, + { + "epoch": 2.16, + "grad_norm": 0.703125, + "learning_rate": 0.00010358589625714572, + "loss": 0.1989, + "step": 16179 + }, + { + "epoch": 2.16, + "grad_norm": 0.7890625, + "learning_rate": 0.00010357425870534891, + "loss": 0.6178, + "step": 16180 + }, + { + "epoch": 2.16, + "grad_norm": 0.7578125, + "learning_rate": 0.00010356262110508287, + "loss": 0.5179, + "step": 16181 + }, + { + "epoch": 2.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00010355098345650538, + "loss": 0.2037, + "step": 16182 + }, + { + "epoch": 2.16, + "grad_norm": 0.58203125, + "learning_rate": 0.00010353934575977429, + "loss": 0.3372, + "step": 16183 + }, + { + "epoch": 2.16, + "grad_norm": 0.490234375, + "learning_rate": 0.00010352770801504739, + "loss": 0.3023, + "step": 16184 + }, + { + "epoch": 2.16, + "grad_norm": 0.546875, + "learning_rate": 0.00010351607022248251, + "loss": 0.4444, + "step": 16185 + }, + { + "epoch": 2.16, + "grad_norm": 0.59375, + "learning_rate": 0.00010350443238223744, + "loss": 0.3471, + "step": 16186 + }, + { + "epoch": 2.16, + "grad_norm": 0.53515625, + "learning_rate": 0.00010349279449447004, + "loss": 0.3639, + "step": 16187 + }, + { + "epoch": 2.16, + "grad_norm": 0.5390625, + "learning_rate": 0.00010348115655933812, + "loss": 0.2367, + "step": 16188 + }, + { + "epoch": 2.16, + "grad_norm": 0.46875, + "learning_rate": 0.00010346951857699945, + "loss": 0.3842, + "step": 16189 + }, + { + "epoch": 2.16, + "grad_norm": 0.39453125, + "learning_rate": 0.0001034578805476119, + "loss": 0.1737, + "step": 16190 + }, + { + "epoch": 2.16, + "grad_norm": 0.52734375, + "learning_rate": 0.00010344624247133325, + "loss": 0.5805, + "step": 16191 + }, + { + "epoch": 2.16, + "grad_norm": 0.45703125, + "learning_rate": 0.00010343460434832137, + "loss": 0.1129, + "step": 16192 + }, + { + "epoch": 2.16, + "grad_norm": 0.46875, + "learning_rate": 0.00010342296617873406, + "loss": 0.3112, + "step": 16193 + }, + { + "epoch": 2.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00010341132796272912, + "loss": 0.2566, + "step": 16194 + }, + { + "epoch": 2.16, + "grad_norm": 0.75390625, + "learning_rate": 0.00010339968970046435, + "loss": 0.5182, + "step": 16195 + }, + { + "epoch": 2.16, + "grad_norm": 0.59765625, + "learning_rate": 0.00010338805139209762, + "loss": 0.2325, + "step": 16196 + }, + { + "epoch": 2.16, + "grad_norm": 0.61328125, + "learning_rate": 0.00010337641303778676, + "loss": 0.4662, + "step": 16197 + }, + { + "epoch": 2.16, + "grad_norm": 0.486328125, + "learning_rate": 0.00010336477463768955, + "loss": 0.2909, + "step": 16198 + }, + { + "epoch": 2.16, + "grad_norm": 0.46875, + "learning_rate": 0.00010335313619196387, + "loss": 0.3226, + "step": 16199 + }, + { + "epoch": 2.16, + "grad_norm": 0.59765625, + "learning_rate": 0.00010334149770076747, + "loss": 0.4636, + "step": 16200 + }, + { + "epoch": 2.16, + "grad_norm": 0.5625, + "learning_rate": 0.00010332985916425824, + "loss": 0.5221, + "step": 16201 + }, + { + "epoch": 2.16, + "grad_norm": 0.5078125, + "learning_rate": 0.00010331822058259397, + "loss": 0.3678, + "step": 16202 + }, + { + "epoch": 2.16, + "grad_norm": 0.59765625, + "learning_rate": 0.0001033065819559325, + "loss": 0.5573, + "step": 16203 + }, + { + "epoch": 2.16, + "grad_norm": 0.5625, + "learning_rate": 0.00010329494328443166, + "loss": 0.3855, + "step": 16204 + }, + { + "epoch": 2.16, + "grad_norm": 0.57421875, + "learning_rate": 0.00010328330456824929, + "loss": 0.4375, + "step": 16205 + }, + { + "epoch": 2.16, + "grad_norm": 0.392578125, + "learning_rate": 0.0001032716658075432, + "loss": 0.241, + "step": 16206 + }, + { + "epoch": 2.16, + "grad_norm": 0.625, + "learning_rate": 0.00010326002700247119, + "loss": 0.1798, + "step": 16207 + }, + { + "epoch": 2.16, + "grad_norm": 0.43359375, + "learning_rate": 0.00010324838815319116, + "loss": 0.2995, + "step": 16208 + }, + { + "epoch": 2.16, + "grad_norm": 0.439453125, + "learning_rate": 0.00010323674925986086, + "loss": 0.2086, + "step": 16209 + }, + { + "epoch": 2.16, + "grad_norm": 0.66015625, + "learning_rate": 0.00010322511032263819, + "loss": 0.3035, + "step": 16210 + }, + { + "epoch": 2.16, + "grad_norm": 0.482421875, + "learning_rate": 0.00010321347134168098, + "loss": 0.2653, + "step": 16211 + }, + { + "epoch": 2.16, + "grad_norm": 0.51171875, + "learning_rate": 0.000103201832317147, + "loss": 0.2341, + "step": 16212 + }, + { + "epoch": 2.16, + "grad_norm": 0.578125, + "learning_rate": 0.00010319019324919412, + "loss": 0.2916, + "step": 16213 + }, + { + "epoch": 2.16, + "grad_norm": 0.74609375, + "learning_rate": 0.00010317855413798017, + "loss": 0.512, + "step": 16214 + }, + { + "epoch": 2.16, + "grad_norm": 0.6875, + "learning_rate": 0.000103166914983663, + "loss": 0.6255, + "step": 16215 + }, + { + "epoch": 2.16, + "grad_norm": 0.4765625, + "learning_rate": 0.0001031552757864004, + "loss": 0.3201, + "step": 16216 + }, + { + "epoch": 2.16, + "grad_norm": 0.61328125, + "learning_rate": 0.00010314363654635031, + "loss": 0.3026, + "step": 16217 + }, + { + "epoch": 2.16, + "grad_norm": 0.453125, + "learning_rate": 0.00010313199726367042, + "loss": 0.2233, + "step": 16218 + }, + { + "epoch": 2.16, + "grad_norm": 0.4921875, + "learning_rate": 0.00010312035793851866, + "loss": 0.4612, + "step": 16219 + }, + { + "epoch": 2.16, + "grad_norm": 0.43359375, + "learning_rate": 0.00010310871857105284, + "loss": 0.3981, + "step": 16220 + }, + { + "epoch": 2.16, + "grad_norm": 0.51953125, + "learning_rate": 0.0001030970791614308, + "loss": 0.3484, + "step": 16221 + }, + { + "epoch": 2.16, + "grad_norm": 0.64453125, + "learning_rate": 0.00010308543970981037, + "loss": 0.3554, + "step": 16222 + }, + { + "epoch": 2.16, + "grad_norm": 0.462890625, + "learning_rate": 0.00010307380021634942, + "loss": 0.1935, + "step": 16223 + }, + { + "epoch": 2.16, + "grad_norm": 0.5859375, + "learning_rate": 0.00010306216068120578, + "loss": 0.3388, + "step": 16224 + }, + { + "epoch": 2.17, + "grad_norm": 0.8671875, + "learning_rate": 0.00010305052110453723, + "loss": 0.1684, + "step": 16225 + }, + { + "epoch": 2.17, + "grad_norm": 0.66796875, + "learning_rate": 0.0001030388814865017, + "loss": 0.4568, + "step": 16226 + }, + { + "epoch": 2.17, + "grad_norm": 0.53125, + "learning_rate": 0.00010302724182725694, + "loss": 0.3875, + "step": 16227 + }, + { + "epoch": 2.17, + "grad_norm": 0.55859375, + "learning_rate": 0.00010301560212696087, + "loss": 0.3545, + "step": 16228 + }, + { + "epoch": 2.17, + "grad_norm": 0.63671875, + "learning_rate": 0.0001030039623857713, + "loss": 0.3398, + "step": 16229 + }, + { + "epoch": 2.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00010299232260384605, + "loss": 0.4856, + "step": 16230 + }, + { + "epoch": 2.17, + "grad_norm": 0.47265625, + "learning_rate": 0.00010298068278134301, + "loss": 0.2085, + "step": 16231 + }, + { + "epoch": 2.17, + "grad_norm": 0.3671875, + "learning_rate": 0.00010296904291841998, + "loss": 0.1335, + "step": 16232 + }, + { + "epoch": 2.17, + "grad_norm": 0.44921875, + "learning_rate": 0.00010295740301523482, + "loss": 0.2169, + "step": 16233 + }, + { + "epoch": 2.17, + "grad_norm": 0.7265625, + "learning_rate": 0.00010294576307194539, + "loss": 0.2505, + "step": 16234 + }, + { + "epoch": 2.17, + "grad_norm": 0.427734375, + "learning_rate": 0.00010293412308870953, + "loss": 0.2559, + "step": 16235 + }, + { + "epoch": 2.17, + "grad_norm": 0.54296875, + "learning_rate": 0.00010292248306568506, + "loss": 0.4639, + "step": 16236 + }, + { + "epoch": 2.17, + "grad_norm": 0.59375, + "learning_rate": 0.00010291084300302984, + "loss": 0.4116, + "step": 16237 + }, + { + "epoch": 2.17, + "grad_norm": 0.68359375, + "learning_rate": 0.00010289920290090176, + "loss": 0.6862, + "step": 16238 + }, + { + "epoch": 2.17, + "grad_norm": 0.68359375, + "learning_rate": 0.00010288756275945857, + "loss": 0.5264, + "step": 16239 + }, + { + "epoch": 2.17, + "grad_norm": 0.55078125, + "learning_rate": 0.0001028759225788582, + "loss": 0.2321, + "step": 16240 + }, + { + "epoch": 2.17, + "grad_norm": 0.5, + "learning_rate": 0.00010286428235925849, + "loss": 0.2678, + "step": 16241 + }, + { + "epoch": 2.17, + "grad_norm": 0.494140625, + "learning_rate": 0.00010285264210081728, + "loss": 0.2502, + "step": 16242 + }, + { + "epoch": 2.17, + "grad_norm": 0.47265625, + "learning_rate": 0.00010284100180369238, + "loss": 0.2039, + "step": 16243 + }, + { + "epoch": 2.17, + "grad_norm": 0.734375, + "learning_rate": 0.00010282936146804167, + "loss": 0.2972, + "step": 16244 + }, + { + "epoch": 2.17, + "grad_norm": 0.67578125, + "learning_rate": 0.00010281772109402304, + "loss": 0.2416, + "step": 16245 + }, + { + "epoch": 2.17, + "grad_norm": 0.5546875, + "learning_rate": 0.00010280608068179427, + "loss": 0.7467, + "step": 16246 + }, + { + "epoch": 2.17, + "grad_norm": 0.48828125, + "learning_rate": 0.00010279444023151328, + "loss": 0.3585, + "step": 16247 + }, + { + "epoch": 2.17, + "grad_norm": 0.494140625, + "learning_rate": 0.00010278279974333787, + "loss": 0.2047, + "step": 16248 + }, + { + "epoch": 2.17, + "grad_norm": 0.69921875, + "learning_rate": 0.0001027711592174259, + "loss": 0.3992, + "step": 16249 + }, + { + "epoch": 2.17, + "grad_norm": 0.447265625, + "learning_rate": 0.00010275951865393523, + "loss": 0.1985, + "step": 16250 + }, + { + "epoch": 2.17, + "grad_norm": 0.50390625, + "learning_rate": 0.00010274787805302373, + "loss": 0.3708, + "step": 16251 + }, + { + "epoch": 2.17, + "grad_norm": 0.53515625, + "learning_rate": 0.00010273623741484923, + "loss": 0.2702, + "step": 16252 + }, + { + "epoch": 2.17, + "grad_norm": 0.609375, + "learning_rate": 0.0001027245967395696, + "loss": 0.379, + "step": 16253 + }, + { + "epoch": 2.17, + "grad_norm": 0.5, + "learning_rate": 0.0001027129560273427, + "loss": 0.3915, + "step": 16254 + }, + { + "epoch": 2.17, + "grad_norm": 0.4921875, + "learning_rate": 0.00010270131527832633, + "loss": 0.398, + "step": 16255 + }, + { + "epoch": 2.17, + "grad_norm": 0.76953125, + "learning_rate": 0.00010268967449267846, + "loss": 0.5768, + "step": 16256 + }, + { + "epoch": 2.17, + "grad_norm": 0.400390625, + "learning_rate": 0.00010267803367055682, + "loss": 0.2707, + "step": 16257 + }, + { + "epoch": 2.17, + "grad_norm": 0.578125, + "learning_rate": 0.00010266639281211936, + "loss": 0.2074, + "step": 16258 + }, + { + "epoch": 2.17, + "grad_norm": 0.5546875, + "learning_rate": 0.00010265475191752387, + "loss": 0.4453, + "step": 16259 + }, + { + "epoch": 2.17, + "grad_norm": 0.5234375, + "learning_rate": 0.00010264311098692827, + "loss": 0.4783, + "step": 16260 + }, + { + "epoch": 2.17, + "grad_norm": 0.447265625, + "learning_rate": 0.00010263147002049038, + "loss": 0.2456, + "step": 16261 + }, + { + "epoch": 2.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00010261982901836805, + "loss": 0.4919, + "step": 16262 + }, + { + "epoch": 2.17, + "grad_norm": 0.6484375, + "learning_rate": 0.00010260818798071918, + "loss": 0.5246, + "step": 16263 + }, + { + "epoch": 2.17, + "grad_norm": 0.796875, + "learning_rate": 0.0001025965469077016, + "loss": 0.4457, + "step": 16264 + }, + { + "epoch": 2.17, + "grad_norm": 0.38671875, + "learning_rate": 0.00010258490579947318, + "loss": 0.1782, + "step": 16265 + }, + { + "epoch": 2.17, + "grad_norm": 0.53125, + "learning_rate": 0.00010257326465619177, + "loss": 0.4284, + "step": 16266 + }, + { + "epoch": 2.17, + "grad_norm": 0.5703125, + "learning_rate": 0.00010256162347801525, + "loss": 0.5285, + "step": 16267 + }, + { + "epoch": 2.17, + "grad_norm": 0.62109375, + "learning_rate": 0.00010254998226510146, + "loss": 0.2058, + "step": 16268 + }, + { + "epoch": 2.17, + "grad_norm": 0.498046875, + "learning_rate": 0.00010253834101760826, + "loss": 0.2202, + "step": 16269 + }, + { + "epoch": 2.17, + "grad_norm": 0.6875, + "learning_rate": 0.00010252669973569356, + "loss": 0.4326, + "step": 16270 + }, + { + "epoch": 2.17, + "grad_norm": 0.62890625, + "learning_rate": 0.00010251505841951516, + "loss": 0.5144, + "step": 16271 + }, + { + "epoch": 2.17, + "grad_norm": 0.7109375, + "learning_rate": 0.00010250341706923102, + "loss": 0.2294, + "step": 16272 + }, + { + "epoch": 2.17, + "grad_norm": 0.67578125, + "learning_rate": 0.00010249177568499886, + "loss": 0.3148, + "step": 16273 + }, + { + "epoch": 2.17, + "grad_norm": 0.46875, + "learning_rate": 0.00010248013426697667, + "loss": 0.1383, + "step": 16274 + }, + { + "epoch": 2.17, + "grad_norm": 0.51953125, + "learning_rate": 0.00010246849281532222, + "loss": 0.3619, + "step": 16275 + }, + { + "epoch": 2.17, + "grad_norm": 0.44921875, + "learning_rate": 0.00010245685133019347, + "loss": 0.1187, + "step": 16276 + }, + { + "epoch": 2.17, + "grad_norm": 0.53515625, + "learning_rate": 0.00010244520981174822, + "loss": 0.2322, + "step": 16277 + }, + { + "epoch": 2.17, + "grad_norm": 0.5625, + "learning_rate": 0.00010243356826014434, + "loss": 0.3502, + "step": 16278 + }, + { + "epoch": 2.17, + "grad_norm": 0.59375, + "learning_rate": 0.00010242192667553973, + "loss": 0.3587, + "step": 16279 + }, + { + "epoch": 2.17, + "grad_norm": 0.59765625, + "learning_rate": 0.00010241028505809225, + "loss": 0.5433, + "step": 16280 + }, + { + "epoch": 2.17, + "grad_norm": 0.54296875, + "learning_rate": 0.00010239864340795975, + "loss": 0.3372, + "step": 16281 + }, + { + "epoch": 2.17, + "grad_norm": 0.52734375, + "learning_rate": 0.00010238700172530009, + "loss": 0.357, + "step": 16282 + }, + { + "epoch": 2.17, + "grad_norm": 0.58203125, + "learning_rate": 0.00010237536001027117, + "loss": 0.2495, + "step": 16283 + }, + { + "epoch": 2.17, + "grad_norm": 0.59765625, + "learning_rate": 0.00010236371826303086, + "loss": 0.1996, + "step": 16284 + }, + { + "epoch": 2.17, + "grad_norm": 0.55078125, + "learning_rate": 0.00010235207648373697, + "loss": 0.3964, + "step": 16285 + }, + { + "epoch": 2.17, + "grad_norm": 0.48046875, + "learning_rate": 0.00010234043467254745, + "loss": 0.2875, + "step": 16286 + }, + { + "epoch": 2.17, + "grad_norm": 0.609375, + "learning_rate": 0.00010232879282962011, + "loss": 0.3123, + "step": 16287 + }, + { + "epoch": 2.17, + "grad_norm": 0.67578125, + "learning_rate": 0.00010231715095511287, + "loss": 0.2511, + "step": 16288 + }, + { + "epoch": 2.17, + "grad_norm": 0.5703125, + "learning_rate": 0.00010230550904918355, + "loss": 0.5184, + "step": 16289 + }, + { + "epoch": 2.17, + "grad_norm": 0.5546875, + "learning_rate": 0.00010229386711199007, + "loss": 0.2443, + "step": 16290 + }, + { + "epoch": 2.17, + "grad_norm": 0.62890625, + "learning_rate": 0.00010228222514369025, + "loss": 0.4113, + "step": 16291 + }, + { + "epoch": 2.17, + "grad_norm": 0.6015625, + "learning_rate": 0.00010227058314444204, + "loss": 0.2648, + "step": 16292 + }, + { + "epoch": 2.17, + "grad_norm": 0.5625, + "learning_rate": 0.00010225894111440321, + "loss": 0.3882, + "step": 16293 + }, + { + "epoch": 2.17, + "grad_norm": 1.4765625, + "learning_rate": 0.00010224729905373171, + "loss": 0.3014, + "step": 16294 + }, + { + "epoch": 2.17, + "grad_norm": 0.62890625, + "learning_rate": 0.00010223565696258542, + "loss": 0.4809, + "step": 16295 + }, + { + "epoch": 2.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00010222401484112216, + "loss": 0.2657, + "step": 16296 + }, + { + "epoch": 2.17, + "grad_norm": 0.44140625, + "learning_rate": 0.00010221237268949986, + "loss": 0.4436, + "step": 16297 + }, + { + "epoch": 2.17, + "grad_norm": 0.69140625, + "learning_rate": 0.00010220073050787631, + "loss": 0.4115, + "step": 16298 + }, + { + "epoch": 2.17, + "grad_norm": 0.71484375, + "learning_rate": 0.0001021890882964095, + "loss": 0.6987, + "step": 16299 + }, + { + "epoch": 2.18, + "grad_norm": 0.5859375, + "learning_rate": 0.0001021774460552572, + "loss": 0.6042, + "step": 16300 + }, + { + "epoch": 2.18, + "grad_norm": 0.56640625, + "learning_rate": 0.00010216580378457736, + "loss": 0.5272, + "step": 16301 + }, + { + "epoch": 2.18, + "grad_norm": 0.57421875, + "learning_rate": 0.00010215416148452783, + "loss": 0.2159, + "step": 16302 + }, + { + "epoch": 2.18, + "grad_norm": 0.41015625, + "learning_rate": 0.00010214251915526646, + "loss": 0.1939, + "step": 16303 + }, + { + "epoch": 2.18, + "grad_norm": 0.5390625, + "learning_rate": 0.00010213087679695122, + "loss": 0.3825, + "step": 16304 + }, + { + "epoch": 2.18, + "grad_norm": 0.62890625, + "learning_rate": 0.00010211923440973984, + "loss": 0.3073, + "step": 16305 + }, + { + "epoch": 2.18, + "grad_norm": 0.5234375, + "learning_rate": 0.00010210759199379034, + "loss": 0.4811, + "step": 16306 + }, + { + "epoch": 2.18, + "grad_norm": 0.486328125, + "learning_rate": 0.00010209594954926051, + "loss": 0.1292, + "step": 16307 + }, + { + "epoch": 2.18, + "grad_norm": 0.67578125, + "learning_rate": 0.00010208430707630829, + "loss": 0.2556, + "step": 16308 + }, + { + "epoch": 2.18, + "grad_norm": 0.484375, + "learning_rate": 0.00010207266457509153, + "loss": 0.1639, + "step": 16309 + }, + { + "epoch": 2.18, + "grad_norm": 0.60546875, + "learning_rate": 0.00010206102204576807, + "loss": 0.4125, + "step": 16310 + }, + { + "epoch": 2.18, + "grad_norm": 0.6484375, + "learning_rate": 0.00010204937948849587, + "loss": 0.5307, + "step": 16311 + }, + { + "epoch": 2.18, + "grad_norm": 0.447265625, + "learning_rate": 0.00010203773690343272, + "loss": 0.2842, + "step": 16312 + }, + { + "epoch": 2.18, + "grad_norm": 0.423828125, + "learning_rate": 0.0001020260942907366, + "loss": 0.2388, + "step": 16313 + }, + { + "epoch": 2.18, + "grad_norm": 0.65234375, + "learning_rate": 0.00010201445165056531, + "loss": 0.3095, + "step": 16314 + }, + { + "epoch": 2.18, + "grad_norm": 0.6875, + "learning_rate": 0.00010200280898307679, + "loss": 0.4077, + "step": 16315 + }, + { + "epoch": 2.18, + "grad_norm": 0.75, + "learning_rate": 0.00010199116628842886, + "loss": 0.3891, + "step": 16316 + }, + { + "epoch": 2.18, + "grad_norm": 0.4609375, + "learning_rate": 0.00010197952356677942, + "loss": 0.2037, + "step": 16317 + }, + { + "epoch": 2.18, + "grad_norm": 0.478515625, + "learning_rate": 0.00010196788081828643, + "loss": 0.224, + "step": 16318 + }, + { + "epoch": 2.18, + "grad_norm": 0.640625, + "learning_rate": 0.00010195623804310766, + "loss": 0.4032, + "step": 16319 + }, + { + "epoch": 2.18, + "grad_norm": 0.67578125, + "learning_rate": 0.00010194459524140108, + "loss": 0.281, + "step": 16320 + }, + { + "epoch": 2.18, + "grad_norm": 0.671875, + "learning_rate": 0.00010193295241332454, + "loss": 0.3546, + "step": 16321 + }, + { + "epoch": 2.18, + "grad_norm": 0.470703125, + "learning_rate": 0.00010192130955903593, + "loss": 0.2286, + "step": 16322 + }, + { + "epoch": 2.18, + "grad_norm": 0.58203125, + "learning_rate": 0.0001019096666786931, + "loss": 0.5123, + "step": 16323 + }, + { + "epoch": 2.18, + "grad_norm": 0.65625, + "learning_rate": 0.00010189802377245398, + "loss": 0.2547, + "step": 16324 + }, + { + "epoch": 2.18, + "grad_norm": 0.640625, + "learning_rate": 0.00010188638084047643, + "loss": 0.365, + "step": 16325 + }, + { + "epoch": 2.18, + "grad_norm": 0.484375, + "learning_rate": 0.00010187473788291838, + "loss": 0.2587, + "step": 16326 + }, + { + "epoch": 2.18, + "grad_norm": 0.63671875, + "learning_rate": 0.00010186309489993766, + "loss": 0.3731, + "step": 16327 + }, + { + "epoch": 2.18, + "grad_norm": 0.45703125, + "learning_rate": 0.00010185145189169217, + "loss": 0.3496, + "step": 16328 + }, + { + "epoch": 2.18, + "grad_norm": 0.63671875, + "learning_rate": 0.00010183980885833982, + "loss": 0.2686, + "step": 16329 + }, + { + "epoch": 2.18, + "grad_norm": 0.4140625, + "learning_rate": 0.00010182816580003844, + "loss": 0.134, + "step": 16330 + }, + { + "epoch": 2.18, + "grad_norm": 0.47265625, + "learning_rate": 0.000101816522716946, + "loss": 0.3464, + "step": 16331 + }, + { + "epoch": 2.18, + "grad_norm": 0.6015625, + "learning_rate": 0.00010180487960922032, + "loss": 0.3509, + "step": 16332 + }, + { + "epoch": 2.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00010179323647701936, + "loss": 0.195, + "step": 16333 + }, + { + "epoch": 2.18, + "grad_norm": 0.58203125, + "learning_rate": 0.00010178159332050092, + "loss": 0.6309, + "step": 16334 + }, + { + "epoch": 2.18, + "grad_norm": 0.69140625, + "learning_rate": 0.00010176995013982293, + "loss": 0.5956, + "step": 16335 + }, + { + "epoch": 2.18, + "grad_norm": 0.44921875, + "learning_rate": 0.0001017583069351433, + "loss": 0.1793, + "step": 16336 + }, + { + "epoch": 2.18, + "grad_norm": 0.63671875, + "learning_rate": 0.00010174666370661987, + "loss": 0.5032, + "step": 16337 + }, + { + "epoch": 2.18, + "grad_norm": 0.59765625, + "learning_rate": 0.00010173502045441059, + "loss": 0.2355, + "step": 16338 + }, + { + "epoch": 2.18, + "grad_norm": 0.71484375, + "learning_rate": 0.00010172337717867333, + "loss": 0.2359, + "step": 16339 + }, + { + "epoch": 2.18, + "grad_norm": 0.6640625, + "learning_rate": 0.00010171173387956594, + "loss": 0.1827, + "step": 16340 + }, + { + "epoch": 2.18, + "grad_norm": 0.82421875, + "learning_rate": 0.00010170009055724635, + "loss": 0.1964, + "step": 16341 + }, + { + "epoch": 2.18, + "grad_norm": 0.494140625, + "learning_rate": 0.00010168844721187243, + "loss": 0.2616, + "step": 16342 + }, + { + "epoch": 2.18, + "grad_norm": 0.67578125, + "learning_rate": 0.0001016768038436021, + "loss": 0.5008, + "step": 16343 + }, + { + "epoch": 2.18, + "grad_norm": 0.546875, + "learning_rate": 0.0001016651604525932, + "loss": 0.2279, + "step": 16344 + }, + { + "epoch": 2.18, + "grad_norm": 0.56640625, + "learning_rate": 0.00010165351703900372, + "loss": 0.3955, + "step": 16345 + }, + { + "epoch": 2.18, + "grad_norm": 0.482421875, + "learning_rate": 0.00010164187360299142, + "loss": 0.1803, + "step": 16346 + }, + { + "epoch": 2.18, + "grad_norm": 0.498046875, + "learning_rate": 0.00010163023014471431, + "loss": 0.4109, + "step": 16347 + }, + { + "epoch": 2.18, + "grad_norm": 0.40625, + "learning_rate": 0.0001016185866643302, + "loss": 0.2153, + "step": 16348 + }, + { + "epoch": 2.18, + "grad_norm": 0.7421875, + "learning_rate": 0.00010160694316199703, + "loss": 0.6578, + "step": 16349 + }, + { + "epoch": 2.18, + "grad_norm": 0.5078125, + "learning_rate": 0.00010159529963787267, + "loss": 0.3146, + "step": 16350 + }, + { + "epoch": 2.18, + "grad_norm": 0.494140625, + "learning_rate": 0.00010158365609211504, + "loss": 0.2671, + "step": 16351 + }, + { + "epoch": 2.18, + "grad_norm": 0.54296875, + "learning_rate": 0.00010157201252488202, + "loss": 0.2888, + "step": 16352 + }, + { + "epoch": 2.18, + "grad_norm": 0.6328125, + "learning_rate": 0.00010156036893633147, + "loss": 0.3997, + "step": 16353 + }, + { + "epoch": 2.18, + "grad_norm": 0.55078125, + "learning_rate": 0.00010154872532662134, + "loss": 0.2732, + "step": 16354 + }, + { + "epoch": 2.18, + "grad_norm": 0.498046875, + "learning_rate": 0.00010153708169590948, + "loss": 0.2005, + "step": 16355 + }, + { + "epoch": 2.18, + "grad_norm": 0.671875, + "learning_rate": 0.00010152543804435384, + "loss": 0.4452, + "step": 16356 + }, + { + "epoch": 2.18, + "grad_norm": 0.79296875, + "learning_rate": 0.00010151379437211225, + "loss": 0.4385, + "step": 16357 + }, + { + "epoch": 2.18, + "grad_norm": 0.41796875, + "learning_rate": 0.00010150215067934265, + "loss": 0.191, + "step": 16358 + }, + { + "epoch": 2.18, + "grad_norm": 0.53125, + "learning_rate": 0.00010149050696620292, + "loss": 0.3868, + "step": 16359 + }, + { + "epoch": 2.18, + "grad_norm": 0.60546875, + "learning_rate": 0.00010147886323285096, + "loss": 0.2425, + "step": 16360 + }, + { + "epoch": 2.18, + "grad_norm": 0.70703125, + "learning_rate": 0.00010146721947944467, + "loss": 0.3307, + "step": 16361 + }, + { + "epoch": 2.18, + "grad_norm": 0.515625, + "learning_rate": 0.00010145557570614192, + "loss": 0.5674, + "step": 16362 + }, + { + "epoch": 2.18, + "grad_norm": 0.625, + "learning_rate": 0.00010144393191310068, + "loss": 0.3772, + "step": 16363 + }, + { + "epoch": 2.18, + "grad_norm": 0.498046875, + "learning_rate": 0.00010143228810047875, + "loss": 0.277, + "step": 16364 + }, + { + "epoch": 2.18, + "grad_norm": 0.56640625, + "learning_rate": 0.0001014206442684341, + "loss": 0.3869, + "step": 16365 + }, + { + "epoch": 2.18, + "grad_norm": 0.5625, + "learning_rate": 0.00010140900041712458, + "loss": 0.2223, + "step": 16366 + }, + { + "epoch": 2.18, + "grad_norm": 0.61328125, + "learning_rate": 0.0001013973565467081, + "loss": 0.2218, + "step": 16367 + }, + { + "epoch": 2.18, + "grad_norm": 0.609375, + "learning_rate": 0.0001013857126573426, + "loss": 0.2987, + "step": 16368 + }, + { + "epoch": 2.18, + "grad_norm": 0.76171875, + "learning_rate": 0.00010137406874918596, + "loss": 0.5519, + "step": 16369 + }, + { + "epoch": 2.18, + "grad_norm": 0.546875, + "learning_rate": 0.00010136242482239603, + "loss": 0.2806, + "step": 16370 + }, + { + "epoch": 2.18, + "grad_norm": 0.57421875, + "learning_rate": 0.00010135078087713075, + "loss": 0.1981, + "step": 16371 + }, + { + "epoch": 2.18, + "grad_norm": 0.50390625, + "learning_rate": 0.00010133913691354804, + "loss": 0.3294, + "step": 16372 + }, + { + "epoch": 2.18, + "grad_norm": 0.40625, + "learning_rate": 0.00010132749293180573, + "loss": 0.2204, + "step": 16373 + }, + { + "epoch": 2.18, + "grad_norm": 0.77734375, + "learning_rate": 0.0001013158489320618, + "loss": 0.4605, + "step": 16374 + }, + { + "epoch": 2.19, + "grad_norm": 0.498046875, + "learning_rate": 0.00010130420491447409, + "loss": 0.5524, + "step": 16375 + }, + { + "epoch": 2.19, + "grad_norm": 0.62109375, + "learning_rate": 0.00010129256087920054, + "loss": 0.4274, + "step": 16376 + }, + { + "epoch": 2.19, + "grad_norm": 0.6796875, + "learning_rate": 0.00010128091682639904, + "loss": 0.2034, + "step": 16377 + }, + { + "epoch": 2.19, + "grad_norm": 0.58984375, + "learning_rate": 0.00010126927275622746, + "loss": 0.1878, + "step": 16378 + }, + { + "epoch": 2.19, + "grad_norm": 0.69140625, + "learning_rate": 0.00010125762866884376, + "loss": 0.5723, + "step": 16379 + }, + { + "epoch": 2.19, + "grad_norm": 0.458984375, + "learning_rate": 0.00010124598456440578, + "loss": 0.3274, + "step": 16380 + }, + { + "epoch": 2.19, + "grad_norm": 0.51953125, + "learning_rate": 0.00010123434044307148, + "loss": 0.2957, + "step": 16381 + }, + { + "epoch": 2.19, + "grad_norm": 0.6328125, + "learning_rate": 0.00010122269630499873, + "loss": 0.2198, + "step": 16382 + }, + { + "epoch": 2.19, + "grad_norm": 0.546875, + "learning_rate": 0.00010121105215034539, + "loss": 0.2464, + "step": 16383 + }, + { + "epoch": 2.19, + "grad_norm": 0.62890625, + "learning_rate": 0.00010119940797926943, + "loss": 0.3556, + "step": 16384 + }, + { + "epoch": 2.19, + "grad_norm": 0.79296875, + "learning_rate": 0.00010118776379192872, + "loss": 0.2814, + "step": 16385 + }, + { + "epoch": 2.19, + "grad_norm": 0.62109375, + "learning_rate": 0.00010117611958848119, + "loss": 0.6312, + "step": 16386 + }, + { + "epoch": 2.19, + "grad_norm": 0.40234375, + "learning_rate": 0.00010116447536908474, + "loss": 0.2676, + "step": 16387 + }, + { + "epoch": 2.19, + "grad_norm": 0.68359375, + "learning_rate": 0.00010115283113389722, + "loss": 0.7098, + "step": 16388 + }, + { + "epoch": 2.19, + "grad_norm": 1.5234375, + "learning_rate": 0.00010114118688307657, + "loss": 0.5406, + "step": 16389 + }, + { + "epoch": 2.19, + "grad_norm": 0.439453125, + "learning_rate": 0.00010112954261678069, + "loss": 0.3605, + "step": 16390 + }, + { + "epoch": 2.19, + "grad_norm": 0.5703125, + "learning_rate": 0.00010111789833516751, + "loss": 0.3251, + "step": 16391 + }, + { + "epoch": 2.19, + "grad_norm": 0.7578125, + "learning_rate": 0.00010110625403839488, + "loss": 0.3507, + "step": 16392 + }, + { + "epoch": 2.19, + "grad_norm": 0.6171875, + "learning_rate": 0.00010109460972662081, + "loss": 0.5587, + "step": 16393 + }, + { + "epoch": 2.19, + "grad_norm": 0.609375, + "learning_rate": 0.00010108296540000305, + "loss": 0.3727, + "step": 16394 + }, + { + "epoch": 2.19, + "grad_norm": 0.474609375, + "learning_rate": 0.00010107132105869963, + "loss": 0.2316, + "step": 16395 + }, + { + "epoch": 2.19, + "grad_norm": 0.61328125, + "learning_rate": 0.00010105967670286838, + "loss": 0.5083, + "step": 16396 + }, + { + "epoch": 2.19, + "grad_norm": 0.6953125, + "learning_rate": 0.00010104803233266726, + "loss": 0.5786, + "step": 16397 + }, + { + "epoch": 2.19, + "grad_norm": 0.51171875, + "learning_rate": 0.00010103638794825411, + "loss": 0.3003, + "step": 16398 + }, + { + "epoch": 2.19, + "grad_norm": 0.55078125, + "learning_rate": 0.0001010247435497869, + "loss": 0.1965, + "step": 16399 + }, + { + "epoch": 2.19, + "grad_norm": 0.6484375, + "learning_rate": 0.00010101309913742352, + "loss": 0.2926, + "step": 16400 + }, + { + "epoch": 2.19, + "grad_norm": 0.49609375, + "learning_rate": 0.00010100145471132185, + "loss": 0.424, + "step": 16401 + }, + { + "epoch": 2.19, + "grad_norm": 0.57421875, + "learning_rate": 0.00010098981027163983, + "loss": 0.5225, + "step": 16402 + }, + { + "epoch": 2.19, + "grad_norm": 0.369140625, + "learning_rate": 0.00010097816581853532, + "loss": 0.1117, + "step": 16403 + }, + { + "epoch": 2.19, + "grad_norm": 0.54296875, + "learning_rate": 0.00010096652135216627, + "loss": 0.4401, + "step": 16404 + }, + { + "epoch": 2.19, + "grad_norm": 0.462890625, + "learning_rate": 0.00010095487687269054, + "loss": 0.3379, + "step": 16405 + }, + { + "epoch": 2.19, + "grad_norm": 0.74609375, + "learning_rate": 0.00010094323238026611, + "loss": 0.3022, + "step": 16406 + }, + { + "epoch": 2.19, + "grad_norm": 0.58203125, + "learning_rate": 0.00010093158787505083, + "loss": 0.5399, + "step": 16407 + }, + { + "epoch": 2.19, + "grad_norm": 0.58203125, + "learning_rate": 0.0001009199433572026, + "loss": 0.4598, + "step": 16408 + }, + { + "epoch": 2.19, + "grad_norm": 0.60546875, + "learning_rate": 0.00010090829882687938, + "loss": 0.5355, + "step": 16409 + }, + { + "epoch": 2.19, + "grad_norm": 0.5703125, + "learning_rate": 0.00010089665428423903, + "loss": 0.4018, + "step": 16410 + }, + { + "epoch": 2.19, + "grad_norm": 0.6875, + "learning_rate": 0.00010088500972943946, + "loss": 0.4374, + "step": 16411 + }, + { + "epoch": 2.19, + "grad_norm": 0.55859375, + "learning_rate": 0.0001008733651626386, + "loss": 0.272, + "step": 16412 + }, + { + "epoch": 2.19, + "grad_norm": 0.45703125, + "learning_rate": 0.00010086172058399437, + "loss": 0.2915, + "step": 16413 + }, + { + "epoch": 2.19, + "grad_norm": 0.5625, + "learning_rate": 0.00010085007599366463, + "loss": 0.2181, + "step": 16414 + }, + { + "epoch": 2.19, + "grad_norm": 0.458984375, + "learning_rate": 0.00010083843139180731, + "loss": 0.3738, + "step": 16415 + }, + { + "epoch": 2.19, + "grad_norm": 0.69140625, + "learning_rate": 0.00010082678677858033, + "loss": 0.3436, + "step": 16416 + }, + { + "epoch": 2.19, + "grad_norm": 0.58203125, + "learning_rate": 0.00010081514215414158, + "loss": 0.2505, + "step": 16417 + }, + { + "epoch": 2.19, + "grad_norm": 0.67578125, + "learning_rate": 0.00010080349751864902, + "loss": 0.3679, + "step": 16418 + }, + { + "epoch": 2.19, + "grad_norm": 0.6328125, + "learning_rate": 0.00010079185287226048, + "loss": 0.2504, + "step": 16419 + }, + { + "epoch": 2.19, + "grad_norm": 0.60546875, + "learning_rate": 0.00010078020821513392, + "loss": 0.4068, + "step": 16420 + }, + { + "epoch": 2.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00010076856354742722, + "loss": 0.1518, + "step": 16421 + }, + { + "epoch": 2.19, + "grad_norm": 0.66796875, + "learning_rate": 0.00010075691886929833, + "loss": 0.3976, + "step": 16422 + }, + { + "epoch": 2.19, + "grad_norm": 0.5546875, + "learning_rate": 0.00010074527418090511, + "loss": 0.3652, + "step": 16423 + }, + { + "epoch": 2.19, + "grad_norm": 0.67578125, + "learning_rate": 0.00010073362948240552, + "loss": 0.3247, + "step": 16424 + }, + { + "epoch": 2.19, + "grad_norm": 0.8671875, + "learning_rate": 0.00010072198477395742, + "loss": 0.5428, + "step": 16425 + }, + { + "epoch": 2.19, + "grad_norm": 0.470703125, + "learning_rate": 0.00010071034005571875, + "loss": 0.4072, + "step": 16426 + }, + { + "epoch": 2.19, + "grad_norm": 0.67578125, + "learning_rate": 0.00010069869532784741, + "loss": 0.3569, + "step": 16427 + }, + { + "epoch": 2.19, + "grad_norm": 0.443359375, + "learning_rate": 0.00010068705059050132, + "loss": 0.1795, + "step": 16428 + }, + { + "epoch": 2.19, + "grad_norm": 0.435546875, + "learning_rate": 0.00010067540584383839, + "loss": 0.2861, + "step": 16429 + }, + { + "epoch": 2.19, + "grad_norm": 0.6875, + "learning_rate": 0.00010066376108801651, + "loss": 0.4224, + "step": 16430 + }, + { + "epoch": 2.19, + "grad_norm": 0.451171875, + "learning_rate": 0.0001006521163231936, + "loss": 0.2681, + "step": 16431 + }, + { + "epoch": 2.19, + "grad_norm": 0.46875, + "learning_rate": 0.0001006404715495276, + "loss": 0.4248, + "step": 16432 + }, + { + "epoch": 2.19, + "grad_norm": 0.578125, + "learning_rate": 0.00010062882676717635, + "loss": 0.3793, + "step": 16433 + }, + { + "epoch": 2.19, + "grad_norm": 0.74609375, + "learning_rate": 0.00010061718197629786, + "loss": 0.3146, + "step": 16434 + }, + { + "epoch": 2.19, + "grad_norm": 0.4453125, + "learning_rate": 0.00010060553717704994, + "loss": 0.3476, + "step": 16435 + }, + { + "epoch": 2.19, + "grad_norm": 0.765625, + "learning_rate": 0.0001005938923695906, + "loss": 0.4261, + "step": 16436 + }, + { + "epoch": 2.19, + "grad_norm": 0.546875, + "learning_rate": 0.00010058224755407764, + "loss": 0.4209, + "step": 16437 + }, + { + "epoch": 2.19, + "grad_norm": 0.53125, + "learning_rate": 0.00010057060273066907, + "loss": 0.3767, + "step": 16438 + }, + { + "epoch": 2.19, + "grad_norm": 0.671875, + "learning_rate": 0.00010055895789952272, + "loss": 0.3852, + "step": 16439 + }, + { + "epoch": 2.19, + "grad_norm": 0.5546875, + "learning_rate": 0.00010054731306079656, + "loss": 0.4494, + "step": 16440 + }, + { + "epoch": 2.19, + "grad_norm": 0.70703125, + "learning_rate": 0.0001005356682146485, + "loss": 0.3872, + "step": 16441 + }, + { + "epoch": 2.19, + "grad_norm": 0.8515625, + "learning_rate": 0.00010052402336123644, + "loss": 0.424, + "step": 16442 + }, + { + "epoch": 2.19, + "grad_norm": 0.486328125, + "learning_rate": 0.00010051237850071828, + "loss": 0.308, + "step": 16443 + }, + { + "epoch": 2.19, + "grad_norm": 0.490234375, + "learning_rate": 0.00010050073363325192, + "loss": 0.2592, + "step": 16444 + }, + { + "epoch": 2.19, + "grad_norm": 0.65234375, + "learning_rate": 0.0001004890887589953, + "loss": 0.2126, + "step": 16445 + }, + { + "epoch": 2.19, + "grad_norm": 0.75, + "learning_rate": 0.00010047744387810632, + "loss": 0.77, + "step": 16446 + }, + { + "epoch": 2.19, + "grad_norm": 0.390625, + "learning_rate": 0.0001004657989907429, + "loss": 0.2892, + "step": 16447 + }, + { + "epoch": 2.19, + "grad_norm": 0.71875, + "learning_rate": 0.00010045415409706296, + "loss": 0.2319, + "step": 16448 + }, + { + "epoch": 2.19, + "grad_norm": 0.76171875, + "learning_rate": 0.00010044250919722435, + "loss": 0.3508, + "step": 16449 + }, + { + "epoch": 2.2, + "grad_norm": 0.72265625, + "learning_rate": 0.00010043086429138508, + "loss": 0.5344, + "step": 16450 + }, + { + "epoch": 2.2, + "grad_norm": 0.5625, + "learning_rate": 0.000100419219379703, + "loss": 0.33, + "step": 16451 + }, + { + "epoch": 2.2, + "grad_norm": 0.66796875, + "learning_rate": 0.00010040757446233603, + "loss": 0.5361, + "step": 16452 + }, + { + "epoch": 2.2, + "grad_norm": 0.59765625, + "learning_rate": 0.00010039592953944209, + "loss": 0.326, + "step": 16453 + }, + { + "epoch": 2.2, + "grad_norm": 0.515625, + "learning_rate": 0.0001003842846111791, + "loss": 0.6121, + "step": 16454 + }, + { + "epoch": 2.2, + "grad_norm": 0.69140625, + "learning_rate": 0.00010037263967770498, + "loss": 0.4802, + "step": 16455 + }, + { + "epoch": 2.2, + "grad_norm": 0.6015625, + "learning_rate": 0.00010036099473917759, + "loss": 0.5106, + "step": 16456 + }, + { + "epoch": 2.2, + "grad_norm": 0.6171875, + "learning_rate": 0.0001003493497957549, + "loss": 0.4809, + "step": 16457 + }, + { + "epoch": 2.2, + "grad_norm": 0.8125, + "learning_rate": 0.0001003377048475948, + "loss": 0.2232, + "step": 16458 + }, + { + "epoch": 2.2, + "grad_norm": 0.40625, + "learning_rate": 0.00010032605989485519, + "loss": 0.2046, + "step": 16459 + }, + { + "epoch": 2.2, + "grad_norm": 0.44140625, + "learning_rate": 0.00010031441493769403, + "loss": 0.2325, + "step": 16460 + }, + { + "epoch": 2.2, + "grad_norm": 0.40234375, + "learning_rate": 0.00010030276997626919, + "loss": 0.306, + "step": 16461 + }, + { + "epoch": 2.2, + "grad_norm": 0.578125, + "learning_rate": 0.00010029112501073858, + "loss": 0.5445, + "step": 16462 + }, + { + "epoch": 2.2, + "grad_norm": 0.51171875, + "learning_rate": 0.00010027948004126013, + "loss": 0.2189, + "step": 16463 + }, + { + "epoch": 2.2, + "grad_norm": 0.66015625, + "learning_rate": 0.00010026783506799176, + "loss": 0.3832, + "step": 16464 + }, + { + "epoch": 2.2, + "grad_norm": 0.494140625, + "learning_rate": 0.00010025619009109136, + "loss": 0.2915, + "step": 16465 + }, + { + "epoch": 2.2, + "grad_norm": 0.6640625, + "learning_rate": 0.0001002445451107169, + "loss": 0.5165, + "step": 16466 + }, + { + "epoch": 2.2, + "grad_norm": 0.60546875, + "learning_rate": 0.00010023290012702622, + "loss": 0.2725, + "step": 16467 + }, + { + "epoch": 2.2, + "grad_norm": 0.63671875, + "learning_rate": 0.00010022125514017726, + "loss": 0.2048, + "step": 16468 + }, + { + "epoch": 2.2, + "grad_norm": 0.64453125, + "learning_rate": 0.00010020961015032794, + "loss": 0.4299, + "step": 16469 + }, + { + "epoch": 2.2, + "grad_norm": 0.56640625, + "learning_rate": 0.00010019796515763618, + "loss": 0.5033, + "step": 16470 + }, + { + "epoch": 2.2, + "grad_norm": 0.734375, + "learning_rate": 0.00010018632016225988, + "loss": 0.3885, + "step": 16471 + }, + { + "epoch": 2.2, + "grad_norm": 0.62109375, + "learning_rate": 0.00010017467516435697, + "loss": 0.6741, + "step": 16472 + }, + { + "epoch": 2.2, + "grad_norm": 0.58984375, + "learning_rate": 0.00010016303016408535, + "loss": 0.5199, + "step": 16473 + }, + { + "epoch": 2.2, + "grad_norm": 0.47265625, + "learning_rate": 0.00010015138516160291, + "loss": 0.1739, + "step": 16474 + }, + { + "epoch": 2.2, + "grad_norm": 0.484375, + "learning_rate": 0.00010013974015706762, + "loss": 0.4259, + "step": 16475 + }, + { + "epoch": 2.2, + "grad_norm": 0.578125, + "learning_rate": 0.00010012809515063734, + "loss": 0.2558, + "step": 16476 + }, + { + "epoch": 2.2, + "grad_norm": 0.58984375, + "learning_rate": 0.00010011645014247004, + "loss": 0.4176, + "step": 16477 + }, + { + "epoch": 2.2, + "grad_norm": 0.5, + "learning_rate": 0.00010010480513272358, + "loss": 0.3246, + "step": 16478 + }, + { + "epoch": 2.2, + "grad_norm": 0.6328125, + "learning_rate": 0.0001000931601215559, + "loss": 0.2302, + "step": 16479 + }, + { + "epoch": 2.2, + "grad_norm": 0.404296875, + "learning_rate": 0.00010008151510912492, + "loss": 0.2301, + "step": 16480 + }, + { + "epoch": 2.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00010006987009558851, + "loss": 0.2108, + "step": 16481 + }, + { + "epoch": 2.2, + "grad_norm": 0.68359375, + "learning_rate": 0.00010005822508110466, + "loss": 0.5934, + "step": 16482 + }, + { + "epoch": 2.2, + "grad_norm": 0.4375, + "learning_rate": 0.00010004658006583118, + "loss": 0.2351, + "step": 16483 + }, + { + "epoch": 2.2, + "grad_norm": 0.4921875, + "learning_rate": 0.00010003493504992612, + "loss": 0.2232, + "step": 16484 + }, + { + "epoch": 2.2, + "grad_norm": 0.61328125, + "learning_rate": 0.00010002329003354727, + "loss": 0.5475, + "step": 16485 + }, + { + "epoch": 2.2, + "grad_norm": 0.640625, + "learning_rate": 0.00010001164501685261, + "loss": 0.5698, + "step": 16486 + }, + { + "epoch": 2.2, + "grad_norm": 0.6484375, + "learning_rate": 0.0001, + "loss": 0.2717, + "step": 16487 + }, + { + "epoch": 2.2, + "grad_norm": 0.49609375, + "learning_rate": 9.998835498314742e-05, + "loss": 0.2947, + "step": 16488 + }, + { + "epoch": 2.2, + "grad_norm": 0.53125, + "learning_rate": 9.997670996645275e-05, + "loss": 0.2678, + "step": 16489 + }, + { + "epoch": 2.2, + "grad_norm": 0.51953125, + "learning_rate": 9.99650649500739e-05, + "loss": 0.4884, + "step": 16490 + }, + { + "epoch": 2.2, + "grad_norm": 0.46875, + "learning_rate": 9.995341993416882e-05, + "loss": 0.2893, + "step": 16491 + }, + { + "epoch": 2.2, + "grad_norm": 0.515625, + "learning_rate": 9.994177491889535e-05, + "loss": 0.286, + "step": 16492 + }, + { + "epoch": 2.2, + "grad_norm": 0.486328125, + "learning_rate": 9.99301299044115e-05, + "loss": 0.2381, + "step": 16493 + }, + { + "epoch": 2.2, + "grad_norm": 0.5703125, + "learning_rate": 9.991848489087513e-05, + "loss": 0.4318, + "step": 16494 + }, + { + "epoch": 2.2, + "grad_norm": 0.5546875, + "learning_rate": 9.990683987844412e-05, + "loss": 0.3309, + "step": 16495 + }, + { + "epoch": 2.2, + "grad_norm": 0.49609375, + "learning_rate": 9.989519486727645e-05, + "loss": 0.1886, + "step": 16496 + }, + { + "epoch": 2.2, + "grad_norm": 0.67578125, + "learning_rate": 9.988354985752998e-05, + "loss": 0.4559, + "step": 16497 + }, + { + "epoch": 2.2, + "grad_norm": 0.734375, + "learning_rate": 9.987190484936268e-05, + "loss": 0.3698, + "step": 16498 + }, + { + "epoch": 2.2, + "grad_norm": 0.578125, + "learning_rate": 9.98602598429324e-05, + "loss": 0.255, + "step": 16499 + }, + { + "epoch": 2.2, + "grad_norm": 0.5625, + "learning_rate": 9.98486148383971e-05, + "loss": 0.3115, + "step": 16500 + }, + { + "epoch": 2.2, + "grad_norm": 0.58984375, + "learning_rate": 9.983696983591467e-05, + "loss": 0.463, + "step": 16501 + }, + { + "epoch": 2.2, + "grad_norm": 0.55859375, + "learning_rate": 9.982532483564306e-05, + "loss": 0.1906, + "step": 16502 + }, + { + "epoch": 2.2, + "grad_norm": 0.400390625, + "learning_rate": 9.981367983774013e-05, + "loss": 0.1999, + "step": 16503 + }, + { + "epoch": 2.2, + "grad_norm": 0.703125, + "learning_rate": 9.980203484236383e-05, + "loss": 0.3763, + "step": 16504 + }, + { + "epoch": 2.2, + "grad_norm": 0.609375, + "learning_rate": 9.979038984967207e-05, + "loss": 0.1782, + "step": 16505 + }, + { + "epoch": 2.2, + "grad_norm": 0.44140625, + "learning_rate": 9.977874485982278e-05, + "loss": 0.3618, + "step": 16506 + }, + { + "epoch": 2.2, + "grad_norm": 0.55859375, + "learning_rate": 9.976709987297383e-05, + "loss": 0.3773, + "step": 16507 + }, + { + "epoch": 2.2, + "grad_norm": 0.80078125, + "learning_rate": 9.975545488928314e-05, + "loss": 0.2287, + "step": 16508 + }, + { + "epoch": 2.2, + "grad_norm": 0.322265625, + "learning_rate": 9.974380990890865e-05, + "loss": 0.117, + "step": 16509 + }, + { + "epoch": 2.2, + "grad_norm": 0.5546875, + "learning_rate": 9.973216493200826e-05, + "loss": 0.3675, + "step": 16510 + }, + { + "epoch": 2.2, + "grad_norm": 0.6796875, + "learning_rate": 9.97205199587399e-05, + "loss": 0.275, + "step": 16511 + }, + { + "epoch": 2.2, + "grad_norm": 0.52734375, + "learning_rate": 9.970887498926145e-05, + "loss": 0.299, + "step": 16512 + }, + { + "epoch": 2.2, + "grad_norm": 0.875, + "learning_rate": 9.969723002373082e-05, + "loss": 0.3195, + "step": 16513 + }, + { + "epoch": 2.2, + "grad_norm": 0.55859375, + "learning_rate": 9.968558506230598e-05, + "loss": 0.1757, + "step": 16514 + }, + { + "epoch": 2.2, + "grad_norm": 0.3984375, + "learning_rate": 9.96739401051448e-05, + "loss": 0.1563, + "step": 16515 + }, + { + "epoch": 2.2, + "grad_norm": 0.6328125, + "learning_rate": 9.966229515240521e-05, + "loss": 0.2316, + "step": 16516 + }, + { + "epoch": 2.2, + "grad_norm": 0.5859375, + "learning_rate": 9.96506502042451e-05, + "loss": 0.3624, + "step": 16517 + }, + { + "epoch": 2.2, + "grad_norm": 0.671875, + "learning_rate": 9.963900526082244e-05, + "loss": 0.4656, + "step": 16518 + }, + { + "epoch": 2.2, + "grad_norm": 0.625, + "learning_rate": 9.962736032229507e-05, + "loss": 0.4401, + "step": 16519 + }, + { + "epoch": 2.2, + "grad_norm": 0.53125, + "learning_rate": 9.961571538882092e-05, + "loss": 0.2799, + "step": 16520 + }, + { + "epoch": 2.2, + "grad_norm": 0.546875, + "learning_rate": 9.960407046055792e-05, + "loss": 0.4245, + "step": 16521 + }, + { + "epoch": 2.2, + "grad_norm": 0.56640625, + "learning_rate": 9.959242553766398e-05, + "loss": 0.2848, + "step": 16522 + }, + { + "epoch": 2.2, + "grad_norm": 0.51953125, + "learning_rate": 9.958078062029703e-05, + "loss": 0.1685, + "step": 16523 + }, + { + "epoch": 2.2, + "grad_norm": 1.09375, + "learning_rate": 9.956913570861494e-05, + "loss": 0.5707, + "step": 16524 + }, + { + "epoch": 2.21, + "grad_norm": 0.78125, + "learning_rate": 9.955749080277565e-05, + "loss": 0.6968, + "step": 16525 + }, + { + "epoch": 2.21, + "grad_norm": 0.48046875, + "learning_rate": 9.954584590293706e-05, + "loss": 0.3064, + "step": 16526 + }, + { + "epoch": 2.21, + "grad_norm": 0.6171875, + "learning_rate": 9.953420100925712e-05, + "loss": 0.4368, + "step": 16527 + }, + { + "epoch": 2.21, + "grad_norm": 0.62890625, + "learning_rate": 9.952255612189368e-05, + "loss": 0.5364, + "step": 16528 + }, + { + "epoch": 2.21, + "grad_norm": 0.62109375, + "learning_rate": 9.951091124100472e-05, + "loss": 0.2568, + "step": 16529 + }, + { + "epoch": 2.21, + "grad_norm": 0.470703125, + "learning_rate": 9.949926636674813e-05, + "loss": 0.3515, + "step": 16530 + }, + { + "epoch": 2.21, + "grad_norm": 0.5, + "learning_rate": 9.948762149928177e-05, + "loss": 0.1727, + "step": 16531 + }, + { + "epoch": 2.21, + "grad_norm": 0.64453125, + "learning_rate": 9.947597663876361e-05, + "loss": 0.2738, + "step": 16532 + }, + { + "epoch": 2.21, + "grad_norm": 0.52734375, + "learning_rate": 9.946433178535153e-05, + "loss": 0.2989, + "step": 16533 + }, + { + "epoch": 2.21, + "grad_norm": 0.515625, + "learning_rate": 9.945268693920346e-05, + "loss": 0.3499, + "step": 16534 + }, + { + "epoch": 2.21, + "grad_norm": 0.5078125, + "learning_rate": 9.94410421004773e-05, + "loss": 0.3386, + "step": 16535 + }, + { + "epoch": 2.21, + "grad_norm": 0.64453125, + "learning_rate": 9.942939726933096e-05, + "loss": 0.3886, + "step": 16536 + }, + { + "epoch": 2.21, + "grad_norm": 0.67578125, + "learning_rate": 9.941775244592237e-05, + "loss": 0.2104, + "step": 16537 + }, + { + "epoch": 2.21, + "grad_norm": 0.55859375, + "learning_rate": 9.940610763040943e-05, + "loss": 0.192, + "step": 16538 + }, + { + "epoch": 2.21, + "grad_norm": 0.546875, + "learning_rate": 9.939446282295007e-05, + "loss": 0.3026, + "step": 16539 + }, + { + "epoch": 2.21, + "grad_norm": 0.49609375, + "learning_rate": 9.938281802370215e-05, + "loss": 0.2189, + "step": 16540 + }, + { + "epoch": 2.21, + "grad_norm": 0.58203125, + "learning_rate": 9.937117323282363e-05, + "loss": 0.1653, + "step": 16541 + }, + { + "epoch": 2.21, + "grad_norm": 0.65625, + "learning_rate": 9.935952845047245e-05, + "loss": 0.6037, + "step": 16542 + }, + { + "epoch": 2.21, + "grad_norm": 0.57421875, + "learning_rate": 9.934788367680642e-05, + "loss": 0.4586, + "step": 16543 + }, + { + "epoch": 2.21, + "grad_norm": 0.65625, + "learning_rate": 9.933623891198353e-05, + "loss": 0.2752, + "step": 16544 + }, + { + "epoch": 2.21, + "grad_norm": 0.482421875, + "learning_rate": 9.932459415616163e-05, + "loss": 0.2508, + "step": 16545 + }, + { + "epoch": 2.21, + "grad_norm": 0.5078125, + "learning_rate": 9.93129494094987e-05, + "loss": 0.2292, + "step": 16546 + }, + { + "epoch": 2.21, + "grad_norm": 0.484375, + "learning_rate": 9.93013046721526e-05, + "loss": 0.2902, + "step": 16547 + }, + { + "epoch": 2.21, + "grad_norm": 0.69140625, + "learning_rate": 9.928965994428126e-05, + "loss": 0.383, + "step": 16548 + }, + { + "epoch": 2.21, + "grad_norm": 0.640625, + "learning_rate": 9.927801522604259e-05, + "loss": 0.5475, + "step": 16549 + }, + { + "epoch": 2.21, + "grad_norm": 0.56640625, + "learning_rate": 9.92663705175945e-05, + "loss": 0.3952, + "step": 16550 + }, + { + "epoch": 2.21, + "grad_norm": 0.5859375, + "learning_rate": 9.925472581909489e-05, + "loss": 0.4235, + "step": 16551 + }, + { + "epoch": 2.21, + "grad_norm": 0.48046875, + "learning_rate": 9.924308113070168e-05, + "loss": 0.1396, + "step": 16552 + }, + { + "epoch": 2.21, + "grad_norm": 0.765625, + "learning_rate": 9.923143645257279e-05, + "loss": 0.5202, + "step": 16553 + }, + { + "epoch": 2.21, + "grad_norm": 0.83984375, + "learning_rate": 9.92197917848661e-05, + "loss": 0.4342, + "step": 16554 + }, + { + "epoch": 2.21, + "grad_norm": 0.5234375, + "learning_rate": 9.920814712773956e-05, + "loss": 0.4717, + "step": 16555 + }, + { + "epoch": 2.21, + "grad_norm": 0.455078125, + "learning_rate": 9.919650248135102e-05, + "loss": 0.3465, + "step": 16556 + }, + { + "epoch": 2.21, + "grad_norm": 0.50390625, + "learning_rate": 9.918485784585844e-05, + "loss": 0.3565, + "step": 16557 + }, + { + "epoch": 2.21, + "grad_norm": 0.54296875, + "learning_rate": 9.91732132214197e-05, + "loss": 0.3711, + "step": 16558 + }, + { + "epoch": 2.21, + "grad_norm": 0.578125, + "learning_rate": 9.916156860819272e-05, + "loss": 0.2202, + "step": 16559 + }, + { + "epoch": 2.21, + "grad_norm": 0.48046875, + "learning_rate": 9.914992400633539e-05, + "loss": 0.1364, + "step": 16560 + }, + { + "epoch": 2.21, + "grad_norm": 0.46484375, + "learning_rate": 9.913827941600564e-05, + "loss": 0.211, + "step": 16561 + }, + { + "epoch": 2.21, + "grad_norm": 0.6171875, + "learning_rate": 9.91266348373614e-05, + "loss": 0.2774, + "step": 16562 + }, + { + "epoch": 2.21, + "grad_norm": 0.53515625, + "learning_rate": 9.911499027056054e-05, + "loss": 0.3814, + "step": 16563 + }, + { + "epoch": 2.21, + "grad_norm": 0.62890625, + "learning_rate": 9.910334571576098e-05, + "loss": 0.425, + "step": 16564 + }, + { + "epoch": 2.21, + "grad_norm": 0.6484375, + "learning_rate": 9.909170117312063e-05, + "loss": 0.2567, + "step": 16565 + }, + { + "epoch": 2.21, + "grad_norm": 0.66796875, + "learning_rate": 9.908005664279742e-05, + "loss": 0.4797, + "step": 16566 + }, + { + "epoch": 2.21, + "grad_norm": 0.62109375, + "learning_rate": 9.906841212494922e-05, + "loss": 0.4764, + "step": 16567 + }, + { + "epoch": 2.21, + "grad_norm": 0.65625, + "learning_rate": 9.905676761973391e-05, + "loss": 0.3405, + "step": 16568 + }, + { + "epoch": 2.21, + "grad_norm": 0.765625, + "learning_rate": 9.904512312730948e-05, + "loss": 0.4263, + "step": 16569 + }, + { + "epoch": 2.21, + "grad_norm": 0.421875, + "learning_rate": 9.903347864783376e-05, + "loss": 0.2389, + "step": 16570 + }, + { + "epoch": 2.21, + "grad_norm": 0.62109375, + "learning_rate": 9.902183418146471e-05, + "loss": 0.4775, + "step": 16571 + }, + { + "epoch": 2.21, + "grad_norm": 0.90625, + "learning_rate": 9.901018972836021e-05, + "loss": 0.2213, + "step": 16572 + }, + { + "epoch": 2.21, + "grad_norm": 0.5234375, + "learning_rate": 9.899854528867817e-05, + "loss": 0.4034, + "step": 16573 + }, + { + "epoch": 2.21, + "grad_norm": 0.6953125, + "learning_rate": 9.898690086257649e-05, + "loss": 0.2887, + "step": 16574 + }, + { + "epoch": 2.21, + "grad_norm": 0.4453125, + "learning_rate": 9.897525645021312e-05, + "loss": 0.2726, + "step": 16575 + }, + { + "epoch": 2.21, + "grad_norm": 0.609375, + "learning_rate": 9.89636120517459e-05, + "loss": 0.4891, + "step": 16576 + }, + { + "epoch": 2.21, + "grad_norm": 0.5078125, + "learning_rate": 9.895196766733277e-05, + "loss": 0.335, + "step": 16577 + }, + { + "epoch": 2.21, + "grad_norm": 0.53125, + "learning_rate": 9.894032329713162e-05, + "loss": 0.3154, + "step": 16578 + }, + { + "epoch": 2.21, + "grad_norm": 0.62890625, + "learning_rate": 9.892867894130041e-05, + "loss": 0.3795, + "step": 16579 + }, + { + "epoch": 2.21, + "grad_norm": 0.546875, + "learning_rate": 9.891703459999698e-05, + "loss": 0.2567, + "step": 16580 + }, + { + "epoch": 2.21, + "grad_norm": 0.50390625, + "learning_rate": 9.890539027337924e-05, + "loss": 0.3598, + "step": 16581 + }, + { + "epoch": 2.21, + "grad_norm": 0.58203125, + "learning_rate": 9.889374596160513e-05, + "loss": 0.3727, + "step": 16582 + }, + { + "epoch": 2.21, + "grad_norm": 0.50390625, + "learning_rate": 9.888210166483251e-05, + "loss": 0.1624, + "step": 16583 + }, + { + "epoch": 2.21, + "grad_norm": 0.75390625, + "learning_rate": 9.887045738321933e-05, + "loss": 0.4935, + "step": 16584 + }, + { + "epoch": 2.21, + "grad_norm": 0.515625, + "learning_rate": 9.885881311692344e-05, + "loss": 0.4016, + "step": 16585 + }, + { + "epoch": 2.21, + "grad_norm": 0.6953125, + "learning_rate": 9.88471688661028e-05, + "loss": 0.6454, + "step": 16586 + }, + { + "epoch": 2.21, + "grad_norm": 0.6484375, + "learning_rate": 9.88355246309153e-05, + "loss": 0.2388, + "step": 16587 + }, + { + "epoch": 2.21, + "grad_norm": 0.478515625, + "learning_rate": 9.882388041151881e-05, + "loss": 0.2925, + "step": 16588 + }, + { + "epoch": 2.21, + "grad_norm": 0.462890625, + "learning_rate": 9.881223620807128e-05, + "loss": 0.3202, + "step": 16589 + }, + { + "epoch": 2.21, + "grad_norm": 0.54296875, + "learning_rate": 9.880059202073056e-05, + "loss": 0.5655, + "step": 16590 + }, + { + "epoch": 2.21, + "grad_norm": 0.47265625, + "learning_rate": 9.878894784965463e-05, + "loss": 0.2707, + "step": 16591 + }, + { + "epoch": 2.21, + "grad_norm": 0.4453125, + "learning_rate": 9.877730369500134e-05, + "loss": 0.2956, + "step": 16592 + }, + { + "epoch": 2.21, + "grad_norm": 0.53125, + "learning_rate": 9.876565955692854e-05, + "loss": 0.3343, + "step": 16593 + }, + { + "epoch": 2.21, + "grad_norm": 0.546875, + "learning_rate": 9.875401543559423e-05, + "loss": 0.193, + "step": 16594 + }, + { + "epoch": 2.21, + "grad_norm": 0.50390625, + "learning_rate": 9.874237133115626e-05, + "loss": 0.2275, + "step": 16595 + }, + { + "epoch": 2.21, + "grad_norm": 0.435546875, + "learning_rate": 9.873072724377255e-05, + "loss": 0.2003, + "step": 16596 + }, + { + "epoch": 2.21, + "grad_norm": 0.62109375, + "learning_rate": 9.871908317360098e-05, + "loss": 0.2576, + "step": 16597 + }, + { + "epoch": 2.21, + "grad_norm": 0.63671875, + "learning_rate": 9.870743912079948e-05, + "loss": 0.5619, + "step": 16598 + }, + { + "epoch": 2.21, + "grad_norm": 0.5234375, + "learning_rate": 9.869579508552592e-05, + "loss": 0.2881, + "step": 16599 + }, + { + "epoch": 2.22, + "grad_norm": 0.59765625, + "learning_rate": 9.868415106793822e-05, + "loss": 0.3156, + "step": 16600 + }, + { + "epoch": 2.22, + "grad_norm": 0.5859375, + "learning_rate": 9.867250706819427e-05, + "loss": 0.2741, + "step": 16601 + }, + { + "epoch": 2.22, + "grad_norm": 0.76953125, + "learning_rate": 9.866086308645199e-05, + "loss": 0.3797, + "step": 16602 + }, + { + "epoch": 2.22, + "grad_norm": 0.51953125, + "learning_rate": 9.86492191228693e-05, + "loss": 0.3351, + "step": 16603 + }, + { + "epoch": 2.22, + "grad_norm": 0.68359375, + "learning_rate": 9.8637575177604e-05, + "loss": 0.2433, + "step": 16604 + }, + { + "epoch": 2.22, + "grad_norm": 0.66796875, + "learning_rate": 9.862593125081409e-05, + "loss": 0.2695, + "step": 16605 + }, + { + "epoch": 2.22, + "grad_norm": 0.87890625, + "learning_rate": 9.861428734265741e-05, + "loss": 0.2651, + "step": 16606 + }, + { + "epoch": 2.22, + "grad_norm": 0.62890625, + "learning_rate": 9.860264345329193e-05, + "loss": 0.2689, + "step": 16607 + }, + { + "epoch": 2.22, + "grad_norm": 0.6640625, + "learning_rate": 9.859099958287545e-05, + "loss": 0.2292, + "step": 16608 + }, + { + "epoch": 2.22, + "grad_norm": 0.6015625, + "learning_rate": 9.857935573156591e-05, + "loss": 0.3494, + "step": 16609 + }, + { + "epoch": 2.22, + "grad_norm": 0.51171875, + "learning_rate": 9.856771189952126e-05, + "loss": 0.4295, + "step": 16610 + }, + { + "epoch": 2.22, + "grad_norm": 0.64453125, + "learning_rate": 9.855606808689933e-05, + "loss": 0.5337, + "step": 16611 + }, + { + "epoch": 2.22, + "grad_norm": 0.58203125, + "learning_rate": 9.854442429385807e-05, + "loss": 0.3123, + "step": 16612 + }, + { + "epoch": 2.22, + "grad_norm": 0.6171875, + "learning_rate": 9.853278052055532e-05, + "loss": 0.5186, + "step": 16613 + }, + { + "epoch": 2.22, + "grad_norm": 0.51171875, + "learning_rate": 9.852113676714905e-05, + "loss": 0.1725, + "step": 16614 + }, + { + "epoch": 2.22, + "grad_norm": 0.55859375, + "learning_rate": 9.850949303379711e-05, + "loss": 0.4704, + "step": 16615 + }, + { + "epoch": 2.22, + "grad_norm": 0.451171875, + "learning_rate": 9.849784932065737e-05, + "loss": 0.3146, + "step": 16616 + }, + { + "epoch": 2.22, + "grad_norm": 0.7890625, + "learning_rate": 9.848620562788778e-05, + "loss": 0.4667, + "step": 16617 + }, + { + "epoch": 2.22, + "grad_norm": 0.4765625, + "learning_rate": 9.847456195564618e-05, + "loss": 0.3375, + "step": 16618 + }, + { + "epoch": 2.22, + "grad_norm": 0.6484375, + "learning_rate": 9.846291830409053e-05, + "loss": 0.2724, + "step": 16619 + }, + { + "epoch": 2.22, + "grad_norm": 0.61328125, + "learning_rate": 9.845127467337868e-05, + "loss": 0.41, + "step": 16620 + }, + { + "epoch": 2.22, + "grad_norm": 0.56640625, + "learning_rate": 9.843963106366855e-05, + "loss": 0.2221, + "step": 16621 + }, + { + "epoch": 2.22, + "grad_norm": 0.4140625, + "learning_rate": 9.842798747511802e-05, + "loss": 0.2263, + "step": 16622 + }, + { + "epoch": 2.22, + "grad_norm": 0.4765625, + "learning_rate": 9.841634390788499e-05, + "loss": 0.312, + "step": 16623 + }, + { + "epoch": 2.22, + "grad_norm": 0.6484375, + "learning_rate": 9.840470036212733e-05, + "loss": 0.4161, + "step": 16624 + }, + { + "epoch": 2.22, + "grad_norm": 0.66796875, + "learning_rate": 9.839305683800298e-05, + "loss": 0.4726, + "step": 16625 + }, + { + "epoch": 2.22, + "grad_norm": 0.458984375, + "learning_rate": 9.83814133356698e-05, + "loss": 0.1787, + "step": 16626 + }, + { + "epoch": 2.22, + "grad_norm": 0.396484375, + "learning_rate": 9.836976985528573e-05, + "loss": 0.1653, + "step": 16627 + }, + { + "epoch": 2.22, + "grad_norm": 0.52734375, + "learning_rate": 9.83581263970086e-05, + "loss": 0.2301, + "step": 16628 + }, + { + "epoch": 2.22, + "grad_norm": 0.8125, + "learning_rate": 9.834648296099633e-05, + "loss": 0.5119, + "step": 16629 + }, + { + "epoch": 2.22, + "grad_norm": 0.51171875, + "learning_rate": 9.833483954740683e-05, + "loss": 0.1917, + "step": 16630 + }, + { + "epoch": 2.22, + "grad_norm": 0.58203125, + "learning_rate": 9.832319615639793e-05, + "loss": 0.5277, + "step": 16631 + }, + { + "epoch": 2.22, + "grad_norm": 0.73828125, + "learning_rate": 9.83115527881276e-05, + "loss": 0.6219, + "step": 16632 + }, + { + "epoch": 2.22, + "grad_norm": 0.5078125, + "learning_rate": 9.829990944275368e-05, + "loss": 0.4115, + "step": 16633 + }, + { + "epoch": 2.22, + "grad_norm": 0.54296875, + "learning_rate": 9.828826612043407e-05, + "loss": 0.3921, + "step": 16634 + }, + { + "epoch": 2.22, + "grad_norm": 0.609375, + "learning_rate": 9.82766228213267e-05, + "loss": 0.4702, + "step": 16635 + }, + { + "epoch": 2.22, + "grad_norm": 0.6484375, + "learning_rate": 9.826497954558941e-05, + "loss": 0.2855, + "step": 16636 + }, + { + "epoch": 2.22, + "grad_norm": 0.4765625, + "learning_rate": 9.825333629338013e-05, + "loss": 0.2568, + "step": 16637 + }, + { + "epoch": 2.22, + "grad_norm": 0.62890625, + "learning_rate": 9.82416930648567e-05, + "loss": 0.295, + "step": 16638 + }, + { + "epoch": 2.22, + "grad_norm": 0.49609375, + "learning_rate": 9.82300498601771e-05, + "loss": 0.3312, + "step": 16639 + }, + { + "epoch": 2.22, + "grad_norm": 0.58984375, + "learning_rate": 9.821840667949913e-05, + "loss": 0.3326, + "step": 16640 + }, + { + "epoch": 2.22, + "grad_norm": 0.578125, + "learning_rate": 9.820676352298068e-05, + "loss": 0.1733, + "step": 16641 + }, + { + "epoch": 2.22, + "grad_norm": 0.458984375, + "learning_rate": 9.81951203907797e-05, + "loss": 0.2526, + "step": 16642 + }, + { + "epoch": 2.22, + "grad_norm": 0.67578125, + "learning_rate": 9.818347728305401e-05, + "loss": 0.2026, + "step": 16643 + }, + { + "epoch": 2.22, + "grad_norm": 0.62109375, + "learning_rate": 9.817183419996158e-05, + "loss": 0.3211, + "step": 16644 + }, + { + "epoch": 2.22, + "grad_norm": 0.435546875, + "learning_rate": 9.816019114166021e-05, + "loss": 0.206, + "step": 16645 + }, + { + "epoch": 2.22, + "grad_norm": 0.734375, + "learning_rate": 9.814854810830786e-05, + "loss": 0.4826, + "step": 16646 + }, + { + "epoch": 2.22, + "grad_norm": 0.546875, + "learning_rate": 9.813690510006236e-05, + "loss": 0.3082, + "step": 16647 + }, + { + "epoch": 2.22, + "grad_norm": 0.5, + "learning_rate": 9.812526211708163e-05, + "loss": 0.3487, + "step": 16648 + }, + { + "epoch": 2.22, + "grad_norm": 0.69140625, + "learning_rate": 9.811361915952357e-05, + "loss": 0.2744, + "step": 16649 + }, + { + "epoch": 2.22, + "grad_norm": 0.5, + "learning_rate": 9.810197622754603e-05, + "loss": 0.4796, + "step": 16650 + }, + { + "epoch": 2.22, + "grad_norm": 0.6171875, + "learning_rate": 9.809033332130693e-05, + "loss": 0.2914, + "step": 16651 + }, + { + "epoch": 2.22, + "grad_norm": 0.4921875, + "learning_rate": 9.807869044096412e-05, + "loss": 0.2676, + "step": 16652 + }, + { + "epoch": 2.22, + "grad_norm": 0.484375, + "learning_rate": 9.80670475866755e-05, + "loss": 0.3496, + "step": 16653 + }, + { + "epoch": 2.22, + "grad_norm": 0.384765625, + "learning_rate": 9.805540475859893e-05, + "loss": 0.2258, + "step": 16654 + }, + { + "epoch": 2.22, + "grad_norm": 0.546875, + "learning_rate": 9.804376195689235e-05, + "loss": 0.3154, + "step": 16655 + }, + { + "epoch": 2.22, + "grad_norm": 0.63671875, + "learning_rate": 9.80321191817136e-05, + "loss": 0.4086, + "step": 16656 + }, + { + "epoch": 2.22, + "grad_norm": 0.48828125, + "learning_rate": 9.80204764332206e-05, + "loss": 0.4604, + "step": 16657 + }, + { + "epoch": 2.22, + "grad_norm": 0.44140625, + "learning_rate": 9.800883371157116e-05, + "loss": 0.2595, + "step": 16658 + }, + { + "epoch": 2.22, + "grad_norm": 0.58203125, + "learning_rate": 9.799719101692324e-05, + "loss": 0.308, + "step": 16659 + }, + { + "epoch": 2.22, + "grad_norm": 0.65234375, + "learning_rate": 9.798554834943471e-05, + "loss": 0.4718, + "step": 16660 + }, + { + "epoch": 2.22, + "grad_norm": 0.5859375, + "learning_rate": 9.797390570926341e-05, + "loss": 0.2577, + "step": 16661 + }, + { + "epoch": 2.22, + "grad_norm": 0.490234375, + "learning_rate": 9.796226309656727e-05, + "loss": 0.1619, + "step": 16662 + }, + { + "epoch": 2.22, + "grad_norm": 0.57421875, + "learning_rate": 9.795062051150414e-05, + "loss": 0.4669, + "step": 16663 + }, + { + "epoch": 2.22, + "grad_norm": 0.66015625, + "learning_rate": 9.793897795423194e-05, + "loss": 0.3346, + "step": 16664 + }, + { + "epoch": 2.22, + "grad_norm": 0.57421875, + "learning_rate": 9.792733542490852e-05, + "loss": 0.3167, + "step": 16665 + }, + { + "epoch": 2.22, + "grad_norm": 0.51171875, + "learning_rate": 9.791569292369173e-05, + "loss": 0.2443, + "step": 16666 + }, + { + "epoch": 2.22, + "grad_norm": 0.50390625, + "learning_rate": 9.790405045073951e-05, + "loss": 0.3555, + "step": 16667 + }, + { + "epoch": 2.22, + "grad_norm": 0.609375, + "learning_rate": 9.789240800620967e-05, + "loss": 0.3787, + "step": 16668 + }, + { + "epoch": 2.22, + "grad_norm": 0.48046875, + "learning_rate": 9.788076559026016e-05, + "loss": 0.2095, + "step": 16669 + }, + { + "epoch": 2.22, + "grad_norm": 0.578125, + "learning_rate": 9.786912320304882e-05, + "loss": 0.4295, + "step": 16670 + }, + { + "epoch": 2.22, + "grad_norm": 0.51953125, + "learning_rate": 9.785748084473355e-05, + "loss": 0.3078, + "step": 16671 + }, + { + "epoch": 2.22, + "grad_norm": 0.69140625, + "learning_rate": 9.784583851547218e-05, + "loss": 0.4036, + "step": 16672 + }, + { + "epoch": 2.22, + "grad_norm": 0.515625, + "learning_rate": 9.783419621542265e-05, + "loss": 0.407, + "step": 16673 + }, + { + "epoch": 2.22, + "grad_norm": 0.482421875, + "learning_rate": 9.78225539447428e-05, + "loss": 0.4301, + "step": 16674 + }, + { + "epoch": 2.23, + "grad_norm": 0.58203125, + "learning_rate": 9.781091170359054e-05, + "loss": 0.2504, + "step": 16675 + }, + { + "epoch": 2.23, + "grad_norm": 0.4375, + "learning_rate": 9.779926949212373e-05, + "loss": 0.1096, + "step": 16676 + }, + { + "epoch": 2.23, + "grad_norm": 0.5234375, + "learning_rate": 9.77876273105002e-05, + "loss": 0.2146, + "step": 16677 + }, + { + "epoch": 2.23, + "grad_norm": 0.72265625, + "learning_rate": 9.777598515887787e-05, + "loss": 0.3921, + "step": 16678 + }, + { + "epoch": 2.23, + "grad_norm": 0.51953125, + "learning_rate": 9.776434303741462e-05, + "loss": 0.4758, + "step": 16679 + }, + { + "epoch": 2.23, + "grad_norm": 0.494140625, + "learning_rate": 9.775270094626831e-05, + "loss": 0.3013, + "step": 16680 + }, + { + "epoch": 2.23, + "grad_norm": 0.56640625, + "learning_rate": 9.774105888559681e-05, + "loss": 0.4349, + "step": 16681 + }, + { + "epoch": 2.23, + "grad_norm": 0.59765625, + "learning_rate": 9.772941685555799e-05, + "loss": 0.3683, + "step": 16682 + }, + { + "epoch": 2.23, + "grad_norm": 0.578125, + "learning_rate": 9.771777485630976e-05, + "loss": 0.4303, + "step": 16683 + }, + { + "epoch": 2.23, + "grad_norm": 0.546875, + "learning_rate": 9.770613288800994e-05, + "loss": 0.3174, + "step": 16684 + }, + { + "epoch": 2.23, + "grad_norm": 0.578125, + "learning_rate": 9.769449095081646e-05, + "loss": 0.4097, + "step": 16685 + }, + { + "epoch": 2.23, + "grad_norm": 0.609375, + "learning_rate": 9.768284904488714e-05, + "loss": 0.419, + "step": 16686 + }, + { + "epoch": 2.23, + "grad_norm": 0.58984375, + "learning_rate": 9.76712071703799e-05, + "loss": 0.5181, + "step": 16687 + }, + { + "epoch": 2.23, + "grad_norm": 0.6484375, + "learning_rate": 9.76595653274526e-05, + "loss": 0.5729, + "step": 16688 + }, + { + "epoch": 2.23, + "grad_norm": 0.5078125, + "learning_rate": 9.764792351626305e-05, + "loss": 0.3222, + "step": 16689 + }, + { + "epoch": 2.23, + "grad_norm": 0.58984375, + "learning_rate": 9.763628173696919e-05, + "loss": 0.4118, + "step": 16690 + }, + { + "epoch": 2.23, + "grad_norm": 0.59375, + "learning_rate": 9.762463998972885e-05, + "loss": 0.4833, + "step": 16691 + }, + { + "epoch": 2.23, + "grad_norm": 0.7109375, + "learning_rate": 9.761299827469992e-05, + "loss": 0.4988, + "step": 16692 + }, + { + "epoch": 2.23, + "grad_norm": 0.5234375, + "learning_rate": 9.760135659204027e-05, + "loss": 0.3885, + "step": 16693 + }, + { + "epoch": 2.23, + "grad_norm": 0.67578125, + "learning_rate": 9.758971494190779e-05, + "loss": 0.3591, + "step": 16694 + }, + { + "epoch": 2.23, + "grad_norm": 0.31640625, + "learning_rate": 9.757807332446028e-05, + "loss": 0.1194, + "step": 16695 + }, + { + "epoch": 2.23, + "grad_norm": 0.58984375, + "learning_rate": 9.756643173985567e-05, + "loss": 0.3302, + "step": 16696 + }, + { + "epoch": 2.23, + "grad_norm": 0.49609375, + "learning_rate": 9.75547901882518e-05, + "loss": 0.1714, + "step": 16697 + }, + { + "epoch": 2.23, + "grad_norm": 0.6171875, + "learning_rate": 9.754314866980655e-05, + "loss": 0.4914, + "step": 16698 + }, + { + "epoch": 2.23, + "grad_norm": 0.625, + "learning_rate": 9.753150718467777e-05, + "loss": 0.2194, + "step": 16699 + }, + { + "epoch": 2.23, + "grad_norm": 0.74609375, + "learning_rate": 9.751986573302338e-05, + "loss": 0.5926, + "step": 16700 + }, + { + "epoch": 2.23, + "grad_norm": 0.671875, + "learning_rate": 9.750822431500119e-05, + "loss": 0.4903, + "step": 16701 + }, + { + "epoch": 2.23, + "grad_norm": 0.5546875, + "learning_rate": 9.749658293076903e-05, + "loss": 0.583, + "step": 16702 + }, + { + "epoch": 2.23, + "grad_norm": 0.921875, + "learning_rate": 9.748494158048486e-05, + "loss": 0.4842, + "step": 16703 + }, + { + "epoch": 2.23, + "grad_norm": 0.64453125, + "learning_rate": 9.747330026430645e-05, + "loss": 0.2342, + "step": 16704 + }, + { + "epoch": 2.23, + "grad_norm": 0.5546875, + "learning_rate": 9.746165898239176e-05, + "loss": 0.2827, + "step": 16705 + }, + { + "epoch": 2.23, + "grad_norm": 0.55859375, + "learning_rate": 9.745001773489856e-05, + "loss": 0.1634, + "step": 16706 + }, + { + "epoch": 2.23, + "grad_norm": 0.6328125, + "learning_rate": 9.743837652198476e-05, + "loss": 0.3712, + "step": 16707 + }, + { + "epoch": 2.23, + "grad_norm": 0.4765625, + "learning_rate": 9.742673534380825e-05, + "loss": 0.3178, + "step": 16708 + }, + { + "epoch": 2.23, + "grad_norm": 0.462890625, + "learning_rate": 9.741509420052683e-05, + "loss": 0.1668, + "step": 16709 + }, + { + "epoch": 2.23, + "grad_norm": 0.7109375, + "learning_rate": 9.740345309229841e-05, + "loss": 0.4662, + "step": 16710 + }, + { + "epoch": 2.23, + "grad_norm": 0.57421875, + "learning_rate": 9.739181201928081e-05, + "loss": 0.4455, + "step": 16711 + }, + { + "epoch": 2.23, + "grad_norm": 0.53515625, + "learning_rate": 9.738017098163196e-05, + "loss": 0.2004, + "step": 16712 + }, + { + "epoch": 2.23, + "grad_norm": 0.890625, + "learning_rate": 9.736852997950967e-05, + "loss": 0.3293, + "step": 16713 + }, + { + "epoch": 2.23, + "grad_norm": 0.74609375, + "learning_rate": 9.735688901307176e-05, + "loss": 0.5313, + "step": 16714 + }, + { + "epoch": 2.23, + "grad_norm": 0.69140625, + "learning_rate": 9.734524808247616e-05, + "loss": 0.4446, + "step": 16715 + }, + { + "epoch": 2.23, + "grad_norm": 0.59765625, + "learning_rate": 9.733360718788067e-05, + "loss": 0.5287, + "step": 16716 + }, + { + "epoch": 2.23, + "grad_norm": 0.64453125, + "learning_rate": 9.732196632944321e-05, + "loss": 0.4174, + "step": 16717 + }, + { + "epoch": 2.23, + "grad_norm": 0.58984375, + "learning_rate": 9.731032550732158e-05, + "loss": 0.2784, + "step": 16718 + }, + { + "epoch": 2.23, + "grad_norm": 0.373046875, + "learning_rate": 9.729868472167368e-05, + "loss": 0.1773, + "step": 16719 + }, + { + "epoch": 2.23, + "grad_norm": 0.6484375, + "learning_rate": 9.728704397265732e-05, + "loss": 0.4146, + "step": 16720 + }, + { + "epoch": 2.23, + "grad_norm": 0.6015625, + "learning_rate": 9.727540326043043e-05, + "loss": 0.4901, + "step": 16721 + }, + { + "epoch": 2.23, + "grad_norm": 0.447265625, + "learning_rate": 9.726376258515078e-05, + "loss": 0.3593, + "step": 16722 + }, + { + "epoch": 2.23, + "grad_norm": 0.59375, + "learning_rate": 9.725212194697629e-05, + "loss": 0.3823, + "step": 16723 + }, + { + "epoch": 2.23, + "grad_norm": 0.625, + "learning_rate": 9.724048134606481e-05, + "loss": 0.4362, + "step": 16724 + }, + { + "epoch": 2.23, + "grad_norm": 0.6484375, + "learning_rate": 9.722884078257414e-05, + "loss": 0.4369, + "step": 16725 + }, + { + "epoch": 2.23, + "grad_norm": 0.482421875, + "learning_rate": 9.721720025666218e-05, + "loss": 0.1965, + "step": 16726 + }, + { + "epoch": 2.23, + "grad_norm": 0.578125, + "learning_rate": 9.720555976848676e-05, + "loss": 0.3379, + "step": 16727 + }, + { + "epoch": 2.23, + "grad_norm": 0.470703125, + "learning_rate": 9.719391931820575e-05, + "loss": 0.3735, + "step": 16728 + }, + { + "epoch": 2.23, + "grad_norm": 0.5078125, + "learning_rate": 9.718227890597697e-05, + "loss": 0.2691, + "step": 16729 + }, + { + "epoch": 2.23, + "grad_norm": 0.58984375, + "learning_rate": 9.717063853195834e-05, + "loss": 0.261, + "step": 16730 + }, + { + "epoch": 2.23, + "grad_norm": 0.6484375, + "learning_rate": 9.715899819630763e-05, + "loss": 0.4636, + "step": 16731 + }, + { + "epoch": 2.23, + "grad_norm": 0.578125, + "learning_rate": 9.714735789918274e-05, + "loss": 0.2558, + "step": 16732 + }, + { + "epoch": 2.23, + "grad_norm": 0.640625, + "learning_rate": 9.713571764074152e-05, + "loss": 0.4197, + "step": 16733 + }, + { + "epoch": 2.23, + "grad_norm": 0.5, + "learning_rate": 9.712407742114178e-05, + "loss": 0.2759, + "step": 16734 + }, + { + "epoch": 2.23, + "grad_norm": 0.5, + "learning_rate": 9.711243724054142e-05, + "loss": 0.39, + "step": 16735 + }, + { + "epoch": 2.23, + "grad_norm": 0.482421875, + "learning_rate": 9.710079709909829e-05, + "loss": 0.2613, + "step": 16736 + }, + { + "epoch": 2.23, + "grad_norm": 0.7421875, + "learning_rate": 9.708915699697017e-05, + "loss": 0.3289, + "step": 16737 + }, + { + "epoch": 2.23, + "grad_norm": 0.51953125, + "learning_rate": 9.707751693431497e-05, + "loss": 0.2451, + "step": 16738 + }, + { + "epoch": 2.23, + "grad_norm": 0.37890625, + "learning_rate": 9.70658769112905e-05, + "loss": 0.0961, + "step": 16739 + }, + { + "epoch": 2.23, + "grad_norm": 0.46484375, + "learning_rate": 9.705423692805463e-05, + "loss": 0.1937, + "step": 16740 + }, + { + "epoch": 2.23, + "grad_norm": 0.67578125, + "learning_rate": 9.70425969847652e-05, + "loss": 0.4634, + "step": 16741 + }, + { + "epoch": 2.23, + "grad_norm": 0.392578125, + "learning_rate": 9.703095708158006e-05, + "loss": 0.1943, + "step": 16742 + }, + { + "epoch": 2.23, + "grad_norm": 0.7265625, + "learning_rate": 9.701931721865701e-05, + "loss": 0.4588, + "step": 16743 + }, + { + "epoch": 2.23, + "grad_norm": 0.56640625, + "learning_rate": 9.700767739615397e-05, + "loss": 0.3459, + "step": 16744 + }, + { + "epoch": 2.23, + "grad_norm": 0.68359375, + "learning_rate": 9.699603761422871e-05, + "loss": 0.1975, + "step": 16745 + }, + { + "epoch": 2.23, + "grad_norm": 0.609375, + "learning_rate": 9.698439787303915e-05, + "loss": 0.352, + "step": 16746 + }, + { + "epoch": 2.23, + "grad_norm": 0.50390625, + "learning_rate": 9.697275817274306e-05, + "loss": 0.2926, + "step": 16747 + }, + { + "epoch": 2.23, + "grad_norm": 0.5234375, + "learning_rate": 9.696111851349834e-05, + "loss": 0.1887, + "step": 16748 + }, + { + "epoch": 2.23, + "grad_norm": 0.5078125, + "learning_rate": 9.69494788954628e-05, + "loss": 0.2384, + "step": 16749 + }, + { + "epoch": 2.24, + "grad_norm": 0.47265625, + "learning_rate": 9.693783931879427e-05, + "loss": 0.1821, + "step": 16750 + }, + { + "epoch": 2.24, + "grad_norm": 0.5234375, + "learning_rate": 9.692619978365061e-05, + "loss": 0.5864, + "step": 16751 + }, + { + "epoch": 2.24, + "grad_norm": 0.52734375, + "learning_rate": 9.691456029018964e-05, + "loss": 0.1717, + "step": 16752 + }, + { + "epoch": 2.24, + "grad_norm": 0.828125, + "learning_rate": 9.690292083856923e-05, + "loss": 0.6476, + "step": 16753 + }, + { + "epoch": 2.24, + "grad_norm": 0.458984375, + "learning_rate": 9.689128142894718e-05, + "loss": 0.1826, + "step": 16754 + }, + { + "epoch": 2.24, + "grad_norm": 0.48046875, + "learning_rate": 9.687964206148135e-05, + "loss": 0.2898, + "step": 16755 + }, + { + "epoch": 2.24, + "grad_norm": 0.640625, + "learning_rate": 9.68680027363296e-05, + "loss": 0.31, + "step": 16756 + }, + { + "epoch": 2.24, + "grad_norm": 0.64453125, + "learning_rate": 9.685636345364971e-05, + "loss": 0.3485, + "step": 16757 + }, + { + "epoch": 2.24, + "grad_norm": 0.41015625, + "learning_rate": 9.684472421359958e-05, + "loss": 0.3049, + "step": 16758 + }, + { + "epoch": 2.24, + "grad_norm": 0.6640625, + "learning_rate": 9.6833085016337e-05, + "loss": 0.7947, + "step": 16759 + }, + { + "epoch": 2.24, + "grad_norm": 0.51953125, + "learning_rate": 9.682144586201983e-05, + "loss": 0.3702, + "step": 16760 + }, + { + "epoch": 2.24, + "grad_norm": 0.58203125, + "learning_rate": 9.680980675080592e-05, + "loss": 0.3225, + "step": 16761 + }, + { + "epoch": 2.24, + "grad_norm": 0.51953125, + "learning_rate": 9.679816768285304e-05, + "loss": 0.298, + "step": 16762 + }, + { + "epoch": 2.24, + "grad_norm": 0.451171875, + "learning_rate": 9.678652865831907e-05, + "loss": 0.2633, + "step": 16763 + }, + { + "epoch": 2.24, + "grad_norm": 0.67578125, + "learning_rate": 9.677488967736183e-05, + "loss": 0.4562, + "step": 16764 + }, + { + "epoch": 2.24, + "grad_norm": 0.53515625, + "learning_rate": 9.676325074013915e-05, + "loss": 0.5264, + "step": 16765 + }, + { + "epoch": 2.24, + "grad_norm": 0.63671875, + "learning_rate": 9.675161184680886e-05, + "loss": 0.3997, + "step": 16766 + }, + { + "epoch": 2.24, + "grad_norm": 0.58203125, + "learning_rate": 9.673997299752883e-05, + "loss": 0.1835, + "step": 16767 + }, + { + "epoch": 2.24, + "grad_norm": 0.60546875, + "learning_rate": 9.672833419245683e-05, + "loss": 0.2756, + "step": 16768 + }, + { + "epoch": 2.24, + "grad_norm": 0.59375, + "learning_rate": 9.671669543175073e-05, + "loss": 0.505, + "step": 16769 + }, + { + "epoch": 2.24, + "grad_norm": 0.484375, + "learning_rate": 9.670505671556835e-05, + "loss": 0.4347, + "step": 16770 + }, + { + "epoch": 2.24, + "grad_norm": 0.61328125, + "learning_rate": 9.66934180440675e-05, + "loss": 0.6318, + "step": 16771 + }, + { + "epoch": 2.24, + "grad_norm": 0.478515625, + "learning_rate": 9.668177941740604e-05, + "loss": 0.1961, + "step": 16772 + }, + { + "epoch": 2.24, + "grad_norm": 0.6640625, + "learning_rate": 9.66701408357418e-05, + "loss": 0.478, + "step": 16773 + }, + { + "epoch": 2.24, + "grad_norm": 0.56640625, + "learning_rate": 9.665850229923258e-05, + "loss": 0.2724, + "step": 16774 + }, + { + "epoch": 2.24, + "grad_norm": 0.765625, + "learning_rate": 9.664686380803618e-05, + "loss": 0.2503, + "step": 16775 + }, + { + "epoch": 2.24, + "grad_norm": 0.60546875, + "learning_rate": 9.663522536231049e-05, + "loss": 0.3567, + "step": 16776 + }, + { + "epoch": 2.24, + "grad_norm": 0.419921875, + "learning_rate": 9.662358696221328e-05, + "loss": 0.2366, + "step": 16777 + }, + { + "epoch": 2.24, + "grad_norm": 0.7265625, + "learning_rate": 9.661194860790241e-05, + "loss": 0.4862, + "step": 16778 + }, + { + "epoch": 2.24, + "grad_norm": 0.70703125, + "learning_rate": 9.660031029953566e-05, + "loss": 0.4444, + "step": 16779 + }, + { + "epoch": 2.24, + "grad_norm": 0.5234375, + "learning_rate": 9.658867203727092e-05, + "loss": 0.5928, + "step": 16780 + }, + { + "epoch": 2.24, + "grad_norm": 0.66015625, + "learning_rate": 9.657703382126597e-05, + "loss": 0.1518, + "step": 16781 + }, + { + "epoch": 2.24, + "grad_norm": 0.54296875, + "learning_rate": 9.656539565167862e-05, + "loss": 0.3442, + "step": 16782 + }, + { + "epoch": 2.24, + "grad_norm": 0.56640625, + "learning_rate": 9.655375752866674e-05, + "loss": 0.4267, + "step": 16783 + }, + { + "epoch": 2.24, + "grad_norm": 0.70703125, + "learning_rate": 9.65421194523881e-05, + "loss": 0.3201, + "step": 16784 + }, + { + "epoch": 2.24, + "grad_norm": 0.703125, + "learning_rate": 9.653048142300057e-05, + "loss": 0.3818, + "step": 16785 + }, + { + "epoch": 2.24, + "grad_norm": 0.412109375, + "learning_rate": 9.651884344066192e-05, + "loss": 0.1565, + "step": 16786 + }, + { + "epoch": 2.24, + "grad_norm": 0.703125, + "learning_rate": 9.650720550552997e-05, + "loss": 0.451, + "step": 16787 + }, + { + "epoch": 2.24, + "grad_norm": 0.388671875, + "learning_rate": 9.649556761776258e-05, + "loss": 0.2169, + "step": 16788 + }, + { + "epoch": 2.24, + "grad_norm": 0.5234375, + "learning_rate": 9.648392977751751e-05, + "loss": 0.5919, + "step": 16789 + }, + { + "epoch": 2.24, + "grad_norm": 0.58203125, + "learning_rate": 9.647229198495265e-05, + "loss": 0.2234, + "step": 16790 + }, + { + "epoch": 2.24, + "grad_norm": 0.640625, + "learning_rate": 9.646065424022574e-05, + "loss": 0.3025, + "step": 16791 + }, + { + "epoch": 2.24, + "grad_norm": 0.70703125, + "learning_rate": 9.644901654349464e-05, + "loss": 0.5231, + "step": 16792 + }, + { + "epoch": 2.24, + "grad_norm": 0.7109375, + "learning_rate": 9.643737889491715e-05, + "loss": 0.3649, + "step": 16793 + }, + { + "epoch": 2.24, + "grad_norm": 0.62890625, + "learning_rate": 9.642574129465111e-05, + "loss": 0.5595, + "step": 16794 + }, + { + "epoch": 2.24, + "grad_norm": 0.478515625, + "learning_rate": 9.641410374285428e-05, + "loss": 0.5012, + "step": 16795 + }, + { + "epoch": 2.24, + "grad_norm": 0.59375, + "learning_rate": 9.640246623968453e-05, + "loss": 0.4226, + "step": 16796 + }, + { + "epoch": 2.24, + "grad_norm": 0.52734375, + "learning_rate": 9.639082878529967e-05, + "loss": 0.3907, + "step": 16797 + }, + { + "epoch": 2.24, + "grad_norm": 0.5390625, + "learning_rate": 9.637919137985744e-05, + "loss": 0.4142, + "step": 16798 + }, + { + "epoch": 2.24, + "grad_norm": 0.5625, + "learning_rate": 9.636755402351572e-05, + "loss": 0.3424, + "step": 16799 + }, + { + "epoch": 2.24, + "grad_norm": 0.64453125, + "learning_rate": 9.635591671643229e-05, + "loss": 0.5259, + "step": 16800 + }, + { + "epoch": 2.24, + "grad_norm": 0.451171875, + "learning_rate": 9.634427945876499e-05, + "loss": 0.1635, + "step": 16801 + }, + { + "epoch": 2.24, + "grad_norm": 0.53125, + "learning_rate": 9.633264225067156e-05, + "loss": 0.1765, + "step": 16802 + }, + { + "epoch": 2.24, + "grad_norm": 0.54296875, + "learning_rate": 9.63210050923099e-05, + "loss": 0.3743, + "step": 16803 + }, + { + "epoch": 2.24, + "grad_norm": 0.73828125, + "learning_rate": 9.630936798383775e-05, + "loss": 0.4314, + "step": 16804 + }, + { + "epoch": 2.24, + "grad_norm": 0.47265625, + "learning_rate": 9.629773092541293e-05, + "loss": 0.2865, + "step": 16805 + }, + { + "epoch": 2.24, + "grad_norm": 0.57421875, + "learning_rate": 9.628609391719328e-05, + "loss": 0.2778, + "step": 16806 + }, + { + "epoch": 2.24, + "grad_norm": 0.55078125, + "learning_rate": 9.627445695933656e-05, + "loss": 0.1972, + "step": 16807 + }, + { + "epoch": 2.24, + "grad_norm": 0.55859375, + "learning_rate": 9.626282005200061e-05, + "loss": 0.4519, + "step": 16808 + }, + { + "epoch": 2.24, + "grad_norm": 0.61328125, + "learning_rate": 9.625118319534324e-05, + "loss": 0.6406, + "step": 16809 + }, + { + "epoch": 2.24, + "grad_norm": 0.58984375, + "learning_rate": 9.62395463895222e-05, + "loss": 0.3361, + "step": 16810 + }, + { + "epoch": 2.24, + "grad_norm": 0.53515625, + "learning_rate": 9.622790963469534e-05, + "loss": 0.4247, + "step": 16811 + }, + { + "epoch": 2.24, + "grad_norm": 0.6171875, + "learning_rate": 9.621627293102041e-05, + "loss": 0.418, + "step": 16812 + }, + { + "epoch": 2.24, + "grad_norm": 0.6015625, + "learning_rate": 9.62046362786553e-05, + "loss": 0.4698, + "step": 16813 + }, + { + "epoch": 2.24, + "grad_norm": 0.53515625, + "learning_rate": 9.61929996777577e-05, + "loss": 0.2193, + "step": 16814 + }, + { + "epoch": 2.24, + "grad_norm": 0.63671875, + "learning_rate": 9.618136312848551e-05, + "loss": 0.2917, + "step": 16815 + }, + { + "epoch": 2.24, + "grad_norm": 0.51171875, + "learning_rate": 9.616972663099647e-05, + "loss": 0.2972, + "step": 16816 + }, + { + "epoch": 2.24, + "grad_norm": 0.5234375, + "learning_rate": 9.615809018544841e-05, + "loss": 0.2473, + "step": 16817 + }, + { + "epoch": 2.24, + "grad_norm": 1.03125, + "learning_rate": 9.614645379199909e-05, + "loss": 0.2681, + "step": 16818 + }, + { + "epoch": 2.24, + "grad_norm": 0.45703125, + "learning_rate": 9.613481745080634e-05, + "loss": 0.2417, + "step": 16819 + }, + { + "epoch": 2.24, + "grad_norm": 0.5859375, + "learning_rate": 9.612318116202794e-05, + "loss": 0.2777, + "step": 16820 + }, + { + "epoch": 2.24, + "grad_norm": 0.5078125, + "learning_rate": 9.611154492582174e-05, + "loss": 0.2896, + "step": 16821 + }, + { + "epoch": 2.24, + "grad_norm": 0.6484375, + "learning_rate": 9.609990874234545e-05, + "loss": 0.3127, + "step": 16822 + }, + { + "epoch": 2.24, + "grad_norm": 0.58203125, + "learning_rate": 9.608827261175689e-05, + "loss": 0.5809, + "step": 16823 + }, + { + "epoch": 2.24, + "grad_norm": 0.82421875, + "learning_rate": 9.607663653421388e-05, + "loss": 0.3286, + "step": 16824 + }, + { + "epoch": 2.25, + "grad_norm": 0.609375, + "learning_rate": 9.606500050987418e-05, + "loss": 0.3786, + "step": 16825 + }, + { + "epoch": 2.25, + "grad_norm": 0.75390625, + "learning_rate": 9.605336453889561e-05, + "loss": 0.41, + "step": 16826 + }, + { + "epoch": 2.25, + "grad_norm": 0.62109375, + "learning_rate": 9.604172862143593e-05, + "loss": 0.5138, + "step": 16827 + }, + { + "epoch": 2.25, + "grad_norm": 0.59765625, + "learning_rate": 9.603009275765296e-05, + "loss": 0.2241, + "step": 16828 + }, + { + "epoch": 2.25, + "grad_norm": 0.5703125, + "learning_rate": 9.601845694770449e-05, + "loss": 0.442, + "step": 16829 + }, + { + "epoch": 2.25, + "grad_norm": 0.54296875, + "learning_rate": 9.600682119174828e-05, + "loss": 0.1997, + "step": 16830 + }, + { + "epoch": 2.25, + "grad_norm": 0.53125, + "learning_rate": 9.599518548994218e-05, + "loss": 0.5206, + "step": 16831 + }, + { + "epoch": 2.25, + "grad_norm": 0.66015625, + "learning_rate": 9.598354984244389e-05, + "loss": 0.3925, + "step": 16832 + }, + { + "epoch": 2.25, + "grad_norm": 0.478515625, + "learning_rate": 9.597191424941126e-05, + "loss": 0.1926, + "step": 16833 + }, + { + "epoch": 2.25, + "grad_norm": 0.42578125, + "learning_rate": 9.59602787110021e-05, + "loss": 0.3053, + "step": 16834 + }, + { + "epoch": 2.25, + "grad_norm": 0.59375, + "learning_rate": 9.594864322737408e-05, + "loss": 0.4264, + "step": 16835 + }, + { + "epoch": 2.25, + "grad_norm": 0.546875, + "learning_rate": 9.593700779868512e-05, + "loss": 0.3047, + "step": 16836 + }, + { + "epoch": 2.25, + "grad_norm": 0.625, + "learning_rate": 9.59253724250929e-05, + "loss": 0.3596, + "step": 16837 + }, + { + "epoch": 2.25, + "grad_norm": 0.640625, + "learning_rate": 9.591373710675526e-05, + "loss": 0.2954, + "step": 16838 + }, + { + "epoch": 2.25, + "grad_norm": 0.49609375, + "learning_rate": 9.590210184382995e-05, + "loss": 0.2835, + "step": 16839 + }, + { + "epoch": 2.25, + "grad_norm": 0.8203125, + "learning_rate": 9.58904666364748e-05, + "loss": 0.3301, + "step": 16840 + }, + { + "epoch": 2.25, + "grad_norm": 0.6484375, + "learning_rate": 9.587883148484752e-05, + "loss": 0.5067, + "step": 16841 + }, + { + "epoch": 2.25, + "grad_norm": 0.486328125, + "learning_rate": 9.586719638910597e-05, + "loss": 0.2118, + "step": 16842 + }, + { + "epoch": 2.25, + "grad_norm": 0.51953125, + "learning_rate": 9.585556134940786e-05, + "loss": 0.4181, + "step": 16843 + }, + { + "epoch": 2.25, + "grad_norm": 0.51953125, + "learning_rate": 9.5843926365911e-05, + "loss": 0.3877, + "step": 16844 + }, + { + "epoch": 2.25, + "grad_norm": 0.48828125, + "learning_rate": 9.583229143877318e-05, + "loss": 0.2841, + "step": 16845 + }, + { + "epoch": 2.25, + "grad_norm": 0.6171875, + "learning_rate": 9.582065656815218e-05, + "loss": 0.3127, + "step": 16846 + }, + { + "epoch": 2.25, + "grad_norm": 0.5234375, + "learning_rate": 9.580902175420575e-05, + "loss": 0.3614, + "step": 16847 + }, + { + "epoch": 2.25, + "grad_norm": 0.41015625, + "learning_rate": 9.579738699709166e-05, + "loss": 0.2756, + "step": 16848 + }, + { + "epoch": 2.25, + "grad_norm": 0.72265625, + "learning_rate": 9.57857522969677e-05, + "loss": 0.3939, + "step": 16849 + }, + { + "epoch": 2.25, + "grad_norm": 0.64453125, + "learning_rate": 9.577411765399163e-05, + "loss": 0.501, + "step": 16850 + }, + { + "epoch": 2.25, + "grad_norm": 0.796875, + "learning_rate": 9.576248306832127e-05, + "loss": 0.507, + "step": 16851 + }, + { + "epoch": 2.25, + "grad_norm": 0.61328125, + "learning_rate": 9.575084854011434e-05, + "loss": 0.4571, + "step": 16852 + }, + { + "epoch": 2.25, + "grad_norm": 0.72265625, + "learning_rate": 9.573921406952861e-05, + "loss": 0.2831, + "step": 16853 + }, + { + "epoch": 2.25, + "grad_norm": 0.5, + "learning_rate": 9.57275796567219e-05, + "loss": 0.4647, + "step": 16854 + }, + { + "epoch": 2.25, + "grad_norm": 0.703125, + "learning_rate": 9.571594530185193e-05, + "loss": 0.6637, + "step": 16855 + }, + { + "epoch": 2.25, + "grad_norm": 0.55078125, + "learning_rate": 9.570431100507651e-05, + "loss": 0.4655, + "step": 16856 + }, + { + "epoch": 2.25, + "grad_norm": 0.58203125, + "learning_rate": 9.569267676655339e-05, + "loss": 0.5369, + "step": 16857 + }, + { + "epoch": 2.25, + "grad_norm": 0.5390625, + "learning_rate": 9.568104258644036e-05, + "loss": 0.3208, + "step": 16858 + }, + { + "epoch": 2.25, + "grad_norm": 0.55078125, + "learning_rate": 9.566940846489515e-05, + "loss": 0.3365, + "step": 16859 + }, + { + "epoch": 2.25, + "grad_norm": 0.453125, + "learning_rate": 9.565777440207551e-05, + "loss": 0.1846, + "step": 16860 + }, + { + "epoch": 2.25, + "grad_norm": 0.64453125, + "learning_rate": 9.564614039813927e-05, + "loss": 0.4102, + "step": 16861 + }, + { + "epoch": 2.25, + "grad_norm": 0.41015625, + "learning_rate": 9.563450645324414e-05, + "loss": 0.1364, + "step": 16862 + }, + { + "epoch": 2.25, + "grad_norm": 0.53515625, + "learning_rate": 9.562287256754792e-05, + "loss": 0.468, + "step": 16863 + }, + { + "epoch": 2.25, + "grad_norm": 0.6171875, + "learning_rate": 9.561123874120832e-05, + "loss": 0.3671, + "step": 16864 + }, + { + "epoch": 2.25, + "grad_norm": 0.51953125, + "learning_rate": 9.559960497438319e-05, + "loss": 0.4428, + "step": 16865 + }, + { + "epoch": 2.25, + "grad_norm": 0.486328125, + "learning_rate": 9.558797126723021e-05, + "loss": 0.351, + "step": 16866 + }, + { + "epoch": 2.25, + "grad_norm": 0.95703125, + "learning_rate": 9.557633761990719e-05, + "loss": 0.3331, + "step": 16867 + }, + { + "epoch": 2.25, + "grad_norm": 0.57421875, + "learning_rate": 9.556470403257183e-05, + "loss": 0.3356, + "step": 16868 + }, + { + "epoch": 2.25, + "grad_norm": 0.55859375, + "learning_rate": 9.555307050538198e-05, + "loss": 0.7299, + "step": 16869 + }, + { + "epoch": 2.25, + "grad_norm": 0.484375, + "learning_rate": 9.554143703849536e-05, + "loss": 0.247, + "step": 16870 + }, + { + "epoch": 2.25, + "grad_norm": 0.6328125, + "learning_rate": 9.552980363206968e-05, + "loss": 0.3437, + "step": 16871 + }, + { + "epoch": 2.25, + "grad_norm": 0.60546875, + "learning_rate": 9.551817028626273e-05, + "loss": 0.5424, + "step": 16872 + }, + { + "epoch": 2.25, + "grad_norm": 0.5625, + "learning_rate": 9.550653700123226e-05, + "loss": 0.2795, + "step": 16873 + }, + { + "epoch": 2.25, + "grad_norm": 0.6328125, + "learning_rate": 9.549490377713606e-05, + "loss": 0.4235, + "step": 16874 + }, + { + "epoch": 2.25, + "grad_norm": 0.80859375, + "learning_rate": 9.548327061413183e-05, + "loss": 0.7269, + "step": 16875 + }, + { + "epoch": 2.25, + "grad_norm": 0.53125, + "learning_rate": 9.547163751237737e-05, + "loss": 0.3494, + "step": 16876 + }, + { + "epoch": 2.25, + "grad_norm": 0.5078125, + "learning_rate": 9.546000447203038e-05, + "loss": 0.2341, + "step": 16877 + }, + { + "epoch": 2.25, + "grad_norm": 0.53515625, + "learning_rate": 9.544837149324865e-05, + "loss": 0.2969, + "step": 16878 + }, + { + "epoch": 2.25, + "grad_norm": 0.490234375, + "learning_rate": 9.543673857618995e-05, + "loss": 0.5095, + "step": 16879 + }, + { + "epoch": 2.25, + "grad_norm": 0.5078125, + "learning_rate": 9.542510572101197e-05, + "loss": 0.1275, + "step": 16880 + }, + { + "epoch": 2.25, + "grad_norm": 0.396484375, + "learning_rate": 9.541347292787253e-05, + "loss": 0.2238, + "step": 16881 + }, + { + "epoch": 2.25, + "grad_norm": 0.55859375, + "learning_rate": 9.540184019692935e-05, + "loss": 0.5264, + "step": 16882 + }, + { + "epoch": 2.25, + "grad_norm": 0.435546875, + "learning_rate": 9.539020752834014e-05, + "loss": 0.304, + "step": 16883 + }, + { + "epoch": 2.25, + "grad_norm": 0.6171875, + "learning_rate": 9.537857492226269e-05, + "loss": 0.2931, + "step": 16884 + }, + { + "epoch": 2.25, + "grad_norm": 0.5625, + "learning_rate": 9.53669423788547e-05, + "loss": 0.3291, + "step": 16885 + }, + { + "epoch": 2.25, + "grad_norm": 0.439453125, + "learning_rate": 9.535530989827396e-05, + "loss": 0.2031, + "step": 16886 + }, + { + "epoch": 2.25, + "grad_norm": 0.34375, + "learning_rate": 9.534367748067819e-05, + "loss": 0.2028, + "step": 16887 + }, + { + "epoch": 2.25, + "grad_norm": 0.44140625, + "learning_rate": 9.533204512622517e-05, + "loss": 0.1375, + "step": 16888 + }, + { + "epoch": 2.25, + "grad_norm": 0.4609375, + "learning_rate": 9.532041283507258e-05, + "loss": 0.3224, + "step": 16889 + }, + { + "epoch": 2.25, + "grad_norm": 0.443359375, + "learning_rate": 9.530878060737822e-05, + "loss": 0.227, + "step": 16890 + }, + { + "epoch": 2.25, + "grad_norm": 0.4453125, + "learning_rate": 9.529714844329979e-05, + "loss": 0.2098, + "step": 16891 + }, + { + "epoch": 2.25, + "grad_norm": 0.5625, + "learning_rate": 9.528551634299506e-05, + "loss": 0.4483, + "step": 16892 + }, + { + "epoch": 2.25, + "grad_norm": 0.408203125, + "learning_rate": 9.527388430662174e-05, + "loss": 0.2083, + "step": 16893 + }, + { + "epoch": 2.25, + "grad_norm": 0.58984375, + "learning_rate": 9.526225233433763e-05, + "loss": 0.2829, + "step": 16894 + }, + { + "epoch": 2.25, + "grad_norm": 0.5, + "learning_rate": 9.525062042630038e-05, + "loss": 0.3664, + "step": 16895 + }, + { + "epoch": 2.25, + "grad_norm": 0.6171875, + "learning_rate": 9.523898858266775e-05, + "loss": 0.237, + "step": 16896 + }, + { + "epoch": 2.25, + "grad_norm": 0.578125, + "learning_rate": 9.52273568035975e-05, + "loss": 0.5051, + "step": 16897 + }, + { + "epoch": 2.25, + "grad_norm": 0.546875, + "learning_rate": 9.521572508924735e-05, + "loss": 0.5283, + "step": 16898 + }, + { + "epoch": 2.26, + "grad_norm": 0.56640625, + "learning_rate": 9.520409343977506e-05, + "loss": 0.1982, + "step": 16899 + }, + { + "epoch": 2.26, + "grad_norm": 0.65625, + "learning_rate": 9.519246185533832e-05, + "loss": 0.3779, + "step": 16900 + }, + { + "epoch": 2.26, + "grad_norm": 0.5078125, + "learning_rate": 9.518083033609488e-05, + "loss": 0.261, + "step": 16901 + }, + { + "epoch": 2.26, + "grad_norm": 0.52734375, + "learning_rate": 9.516919888220249e-05, + "loss": 0.2035, + "step": 16902 + }, + { + "epoch": 2.26, + "grad_norm": 0.486328125, + "learning_rate": 9.515756749381885e-05, + "loss": 0.2194, + "step": 16903 + }, + { + "epoch": 2.26, + "grad_norm": 0.52734375, + "learning_rate": 9.514593617110171e-05, + "loss": 0.3803, + "step": 16904 + }, + { + "epoch": 2.26, + "grad_norm": 0.5859375, + "learning_rate": 9.513430491420878e-05, + "loss": 0.258, + "step": 16905 + }, + { + "epoch": 2.26, + "grad_norm": 0.72265625, + "learning_rate": 9.512267372329786e-05, + "loss": 0.4249, + "step": 16906 + }, + { + "epoch": 2.26, + "grad_norm": 0.59375, + "learning_rate": 9.511104259852656e-05, + "loss": 0.3534, + "step": 16907 + }, + { + "epoch": 2.26, + "grad_norm": 0.54296875, + "learning_rate": 9.509941154005265e-05, + "loss": 0.3769, + "step": 16908 + }, + { + "epoch": 2.26, + "grad_norm": 0.4609375, + "learning_rate": 9.508778054803389e-05, + "loss": 0.1908, + "step": 16909 + }, + { + "epoch": 2.26, + "grad_norm": 0.4609375, + "learning_rate": 9.507614962262795e-05, + "loss": 0.1848, + "step": 16910 + }, + { + "epoch": 2.26, + "grad_norm": 0.68359375, + "learning_rate": 9.50645187639926e-05, + "loss": 0.3935, + "step": 16911 + }, + { + "epoch": 2.26, + "grad_norm": 0.6953125, + "learning_rate": 9.505288797228554e-05, + "loss": 0.5047, + "step": 16912 + }, + { + "epoch": 2.26, + "grad_norm": 0.5546875, + "learning_rate": 9.504125724766451e-05, + "loss": 0.2476, + "step": 16913 + }, + { + "epoch": 2.26, + "grad_norm": 0.55859375, + "learning_rate": 9.502962659028718e-05, + "loss": 0.2324, + "step": 16914 + }, + { + "epoch": 2.26, + "grad_norm": 0.55078125, + "learning_rate": 9.501799600031134e-05, + "loss": 0.3498, + "step": 16915 + }, + { + "epoch": 2.26, + "grad_norm": 0.671875, + "learning_rate": 9.500636547789463e-05, + "loss": 0.3242, + "step": 16916 + }, + { + "epoch": 2.26, + "grad_norm": 0.70703125, + "learning_rate": 9.499473502319485e-05, + "loss": 0.4343, + "step": 16917 + }, + { + "epoch": 2.26, + "grad_norm": 0.4921875, + "learning_rate": 9.498310463636965e-05, + "loss": 0.2633, + "step": 16918 + }, + { + "epoch": 2.26, + "grad_norm": 0.55078125, + "learning_rate": 9.49714743175768e-05, + "loss": 0.4726, + "step": 16919 + }, + { + "epoch": 2.26, + "grad_norm": 0.46875, + "learning_rate": 9.495984406697399e-05, + "loss": 0.2518, + "step": 16920 + }, + { + "epoch": 2.26, + "grad_norm": 0.69140625, + "learning_rate": 9.494821388471888e-05, + "loss": 0.3438, + "step": 16921 + }, + { + "epoch": 2.26, + "grad_norm": 0.578125, + "learning_rate": 9.493658377096927e-05, + "loss": 0.1958, + "step": 16922 + }, + { + "epoch": 2.26, + "grad_norm": 0.50390625, + "learning_rate": 9.492495372588282e-05, + "loss": 0.2448, + "step": 16923 + }, + { + "epoch": 2.26, + "grad_norm": 0.5859375, + "learning_rate": 9.491332374961726e-05, + "loss": 0.2097, + "step": 16924 + }, + { + "epoch": 2.26, + "grad_norm": 0.55859375, + "learning_rate": 9.490169384233028e-05, + "loss": 0.3979, + "step": 16925 + }, + { + "epoch": 2.26, + "grad_norm": 0.45703125, + "learning_rate": 9.48900640041796e-05, + "loss": 0.2567, + "step": 16926 + }, + { + "epoch": 2.26, + "grad_norm": 0.5078125, + "learning_rate": 9.487843423532295e-05, + "loss": 0.429, + "step": 16927 + }, + { + "epoch": 2.26, + "grad_norm": 0.453125, + "learning_rate": 9.4866804535918e-05, + "loss": 0.3647, + "step": 16928 + }, + { + "epoch": 2.26, + "grad_norm": 0.63671875, + "learning_rate": 9.485517490612249e-05, + "loss": 0.2017, + "step": 16929 + }, + { + "epoch": 2.26, + "grad_norm": 0.75390625, + "learning_rate": 9.48435453460941e-05, + "loss": 0.5222, + "step": 16930 + }, + { + "epoch": 2.26, + "grad_norm": 0.58984375, + "learning_rate": 9.483191585599059e-05, + "loss": 0.2922, + "step": 16931 + }, + { + "epoch": 2.26, + "grad_norm": 0.6171875, + "learning_rate": 9.482028643596959e-05, + "loss": 0.3155, + "step": 16932 + }, + { + "epoch": 2.26, + "grad_norm": 0.484375, + "learning_rate": 9.48086570861888e-05, + "loss": 0.2493, + "step": 16933 + }, + { + "epoch": 2.26, + "grad_norm": 0.47265625, + "learning_rate": 9.479702780680599e-05, + "loss": 0.301, + "step": 16934 + }, + { + "epoch": 2.26, + "grad_norm": 0.5, + "learning_rate": 9.47853985979788e-05, + "loss": 0.4299, + "step": 16935 + }, + { + "epoch": 2.26, + "grad_norm": 0.62109375, + "learning_rate": 9.477376945986496e-05, + "loss": 0.2372, + "step": 16936 + }, + { + "epoch": 2.26, + "grad_norm": 0.52734375, + "learning_rate": 9.476214039262214e-05, + "loss": 0.2909, + "step": 16937 + }, + { + "epoch": 2.26, + "grad_norm": 0.48046875, + "learning_rate": 9.475051139640809e-05, + "loss": 0.283, + "step": 16938 + }, + { + "epoch": 2.26, + "grad_norm": 0.482421875, + "learning_rate": 9.473888247138043e-05, + "loss": 0.3223, + "step": 16939 + }, + { + "epoch": 2.26, + "grad_norm": 0.515625, + "learning_rate": 9.472725361769696e-05, + "loss": 0.2832, + "step": 16940 + }, + { + "epoch": 2.26, + "grad_norm": 0.578125, + "learning_rate": 9.471562483551527e-05, + "loss": 0.1751, + "step": 16941 + }, + { + "epoch": 2.26, + "grad_norm": 0.4765625, + "learning_rate": 9.470399612499312e-05, + "loss": 0.3481, + "step": 16942 + }, + { + "epoch": 2.26, + "grad_norm": 0.62109375, + "learning_rate": 9.46923674862882e-05, + "loss": 0.2883, + "step": 16943 + }, + { + "epoch": 2.26, + "grad_norm": 0.72265625, + "learning_rate": 9.468073891955816e-05, + "loss": 0.3583, + "step": 16944 + }, + { + "epoch": 2.26, + "grad_norm": 0.66015625, + "learning_rate": 9.466911042496072e-05, + "loss": 0.4061, + "step": 16945 + }, + { + "epoch": 2.26, + "grad_norm": 0.5859375, + "learning_rate": 9.465748200265354e-05, + "loss": 0.5094, + "step": 16946 + }, + { + "epoch": 2.26, + "grad_norm": 0.70703125, + "learning_rate": 9.464585365279435e-05, + "loss": 0.4446, + "step": 16947 + }, + { + "epoch": 2.26, + "grad_norm": 0.58203125, + "learning_rate": 9.463422537554082e-05, + "loss": 0.2881, + "step": 16948 + }, + { + "epoch": 2.26, + "grad_norm": 0.41015625, + "learning_rate": 9.462259717105064e-05, + "loss": 0.225, + "step": 16949 + }, + { + "epoch": 2.26, + "grad_norm": 0.66015625, + "learning_rate": 9.461096903948147e-05, + "loss": 0.4366, + "step": 16950 + }, + { + "epoch": 2.26, + "grad_norm": 0.8046875, + "learning_rate": 9.459934098099104e-05, + "loss": 0.6165, + "step": 16951 + }, + { + "epoch": 2.26, + "grad_norm": 0.625, + "learning_rate": 9.458771299573702e-05, + "loss": 0.4431, + "step": 16952 + }, + { + "epoch": 2.26, + "grad_norm": 0.58203125, + "learning_rate": 9.457608508387706e-05, + "loss": 0.2728, + "step": 16953 + }, + { + "epoch": 2.26, + "grad_norm": 0.5390625, + "learning_rate": 9.45644572455689e-05, + "loss": 0.4135, + "step": 16954 + }, + { + "epoch": 2.26, + "grad_norm": 0.76953125, + "learning_rate": 9.455282948097021e-05, + "loss": 0.3241, + "step": 16955 + }, + { + "epoch": 2.26, + "grad_norm": 0.5546875, + "learning_rate": 9.45412017902386e-05, + "loss": 0.1877, + "step": 16956 + }, + { + "epoch": 2.26, + "grad_norm": 0.640625, + "learning_rate": 9.452957417353182e-05, + "loss": 0.392, + "step": 16957 + }, + { + "epoch": 2.26, + "grad_norm": 0.52734375, + "learning_rate": 9.45179466310075e-05, + "loss": 0.5542, + "step": 16958 + }, + { + "epoch": 2.26, + "grad_norm": 0.52734375, + "learning_rate": 9.450631916282336e-05, + "loss": 0.2116, + "step": 16959 + }, + { + "epoch": 2.26, + "grad_norm": 0.6796875, + "learning_rate": 9.449469176913704e-05, + "loss": 0.2649, + "step": 16960 + }, + { + "epoch": 2.26, + "grad_norm": 0.5234375, + "learning_rate": 9.448306445010626e-05, + "loss": 0.2698, + "step": 16961 + }, + { + "epoch": 2.26, + "grad_norm": 0.55078125, + "learning_rate": 9.447143720588864e-05, + "loss": 0.3139, + "step": 16962 + }, + { + "epoch": 2.26, + "grad_norm": 0.56640625, + "learning_rate": 9.44598100366419e-05, + "loss": 0.3848, + "step": 16963 + }, + { + "epoch": 2.26, + "grad_norm": 0.52734375, + "learning_rate": 9.444818294252368e-05, + "loss": 0.3867, + "step": 16964 + }, + { + "epoch": 2.26, + "grad_norm": 0.48046875, + "learning_rate": 9.443655592369167e-05, + "loss": 0.1711, + "step": 16965 + }, + { + "epoch": 2.26, + "grad_norm": 0.58984375, + "learning_rate": 9.442492898030352e-05, + "loss": 0.6822, + "step": 16966 + }, + { + "epoch": 2.26, + "grad_norm": 0.48828125, + "learning_rate": 9.441330211251696e-05, + "loss": 0.3436, + "step": 16967 + }, + { + "epoch": 2.26, + "grad_norm": 0.46484375, + "learning_rate": 9.440167532048957e-05, + "loss": 0.2237, + "step": 16968 + }, + { + "epoch": 2.26, + "grad_norm": 0.8125, + "learning_rate": 9.439004860437905e-05, + "loss": 0.2639, + "step": 16969 + }, + { + "epoch": 2.26, + "grad_norm": 0.7109375, + "learning_rate": 9.437842196434308e-05, + "loss": 0.4007, + "step": 16970 + }, + { + "epoch": 2.26, + "grad_norm": 0.55859375, + "learning_rate": 9.43667954005393e-05, + "loss": 0.1416, + "step": 16971 + }, + { + "epoch": 2.26, + "grad_norm": 0.78125, + "learning_rate": 9.435516891312542e-05, + "loss": 0.4984, + "step": 16972 + }, + { + "epoch": 2.26, + "grad_norm": 0.58203125, + "learning_rate": 9.434354250225903e-05, + "loss": 0.3075, + "step": 16973 + }, + { + "epoch": 2.27, + "grad_norm": 0.515625, + "learning_rate": 9.433191616809785e-05, + "loss": 0.3373, + "step": 16974 + }, + { + "epoch": 2.27, + "grad_norm": 0.71484375, + "learning_rate": 9.432028991079954e-05, + "loss": 0.45, + "step": 16975 + }, + { + "epoch": 2.27, + "grad_norm": 0.609375, + "learning_rate": 9.430866373052172e-05, + "loss": 0.3412, + "step": 16976 + }, + { + "epoch": 2.27, + "grad_norm": 0.58984375, + "learning_rate": 9.42970376274221e-05, + "loss": 0.5156, + "step": 16977 + }, + { + "epoch": 2.27, + "grad_norm": 0.60546875, + "learning_rate": 9.42854116016583e-05, + "loss": 0.1527, + "step": 16978 + }, + { + "epoch": 2.27, + "grad_norm": 0.625, + "learning_rate": 9.4273785653388e-05, + "loss": 0.3483, + "step": 16979 + }, + { + "epoch": 2.27, + "grad_norm": 0.53125, + "learning_rate": 9.426215978276884e-05, + "loss": 0.3151, + "step": 16980 + }, + { + "epoch": 2.27, + "grad_norm": 0.5, + "learning_rate": 9.425053398995845e-05, + "loss": 0.3429, + "step": 16981 + }, + { + "epoch": 2.27, + "grad_norm": 0.55078125, + "learning_rate": 9.423890827511453e-05, + "loss": 0.4643, + "step": 16982 + }, + { + "epoch": 2.27, + "grad_norm": 0.5625, + "learning_rate": 9.42272826383947e-05, + "loss": 0.229, + "step": 16983 + }, + { + "epoch": 2.27, + "grad_norm": 0.59375, + "learning_rate": 9.421565707995663e-05, + "loss": 0.3229, + "step": 16984 + }, + { + "epoch": 2.27, + "grad_norm": 0.56640625, + "learning_rate": 9.420403159995794e-05, + "loss": 0.3851, + "step": 16985 + }, + { + "epoch": 2.27, + "grad_norm": 0.65625, + "learning_rate": 9.419240619855634e-05, + "loss": 0.4568, + "step": 16986 + }, + { + "epoch": 2.27, + "grad_norm": 0.59375, + "learning_rate": 9.418078087590941e-05, + "loss": 0.4019, + "step": 16987 + }, + { + "epoch": 2.27, + "grad_norm": 0.765625, + "learning_rate": 9.416915563217483e-05, + "loss": 0.5663, + "step": 16988 + }, + { + "epoch": 2.27, + "grad_norm": 0.42578125, + "learning_rate": 9.415753046751024e-05, + "loss": 0.2136, + "step": 16989 + }, + { + "epoch": 2.27, + "grad_norm": 0.68359375, + "learning_rate": 9.414590538207331e-05, + "loss": 0.3681, + "step": 16990 + }, + { + "epoch": 2.27, + "grad_norm": 0.357421875, + "learning_rate": 9.413428037602167e-05, + "loss": 0.2416, + "step": 16991 + }, + { + "epoch": 2.27, + "grad_norm": 0.78125, + "learning_rate": 9.412265544951291e-05, + "loss": 0.2479, + "step": 16992 + }, + { + "epoch": 2.27, + "grad_norm": 0.59765625, + "learning_rate": 9.411103060270474e-05, + "loss": 0.3953, + "step": 16993 + }, + { + "epoch": 2.27, + "grad_norm": 0.62109375, + "learning_rate": 9.409940583575475e-05, + "loss": 0.5143, + "step": 16994 + }, + { + "epoch": 2.27, + "grad_norm": 0.5703125, + "learning_rate": 9.408778114882063e-05, + "loss": 0.2549, + "step": 16995 + }, + { + "epoch": 2.27, + "grad_norm": 0.72265625, + "learning_rate": 9.407615654205997e-05, + "loss": 0.4318, + "step": 16996 + }, + { + "epoch": 2.27, + "grad_norm": 0.6015625, + "learning_rate": 9.406453201563044e-05, + "loss": 0.5139, + "step": 16997 + }, + { + "epoch": 2.27, + "grad_norm": 0.54296875, + "learning_rate": 9.405290756968966e-05, + "loss": 0.3665, + "step": 16998 + }, + { + "epoch": 2.27, + "grad_norm": 0.53515625, + "learning_rate": 9.404128320439527e-05, + "loss": 0.5325, + "step": 16999 + }, + { + "epoch": 2.27, + "grad_norm": 0.427734375, + "learning_rate": 9.402965891990492e-05, + "loss": 0.4744, + "step": 17000 + }, + { + "epoch": 2.27, + "grad_norm": 0.5078125, + "learning_rate": 9.40180347163762e-05, + "loss": 0.1739, + "step": 17001 + }, + { + "epoch": 2.27, + "grad_norm": 0.53125, + "learning_rate": 9.400641059396679e-05, + "loss": 0.2136, + "step": 17002 + }, + { + "epoch": 2.27, + "grad_norm": 0.79296875, + "learning_rate": 9.399478655283428e-05, + "loss": 0.4975, + "step": 17003 + }, + { + "epoch": 2.27, + "grad_norm": 0.7734375, + "learning_rate": 9.398316259313637e-05, + "loss": 0.2779, + "step": 17004 + }, + { + "epoch": 2.27, + "grad_norm": 0.51953125, + "learning_rate": 9.39715387150306e-05, + "loss": 0.2651, + "step": 17005 + }, + { + "epoch": 2.27, + "grad_norm": 0.57421875, + "learning_rate": 9.395991491867463e-05, + "loss": 0.2862, + "step": 17006 + }, + { + "epoch": 2.27, + "grad_norm": 0.70703125, + "learning_rate": 9.39482912042261e-05, + "loss": 0.4766, + "step": 17007 + }, + { + "epoch": 2.27, + "grad_norm": 0.54296875, + "learning_rate": 9.39366675718426e-05, + "loss": 0.3662, + "step": 17008 + }, + { + "epoch": 2.27, + "grad_norm": 0.58984375, + "learning_rate": 9.392504402168181e-05, + "loss": 0.4236, + "step": 17009 + }, + { + "epoch": 2.27, + "grad_norm": 0.48046875, + "learning_rate": 9.39134205539013e-05, + "loss": 0.3507, + "step": 17010 + }, + { + "epoch": 2.27, + "grad_norm": 0.67578125, + "learning_rate": 9.390179716865873e-05, + "loss": 0.4997, + "step": 17011 + }, + { + "epoch": 2.27, + "grad_norm": 0.5859375, + "learning_rate": 9.389017386611169e-05, + "loss": 0.3844, + "step": 17012 + }, + { + "epoch": 2.27, + "grad_norm": 0.5546875, + "learning_rate": 9.387855064641782e-05, + "loss": 0.3024, + "step": 17013 + }, + { + "epoch": 2.27, + "grad_norm": 0.5234375, + "learning_rate": 9.386692750973473e-05, + "loss": 0.2977, + "step": 17014 + }, + { + "epoch": 2.27, + "grad_norm": 0.84375, + "learning_rate": 9.385530445622006e-05, + "loss": 0.2683, + "step": 17015 + }, + { + "epoch": 2.27, + "grad_norm": 0.63671875, + "learning_rate": 9.384368148603141e-05, + "loss": 0.4677, + "step": 17016 + }, + { + "epoch": 2.27, + "grad_norm": 0.7109375, + "learning_rate": 9.383205859932634e-05, + "loss": 0.3736, + "step": 17017 + }, + { + "epoch": 2.27, + "grad_norm": 0.5625, + "learning_rate": 9.382043579626257e-05, + "loss": 0.2231, + "step": 17018 + }, + { + "epoch": 2.27, + "grad_norm": 0.66796875, + "learning_rate": 9.380881307699761e-05, + "loss": 0.2178, + "step": 17019 + }, + { + "epoch": 2.27, + "grad_norm": 0.53515625, + "learning_rate": 9.379719044168915e-05, + "loss": 0.3346, + "step": 17020 + }, + { + "epoch": 2.27, + "grad_norm": 0.59765625, + "learning_rate": 9.378556789049473e-05, + "loss": 0.3066, + "step": 17021 + }, + { + "epoch": 2.27, + "grad_norm": 0.609375, + "learning_rate": 9.377394542357203e-05, + "loss": 0.2334, + "step": 17022 + }, + { + "epoch": 2.27, + "grad_norm": 0.431640625, + "learning_rate": 9.376232304107862e-05, + "loss": 0.2779, + "step": 17023 + }, + { + "epoch": 2.27, + "grad_norm": 0.51953125, + "learning_rate": 9.37507007431721e-05, + "loss": 0.3683, + "step": 17024 + }, + { + "epoch": 2.27, + "grad_norm": 0.44921875, + "learning_rate": 9.37390785300101e-05, + "loss": 0.228, + "step": 17025 + }, + { + "epoch": 2.27, + "grad_norm": 0.578125, + "learning_rate": 9.37274564017502e-05, + "loss": 0.4923, + "step": 17026 + }, + { + "epoch": 2.27, + "grad_norm": 0.7109375, + "learning_rate": 9.371583435855003e-05, + "loss": 0.3161, + "step": 17027 + }, + { + "epoch": 2.27, + "grad_norm": 0.8671875, + "learning_rate": 9.370421240056721e-05, + "loss": 0.3403, + "step": 17028 + }, + { + "epoch": 2.27, + "grad_norm": 0.5625, + "learning_rate": 9.369259052795928e-05, + "loss": 0.6288, + "step": 17029 + }, + { + "epoch": 2.27, + "grad_norm": 0.5078125, + "learning_rate": 9.368096874088388e-05, + "loss": 0.1884, + "step": 17030 + }, + { + "epoch": 2.27, + "grad_norm": 0.64453125, + "learning_rate": 9.366934703949858e-05, + "loss": 0.3039, + "step": 17031 + }, + { + "epoch": 2.27, + "grad_norm": 0.58203125, + "learning_rate": 9.3657725423961e-05, + "loss": 0.2931, + "step": 17032 + }, + { + "epoch": 2.27, + "grad_norm": 0.72265625, + "learning_rate": 9.364610389442874e-05, + "loss": 0.4608, + "step": 17033 + }, + { + "epoch": 2.27, + "grad_norm": 0.5, + "learning_rate": 9.36344824510594e-05, + "loss": 0.3924, + "step": 17034 + }, + { + "epoch": 2.27, + "grad_norm": 0.5, + "learning_rate": 9.362286109401054e-05, + "loss": 0.3393, + "step": 17035 + }, + { + "epoch": 2.27, + "grad_norm": 0.58984375, + "learning_rate": 9.36112398234398e-05, + "loss": 0.2759, + "step": 17036 + }, + { + "epoch": 2.27, + "grad_norm": 0.703125, + "learning_rate": 9.359961863950473e-05, + "loss": 0.4838, + "step": 17037 + }, + { + "epoch": 2.27, + "grad_norm": 0.369140625, + "learning_rate": 9.358799754236295e-05, + "loss": 0.1426, + "step": 17038 + }, + { + "epoch": 2.27, + "grad_norm": 0.671875, + "learning_rate": 9.357637653217202e-05, + "loss": 0.3211, + "step": 17039 + }, + { + "epoch": 2.27, + "grad_norm": 0.63671875, + "learning_rate": 9.35647556090896e-05, + "loss": 0.7519, + "step": 17040 + }, + { + "epoch": 2.27, + "grad_norm": 0.57421875, + "learning_rate": 9.355313477327319e-05, + "loss": 0.2506, + "step": 17041 + }, + { + "epoch": 2.27, + "grad_norm": 0.376953125, + "learning_rate": 9.354151402488039e-05, + "loss": 0.1343, + "step": 17042 + }, + { + "epoch": 2.27, + "grad_norm": 0.4609375, + "learning_rate": 9.352989336406883e-05, + "loss": 0.1619, + "step": 17043 + }, + { + "epoch": 2.27, + "grad_norm": 0.4921875, + "learning_rate": 9.351827279099606e-05, + "loss": 0.3302, + "step": 17044 + }, + { + "epoch": 2.27, + "grad_norm": 0.40625, + "learning_rate": 9.350665230581968e-05, + "loss": 0.2867, + "step": 17045 + }, + { + "epoch": 2.27, + "grad_norm": 0.44921875, + "learning_rate": 9.349503190869723e-05, + "loss": 0.2549, + "step": 17046 + }, + { + "epoch": 2.27, + "grad_norm": 0.46875, + "learning_rate": 9.348341159978633e-05, + "loss": 0.2314, + "step": 17047 + }, + { + "epoch": 2.27, + "grad_norm": 0.55859375, + "learning_rate": 9.347179137924458e-05, + "loss": 0.3133, + "step": 17048 + }, + { + "epoch": 2.28, + "grad_norm": 0.6015625, + "learning_rate": 9.34601712472295e-05, + "loss": 0.4209, + "step": 17049 + }, + { + "epoch": 2.28, + "grad_norm": 0.435546875, + "learning_rate": 9.344855120389872e-05, + "loss": 0.2987, + "step": 17050 + }, + { + "epoch": 2.28, + "grad_norm": 0.58984375, + "learning_rate": 9.343693124940977e-05, + "loss": 0.4111, + "step": 17051 + }, + { + "epoch": 2.28, + "grad_norm": 0.5234375, + "learning_rate": 9.342531138392028e-05, + "loss": 0.3091, + "step": 17052 + }, + { + "epoch": 2.28, + "grad_norm": 0.59375, + "learning_rate": 9.341369160758778e-05, + "loss": 0.3011, + "step": 17053 + }, + { + "epoch": 2.28, + "grad_norm": 0.361328125, + "learning_rate": 9.340207192056981e-05, + "loss": 0.1481, + "step": 17054 + }, + { + "epoch": 2.28, + "grad_norm": 0.69140625, + "learning_rate": 9.339045232302402e-05, + "loss": 0.3712, + "step": 17055 + }, + { + "epoch": 2.28, + "grad_norm": 0.65234375, + "learning_rate": 9.337883281510789e-05, + "loss": 0.3519, + "step": 17056 + }, + { + "epoch": 2.28, + "grad_norm": 0.439453125, + "learning_rate": 9.336721339697909e-05, + "loss": 0.1753, + "step": 17057 + }, + { + "epoch": 2.28, + "grad_norm": 0.5078125, + "learning_rate": 9.335559406879508e-05, + "loss": 0.2936, + "step": 17058 + }, + { + "epoch": 2.28, + "grad_norm": 0.59765625, + "learning_rate": 9.334397483071352e-05, + "loss": 0.4807, + "step": 17059 + }, + { + "epoch": 2.28, + "grad_norm": 0.39453125, + "learning_rate": 9.33323556828919e-05, + "loss": 0.2457, + "step": 17060 + }, + { + "epoch": 2.28, + "grad_norm": 0.75390625, + "learning_rate": 9.332073662548784e-05, + "loss": 0.4219, + "step": 17061 + }, + { + "epoch": 2.28, + "grad_norm": 0.6328125, + "learning_rate": 9.330911765865887e-05, + "loss": 0.5094, + "step": 17062 + }, + { + "epoch": 2.28, + "grad_norm": 0.6328125, + "learning_rate": 9.329749878256257e-05, + "loss": 0.4912, + "step": 17063 + }, + { + "epoch": 2.28, + "grad_norm": 0.55859375, + "learning_rate": 9.328587999735652e-05, + "loss": 0.172, + "step": 17064 + }, + { + "epoch": 2.28, + "grad_norm": 0.63671875, + "learning_rate": 9.32742613031982e-05, + "loss": 0.4664, + "step": 17065 + }, + { + "epoch": 2.28, + "grad_norm": 0.6796875, + "learning_rate": 9.326264270024523e-05, + "loss": 0.5642, + "step": 17066 + }, + { + "epoch": 2.28, + "grad_norm": 0.484375, + "learning_rate": 9.325102418865513e-05, + "loss": 0.3655, + "step": 17067 + }, + { + "epoch": 2.28, + "grad_norm": 0.5546875, + "learning_rate": 9.323940576858549e-05, + "loss": 0.4167, + "step": 17068 + }, + { + "epoch": 2.28, + "grad_norm": 0.64453125, + "learning_rate": 9.322778744019383e-05, + "loss": 0.4009, + "step": 17069 + }, + { + "epoch": 2.28, + "grad_norm": 0.40625, + "learning_rate": 9.321616920363772e-05, + "loss": 0.2478, + "step": 17070 + }, + { + "epoch": 2.28, + "grad_norm": 0.53515625, + "learning_rate": 9.320455105907471e-05, + "loss": 0.4761, + "step": 17071 + }, + { + "epoch": 2.28, + "grad_norm": 0.423828125, + "learning_rate": 9.319293300666234e-05, + "loss": 0.1989, + "step": 17072 + }, + { + "epoch": 2.28, + "grad_norm": 0.53125, + "learning_rate": 9.31813150465582e-05, + "loss": 0.2194, + "step": 17073 + }, + { + "epoch": 2.28, + "grad_norm": 0.5390625, + "learning_rate": 9.316969717891978e-05, + "loss": 0.4403, + "step": 17074 + }, + { + "epoch": 2.28, + "grad_norm": 0.5390625, + "learning_rate": 9.315807940390467e-05, + "loss": 0.3018, + "step": 17075 + }, + { + "epoch": 2.28, + "grad_norm": 0.53125, + "learning_rate": 9.314646172167041e-05, + "loss": 0.299, + "step": 17076 + }, + { + "epoch": 2.28, + "grad_norm": 0.734375, + "learning_rate": 9.31348441323745e-05, + "loss": 0.4601, + "step": 17077 + }, + { + "epoch": 2.28, + "grad_norm": 0.55078125, + "learning_rate": 9.31232266361745e-05, + "loss": 0.2731, + "step": 17078 + }, + { + "epoch": 2.28, + "grad_norm": 0.5, + "learning_rate": 9.311160923322796e-05, + "loss": 0.3336, + "step": 17079 + }, + { + "epoch": 2.28, + "grad_norm": 0.55078125, + "learning_rate": 9.309999192369245e-05, + "loss": 0.3579, + "step": 17080 + }, + { + "epoch": 2.28, + "grad_norm": 0.58203125, + "learning_rate": 9.308837470772543e-05, + "loss": 0.2885, + "step": 17081 + }, + { + "epoch": 2.28, + "grad_norm": 0.67578125, + "learning_rate": 9.307675758548453e-05, + "loss": 0.4366, + "step": 17082 + }, + { + "epoch": 2.28, + "grad_norm": 0.5234375, + "learning_rate": 9.306514055712721e-05, + "loss": 0.4327, + "step": 17083 + }, + { + "epoch": 2.28, + "grad_norm": 0.5, + "learning_rate": 9.305352362281105e-05, + "loss": 0.2092, + "step": 17084 + }, + { + "epoch": 2.28, + "grad_norm": 0.5859375, + "learning_rate": 9.304190678269356e-05, + "loss": 0.2738, + "step": 17085 + }, + { + "epoch": 2.28, + "grad_norm": 0.54296875, + "learning_rate": 9.303029003693229e-05, + "loss": 0.3827, + "step": 17086 + }, + { + "epoch": 2.28, + "grad_norm": 0.55078125, + "learning_rate": 9.301867338568475e-05, + "loss": 0.4435, + "step": 17087 + }, + { + "epoch": 2.28, + "grad_norm": 0.5625, + "learning_rate": 9.30070568291085e-05, + "loss": 0.5279, + "step": 17088 + }, + { + "epoch": 2.28, + "grad_norm": 0.640625, + "learning_rate": 9.299544036736105e-05, + "loss": 0.3332, + "step": 17089 + }, + { + "epoch": 2.28, + "grad_norm": 0.5859375, + "learning_rate": 9.298382400059989e-05, + "loss": 0.3475, + "step": 17090 + }, + { + "epoch": 2.28, + "grad_norm": 0.53125, + "learning_rate": 9.29722077289826e-05, + "loss": 0.5683, + "step": 17091 + }, + { + "epoch": 2.28, + "grad_norm": 0.55859375, + "learning_rate": 9.296059155266668e-05, + "loss": 0.5484, + "step": 17092 + }, + { + "epoch": 2.28, + "grad_norm": 0.5234375, + "learning_rate": 9.294897547180966e-05, + "loss": 0.3185, + "step": 17093 + }, + { + "epoch": 2.28, + "grad_norm": 0.94140625, + "learning_rate": 9.293735948656905e-05, + "loss": 0.3787, + "step": 17094 + }, + { + "epoch": 2.28, + "grad_norm": 0.66015625, + "learning_rate": 9.292574359710239e-05, + "loss": 0.5169, + "step": 17095 + }, + { + "epoch": 2.28, + "grad_norm": 0.5234375, + "learning_rate": 9.291412780356717e-05, + "loss": 0.1719, + "step": 17096 + }, + { + "epoch": 2.28, + "grad_norm": 0.59765625, + "learning_rate": 9.290251210612091e-05, + "loss": 0.3433, + "step": 17097 + }, + { + "epoch": 2.28, + "grad_norm": 0.5390625, + "learning_rate": 9.289089650492118e-05, + "loss": 0.3217, + "step": 17098 + }, + { + "epoch": 2.28, + "grad_norm": 0.56640625, + "learning_rate": 9.287928100012543e-05, + "loss": 0.4351, + "step": 17099 + }, + { + "epoch": 2.28, + "grad_norm": 0.65234375, + "learning_rate": 9.286766559189122e-05, + "loss": 0.3563, + "step": 17100 + }, + { + "epoch": 2.28, + "grad_norm": 0.66796875, + "learning_rate": 9.285605028037606e-05, + "loss": 0.5602, + "step": 17101 + }, + { + "epoch": 2.28, + "grad_norm": 0.5625, + "learning_rate": 9.28444350657374e-05, + "loss": 0.3319, + "step": 17102 + }, + { + "epoch": 2.28, + "grad_norm": 0.7421875, + "learning_rate": 9.283281994813282e-05, + "loss": 0.4469, + "step": 17103 + }, + { + "epoch": 2.28, + "grad_norm": 0.6484375, + "learning_rate": 9.282120492771977e-05, + "loss": 0.461, + "step": 17104 + }, + { + "epoch": 2.28, + "grad_norm": 0.55859375, + "learning_rate": 9.280959000465582e-05, + "loss": 0.3272, + "step": 17105 + }, + { + "epoch": 2.28, + "grad_norm": 0.51953125, + "learning_rate": 9.279797517909843e-05, + "loss": 0.2949, + "step": 17106 + }, + { + "epoch": 2.28, + "grad_norm": 0.61328125, + "learning_rate": 9.278636045120513e-05, + "loss": 0.2819, + "step": 17107 + }, + { + "epoch": 2.28, + "grad_norm": 0.5390625, + "learning_rate": 9.277474582113338e-05, + "loss": 0.2501, + "step": 17108 + }, + { + "epoch": 2.28, + "grad_norm": 0.87109375, + "learning_rate": 9.276313128904075e-05, + "loss": 0.2181, + "step": 17109 + }, + { + "epoch": 2.28, + "grad_norm": 0.7265625, + "learning_rate": 9.275151685508468e-05, + "loss": 0.1812, + "step": 17110 + }, + { + "epoch": 2.28, + "grad_norm": 0.57421875, + "learning_rate": 9.273990251942272e-05, + "loss": 0.2836, + "step": 17111 + }, + { + "epoch": 2.28, + "grad_norm": 0.453125, + "learning_rate": 9.272828828221232e-05, + "loss": 0.2148, + "step": 17112 + }, + { + "epoch": 2.28, + "grad_norm": 0.365234375, + "learning_rate": 9.271667414361102e-05, + "loss": 0.1933, + "step": 17113 + }, + { + "epoch": 2.28, + "grad_norm": 0.6328125, + "learning_rate": 9.27050601037763e-05, + "loss": 0.7367, + "step": 17114 + }, + { + "epoch": 2.28, + "grad_norm": 0.52734375, + "learning_rate": 9.26934461628656e-05, + "loss": 0.2306, + "step": 17115 + }, + { + "epoch": 2.28, + "grad_norm": 0.52734375, + "learning_rate": 9.26818323210365e-05, + "loss": 0.2882, + "step": 17116 + }, + { + "epoch": 2.28, + "grad_norm": 0.76953125, + "learning_rate": 9.267021857844642e-05, + "loss": 0.4098, + "step": 17117 + }, + { + "epoch": 2.28, + "grad_norm": 0.578125, + "learning_rate": 9.26586049352529e-05, + "loss": 0.4091, + "step": 17118 + }, + { + "epoch": 2.28, + "grad_norm": 1.0703125, + "learning_rate": 9.26469913916134e-05, + "loss": 0.3937, + "step": 17119 + }, + { + "epoch": 2.28, + "grad_norm": 0.67578125, + "learning_rate": 9.263537794768539e-05, + "loss": 0.6079, + "step": 17120 + }, + { + "epoch": 2.28, + "grad_norm": 0.51171875, + "learning_rate": 9.262376460362642e-05, + "loss": 0.4608, + "step": 17121 + }, + { + "epoch": 2.28, + "grad_norm": 0.52734375, + "learning_rate": 9.261215135959393e-05, + "loss": 0.2085, + "step": 17122 + }, + { + "epoch": 2.28, + "grad_norm": 0.7109375, + "learning_rate": 9.260053821574541e-05, + "loss": 0.3039, + "step": 17123 + }, + { + "epoch": 2.29, + "grad_norm": 0.53125, + "learning_rate": 9.258892517223831e-05, + "loss": 0.4483, + "step": 17124 + }, + { + "epoch": 2.29, + "grad_norm": 0.53125, + "learning_rate": 9.25773122292302e-05, + "loss": 0.3262, + "step": 17125 + }, + { + "epoch": 2.29, + "grad_norm": 0.5703125, + "learning_rate": 9.25656993868785e-05, + "loss": 0.1616, + "step": 17126 + }, + { + "epoch": 2.29, + "grad_norm": 0.609375, + "learning_rate": 9.255408664534064e-05, + "loss": 0.4474, + "step": 17127 + }, + { + "epoch": 2.29, + "grad_norm": 0.6875, + "learning_rate": 9.254247400477416e-05, + "loss": 0.49, + "step": 17128 + }, + { + "epoch": 2.29, + "grad_norm": 0.59375, + "learning_rate": 9.253086146533651e-05, + "loss": 0.4898, + "step": 17129 + }, + { + "epoch": 2.29, + "grad_norm": 0.63671875, + "learning_rate": 9.251924902718518e-05, + "loss": 0.5855, + "step": 17130 + }, + { + "epoch": 2.29, + "grad_norm": 0.51953125, + "learning_rate": 9.250763669047763e-05, + "loss": 0.3332, + "step": 17131 + }, + { + "epoch": 2.29, + "grad_norm": 0.49609375, + "learning_rate": 9.249602445537134e-05, + "loss": 0.2482, + "step": 17132 + }, + { + "epoch": 2.29, + "grad_norm": 0.44921875, + "learning_rate": 9.248441232202376e-05, + "loss": 0.3286, + "step": 17133 + }, + { + "epoch": 2.29, + "grad_norm": 0.474609375, + "learning_rate": 9.24728002905924e-05, + "loss": 0.1878, + "step": 17134 + }, + { + "epoch": 2.29, + "grad_norm": 0.53515625, + "learning_rate": 9.246118836123467e-05, + "loss": 0.2971, + "step": 17135 + }, + { + "epoch": 2.29, + "grad_norm": 0.609375, + "learning_rate": 9.244957653410809e-05, + "loss": 0.4088, + "step": 17136 + }, + { + "epoch": 2.29, + "grad_norm": 0.5546875, + "learning_rate": 9.243796480937011e-05, + "loss": 0.2897, + "step": 17137 + }, + { + "epoch": 2.29, + "grad_norm": 0.64453125, + "learning_rate": 9.242635318717815e-05, + "loss": 0.3979, + "step": 17138 + }, + { + "epoch": 2.29, + "grad_norm": 0.62109375, + "learning_rate": 9.241474166768972e-05, + "loss": 0.4172, + "step": 17139 + }, + { + "epoch": 2.29, + "grad_norm": 0.55078125, + "learning_rate": 9.240313025106224e-05, + "loss": 0.4233, + "step": 17140 + }, + { + "epoch": 2.29, + "grad_norm": 0.498046875, + "learning_rate": 9.239151893745322e-05, + "loss": 0.2525, + "step": 17141 + }, + { + "epoch": 2.29, + "grad_norm": 0.55859375, + "learning_rate": 9.237990772702005e-05, + "loss": 0.3275, + "step": 17142 + }, + { + "epoch": 2.29, + "grad_norm": 0.5703125, + "learning_rate": 9.236829661992023e-05, + "loss": 0.3362, + "step": 17143 + }, + { + "epoch": 2.29, + "grad_norm": 0.38671875, + "learning_rate": 9.235668561631121e-05, + "loss": 0.1968, + "step": 17144 + }, + { + "epoch": 2.29, + "grad_norm": 0.54296875, + "learning_rate": 9.234507471635043e-05, + "loss": 0.3706, + "step": 17145 + }, + { + "epoch": 2.29, + "grad_norm": 0.62109375, + "learning_rate": 9.233346392019538e-05, + "loss": 0.261, + "step": 17146 + }, + { + "epoch": 2.29, + "grad_norm": 0.466796875, + "learning_rate": 9.232185322800344e-05, + "loss": 0.3493, + "step": 17147 + }, + { + "epoch": 2.29, + "grad_norm": 0.55859375, + "learning_rate": 9.231024263993213e-05, + "loss": 0.3217, + "step": 17148 + }, + { + "epoch": 2.29, + "grad_norm": 0.482421875, + "learning_rate": 9.229863215613888e-05, + "loss": 0.1883, + "step": 17149 + }, + { + "epoch": 2.29, + "grad_norm": 0.50390625, + "learning_rate": 9.22870217767811e-05, + "loss": 0.197, + "step": 17150 + }, + { + "epoch": 2.29, + "grad_norm": 0.734375, + "learning_rate": 9.227541150201624e-05, + "loss": 0.3534, + "step": 17151 + }, + { + "epoch": 2.29, + "grad_norm": 0.515625, + "learning_rate": 9.226380133200177e-05, + "loss": 0.2724, + "step": 17152 + }, + { + "epoch": 2.29, + "grad_norm": 0.5390625, + "learning_rate": 9.225219126689511e-05, + "loss": 0.3773, + "step": 17153 + }, + { + "epoch": 2.29, + "grad_norm": 0.484375, + "learning_rate": 9.224058130685371e-05, + "loss": 0.189, + "step": 17154 + }, + { + "epoch": 2.29, + "grad_norm": 0.6953125, + "learning_rate": 9.222897145203502e-05, + "loss": 0.4307, + "step": 17155 + }, + { + "epoch": 2.29, + "grad_norm": 0.64453125, + "learning_rate": 9.221736170259645e-05, + "loss": 0.2824, + "step": 17156 + }, + { + "epoch": 2.29, + "grad_norm": 0.546875, + "learning_rate": 9.220575205869546e-05, + "loss": 0.2615, + "step": 17157 + }, + { + "epoch": 2.29, + "grad_norm": 0.412109375, + "learning_rate": 9.219414252048947e-05, + "loss": 0.207, + "step": 17158 + }, + { + "epoch": 2.29, + "grad_norm": 0.515625, + "learning_rate": 9.218253308813592e-05, + "loss": 0.2612, + "step": 17159 + }, + { + "epoch": 2.29, + "grad_norm": 0.44921875, + "learning_rate": 9.217092376179223e-05, + "loss": 0.3119, + "step": 17160 + }, + { + "epoch": 2.29, + "grad_norm": 0.51953125, + "learning_rate": 9.215931454161587e-05, + "loss": 0.3295, + "step": 17161 + }, + { + "epoch": 2.29, + "grad_norm": 0.625, + "learning_rate": 9.214770542776423e-05, + "loss": 0.4448, + "step": 17162 + }, + { + "epoch": 2.29, + "grad_norm": 0.640625, + "learning_rate": 9.213609642039471e-05, + "loss": 0.478, + "step": 17163 + }, + { + "epoch": 2.29, + "grad_norm": 0.5546875, + "learning_rate": 9.21244875196648e-05, + "loss": 0.2754, + "step": 17164 + }, + { + "epoch": 2.29, + "grad_norm": 0.37890625, + "learning_rate": 9.211287872573187e-05, + "loss": 0.1901, + "step": 17165 + }, + { + "epoch": 2.29, + "grad_norm": 0.46875, + "learning_rate": 9.210127003875338e-05, + "loss": 0.1403, + "step": 17166 + }, + { + "epoch": 2.29, + "grad_norm": 0.5703125, + "learning_rate": 9.208966145888672e-05, + "loss": 0.4573, + "step": 17167 + }, + { + "epoch": 2.29, + "grad_norm": 0.50390625, + "learning_rate": 9.207805298628936e-05, + "loss": 0.1722, + "step": 17168 + }, + { + "epoch": 2.29, + "grad_norm": 0.6015625, + "learning_rate": 9.206644462111866e-05, + "loss": 0.3091, + "step": 17169 + }, + { + "epoch": 2.29, + "grad_norm": 0.63671875, + "learning_rate": 9.205483636353204e-05, + "loss": 0.3764, + "step": 17170 + }, + { + "epoch": 2.29, + "grad_norm": 0.5625, + "learning_rate": 9.204322821368699e-05, + "loss": 0.5909, + "step": 17171 + }, + { + "epoch": 2.29, + "grad_norm": 0.55859375, + "learning_rate": 9.203162017174083e-05, + "loss": 0.3473, + "step": 17172 + }, + { + "epoch": 2.29, + "grad_norm": 0.62890625, + "learning_rate": 9.202001223785108e-05, + "loss": 0.4222, + "step": 17173 + }, + { + "epoch": 2.29, + "grad_norm": 0.4296875, + "learning_rate": 9.200840441217504e-05, + "loss": 0.2142, + "step": 17174 + }, + { + "epoch": 2.29, + "grad_norm": 0.46484375, + "learning_rate": 9.199679669487016e-05, + "loss": 0.3464, + "step": 17175 + }, + { + "epoch": 2.29, + "grad_norm": 0.48046875, + "learning_rate": 9.198518908609386e-05, + "loss": 0.2989, + "step": 17176 + }, + { + "epoch": 2.29, + "grad_norm": 0.490234375, + "learning_rate": 9.197358158600353e-05, + "loss": 0.2294, + "step": 17177 + }, + { + "epoch": 2.29, + "grad_norm": 0.5625, + "learning_rate": 9.196197419475661e-05, + "loss": 0.231, + "step": 17178 + }, + { + "epoch": 2.29, + "grad_norm": 0.5703125, + "learning_rate": 9.195036691251044e-05, + "loss": 0.5782, + "step": 17179 + }, + { + "epoch": 2.29, + "grad_norm": 0.5859375, + "learning_rate": 9.19387597394225e-05, + "loss": 0.3317, + "step": 17180 + }, + { + "epoch": 2.29, + "grad_norm": 0.52734375, + "learning_rate": 9.192715267565012e-05, + "loss": 0.3834, + "step": 17181 + }, + { + "epoch": 2.29, + "grad_norm": 0.6640625, + "learning_rate": 9.191554572135075e-05, + "loss": 0.3227, + "step": 17182 + }, + { + "epoch": 2.29, + "grad_norm": 0.578125, + "learning_rate": 9.190393887668175e-05, + "loss": 0.1194, + "step": 17183 + }, + { + "epoch": 2.29, + "grad_norm": 0.3828125, + "learning_rate": 9.189233214180056e-05, + "loss": 0.1273, + "step": 17184 + }, + { + "epoch": 2.29, + "grad_norm": 0.51171875, + "learning_rate": 9.188072551686453e-05, + "loss": 0.6024, + "step": 17185 + }, + { + "epoch": 2.29, + "grad_norm": 0.455078125, + "learning_rate": 9.186911900203111e-05, + "loss": 0.4465, + "step": 17186 + }, + { + "epoch": 2.29, + "grad_norm": 0.859375, + "learning_rate": 9.185751259745764e-05, + "loss": 0.2158, + "step": 17187 + }, + { + "epoch": 2.29, + "grad_norm": 0.9140625, + "learning_rate": 9.18459063033015e-05, + "loss": 0.3536, + "step": 17188 + }, + { + "epoch": 2.29, + "grad_norm": 0.71484375, + "learning_rate": 9.183430011972012e-05, + "loss": 0.2863, + "step": 17189 + }, + { + "epoch": 2.29, + "grad_norm": 0.423828125, + "learning_rate": 9.182269404687087e-05, + "loss": 0.1345, + "step": 17190 + }, + { + "epoch": 2.29, + "grad_norm": 0.4921875, + "learning_rate": 9.181108808491113e-05, + "loss": 0.3784, + "step": 17191 + }, + { + "epoch": 2.29, + "grad_norm": 0.53515625, + "learning_rate": 9.179948223399828e-05, + "loss": 0.4482, + "step": 17192 + }, + { + "epoch": 2.29, + "grad_norm": 0.37109375, + "learning_rate": 9.178787649428971e-05, + "loss": 0.1591, + "step": 17193 + }, + { + "epoch": 2.29, + "grad_norm": 0.62109375, + "learning_rate": 9.177627086594283e-05, + "loss": 0.2879, + "step": 17194 + }, + { + "epoch": 2.29, + "grad_norm": 0.54296875, + "learning_rate": 9.176466534911498e-05, + "loss": 0.2929, + "step": 17195 + }, + { + "epoch": 2.29, + "grad_norm": 0.5390625, + "learning_rate": 9.175305994396358e-05, + "loss": 0.2762, + "step": 17196 + }, + { + "epoch": 2.29, + "grad_norm": 0.671875, + "learning_rate": 9.174145465064594e-05, + "loss": 0.6648, + "step": 17197 + }, + { + "epoch": 2.29, + "grad_norm": 0.55859375, + "learning_rate": 9.172984946931954e-05, + "loss": 0.3885, + "step": 17198 + }, + { + "epoch": 2.3, + "grad_norm": 0.61328125, + "learning_rate": 9.171824440014165e-05, + "loss": 0.3798, + "step": 17199 + }, + { + "epoch": 2.3, + "grad_norm": 0.60546875, + "learning_rate": 9.170663944326968e-05, + "loss": 0.2842, + "step": 17200 + }, + { + "epoch": 2.3, + "grad_norm": 0.443359375, + "learning_rate": 9.169503459886099e-05, + "loss": 0.1797, + "step": 17201 + }, + { + "epoch": 2.3, + "grad_norm": 0.5859375, + "learning_rate": 9.168342986707296e-05, + "loss": 0.2356, + "step": 17202 + }, + { + "epoch": 2.3, + "grad_norm": 0.478515625, + "learning_rate": 9.167182524806298e-05, + "loss": 0.2314, + "step": 17203 + }, + { + "epoch": 2.3, + "grad_norm": 0.58203125, + "learning_rate": 9.166022074198837e-05, + "loss": 0.5582, + "step": 17204 + }, + { + "epoch": 2.3, + "grad_norm": 0.6640625, + "learning_rate": 9.164861634900654e-05, + "loss": 0.3622, + "step": 17205 + }, + { + "epoch": 2.3, + "grad_norm": 0.578125, + "learning_rate": 9.163701206927482e-05, + "loss": 0.506, + "step": 17206 + }, + { + "epoch": 2.3, + "grad_norm": 0.455078125, + "learning_rate": 9.162540790295058e-05, + "loss": 0.1973, + "step": 17207 + }, + { + "epoch": 2.3, + "grad_norm": 0.5625, + "learning_rate": 9.16138038501912e-05, + "loss": 0.3444, + "step": 17208 + }, + { + "epoch": 2.3, + "grad_norm": 0.498046875, + "learning_rate": 9.160219991115402e-05, + "loss": 0.1612, + "step": 17209 + }, + { + "epoch": 2.3, + "grad_norm": 0.5078125, + "learning_rate": 9.159059608599642e-05, + "loss": 0.4918, + "step": 17210 + }, + { + "epoch": 2.3, + "grad_norm": 0.546875, + "learning_rate": 9.15789923748757e-05, + "loss": 0.3469, + "step": 17211 + }, + { + "epoch": 2.3, + "grad_norm": 0.53125, + "learning_rate": 9.156738877794926e-05, + "loss": 0.4836, + "step": 17212 + }, + { + "epoch": 2.3, + "grad_norm": 0.6953125, + "learning_rate": 9.155578529537442e-05, + "loss": 0.3241, + "step": 17213 + }, + { + "epoch": 2.3, + "grad_norm": 0.53125, + "learning_rate": 9.154418192730858e-05, + "loss": 0.3455, + "step": 17214 + }, + { + "epoch": 2.3, + "grad_norm": 0.474609375, + "learning_rate": 9.153257867390903e-05, + "loss": 0.2143, + "step": 17215 + }, + { + "epoch": 2.3, + "grad_norm": 0.60546875, + "learning_rate": 9.152097553533317e-05, + "loss": 0.3002, + "step": 17216 + }, + { + "epoch": 2.3, + "grad_norm": 0.49609375, + "learning_rate": 9.15093725117383e-05, + "loss": 0.2867, + "step": 17217 + }, + { + "epoch": 2.3, + "grad_norm": 0.55078125, + "learning_rate": 9.14977696032818e-05, + "loss": 0.3832, + "step": 17218 + }, + { + "epoch": 2.3, + "grad_norm": 0.40234375, + "learning_rate": 9.148616681012101e-05, + "loss": 0.2061, + "step": 17219 + }, + { + "epoch": 2.3, + "grad_norm": 0.65234375, + "learning_rate": 9.147456413241324e-05, + "loss": 0.3516, + "step": 17220 + }, + { + "epoch": 2.3, + "grad_norm": 0.67578125, + "learning_rate": 9.146296157031588e-05, + "loss": 0.3847, + "step": 17221 + }, + { + "epoch": 2.3, + "grad_norm": 0.65625, + "learning_rate": 9.145135912398627e-05, + "loss": 0.5175, + "step": 17222 + }, + { + "epoch": 2.3, + "grad_norm": 0.625, + "learning_rate": 9.143975679358166e-05, + "loss": 0.2753, + "step": 17223 + }, + { + "epoch": 2.3, + "grad_norm": 0.51171875, + "learning_rate": 9.142815457925947e-05, + "loss": 0.4742, + "step": 17224 + }, + { + "epoch": 2.3, + "grad_norm": 0.57421875, + "learning_rate": 9.141655248117698e-05, + "loss": 0.1993, + "step": 17225 + }, + { + "epoch": 2.3, + "grad_norm": 0.71875, + "learning_rate": 9.140495049949157e-05, + "loss": 0.2897, + "step": 17226 + }, + { + "epoch": 2.3, + "grad_norm": 0.498046875, + "learning_rate": 9.139334863436053e-05, + "loss": 0.2556, + "step": 17227 + }, + { + "epoch": 2.3, + "grad_norm": 0.6953125, + "learning_rate": 9.138174688594122e-05, + "loss": 0.209, + "step": 17228 + }, + { + "epoch": 2.3, + "grad_norm": 0.5390625, + "learning_rate": 9.137014525439094e-05, + "loss": 0.2081, + "step": 17229 + }, + { + "epoch": 2.3, + "grad_norm": 0.6875, + "learning_rate": 9.135854373986704e-05, + "loss": 0.4541, + "step": 17230 + }, + { + "epoch": 2.3, + "grad_norm": 0.4296875, + "learning_rate": 9.134694234252682e-05, + "loss": 0.2736, + "step": 17231 + }, + { + "epoch": 2.3, + "grad_norm": 0.42578125, + "learning_rate": 9.133534106252763e-05, + "loss": 0.2508, + "step": 17232 + }, + { + "epoch": 2.3, + "grad_norm": 0.81640625, + "learning_rate": 9.132373990002677e-05, + "loss": 0.7636, + "step": 17233 + }, + { + "epoch": 2.3, + "grad_norm": 0.5390625, + "learning_rate": 9.13121388551816e-05, + "loss": 0.5683, + "step": 17234 + }, + { + "epoch": 2.3, + "grad_norm": 0.416015625, + "learning_rate": 9.130053792814938e-05, + "loss": 0.1965, + "step": 17235 + }, + { + "epoch": 2.3, + "grad_norm": 0.51171875, + "learning_rate": 9.128893711908742e-05, + "loss": 0.2607, + "step": 17236 + }, + { + "epoch": 2.3, + "grad_norm": 0.400390625, + "learning_rate": 9.127733642815309e-05, + "loss": 0.1159, + "step": 17237 + }, + { + "epoch": 2.3, + "grad_norm": 0.53125, + "learning_rate": 9.126573585550365e-05, + "loss": 0.3787, + "step": 17238 + }, + { + "epoch": 2.3, + "grad_norm": 0.56640625, + "learning_rate": 9.125413540129648e-05, + "loss": 0.3728, + "step": 17239 + }, + { + "epoch": 2.3, + "grad_norm": 0.7109375, + "learning_rate": 9.124253506568881e-05, + "loss": 0.437, + "step": 17240 + }, + { + "epoch": 2.3, + "grad_norm": 0.455078125, + "learning_rate": 9.1230934848838e-05, + "loss": 0.2178, + "step": 17241 + }, + { + "epoch": 2.3, + "grad_norm": 0.486328125, + "learning_rate": 9.121933475090134e-05, + "loss": 0.4814, + "step": 17242 + }, + { + "epoch": 2.3, + "grad_norm": 0.5, + "learning_rate": 9.120773477203611e-05, + "loss": 0.406, + "step": 17243 + }, + { + "epoch": 2.3, + "grad_norm": 0.59375, + "learning_rate": 9.119613491239968e-05, + "loss": 0.3973, + "step": 17244 + }, + { + "epoch": 2.3, + "grad_norm": 0.5703125, + "learning_rate": 9.118453517214927e-05, + "loss": 0.3191, + "step": 17245 + }, + { + "epoch": 2.3, + "grad_norm": 0.5859375, + "learning_rate": 9.117293555144228e-05, + "loss": 0.3623, + "step": 17246 + }, + { + "epoch": 2.3, + "grad_norm": 0.6015625, + "learning_rate": 9.116133605043594e-05, + "loss": 0.3922, + "step": 17247 + }, + { + "epoch": 2.3, + "grad_norm": 0.466796875, + "learning_rate": 9.11497366692875e-05, + "loss": 0.2082, + "step": 17248 + }, + { + "epoch": 2.3, + "grad_norm": 0.70703125, + "learning_rate": 9.113813740815436e-05, + "loss": 0.5588, + "step": 17249 + }, + { + "epoch": 2.3, + "grad_norm": 0.55078125, + "learning_rate": 9.112653826719372e-05, + "loss": 0.3032, + "step": 17250 + }, + { + "epoch": 2.3, + "grad_norm": 0.66796875, + "learning_rate": 9.111493924656297e-05, + "loss": 0.4314, + "step": 17251 + }, + { + "epoch": 2.3, + "grad_norm": 0.62890625, + "learning_rate": 9.11033403464193e-05, + "loss": 0.2321, + "step": 17252 + }, + { + "epoch": 2.3, + "grad_norm": 0.73046875, + "learning_rate": 9.109174156692008e-05, + "loss": 0.2288, + "step": 17253 + }, + { + "epoch": 2.3, + "grad_norm": 0.6171875, + "learning_rate": 9.108014290822253e-05, + "loss": 0.3353, + "step": 17254 + }, + { + "epoch": 2.3, + "grad_norm": 0.64453125, + "learning_rate": 9.106854437048398e-05, + "loss": 0.5729, + "step": 17255 + }, + { + "epoch": 2.3, + "grad_norm": 0.447265625, + "learning_rate": 9.10569459538617e-05, + "loss": 0.1605, + "step": 17256 + }, + { + "epoch": 2.3, + "grad_norm": 0.94140625, + "learning_rate": 9.104534765851299e-05, + "loss": 0.4584, + "step": 17257 + }, + { + "epoch": 2.3, + "grad_norm": 0.6875, + "learning_rate": 9.103374948459512e-05, + "loss": 0.4785, + "step": 17258 + }, + { + "epoch": 2.3, + "grad_norm": 0.68359375, + "learning_rate": 9.102215143226534e-05, + "loss": 0.3487, + "step": 17259 + }, + { + "epoch": 2.3, + "grad_norm": 0.6484375, + "learning_rate": 9.101055350168096e-05, + "loss": 0.6016, + "step": 17260 + }, + { + "epoch": 2.3, + "grad_norm": 0.671875, + "learning_rate": 9.099895569299921e-05, + "loss": 0.6655, + "step": 17261 + }, + { + "epoch": 2.3, + "grad_norm": 0.58203125, + "learning_rate": 9.098735800637742e-05, + "loss": 0.2983, + "step": 17262 + }, + { + "epoch": 2.3, + "grad_norm": 0.6640625, + "learning_rate": 9.097576044197283e-05, + "loss": 0.3653, + "step": 17263 + }, + { + "epoch": 2.3, + "grad_norm": 0.58203125, + "learning_rate": 9.096416299994273e-05, + "loss": 0.2003, + "step": 17264 + }, + { + "epoch": 2.3, + "grad_norm": 0.421875, + "learning_rate": 9.095256568044437e-05, + "loss": 0.1595, + "step": 17265 + }, + { + "epoch": 2.3, + "grad_norm": 0.63671875, + "learning_rate": 9.094096848363502e-05, + "loss": 0.2934, + "step": 17266 + }, + { + "epoch": 2.3, + "grad_norm": 0.6328125, + "learning_rate": 9.092937140967198e-05, + "loss": 0.2818, + "step": 17267 + }, + { + "epoch": 2.3, + "grad_norm": 0.73046875, + "learning_rate": 9.091777445871246e-05, + "loss": 0.5446, + "step": 17268 + }, + { + "epoch": 2.3, + "grad_norm": 0.51953125, + "learning_rate": 9.090617763091378e-05, + "loss": 0.384, + "step": 17269 + }, + { + "epoch": 2.3, + "grad_norm": 0.322265625, + "learning_rate": 9.089458092643311e-05, + "loss": 0.131, + "step": 17270 + }, + { + "epoch": 2.3, + "grad_norm": 0.65625, + "learning_rate": 9.088298434542784e-05, + "loss": 0.192, + "step": 17271 + }, + { + "epoch": 2.3, + "grad_norm": 0.59765625, + "learning_rate": 9.087138788805512e-05, + "loss": 0.3638, + "step": 17272 + }, + { + "epoch": 2.3, + "grad_norm": 0.52734375, + "learning_rate": 9.085979155447223e-05, + "loss": 0.1862, + "step": 17273 + }, + { + "epoch": 2.31, + "grad_norm": 0.6953125, + "learning_rate": 9.084819534483644e-05, + "loss": 0.4661, + "step": 17274 + }, + { + "epoch": 2.31, + "grad_norm": 0.62109375, + "learning_rate": 9.083659925930499e-05, + "loss": 0.3975, + "step": 17275 + }, + { + "epoch": 2.31, + "grad_norm": 0.7421875, + "learning_rate": 9.082500329803514e-05, + "loss": 0.3146, + "step": 17276 + }, + { + "epoch": 2.31, + "grad_norm": 0.6640625, + "learning_rate": 9.081340746118412e-05, + "loss": 0.3413, + "step": 17277 + }, + { + "epoch": 2.31, + "grad_norm": 0.47265625, + "learning_rate": 9.080181174890921e-05, + "loss": 0.3353, + "step": 17278 + }, + { + "epoch": 2.31, + "grad_norm": 0.6328125, + "learning_rate": 9.079021616136762e-05, + "loss": 0.2027, + "step": 17279 + }, + { + "epoch": 2.31, + "grad_norm": 0.62890625, + "learning_rate": 9.077862069871662e-05, + "loss": 0.2396, + "step": 17280 + }, + { + "epoch": 2.31, + "grad_norm": 0.66015625, + "learning_rate": 9.076702536111342e-05, + "loss": 0.3297, + "step": 17281 + }, + { + "epoch": 2.31, + "grad_norm": 0.5234375, + "learning_rate": 9.075543014871532e-05, + "loss": 0.2619, + "step": 17282 + }, + { + "epoch": 2.31, + "grad_norm": 0.6953125, + "learning_rate": 9.074383506167952e-05, + "loss": 0.6694, + "step": 17283 + }, + { + "epoch": 2.31, + "grad_norm": 0.69140625, + "learning_rate": 9.073224010016323e-05, + "loss": 0.4985, + "step": 17284 + }, + { + "epoch": 2.31, + "grad_norm": 0.6484375, + "learning_rate": 9.072064526432373e-05, + "loss": 0.4758, + "step": 17285 + }, + { + "epoch": 2.31, + "grad_norm": 0.44140625, + "learning_rate": 9.070905055431822e-05, + "loss": 0.1888, + "step": 17286 + }, + { + "epoch": 2.31, + "grad_norm": 0.5625, + "learning_rate": 9.069745597030396e-05, + "loss": 0.3085, + "step": 17287 + }, + { + "epoch": 2.31, + "grad_norm": 0.458984375, + "learning_rate": 9.068586151243815e-05, + "loss": 0.4131, + "step": 17288 + }, + { + "epoch": 2.31, + "grad_norm": 0.5234375, + "learning_rate": 9.067426718087805e-05, + "loss": 0.4724, + "step": 17289 + }, + { + "epoch": 2.31, + "grad_norm": 0.66796875, + "learning_rate": 9.066267297578086e-05, + "loss": 0.2894, + "step": 17290 + }, + { + "epoch": 2.31, + "grad_norm": 0.75, + "learning_rate": 9.065107889730381e-05, + "loss": 0.3493, + "step": 17291 + }, + { + "epoch": 2.31, + "grad_norm": 0.4921875, + "learning_rate": 9.063948494560416e-05, + "loss": 0.2589, + "step": 17292 + }, + { + "epoch": 2.31, + "grad_norm": 0.498046875, + "learning_rate": 9.062789112083906e-05, + "loss": 0.3315, + "step": 17293 + }, + { + "epoch": 2.31, + "grad_norm": 0.76953125, + "learning_rate": 9.061629742316582e-05, + "loss": 0.3644, + "step": 17294 + }, + { + "epoch": 2.31, + "grad_norm": 0.796875, + "learning_rate": 9.060470385274162e-05, + "loss": 0.6167, + "step": 17295 + }, + { + "epoch": 2.31, + "grad_norm": 0.7109375, + "learning_rate": 9.059311040972362e-05, + "loss": 0.5564, + "step": 17296 + }, + { + "epoch": 2.31, + "grad_norm": 0.373046875, + "learning_rate": 9.05815170942691e-05, + "loss": 0.1511, + "step": 17297 + }, + { + "epoch": 2.31, + "grad_norm": 0.41015625, + "learning_rate": 9.056992390653524e-05, + "loss": 0.3008, + "step": 17298 + }, + { + "epoch": 2.31, + "grad_norm": 0.56640625, + "learning_rate": 9.055833084667929e-05, + "loss": 0.3939, + "step": 17299 + }, + { + "epoch": 2.31, + "grad_norm": 0.5390625, + "learning_rate": 9.054673791485839e-05, + "loss": 0.4856, + "step": 17300 + }, + { + "epoch": 2.31, + "grad_norm": 0.75390625, + "learning_rate": 9.053514511122982e-05, + "loss": 0.3075, + "step": 17301 + }, + { + "epoch": 2.31, + "grad_norm": 0.6015625, + "learning_rate": 9.052355243595074e-05, + "loss": 0.4706, + "step": 17302 + }, + { + "epoch": 2.31, + "grad_norm": 0.546875, + "learning_rate": 9.05119598891784e-05, + "loss": 0.4702, + "step": 17303 + }, + { + "epoch": 2.31, + "grad_norm": 0.51171875, + "learning_rate": 9.050036747106993e-05, + "loss": 0.3036, + "step": 17304 + }, + { + "epoch": 2.31, + "grad_norm": 0.578125, + "learning_rate": 9.048877518178262e-05, + "loss": 0.308, + "step": 17305 + }, + { + "epoch": 2.31, + "grad_norm": 0.5078125, + "learning_rate": 9.04771830214736e-05, + "loss": 0.2145, + "step": 17306 + }, + { + "epoch": 2.31, + "grad_norm": 0.48046875, + "learning_rate": 9.046559099030012e-05, + "loss": 0.3841, + "step": 17307 + }, + { + "epoch": 2.31, + "grad_norm": 0.578125, + "learning_rate": 9.045399908841932e-05, + "loss": 0.28, + "step": 17308 + }, + { + "epoch": 2.31, + "grad_norm": 0.609375, + "learning_rate": 9.04424073159884e-05, + "loss": 0.4695, + "step": 17309 + }, + { + "epoch": 2.31, + "grad_norm": 0.48046875, + "learning_rate": 9.04308156731646e-05, + "loss": 0.2573, + "step": 17310 + }, + { + "epoch": 2.31, + "grad_norm": 0.83203125, + "learning_rate": 9.041922416010506e-05, + "loss": 0.2839, + "step": 17311 + }, + { + "epoch": 2.31, + "grad_norm": 0.474609375, + "learning_rate": 9.0407632776967e-05, + "loss": 0.3556, + "step": 17312 + }, + { + "epoch": 2.31, + "grad_norm": 0.6328125, + "learning_rate": 9.03960415239076e-05, + "loss": 0.5662, + "step": 17313 + }, + { + "epoch": 2.31, + "grad_norm": 0.7421875, + "learning_rate": 9.038445040108403e-05, + "loss": 0.5026, + "step": 17314 + }, + { + "epoch": 2.31, + "grad_norm": 0.79296875, + "learning_rate": 9.037285940865348e-05, + "loss": 0.6238, + "step": 17315 + }, + { + "epoch": 2.31, + "grad_norm": 0.63671875, + "learning_rate": 9.036126854677313e-05, + "loss": 0.2997, + "step": 17316 + }, + { + "epoch": 2.31, + "grad_norm": 0.47265625, + "learning_rate": 9.034967781560018e-05, + "loss": 0.3001, + "step": 17317 + }, + { + "epoch": 2.31, + "grad_norm": 0.4921875, + "learning_rate": 9.03380872152918e-05, + "loss": 0.2497, + "step": 17318 + }, + { + "epoch": 2.31, + "grad_norm": 0.57421875, + "learning_rate": 9.032649674600516e-05, + "loss": 0.394, + "step": 17319 + }, + { + "epoch": 2.31, + "grad_norm": 0.7734375, + "learning_rate": 9.031490640789743e-05, + "loss": 0.6328, + "step": 17320 + }, + { + "epoch": 2.31, + "grad_norm": 0.5, + "learning_rate": 9.030331620112578e-05, + "loss": 0.3252, + "step": 17321 + }, + { + "epoch": 2.31, + "grad_norm": 0.486328125, + "learning_rate": 9.029172612584738e-05, + "loss": 0.3701, + "step": 17322 + }, + { + "epoch": 2.31, + "grad_norm": 0.45703125, + "learning_rate": 9.02801361822194e-05, + "loss": 0.1285, + "step": 17323 + }, + { + "epoch": 2.31, + "grad_norm": 0.458984375, + "learning_rate": 9.026854637039902e-05, + "loss": 0.1695, + "step": 17324 + }, + { + "epoch": 2.31, + "grad_norm": 0.47265625, + "learning_rate": 9.025695669054338e-05, + "loss": 0.394, + "step": 17325 + }, + { + "epoch": 2.31, + "grad_norm": 0.546875, + "learning_rate": 9.024536714280969e-05, + "loss": 0.3401, + "step": 17326 + }, + { + "epoch": 2.31, + "grad_norm": 0.54296875, + "learning_rate": 9.023377772735505e-05, + "loss": 0.2586, + "step": 17327 + }, + { + "epoch": 2.31, + "grad_norm": 0.578125, + "learning_rate": 9.022218844433669e-05, + "loss": 0.6439, + "step": 17328 + }, + { + "epoch": 2.31, + "grad_norm": 0.41015625, + "learning_rate": 9.021059929391169e-05, + "loss": 0.2334, + "step": 17329 + }, + { + "epoch": 2.31, + "grad_norm": 0.7734375, + "learning_rate": 9.019901027623727e-05, + "loss": 0.2828, + "step": 17330 + }, + { + "epoch": 2.31, + "grad_norm": 0.9140625, + "learning_rate": 9.01874213914706e-05, + "loss": 0.3529, + "step": 17331 + }, + { + "epoch": 2.31, + "grad_norm": 0.703125, + "learning_rate": 9.017583263976873e-05, + "loss": 0.3589, + "step": 17332 + }, + { + "epoch": 2.31, + "grad_norm": 0.462890625, + "learning_rate": 9.01642440212889e-05, + "loss": 0.181, + "step": 17333 + }, + { + "epoch": 2.31, + "grad_norm": 0.3984375, + "learning_rate": 9.015265553618821e-05, + "loss": 0.1266, + "step": 17334 + }, + { + "epoch": 2.31, + "grad_norm": 0.6328125, + "learning_rate": 9.014106718462387e-05, + "loss": 0.4054, + "step": 17335 + }, + { + "epoch": 2.31, + "grad_norm": 0.451171875, + "learning_rate": 9.012947896675294e-05, + "loss": 0.1239, + "step": 17336 + }, + { + "epoch": 2.31, + "grad_norm": 0.5546875, + "learning_rate": 9.011789088273266e-05, + "loss": 0.4672, + "step": 17337 + }, + { + "epoch": 2.31, + "grad_norm": 0.72265625, + "learning_rate": 9.010630293272008e-05, + "loss": 0.3713, + "step": 17338 + }, + { + "epoch": 2.31, + "grad_norm": 0.67578125, + "learning_rate": 9.009471511687238e-05, + "loss": 0.3632, + "step": 17339 + }, + { + "epoch": 2.31, + "grad_norm": 0.494140625, + "learning_rate": 9.008312743534674e-05, + "loss": 0.3332, + "step": 17340 + }, + { + "epoch": 2.31, + "grad_norm": 0.69140625, + "learning_rate": 9.007153988830022e-05, + "loss": 0.4272, + "step": 17341 + }, + { + "epoch": 2.31, + "grad_norm": 0.65625, + "learning_rate": 9.005995247589002e-05, + "loss": 0.5263, + "step": 17342 + }, + { + "epoch": 2.31, + "grad_norm": 0.62890625, + "learning_rate": 9.004836519827325e-05, + "loss": 0.2275, + "step": 17343 + }, + { + "epoch": 2.31, + "grad_norm": 0.5, + "learning_rate": 9.0036778055607e-05, + "loss": 0.2348, + "step": 17344 + }, + { + "epoch": 2.31, + "grad_norm": 0.53125, + "learning_rate": 9.002519104804847e-05, + "loss": 0.3975, + "step": 17345 + }, + { + "epoch": 2.31, + "grad_norm": 0.53515625, + "learning_rate": 9.001360417575472e-05, + "loss": 0.4838, + "step": 17346 + }, + { + "epoch": 2.31, + "grad_norm": 0.578125, + "learning_rate": 9.000201743888292e-05, + "loss": 0.4587, + "step": 17347 + }, + { + "epoch": 2.31, + "grad_norm": 0.52734375, + "learning_rate": 8.999043083759017e-05, + "loss": 0.3972, + "step": 17348 + }, + { + "epoch": 2.32, + "grad_norm": 0.6640625, + "learning_rate": 8.997884437203362e-05, + "loss": 0.2229, + "step": 17349 + }, + { + "epoch": 2.32, + "grad_norm": 0.86328125, + "learning_rate": 8.996725804237035e-05, + "loss": 0.4981, + "step": 17350 + }, + { + "epoch": 2.32, + "grad_norm": 0.71875, + "learning_rate": 8.995567184875751e-05, + "loss": 0.4401, + "step": 17351 + }, + { + "epoch": 2.32, + "grad_norm": 0.41015625, + "learning_rate": 8.99440857913522e-05, + "loss": 0.2979, + "step": 17352 + }, + { + "epoch": 2.32, + "grad_norm": 0.53515625, + "learning_rate": 8.993249987031156e-05, + "loss": 0.5892, + "step": 17353 + }, + { + "epoch": 2.32, + "grad_norm": 0.6875, + "learning_rate": 8.992091408579266e-05, + "loss": 0.5408, + "step": 17354 + }, + { + "epoch": 2.32, + "grad_norm": 0.58203125, + "learning_rate": 8.990932843795265e-05, + "loss": 0.4472, + "step": 17355 + }, + { + "epoch": 2.32, + "grad_norm": 0.65234375, + "learning_rate": 8.989774292694866e-05, + "loss": 0.5185, + "step": 17356 + }, + { + "epoch": 2.32, + "grad_norm": 0.48828125, + "learning_rate": 8.98861575529377e-05, + "loss": 0.4093, + "step": 17357 + }, + { + "epoch": 2.32, + "grad_norm": 0.5625, + "learning_rate": 8.987457231607696e-05, + "loss": 0.2216, + "step": 17358 + }, + { + "epoch": 2.32, + "grad_norm": 0.5625, + "learning_rate": 8.98629872165235e-05, + "loss": 0.5201, + "step": 17359 + }, + { + "epoch": 2.32, + "grad_norm": 0.625, + "learning_rate": 8.985140225443446e-05, + "loss": 0.3887, + "step": 17360 + }, + { + "epoch": 2.32, + "grad_norm": 0.6171875, + "learning_rate": 8.983981742996689e-05, + "loss": 0.3059, + "step": 17361 + }, + { + "epoch": 2.32, + "grad_norm": 0.55078125, + "learning_rate": 8.982823274327795e-05, + "loss": 0.4593, + "step": 17362 + }, + { + "epoch": 2.32, + "grad_norm": 0.5859375, + "learning_rate": 8.981664819452466e-05, + "loss": 0.266, + "step": 17363 + }, + { + "epoch": 2.32, + "grad_norm": 0.55859375, + "learning_rate": 8.980506378386417e-05, + "loss": 0.2452, + "step": 17364 + }, + { + "epoch": 2.32, + "grad_norm": 0.69140625, + "learning_rate": 8.979347951145359e-05, + "loss": 0.4851, + "step": 17365 + }, + { + "epoch": 2.32, + "grad_norm": 0.5703125, + "learning_rate": 8.978189537744994e-05, + "loss": 0.2896, + "step": 17366 + }, + { + "epoch": 2.32, + "grad_norm": 0.56640625, + "learning_rate": 8.977031138201038e-05, + "loss": 0.2986, + "step": 17367 + }, + { + "epoch": 2.32, + "grad_norm": 0.66015625, + "learning_rate": 8.975872752529198e-05, + "loss": 0.3956, + "step": 17368 + }, + { + "epoch": 2.32, + "grad_norm": 0.7109375, + "learning_rate": 8.974714380745177e-05, + "loss": 0.4127, + "step": 17369 + }, + { + "epoch": 2.32, + "grad_norm": 0.486328125, + "learning_rate": 8.973556022864687e-05, + "loss": 0.2926, + "step": 17370 + }, + { + "epoch": 2.32, + "grad_norm": 0.447265625, + "learning_rate": 8.972397678903436e-05, + "loss": 0.1465, + "step": 17371 + }, + { + "epoch": 2.32, + "grad_norm": 0.66015625, + "learning_rate": 8.971239348877134e-05, + "loss": 0.3514, + "step": 17372 + }, + { + "epoch": 2.32, + "grad_norm": 0.404296875, + "learning_rate": 8.970081032801484e-05, + "loss": 0.1884, + "step": 17373 + }, + { + "epoch": 2.32, + "grad_norm": 0.7109375, + "learning_rate": 8.968922730692198e-05, + "loss": 0.3679, + "step": 17374 + }, + { + "epoch": 2.32, + "grad_norm": 0.34765625, + "learning_rate": 8.96776444256498e-05, + "loss": 0.1019, + "step": 17375 + }, + { + "epoch": 2.32, + "grad_norm": 0.671875, + "learning_rate": 8.966606168435541e-05, + "loss": 0.5127, + "step": 17376 + }, + { + "epoch": 2.32, + "grad_norm": 0.61328125, + "learning_rate": 8.965447908319584e-05, + "loss": 0.3454, + "step": 17377 + }, + { + "epoch": 2.32, + "grad_norm": 0.6171875, + "learning_rate": 8.964289662232818e-05, + "loss": 0.3735, + "step": 17378 + }, + { + "epoch": 2.32, + "grad_norm": 0.57421875, + "learning_rate": 8.963131430190947e-05, + "loss": 0.5433, + "step": 17379 + }, + { + "epoch": 2.32, + "grad_norm": 0.58984375, + "learning_rate": 8.961973212209684e-05, + "loss": 0.6498, + "step": 17380 + }, + { + "epoch": 2.32, + "grad_norm": 0.51171875, + "learning_rate": 8.960815008304728e-05, + "loss": 0.3025, + "step": 17381 + }, + { + "epoch": 2.32, + "grad_norm": 0.408203125, + "learning_rate": 8.959656818491784e-05, + "loss": 0.1459, + "step": 17382 + }, + { + "epoch": 2.32, + "grad_norm": 0.56640625, + "learning_rate": 8.958498642786566e-05, + "loss": 0.3953, + "step": 17383 + }, + { + "epoch": 2.32, + "grad_norm": 0.5546875, + "learning_rate": 8.957340481204772e-05, + "loss": 0.3497, + "step": 17384 + }, + { + "epoch": 2.32, + "grad_norm": 0.5234375, + "learning_rate": 8.956182333762112e-05, + "loss": 0.4132, + "step": 17385 + }, + { + "epoch": 2.32, + "grad_norm": 0.40625, + "learning_rate": 8.955024200474287e-05, + "loss": 0.1784, + "step": 17386 + }, + { + "epoch": 2.32, + "grad_norm": 0.515625, + "learning_rate": 8.953866081357006e-05, + "loss": 0.4533, + "step": 17387 + }, + { + "epoch": 2.32, + "grad_norm": 0.52734375, + "learning_rate": 8.952707976425972e-05, + "loss": 0.3401, + "step": 17388 + }, + { + "epoch": 2.32, + "grad_norm": 1.1796875, + "learning_rate": 8.951549885696889e-05, + "loss": 0.4051, + "step": 17389 + }, + { + "epoch": 2.32, + "grad_norm": 0.60546875, + "learning_rate": 8.950391809185464e-05, + "loss": 0.4557, + "step": 17390 + }, + { + "epoch": 2.32, + "grad_norm": 0.75390625, + "learning_rate": 8.949233746907398e-05, + "loss": 0.3247, + "step": 17391 + }, + { + "epoch": 2.32, + "grad_norm": 0.8515625, + "learning_rate": 8.948075698878402e-05, + "loss": 0.3201, + "step": 17392 + }, + { + "epoch": 2.32, + "grad_norm": 0.50390625, + "learning_rate": 8.946917665114172e-05, + "loss": 0.2386, + "step": 17393 + }, + { + "epoch": 2.32, + "grad_norm": 0.7109375, + "learning_rate": 8.945759645630411e-05, + "loss": 0.2573, + "step": 17394 + }, + { + "epoch": 2.32, + "grad_norm": 0.7421875, + "learning_rate": 8.94460164044283e-05, + "loss": 0.4167, + "step": 17395 + }, + { + "epoch": 2.32, + "grad_norm": 0.5, + "learning_rate": 8.943443649567124e-05, + "loss": 0.3585, + "step": 17396 + }, + { + "epoch": 2.32, + "grad_norm": 0.5390625, + "learning_rate": 8.942285673019004e-05, + "loss": 0.4287, + "step": 17397 + }, + { + "epoch": 2.32, + "grad_norm": 0.671875, + "learning_rate": 8.941127710814165e-05, + "loss": 0.1796, + "step": 17398 + }, + { + "epoch": 2.32, + "grad_norm": 0.66015625, + "learning_rate": 8.939969762968318e-05, + "loss": 0.5657, + "step": 17399 + }, + { + "epoch": 2.32, + "grad_norm": 0.61328125, + "learning_rate": 8.938811829497157e-05, + "loss": 0.22, + "step": 17400 + }, + { + "epoch": 2.32, + "grad_norm": 0.40625, + "learning_rate": 8.937653910416393e-05, + "loss": 0.2588, + "step": 17401 + }, + { + "epoch": 2.32, + "grad_norm": 0.423828125, + "learning_rate": 8.936496005741721e-05, + "loss": 0.1166, + "step": 17402 + }, + { + "epoch": 2.32, + "grad_norm": 0.5625, + "learning_rate": 8.935338115488848e-05, + "loss": 0.5026, + "step": 17403 + }, + { + "epoch": 2.32, + "grad_norm": 0.60546875, + "learning_rate": 8.934180239673475e-05, + "loss": 0.3997, + "step": 17404 + }, + { + "epoch": 2.32, + "grad_norm": 0.6484375, + "learning_rate": 8.933022378311297e-05, + "loss": 0.4445, + "step": 17405 + }, + { + "epoch": 2.32, + "grad_norm": 0.7890625, + "learning_rate": 8.931864531418025e-05, + "loss": 0.3983, + "step": 17406 + }, + { + "epoch": 2.32, + "grad_norm": 0.65234375, + "learning_rate": 8.930706699009352e-05, + "loss": 0.4726, + "step": 17407 + }, + { + "epoch": 2.32, + "grad_norm": 0.4921875, + "learning_rate": 8.929548881100985e-05, + "loss": 0.3112, + "step": 17408 + }, + { + "epoch": 2.32, + "grad_norm": 0.357421875, + "learning_rate": 8.928391077708617e-05, + "loss": 0.1492, + "step": 17409 + }, + { + "epoch": 2.32, + "grad_norm": 0.8046875, + "learning_rate": 8.92723328884796e-05, + "loss": 0.42, + "step": 17410 + }, + { + "epoch": 2.32, + "grad_norm": 0.609375, + "learning_rate": 8.926075514534703e-05, + "loss": 0.1717, + "step": 17411 + }, + { + "epoch": 2.32, + "grad_norm": 0.51953125, + "learning_rate": 8.924917754784552e-05, + "loss": 0.2149, + "step": 17412 + }, + { + "epoch": 2.32, + "grad_norm": 0.453125, + "learning_rate": 8.92376000961321e-05, + "loss": 0.3447, + "step": 17413 + }, + { + "epoch": 2.32, + "grad_norm": 0.60546875, + "learning_rate": 8.92260227903637e-05, + "loss": 0.3315, + "step": 17414 + }, + { + "epoch": 2.32, + "grad_norm": 0.6015625, + "learning_rate": 8.921444563069736e-05, + "loss": 0.2962, + "step": 17415 + }, + { + "epoch": 2.32, + "grad_norm": 0.50390625, + "learning_rate": 8.920286861729008e-05, + "loss": 0.1767, + "step": 17416 + }, + { + "epoch": 2.32, + "grad_norm": 0.5703125, + "learning_rate": 8.91912917502988e-05, + "loss": 0.4349, + "step": 17417 + }, + { + "epoch": 2.32, + "grad_norm": 0.6015625, + "learning_rate": 8.917971502988055e-05, + "loss": 0.3312, + "step": 17418 + }, + { + "epoch": 2.32, + "grad_norm": 0.51953125, + "learning_rate": 8.91681384561923e-05, + "loss": 0.3653, + "step": 17419 + }, + { + "epoch": 2.32, + "grad_norm": 0.51953125, + "learning_rate": 8.915656202939106e-05, + "loss": 0.183, + "step": 17420 + }, + { + "epoch": 2.32, + "grad_norm": 0.7734375, + "learning_rate": 8.914498574963377e-05, + "loss": 0.2488, + "step": 17421 + }, + { + "epoch": 2.32, + "grad_norm": 0.71484375, + "learning_rate": 8.913340961707746e-05, + "loss": 0.6023, + "step": 17422 + }, + { + "epoch": 2.32, + "grad_norm": 0.50390625, + "learning_rate": 8.912183363187908e-05, + "loss": 0.4062, + "step": 17423 + }, + { + "epoch": 2.33, + "grad_norm": 0.51171875, + "learning_rate": 8.911025779419562e-05, + "loss": 0.2298, + "step": 17424 + }, + { + "epoch": 2.33, + "grad_norm": 0.62109375, + "learning_rate": 8.909868210418406e-05, + "loss": 0.484, + "step": 17425 + }, + { + "epoch": 2.33, + "grad_norm": 0.73828125, + "learning_rate": 8.908710656200137e-05, + "loss": 0.2441, + "step": 17426 + }, + { + "epoch": 2.33, + "grad_norm": 0.546875, + "learning_rate": 8.907553116780451e-05, + "loss": 0.3851, + "step": 17427 + }, + { + "epoch": 2.33, + "grad_norm": 0.4375, + "learning_rate": 8.906395592175049e-05, + "loss": 0.3555, + "step": 17428 + }, + { + "epoch": 2.33, + "grad_norm": 0.546875, + "learning_rate": 8.905238082399623e-05, + "loss": 0.2744, + "step": 17429 + }, + { + "epoch": 2.33, + "grad_norm": 0.64453125, + "learning_rate": 8.904080587469868e-05, + "loss": 0.1833, + "step": 17430 + }, + { + "epoch": 2.33, + "grad_norm": 0.58203125, + "learning_rate": 8.902923107401487e-05, + "loss": 0.4547, + "step": 17431 + }, + { + "epoch": 2.33, + "grad_norm": 0.640625, + "learning_rate": 8.90176564221017e-05, + "loss": 0.4296, + "step": 17432 + }, + { + "epoch": 2.33, + "grad_norm": 0.65625, + "learning_rate": 8.900608191911618e-05, + "loss": 0.3423, + "step": 17433 + }, + { + "epoch": 2.33, + "grad_norm": 0.5625, + "learning_rate": 8.899450756521522e-05, + "loss": 0.4894, + "step": 17434 + }, + { + "epoch": 2.33, + "grad_norm": 0.451171875, + "learning_rate": 8.898293336055583e-05, + "loss": 0.1779, + "step": 17435 + }, + { + "epoch": 2.33, + "grad_norm": 0.53515625, + "learning_rate": 8.897135930529491e-05, + "loss": 0.2795, + "step": 17436 + }, + { + "epoch": 2.33, + "grad_norm": 0.5546875, + "learning_rate": 8.895978539958944e-05, + "loss": 0.3181, + "step": 17437 + }, + { + "epoch": 2.33, + "grad_norm": 0.671875, + "learning_rate": 8.894821164359638e-05, + "loss": 0.5049, + "step": 17438 + }, + { + "epoch": 2.33, + "grad_norm": 0.60546875, + "learning_rate": 8.893663803747264e-05, + "loss": 0.2053, + "step": 17439 + }, + { + "epoch": 2.33, + "grad_norm": 0.6015625, + "learning_rate": 8.892506458137522e-05, + "loss": 0.5403, + "step": 17440 + }, + { + "epoch": 2.33, + "grad_norm": 0.388671875, + "learning_rate": 8.891349127546105e-05, + "loss": 0.151, + "step": 17441 + }, + { + "epoch": 2.33, + "grad_norm": 0.61328125, + "learning_rate": 8.890191811988701e-05, + "loss": 0.2125, + "step": 17442 + }, + { + "epoch": 2.33, + "grad_norm": 0.48046875, + "learning_rate": 8.889034511481009e-05, + "loss": 0.2435, + "step": 17443 + }, + { + "epoch": 2.33, + "grad_norm": 0.7890625, + "learning_rate": 8.887877226038723e-05, + "loss": 0.4753, + "step": 17444 + }, + { + "epoch": 2.33, + "grad_norm": 0.55078125, + "learning_rate": 8.886719955677535e-05, + "loss": 0.2856, + "step": 17445 + }, + { + "epoch": 2.33, + "grad_norm": 0.55078125, + "learning_rate": 8.885562700413137e-05, + "loss": 0.3584, + "step": 17446 + }, + { + "epoch": 2.33, + "grad_norm": 0.390625, + "learning_rate": 8.884405460261227e-05, + "loss": 0.2193, + "step": 17447 + }, + { + "epoch": 2.33, + "grad_norm": 0.50390625, + "learning_rate": 8.883248235237493e-05, + "loss": 0.3759, + "step": 17448 + }, + { + "epoch": 2.33, + "grad_norm": 0.546875, + "learning_rate": 8.882091025357631e-05, + "loss": 0.3685, + "step": 17449 + }, + { + "epoch": 2.33, + "grad_norm": 0.5234375, + "learning_rate": 8.880933830637331e-05, + "loss": 0.2348, + "step": 17450 + }, + { + "epoch": 2.33, + "grad_norm": 0.546875, + "learning_rate": 8.879776651092287e-05, + "loss": 0.5599, + "step": 17451 + }, + { + "epoch": 2.33, + "grad_norm": 0.78125, + "learning_rate": 8.87861948673819e-05, + "loss": 0.9273, + "step": 17452 + }, + { + "epoch": 2.33, + "grad_norm": 0.474609375, + "learning_rate": 8.877462337590736e-05, + "loss": 0.2087, + "step": 17453 + }, + { + "epoch": 2.33, + "grad_norm": 0.37890625, + "learning_rate": 8.876305203665612e-05, + "loss": 0.1963, + "step": 17454 + }, + { + "epoch": 2.33, + "grad_norm": 0.53515625, + "learning_rate": 8.875148084978507e-05, + "loss": 0.4595, + "step": 17455 + }, + { + "epoch": 2.33, + "grad_norm": 0.61328125, + "learning_rate": 8.87399098154512e-05, + "loss": 0.1617, + "step": 17456 + }, + { + "epoch": 2.33, + "grad_norm": 0.76171875, + "learning_rate": 8.872833893381134e-05, + "loss": 0.5719, + "step": 17457 + }, + { + "epoch": 2.33, + "grad_norm": 0.796875, + "learning_rate": 8.871676820502246e-05, + "loss": 0.4485, + "step": 17458 + }, + { + "epoch": 2.33, + "grad_norm": 0.64453125, + "learning_rate": 8.870519762924143e-05, + "loss": 0.5386, + "step": 17459 + }, + { + "epoch": 2.33, + "grad_norm": 0.431640625, + "learning_rate": 8.869362720662519e-05, + "loss": 0.2339, + "step": 17460 + }, + { + "epoch": 2.33, + "grad_norm": 0.62890625, + "learning_rate": 8.86820569373306e-05, + "loss": 0.4011, + "step": 17461 + }, + { + "epoch": 2.33, + "grad_norm": 0.5859375, + "learning_rate": 8.867048682151458e-05, + "loss": 0.3211, + "step": 17462 + }, + { + "epoch": 2.33, + "grad_norm": 0.53125, + "learning_rate": 8.865891685933405e-05, + "loss": 0.3684, + "step": 17463 + }, + { + "epoch": 2.33, + "grad_norm": 0.64453125, + "learning_rate": 8.864734705094586e-05, + "loss": 0.4067, + "step": 17464 + }, + { + "epoch": 2.33, + "grad_norm": 0.49609375, + "learning_rate": 8.863577739650698e-05, + "loss": 0.1473, + "step": 17465 + }, + { + "epoch": 2.33, + "grad_norm": 0.7265625, + "learning_rate": 8.862420789617424e-05, + "loss": 0.4892, + "step": 17466 + }, + { + "epoch": 2.33, + "grad_norm": 0.50390625, + "learning_rate": 8.861263855010451e-05, + "loss": 0.1648, + "step": 17467 + }, + { + "epoch": 2.33, + "grad_norm": 0.5625, + "learning_rate": 8.860106935845473e-05, + "loss": 0.4153, + "step": 17468 + }, + { + "epoch": 2.33, + "grad_norm": 0.53125, + "learning_rate": 8.858950032138175e-05, + "loss": 0.32, + "step": 17469 + }, + { + "epoch": 2.33, + "grad_norm": 0.478515625, + "learning_rate": 8.857793143904247e-05, + "loss": 0.3355, + "step": 17470 + }, + { + "epoch": 2.33, + "grad_norm": 0.498046875, + "learning_rate": 8.856636271159377e-05, + "loss": 0.2207, + "step": 17471 + }, + { + "epoch": 2.33, + "grad_norm": 0.890625, + "learning_rate": 8.855479413919254e-05, + "loss": 0.3518, + "step": 17472 + }, + { + "epoch": 2.33, + "grad_norm": 0.4921875, + "learning_rate": 8.854322572199564e-05, + "loss": 0.3465, + "step": 17473 + }, + { + "epoch": 2.33, + "grad_norm": 0.423828125, + "learning_rate": 8.853165746015997e-05, + "loss": 0.1969, + "step": 17474 + }, + { + "epoch": 2.33, + "grad_norm": 0.734375, + "learning_rate": 8.852008935384235e-05, + "loss": 0.3661, + "step": 17475 + }, + { + "epoch": 2.33, + "grad_norm": 0.609375, + "learning_rate": 8.850852140319972e-05, + "loss": 0.4757, + "step": 17476 + }, + { + "epoch": 2.33, + "grad_norm": 0.625, + "learning_rate": 8.849695360838894e-05, + "loss": 0.3926, + "step": 17477 + }, + { + "epoch": 2.33, + "grad_norm": 0.71484375, + "learning_rate": 8.848538596956679e-05, + "loss": 0.4376, + "step": 17478 + }, + { + "epoch": 2.33, + "grad_norm": 0.70703125, + "learning_rate": 8.847381848689022e-05, + "loss": 0.2005, + "step": 17479 + }, + { + "epoch": 2.33, + "grad_norm": 0.625, + "learning_rate": 8.846225116051606e-05, + "loss": 0.4536, + "step": 17480 + }, + { + "epoch": 2.33, + "grad_norm": 0.421875, + "learning_rate": 8.84506839906012e-05, + "loss": 0.2337, + "step": 17481 + }, + { + "epoch": 2.33, + "grad_norm": 0.515625, + "learning_rate": 8.843911697730246e-05, + "loss": 0.2338, + "step": 17482 + }, + { + "epoch": 2.33, + "grad_norm": 0.54296875, + "learning_rate": 8.842755012077673e-05, + "loss": 0.3119, + "step": 17483 + }, + { + "epoch": 2.33, + "grad_norm": 0.45703125, + "learning_rate": 8.841598342118082e-05, + "loss": 0.2993, + "step": 17484 + }, + { + "epoch": 2.33, + "grad_norm": 0.69140625, + "learning_rate": 8.840441687867161e-05, + "loss": 0.3016, + "step": 17485 + }, + { + "epoch": 2.33, + "grad_norm": 0.640625, + "learning_rate": 8.839285049340597e-05, + "loss": 0.3985, + "step": 17486 + }, + { + "epoch": 2.33, + "grad_norm": 0.734375, + "learning_rate": 8.83812842655407e-05, + "loss": 0.4878, + "step": 17487 + }, + { + "epoch": 2.33, + "grad_norm": 0.60546875, + "learning_rate": 8.836971819523272e-05, + "loss": 0.3406, + "step": 17488 + }, + { + "epoch": 2.33, + "grad_norm": 0.546875, + "learning_rate": 8.835815228263883e-05, + "loss": 0.3757, + "step": 17489 + }, + { + "epoch": 2.33, + "grad_norm": 0.625, + "learning_rate": 8.834658652791582e-05, + "loss": 0.4826, + "step": 17490 + }, + { + "epoch": 2.33, + "grad_norm": 0.59375, + "learning_rate": 8.83350209312206e-05, + "loss": 0.3656, + "step": 17491 + }, + { + "epoch": 2.33, + "grad_norm": 0.578125, + "learning_rate": 8.832345549270996e-05, + "loss": 0.3678, + "step": 17492 + }, + { + "epoch": 2.33, + "grad_norm": 0.447265625, + "learning_rate": 8.831189021254078e-05, + "loss": 0.2686, + "step": 17493 + }, + { + "epoch": 2.33, + "grad_norm": 0.55078125, + "learning_rate": 8.830032509086985e-05, + "loss": 0.3137, + "step": 17494 + }, + { + "epoch": 2.33, + "grad_norm": 0.59765625, + "learning_rate": 8.828876012785404e-05, + "loss": 0.307, + "step": 17495 + }, + { + "epoch": 2.33, + "grad_norm": 0.58203125, + "learning_rate": 8.827719532365014e-05, + "loss": 0.2932, + "step": 17496 + }, + { + "epoch": 2.33, + "grad_norm": 0.6015625, + "learning_rate": 8.826563067841502e-05, + "loss": 0.2624, + "step": 17497 + }, + { + "epoch": 2.33, + "grad_norm": 0.6171875, + "learning_rate": 8.825406619230545e-05, + "loss": 0.2854, + "step": 17498 + }, + { + "epoch": 2.34, + "grad_norm": 0.64453125, + "learning_rate": 8.82425018654783e-05, + "loss": 0.5454, + "step": 17499 + }, + { + "epoch": 2.34, + "grad_norm": 0.953125, + "learning_rate": 8.823093769809036e-05, + "loss": 0.4109, + "step": 17500 + }, + { + "epoch": 2.34, + "grad_norm": 0.546875, + "learning_rate": 8.82193736902985e-05, + "loss": 0.5107, + "step": 17501 + }, + { + "epoch": 2.34, + "grad_norm": 0.5078125, + "learning_rate": 8.820780984225947e-05, + "loss": 0.4097, + "step": 17502 + }, + { + "epoch": 2.34, + "grad_norm": 0.75, + "learning_rate": 8.819624615413009e-05, + "loss": 0.6797, + "step": 17503 + }, + { + "epoch": 2.34, + "grad_norm": 0.51953125, + "learning_rate": 8.81846826260672e-05, + "loss": 0.272, + "step": 17504 + }, + { + "epoch": 2.34, + "grad_norm": 0.5703125, + "learning_rate": 8.817311925822757e-05, + "loss": 0.3713, + "step": 17505 + }, + { + "epoch": 2.34, + "grad_norm": 1.09375, + "learning_rate": 8.816155605076805e-05, + "loss": 0.5283, + "step": 17506 + }, + { + "epoch": 2.34, + "grad_norm": 0.55078125, + "learning_rate": 8.814999300384544e-05, + "loss": 0.4313, + "step": 17507 + }, + { + "epoch": 2.34, + "grad_norm": 0.54296875, + "learning_rate": 8.813843011761652e-05, + "loss": 0.2681, + "step": 17508 + }, + { + "epoch": 2.34, + "grad_norm": 0.494140625, + "learning_rate": 8.812686739223809e-05, + "loss": 0.1999, + "step": 17509 + }, + { + "epoch": 2.34, + "grad_norm": 0.7265625, + "learning_rate": 8.811530482786696e-05, + "loss": 0.5858, + "step": 17510 + }, + { + "epoch": 2.34, + "grad_norm": 0.671875, + "learning_rate": 8.810374242465994e-05, + "loss": 0.5964, + "step": 17511 + }, + { + "epoch": 2.34, + "grad_norm": 0.4765625, + "learning_rate": 8.809218018277378e-05, + "loss": 0.3113, + "step": 17512 + }, + { + "epoch": 2.34, + "grad_norm": 0.59375, + "learning_rate": 8.808061810236536e-05, + "loss": 0.5415, + "step": 17513 + }, + { + "epoch": 2.34, + "grad_norm": 0.73828125, + "learning_rate": 8.806905618359138e-05, + "loss": 0.2759, + "step": 17514 + }, + { + "epoch": 2.34, + "grad_norm": 0.609375, + "learning_rate": 8.805749442660864e-05, + "loss": 0.3715, + "step": 17515 + }, + { + "epoch": 2.34, + "grad_norm": 0.94921875, + "learning_rate": 8.804593283157394e-05, + "loss": 0.346, + "step": 17516 + }, + { + "epoch": 2.34, + "grad_norm": 0.6484375, + "learning_rate": 8.803437139864406e-05, + "loss": 0.2847, + "step": 17517 + }, + { + "epoch": 2.34, + "grad_norm": 0.376953125, + "learning_rate": 8.80228101279758e-05, + "loss": 0.2066, + "step": 17518 + }, + { + "epoch": 2.34, + "grad_norm": 0.68359375, + "learning_rate": 8.80112490197259e-05, + "loss": 0.3364, + "step": 17519 + }, + { + "epoch": 2.34, + "grad_norm": 0.515625, + "learning_rate": 8.799968807405118e-05, + "loss": 0.1181, + "step": 17520 + }, + { + "epoch": 2.34, + "grad_norm": 0.60546875, + "learning_rate": 8.798812729110837e-05, + "loss": 0.2708, + "step": 17521 + }, + { + "epoch": 2.34, + "grad_norm": 0.66796875, + "learning_rate": 8.797656667105428e-05, + "loss": 0.3794, + "step": 17522 + }, + { + "epoch": 2.34, + "grad_norm": 0.3984375, + "learning_rate": 8.796500621404564e-05, + "loss": 0.1493, + "step": 17523 + }, + { + "epoch": 2.34, + "grad_norm": 0.58984375, + "learning_rate": 8.795344592023928e-05, + "loss": 0.2098, + "step": 17524 + }, + { + "epoch": 2.34, + "grad_norm": 0.55859375, + "learning_rate": 8.794188578979187e-05, + "loss": 0.5538, + "step": 17525 + }, + { + "epoch": 2.34, + "grad_norm": 0.54296875, + "learning_rate": 8.793032582286029e-05, + "loss": 0.2327, + "step": 17526 + }, + { + "epoch": 2.34, + "grad_norm": 0.52734375, + "learning_rate": 8.791876601960123e-05, + "loss": 0.3766, + "step": 17527 + }, + { + "epoch": 2.34, + "grad_norm": 0.80078125, + "learning_rate": 8.790720638017141e-05, + "loss": 0.6804, + "step": 17528 + }, + { + "epoch": 2.34, + "grad_norm": 0.73046875, + "learning_rate": 8.789564690472767e-05, + "loss": 0.3567, + "step": 17529 + }, + { + "epoch": 2.34, + "grad_norm": 0.609375, + "learning_rate": 8.78840875934267e-05, + "loss": 0.2746, + "step": 17530 + }, + { + "epoch": 2.34, + "grad_norm": 0.46875, + "learning_rate": 8.78725284464253e-05, + "loss": 0.3127, + "step": 17531 + }, + { + "epoch": 2.34, + "grad_norm": 0.494140625, + "learning_rate": 8.786096946388019e-05, + "loss": 0.4366, + "step": 17532 + }, + { + "epoch": 2.34, + "grad_norm": 0.64453125, + "learning_rate": 8.784941064594812e-05, + "loss": 0.6527, + "step": 17533 + }, + { + "epoch": 2.34, + "grad_norm": 0.58984375, + "learning_rate": 8.783785199278583e-05, + "loss": 0.2607, + "step": 17534 + }, + { + "epoch": 2.34, + "grad_norm": 0.4765625, + "learning_rate": 8.782629350455007e-05, + "loss": 0.4255, + "step": 17535 + }, + { + "epoch": 2.34, + "grad_norm": 0.69921875, + "learning_rate": 8.781473518139761e-05, + "loss": 0.3142, + "step": 17536 + }, + { + "epoch": 2.34, + "grad_norm": 0.63671875, + "learning_rate": 8.780317702348514e-05, + "loss": 0.1651, + "step": 17537 + }, + { + "epoch": 2.34, + "grad_norm": 0.5078125, + "learning_rate": 8.779161903096947e-05, + "loss": 0.1733, + "step": 17538 + }, + { + "epoch": 2.34, + "grad_norm": 0.55078125, + "learning_rate": 8.778006120400724e-05, + "loss": 0.2973, + "step": 17539 + }, + { + "epoch": 2.34, + "grad_norm": 0.60546875, + "learning_rate": 8.776850354275521e-05, + "loss": 0.4693, + "step": 17540 + }, + { + "epoch": 2.34, + "grad_norm": 0.5390625, + "learning_rate": 8.775694604737014e-05, + "loss": 0.2079, + "step": 17541 + }, + { + "epoch": 2.34, + "grad_norm": 0.6796875, + "learning_rate": 8.774538871800872e-05, + "loss": 0.283, + "step": 17542 + }, + { + "epoch": 2.34, + "grad_norm": 0.59765625, + "learning_rate": 8.773383155482772e-05, + "loss": 0.4703, + "step": 17543 + }, + { + "epoch": 2.34, + "grad_norm": 0.578125, + "learning_rate": 8.772227455798381e-05, + "loss": 0.5212, + "step": 17544 + }, + { + "epoch": 2.34, + "grad_norm": 0.5234375, + "learning_rate": 8.771071772763375e-05, + "loss": 0.4078, + "step": 17545 + }, + { + "epoch": 2.34, + "grad_norm": 0.51171875, + "learning_rate": 8.769916106393423e-05, + "loss": 0.2084, + "step": 17546 + }, + { + "epoch": 2.34, + "grad_norm": 0.5390625, + "learning_rate": 8.768760456704202e-05, + "loss": 0.3189, + "step": 17547 + }, + { + "epoch": 2.34, + "grad_norm": 0.486328125, + "learning_rate": 8.767604823711374e-05, + "loss": 0.3198, + "step": 17548 + }, + { + "epoch": 2.34, + "grad_norm": 0.6796875, + "learning_rate": 8.766449207430621e-05, + "loss": 0.3594, + "step": 17549 + }, + { + "epoch": 2.34, + "grad_norm": 0.494140625, + "learning_rate": 8.765293607877608e-05, + "loss": 0.349, + "step": 17550 + }, + { + "epoch": 2.34, + "grad_norm": 0.66015625, + "learning_rate": 8.764138025068003e-05, + "loss": 0.4739, + "step": 17551 + }, + { + "epoch": 2.34, + "grad_norm": 0.41015625, + "learning_rate": 8.762982459017481e-05, + "loss": 0.2357, + "step": 17552 + }, + { + "epoch": 2.34, + "grad_norm": 0.6171875, + "learning_rate": 8.761826909741709e-05, + "loss": 0.359, + "step": 17553 + }, + { + "epoch": 2.34, + "grad_norm": 0.62890625, + "learning_rate": 8.760671377256362e-05, + "loss": 0.2916, + "step": 17554 + }, + { + "epoch": 2.34, + "grad_norm": 0.44140625, + "learning_rate": 8.759515861577103e-05, + "loss": 0.2456, + "step": 17555 + }, + { + "epoch": 2.34, + "grad_norm": 0.625, + "learning_rate": 8.758360362719607e-05, + "loss": 0.4365, + "step": 17556 + }, + { + "epoch": 2.34, + "grad_norm": 0.578125, + "learning_rate": 8.757204880699541e-05, + "loss": 0.3704, + "step": 17557 + }, + { + "epoch": 2.34, + "grad_norm": 0.5390625, + "learning_rate": 8.756049415532572e-05, + "loss": 0.1847, + "step": 17558 + }, + { + "epoch": 2.34, + "grad_norm": 0.60546875, + "learning_rate": 8.754893967234376e-05, + "loss": 0.4242, + "step": 17559 + }, + { + "epoch": 2.34, + "grad_norm": 0.42578125, + "learning_rate": 8.753738535820612e-05, + "loss": 0.3087, + "step": 17560 + }, + { + "epoch": 2.34, + "grad_norm": 0.466796875, + "learning_rate": 8.752583121306958e-05, + "loss": 0.1927, + "step": 17561 + }, + { + "epoch": 2.34, + "grad_norm": 0.48046875, + "learning_rate": 8.751427723709078e-05, + "loss": 0.2154, + "step": 17562 + }, + { + "epoch": 2.34, + "grad_norm": 0.625, + "learning_rate": 8.750272343042635e-05, + "loss": 0.3088, + "step": 17563 + }, + { + "epoch": 2.34, + "grad_norm": 0.40234375, + "learning_rate": 8.749116979323306e-05, + "loss": 0.1739, + "step": 17564 + }, + { + "epoch": 2.34, + "grad_norm": 0.703125, + "learning_rate": 8.747961632566748e-05, + "loss": 0.4125, + "step": 17565 + }, + { + "epoch": 2.34, + "grad_norm": 0.50390625, + "learning_rate": 8.746806302788637e-05, + "loss": 0.4484, + "step": 17566 + }, + { + "epoch": 2.34, + "grad_norm": 0.58984375, + "learning_rate": 8.745650990004635e-05, + "loss": 0.3757, + "step": 17567 + }, + { + "epoch": 2.34, + "grad_norm": 0.58203125, + "learning_rate": 8.744495694230412e-05, + "loss": 0.4812, + "step": 17568 + }, + { + "epoch": 2.34, + "grad_norm": 0.5546875, + "learning_rate": 8.743340415481632e-05, + "loss": 0.4402, + "step": 17569 + }, + { + "epoch": 2.34, + "grad_norm": 0.486328125, + "learning_rate": 8.742185153773966e-05, + "loss": 0.1473, + "step": 17570 + }, + { + "epoch": 2.34, + "grad_norm": 0.5390625, + "learning_rate": 8.741029909123072e-05, + "loss": 0.2833, + "step": 17571 + }, + { + "epoch": 2.34, + "grad_norm": 0.6796875, + "learning_rate": 8.739874681544623e-05, + "loss": 0.1881, + "step": 17572 + }, + { + "epoch": 2.34, + "grad_norm": 0.494140625, + "learning_rate": 8.73871947105428e-05, + "loss": 0.3362, + "step": 17573 + }, + { + "epoch": 2.35, + "grad_norm": 0.71875, + "learning_rate": 8.737564277667716e-05, + "loss": 0.5267, + "step": 17574 + }, + { + "epoch": 2.35, + "grad_norm": 0.494140625, + "learning_rate": 8.736409101400586e-05, + "loss": 0.2851, + "step": 17575 + }, + { + "epoch": 2.35, + "grad_norm": 0.427734375, + "learning_rate": 8.73525394226856e-05, + "loss": 0.275, + "step": 17576 + }, + { + "epoch": 2.35, + "grad_norm": 0.396484375, + "learning_rate": 8.734098800287302e-05, + "loss": 0.2846, + "step": 17577 + }, + { + "epoch": 2.35, + "grad_norm": 0.75390625, + "learning_rate": 8.732943675472477e-05, + "loss": 0.3135, + "step": 17578 + }, + { + "epoch": 2.35, + "grad_norm": 0.609375, + "learning_rate": 8.731788567839749e-05, + "loss": 0.2743, + "step": 17579 + }, + { + "epoch": 2.35, + "grad_norm": 0.78125, + "learning_rate": 8.730633477404782e-05, + "loss": 0.4798, + "step": 17580 + }, + { + "epoch": 2.35, + "grad_norm": 0.578125, + "learning_rate": 8.72947840418324e-05, + "loss": 0.3341, + "step": 17581 + }, + { + "epoch": 2.35, + "grad_norm": 0.5625, + "learning_rate": 8.728323348190785e-05, + "loss": 0.5232, + "step": 17582 + }, + { + "epoch": 2.35, + "grad_norm": 0.474609375, + "learning_rate": 8.727168309443082e-05, + "loss": 0.2963, + "step": 17583 + }, + { + "epoch": 2.35, + "grad_norm": 0.515625, + "learning_rate": 8.726013287955794e-05, + "loss": 0.3861, + "step": 17584 + }, + { + "epoch": 2.35, + "grad_norm": 0.8046875, + "learning_rate": 8.724858283744583e-05, + "loss": 0.2398, + "step": 17585 + }, + { + "epoch": 2.35, + "grad_norm": 0.455078125, + "learning_rate": 8.723703296825115e-05, + "loss": 0.1225, + "step": 17586 + }, + { + "epoch": 2.35, + "grad_norm": 0.5234375, + "learning_rate": 8.722548327213048e-05, + "loss": 0.346, + "step": 17587 + }, + { + "epoch": 2.35, + "grad_norm": 0.52734375, + "learning_rate": 8.721393374924042e-05, + "loss": 0.3812, + "step": 17588 + }, + { + "epoch": 2.35, + "grad_norm": 0.71875, + "learning_rate": 8.720238439973766e-05, + "loss": 0.464, + "step": 17589 + }, + { + "epoch": 2.35, + "grad_norm": 0.60546875, + "learning_rate": 8.719083522377878e-05, + "loss": 0.2291, + "step": 17590 + }, + { + "epoch": 2.35, + "grad_norm": 0.484375, + "learning_rate": 8.717928622152039e-05, + "loss": 0.1428, + "step": 17591 + }, + { + "epoch": 2.35, + "grad_norm": 0.609375, + "learning_rate": 8.71677373931191e-05, + "loss": 0.3049, + "step": 17592 + }, + { + "epoch": 2.35, + "grad_norm": 0.58984375, + "learning_rate": 8.715618873873153e-05, + "loss": 0.372, + "step": 17593 + }, + { + "epoch": 2.35, + "grad_norm": 0.65234375, + "learning_rate": 8.714464025851427e-05, + "loss": 0.4846, + "step": 17594 + }, + { + "epoch": 2.35, + "grad_norm": 0.6171875, + "learning_rate": 8.713309195262396e-05, + "loss": 0.4751, + "step": 17595 + }, + { + "epoch": 2.35, + "grad_norm": 0.53515625, + "learning_rate": 8.712154382121717e-05, + "loss": 0.3291, + "step": 17596 + }, + { + "epoch": 2.35, + "grad_norm": 0.6484375, + "learning_rate": 8.710999586445052e-05, + "loss": 0.2015, + "step": 17597 + }, + { + "epoch": 2.35, + "grad_norm": 0.65625, + "learning_rate": 8.709844808248062e-05, + "loss": 0.3847, + "step": 17598 + }, + { + "epoch": 2.35, + "grad_norm": 0.4921875, + "learning_rate": 8.708690047546402e-05, + "loss": 0.3415, + "step": 17599 + }, + { + "epoch": 2.35, + "grad_norm": 0.4296875, + "learning_rate": 8.707535304355734e-05, + "loss": 0.1698, + "step": 17600 + }, + { + "epoch": 2.35, + "grad_norm": 0.75, + "learning_rate": 8.706380578691715e-05, + "loss": 0.4739, + "step": 17601 + }, + { + "epoch": 2.35, + "grad_norm": 0.439453125, + "learning_rate": 8.705225870570007e-05, + "loss": 0.2424, + "step": 17602 + }, + { + "epoch": 2.35, + "grad_norm": 0.69140625, + "learning_rate": 8.704071180006265e-05, + "loss": 0.6539, + "step": 17603 + }, + { + "epoch": 2.35, + "grad_norm": 0.64453125, + "learning_rate": 8.702916507016153e-05, + "loss": 0.3844, + "step": 17604 + }, + { + "epoch": 2.35, + "grad_norm": 0.66015625, + "learning_rate": 8.701761851615324e-05, + "loss": 0.3099, + "step": 17605 + }, + { + "epoch": 2.35, + "grad_norm": 0.474609375, + "learning_rate": 8.700607213819438e-05, + "loss": 0.3273, + "step": 17606 + }, + { + "epoch": 2.35, + "grad_norm": 0.5703125, + "learning_rate": 8.699452593644151e-05, + "loss": 0.487, + "step": 17607 + }, + { + "epoch": 2.35, + "grad_norm": 0.421875, + "learning_rate": 8.698297991105122e-05, + "loss": 0.1858, + "step": 17608 + }, + { + "epoch": 2.35, + "grad_norm": 0.67578125, + "learning_rate": 8.697143406218008e-05, + "loss": 0.4353, + "step": 17609 + }, + { + "epoch": 2.35, + "grad_norm": 0.6328125, + "learning_rate": 8.695988838998464e-05, + "loss": 0.3256, + "step": 17610 + }, + { + "epoch": 2.35, + "grad_norm": 0.466796875, + "learning_rate": 8.694834289462153e-05, + "loss": 0.1534, + "step": 17611 + }, + { + "epoch": 2.35, + "grad_norm": 0.66796875, + "learning_rate": 8.693679757624724e-05, + "loss": 0.2075, + "step": 17612 + }, + { + "epoch": 2.35, + "grad_norm": 0.68359375, + "learning_rate": 8.692525243501835e-05, + "loss": 0.4777, + "step": 17613 + }, + { + "epoch": 2.35, + "grad_norm": 0.671875, + "learning_rate": 8.691370747109145e-05, + "loss": 0.3252, + "step": 17614 + }, + { + "epoch": 2.35, + "grad_norm": 0.65234375, + "learning_rate": 8.690216268462303e-05, + "loss": 0.2959, + "step": 17615 + }, + { + "epoch": 2.35, + "grad_norm": 0.5703125, + "learning_rate": 8.689061807576973e-05, + "loss": 0.4089, + "step": 17616 + }, + { + "epoch": 2.35, + "grad_norm": 0.59375, + "learning_rate": 8.687907364468804e-05, + "loss": 0.5843, + "step": 17617 + }, + { + "epoch": 2.35, + "grad_norm": 0.52734375, + "learning_rate": 8.686752939153455e-05, + "loss": 0.2082, + "step": 17618 + }, + { + "epoch": 2.35, + "grad_norm": 0.44140625, + "learning_rate": 8.685598531646577e-05, + "loss": 0.1101, + "step": 17619 + }, + { + "epoch": 2.35, + "grad_norm": 0.625, + "learning_rate": 8.684444141963829e-05, + "loss": 0.3059, + "step": 17620 + }, + { + "epoch": 2.35, + "grad_norm": 0.68359375, + "learning_rate": 8.683289770120859e-05, + "loss": 0.3184, + "step": 17621 + }, + { + "epoch": 2.35, + "grad_norm": 0.7109375, + "learning_rate": 8.682135416133329e-05, + "loss": 0.4037, + "step": 17622 + }, + { + "epoch": 2.35, + "grad_norm": 0.5546875, + "learning_rate": 8.68098108001689e-05, + "loss": 0.2953, + "step": 17623 + }, + { + "epoch": 2.35, + "grad_norm": 0.52734375, + "learning_rate": 8.679826761787189e-05, + "loss": 0.1225, + "step": 17624 + }, + { + "epoch": 2.35, + "grad_norm": 0.5703125, + "learning_rate": 8.678672461459888e-05, + "loss": 0.4445, + "step": 17625 + }, + { + "epoch": 2.35, + "grad_norm": 0.515625, + "learning_rate": 8.677518179050634e-05, + "loss": 0.385, + "step": 17626 + }, + { + "epoch": 2.35, + "grad_norm": 0.609375, + "learning_rate": 8.676363914575083e-05, + "loss": 0.6853, + "step": 17627 + }, + { + "epoch": 2.35, + "grad_norm": 0.70703125, + "learning_rate": 8.675209668048886e-05, + "loss": 0.2793, + "step": 17628 + }, + { + "epoch": 2.35, + "grad_norm": 0.52734375, + "learning_rate": 8.674055439487698e-05, + "loss": 0.3848, + "step": 17629 + }, + { + "epoch": 2.35, + "grad_norm": 0.66796875, + "learning_rate": 8.672901228907166e-05, + "loss": 0.3945, + "step": 17630 + }, + { + "epoch": 2.35, + "grad_norm": 0.5, + "learning_rate": 8.671747036322948e-05, + "loss": 0.2302, + "step": 17631 + }, + { + "epoch": 2.35, + "grad_norm": 0.486328125, + "learning_rate": 8.670592861750691e-05, + "loss": 0.2533, + "step": 17632 + }, + { + "epoch": 2.35, + "grad_norm": 0.490234375, + "learning_rate": 8.669438705206048e-05, + "loss": 0.2152, + "step": 17633 + }, + { + "epoch": 2.35, + "grad_norm": 0.703125, + "learning_rate": 8.668284566704672e-05, + "loss": 0.7132, + "step": 17634 + }, + { + "epoch": 2.35, + "grad_norm": 0.73828125, + "learning_rate": 8.667130446262214e-05, + "loss": 0.4441, + "step": 17635 + }, + { + "epoch": 2.35, + "grad_norm": 0.54296875, + "learning_rate": 8.665976343894319e-05, + "loss": 0.336, + "step": 17636 + }, + { + "epoch": 2.35, + "grad_norm": 0.640625, + "learning_rate": 8.664822259616642e-05, + "loss": 0.4178, + "step": 17637 + }, + { + "epoch": 2.35, + "grad_norm": 0.5546875, + "learning_rate": 8.663668193444831e-05, + "loss": 0.3913, + "step": 17638 + }, + { + "epoch": 2.35, + "grad_norm": 0.859375, + "learning_rate": 8.662514145394538e-05, + "loss": 0.248, + "step": 17639 + }, + { + "epoch": 2.35, + "grad_norm": 0.5234375, + "learning_rate": 8.66136011548141e-05, + "loss": 0.3058, + "step": 17640 + }, + { + "epoch": 2.35, + "grad_norm": 0.4921875, + "learning_rate": 8.6602061037211e-05, + "loss": 0.2915, + "step": 17641 + }, + { + "epoch": 2.35, + "grad_norm": 0.58203125, + "learning_rate": 8.659052110129254e-05, + "loss": 0.4721, + "step": 17642 + }, + { + "epoch": 2.35, + "grad_norm": 0.52734375, + "learning_rate": 8.657898134721521e-05, + "loss": 0.247, + "step": 17643 + }, + { + "epoch": 2.35, + "grad_norm": 0.76953125, + "learning_rate": 8.656744177513552e-05, + "loss": 0.4092, + "step": 17644 + }, + { + "epoch": 2.35, + "grad_norm": 0.56640625, + "learning_rate": 8.655590238520995e-05, + "loss": 0.3606, + "step": 17645 + }, + { + "epoch": 2.35, + "grad_norm": 0.59375, + "learning_rate": 8.654436317759493e-05, + "loss": 0.4079, + "step": 17646 + }, + { + "epoch": 2.35, + "grad_norm": 0.56640625, + "learning_rate": 8.653282415244706e-05, + "loss": 0.4433, + "step": 17647 + }, + { + "epoch": 2.35, + "grad_norm": 0.73828125, + "learning_rate": 8.65212853099227e-05, + "loss": 0.3252, + "step": 17648 + }, + { + "epoch": 2.36, + "grad_norm": 0.6796875, + "learning_rate": 8.650974665017832e-05, + "loss": 0.2676, + "step": 17649 + }, + { + "epoch": 2.36, + "grad_norm": 0.4765625, + "learning_rate": 8.64982081733705e-05, + "loss": 0.1964, + "step": 17650 + }, + { + "epoch": 2.36, + "grad_norm": 0.68359375, + "learning_rate": 8.648666987965558e-05, + "loss": 0.1836, + "step": 17651 + }, + { + "epoch": 2.36, + "grad_norm": 0.62109375, + "learning_rate": 8.647513176919014e-05, + "loss": 0.4176, + "step": 17652 + }, + { + "epoch": 2.36, + "grad_norm": 0.50390625, + "learning_rate": 8.646359384213054e-05, + "loss": 0.314, + "step": 17653 + }, + { + "epoch": 2.36, + "grad_norm": 0.6484375, + "learning_rate": 8.645205609863334e-05, + "loss": 0.3773, + "step": 17654 + }, + { + "epoch": 2.36, + "grad_norm": 0.63671875, + "learning_rate": 8.644051853885491e-05, + "loss": 0.2082, + "step": 17655 + }, + { + "epoch": 2.36, + "grad_norm": 0.6875, + "learning_rate": 8.642898116295178e-05, + "loss": 0.3251, + "step": 17656 + }, + { + "epoch": 2.36, + "grad_norm": 0.65234375, + "learning_rate": 8.641744397108038e-05, + "loss": 0.2748, + "step": 17657 + }, + { + "epoch": 2.36, + "grad_norm": 0.60546875, + "learning_rate": 8.640590696339715e-05, + "loss": 0.3265, + "step": 17658 + }, + { + "epoch": 2.36, + "grad_norm": 0.89453125, + "learning_rate": 8.639437014005858e-05, + "loss": 0.3154, + "step": 17659 + }, + { + "epoch": 2.36, + "grad_norm": 0.5625, + "learning_rate": 8.638283350122106e-05, + "loss": 0.5472, + "step": 17660 + }, + { + "epoch": 2.36, + "grad_norm": 0.71875, + "learning_rate": 8.637129704704103e-05, + "loss": 0.2225, + "step": 17661 + }, + { + "epoch": 2.36, + "grad_norm": 0.51953125, + "learning_rate": 8.635976077767499e-05, + "loss": 0.2386, + "step": 17662 + }, + { + "epoch": 2.36, + "grad_norm": 0.53515625, + "learning_rate": 8.634822469327932e-05, + "loss": 0.3119, + "step": 17663 + }, + { + "epoch": 2.36, + "grad_norm": 0.58203125, + "learning_rate": 8.633668879401051e-05, + "loss": 0.3622, + "step": 17664 + }, + { + "epoch": 2.36, + "grad_norm": 0.61328125, + "learning_rate": 8.632515308002496e-05, + "loss": 0.3544, + "step": 17665 + }, + { + "epoch": 2.36, + "grad_norm": 0.5625, + "learning_rate": 8.631361755147911e-05, + "loss": 0.4573, + "step": 17666 + }, + { + "epoch": 2.36, + "grad_norm": 0.625, + "learning_rate": 8.630208220852937e-05, + "loss": 0.4831, + "step": 17667 + }, + { + "epoch": 2.36, + "grad_norm": 0.6328125, + "learning_rate": 8.629054705133223e-05, + "loss": 0.226, + "step": 17668 + }, + { + "epoch": 2.36, + "grad_norm": 0.486328125, + "learning_rate": 8.627901208004403e-05, + "loss": 0.3183, + "step": 17669 + }, + { + "epoch": 2.36, + "grad_norm": 0.4375, + "learning_rate": 8.626747729482127e-05, + "loss": 0.212, + "step": 17670 + }, + { + "epoch": 2.36, + "grad_norm": 0.578125, + "learning_rate": 8.625594269582034e-05, + "loss": 0.255, + "step": 17671 + }, + { + "epoch": 2.36, + "grad_norm": 0.4453125, + "learning_rate": 8.62444082831976e-05, + "loss": 0.2731, + "step": 17672 + }, + { + "epoch": 2.36, + "grad_norm": 0.5625, + "learning_rate": 8.623287405710954e-05, + "loss": 0.4186, + "step": 17673 + }, + { + "epoch": 2.36, + "grad_norm": 0.55078125, + "learning_rate": 8.622134001771251e-05, + "loss": 0.334, + "step": 17674 + }, + { + "epoch": 2.36, + "grad_norm": 0.6015625, + "learning_rate": 8.6209806165163e-05, + "loss": 0.4453, + "step": 17675 + }, + { + "epoch": 2.36, + "grad_norm": 0.57421875, + "learning_rate": 8.619827249961733e-05, + "loss": 0.2575, + "step": 17676 + }, + { + "epoch": 2.36, + "grad_norm": 0.494140625, + "learning_rate": 8.618673902123196e-05, + "loss": 0.5307, + "step": 17677 + }, + { + "epoch": 2.36, + "grad_norm": 0.515625, + "learning_rate": 8.617520573016327e-05, + "loss": 0.2629, + "step": 17678 + }, + { + "epoch": 2.36, + "grad_norm": 0.5546875, + "learning_rate": 8.616367262656767e-05, + "loss": 0.3906, + "step": 17679 + }, + { + "epoch": 2.36, + "grad_norm": 0.875, + "learning_rate": 8.615213971060154e-05, + "loss": 0.4758, + "step": 17680 + }, + { + "epoch": 2.36, + "grad_norm": 0.60546875, + "learning_rate": 8.614060698242127e-05, + "loss": 0.3025, + "step": 17681 + }, + { + "epoch": 2.36, + "grad_norm": 0.765625, + "learning_rate": 8.61290744421833e-05, + "loss": 0.5953, + "step": 17682 + }, + { + "epoch": 2.36, + "grad_norm": 0.4921875, + "learning_rate": 8.611754209004398e-05, + "loss": 0.2103, + "step": 17683 + }, + { + "epoch": 2.36, + "grad_norm": 0.412109375, + "learning_rate": 8.610600992615969e-05, + "loss": 0.307, + "step": 17684 + }, + { + "epoch": 2.36, + "grad_norm": 0.6171875, + "learning_rate": 8.609447795068682e-05, + "loss": 0.3957, + "step": 17685 + }, + { + "epoch": 2.36, + "grad_norm": 0.6015625, + "learning_rate": 8.608294616378174e-05, + "loss": 0.286, + "step": 17686 + }, + { + "epoch": 2.36, + "grad_norm": 0.57421875, + "learning_rate": 8.607141456560087e-05, + "loss": 0.3711, + "step": 17687 + }, + { + "epoch": 2.36, + "grad_norm": 0.67578125, + "learning_rate": 8.605988315630052e-05, + "loss": 0.2956, + "step": 17688 + }, + { + "epoch": 2.36, + "grad_norm": 0.71484375, + "learning_rate": 8.604835193603713e-05, + "loss": 0.4544, + "step": 17689 + }, + { + "epoch": 2.36, + "grad_norm": 0.361328125, + "learning_rate": 8.603682090496701e-05, + "loss": 0.1574, + "step": 17690 + }, + { + "epoch": 2.36, + "grad_norm": 0.55859375, + "learning_rate": 8.60252900632466e-05, + "loss": 0.4315, + "step": 17691 + }, + { + "epoch": 2.36, + "grad_norm": 0.6640625, + "learning_rate": 8.601375941103219e-05, + "loss": 0.3094, + "step": 17692 + }, + { + "epoch": 2.36, + "grad_norm": 0.78515625, + "learning_rate": 8.600222894848022e-05, + "loss": 0.2305, + "step": 17693 + }, + { + "epoch": 2.36, + "grad_norm": 0.5234375, + "learning_rate": 8.599069867574696e-05, + "loss": 0.1942, + "step": 17694 + }, + { + "epoch": 2.36, + "grad_norm": 0.52734375, + "learning_rate": 8.597916859298885e-05, + "loss": 0.2397, + "step": 17695 + }, + { + "epoch": 2.36, + "grad_norm": 0.42578125, + "learning_rate": 8.596763870036224e-05, + "loss": 0.1655, + "step": 17696 + }, + { + "epoch": 2.36, + "grad_norm": 0.5546875, + "learning_rate": 8.59561089980234e-05, + "loss": 0.1863, + "step": 17697 + }, + { + "epoch": 2.36, + "grad_norm": 0.48828125, + "learning_rate": 8.594457948612878e-05, + "loss": 0.2007, + "step": 17698 + }, + { + "epoch": 2.36, + "grad_norm": 0.84375, + "learning_rate": 8.593305016483464e-05, + "loss": 0.324, + "step": 17699 + }, + { + "epoch": 2.36, + "grad_norm": 0.78125, + "learning_rate": 8.59215210342974e-05, + "loss": 0.4193, + "step": 17700 + }, + { + "epoch": 2.36, + "grad_norm": 0.640625, + "learning_rate": 8.590999209467335e-05, + "loss": 0.2864, + "step": 17701 + }, + { + "epoch": 2.36, + "grad_norm": 0.4140625, + "learning_rate": 8.589846334611886e-05, + "loss": 0.1429, + "step": 17702 + }, + { + "epoch": 2.36, + "grad_norm": 0.57421875, + "learning_rate": 8.588693478879024e-05, + "loss": 0.1682, + "step": 17703 + }, + { + "epoch": 2.36, + "grad_norm": 0.59375, + "learning_rate": 8.587540642284384e-05, + "loss": 0.4439, + "step": 17704 + }, + { + "epoch": 2.36, + "grad_norm": 0.63671875, + "learning_rate": 8.586387824843603e-05, + "loss": 0.3895, + "step": 17705 + }, + { + "epoch": 2.36, + "grad_norm": 0.451171875, + "learning_rate": 8.585235026572307e-05, + "loss": 0.1519, + "step": 17706 + }, + { + "epoch": 2.36, + "grad_norm": 0.7109375, + "learning_rate": 8.584082247486133e-05, + "loss": 0.3882, + "step": 17707 + }, + { + "epoch": 2.36, + "grad_norm": 0.53125, + "learning_rate": 8.582929487600716e-05, + "loss": 0.288, + "step": 17708 + }, + { + "epoch": 2.36, + "grad_norm": 0.458984375, + "learning_rate": 8.581776746931681e-05, + "loss": 0.2975, + "step": 17709 + }, + { + "epoch": 2.36, + "grad_norm": 0.51171875, + "learning_rate": 8.580624025494666e-05, + "loss": 0.3086, + "step": 17710 + }, + { + "epoch": 2.36, + "grad_norm": 0.69140625, + "learning_rate": 8.579471323305295e-05, + "loss": 0.4267, + "step": 17711 + }, + { + "epoch": 2.36, + "grad_norm": 0.5234375, + "learning_rate": 8.57831864037921e-05, + "loss": 0.2141, + "step": 17712 + }, + { + "epoch": 2.36, + "grad_norm": 0.515625, + "learning_rate": 8.577165976732033e-05, + "loss": 0.2962, + "step": 17713 + }, + { + "epoch": 2.36, + "grad_norm": 0.44921875, + "learning_rate": 8.576013332379401e-05, + "loss": 0.1942, + "step": 17714 + }, + { + "epoch": 2.36, + "grad_norm": 0.58203125, + "learning_rate": 8.57486070733694e-05, + "loss": 0.352, + "step": 17715 + }, + { + "epoch": 2.36, + "grad_norm": 0.58984375, + "learning_rate": 8.573708101620283e-05, + "loss": 0.2644, + "step": 17716 + }, + { + "epoch": 2.36, + "grad_norm": 0.455078125, + "learning_rate": 8.57255551524506e-05, + "loss": 0.1954, + "step": 17717 + }, + { + "epoch": 2.36, + "grad_norm": 0.54296875, + "learning_rate": 8.571402948226902e-05, + "loss": 0.3025, + "step": 17718 + }, + { + "epoch": 2.36, + "grad_norm": 0.578125, + "learning_rate": 8.570250400581434e-05, + "loss": 0.3844, + "step": 17719 + }, + { + "epoch": 2.36, + "grad_norm": 0.7734375, + "learning_rate": 8.569097872324293e-05, + "loss": 0.6136, + "step": 17720 + }, + { + "epoch": 2.36, + "grad_norm": 0.515625, + "learning_rate": 8.567945363471099e-05, + "loss": 0.3476, + "step": 17721 + }, + { + "epoch": 2.36, + "grad_norm": 0.59375, + "learning_rate": 8.566792874037485e-05, + "loss": 0.2743, + "step": 17722 + }, + { + "epoch": 2.36, + "grad_norm": 0.51171875, + "learning_rate": 8.565640404039082e-05, + "loss": 0.2717, + "step": 17723 + }, + { + "epoch": 2.37, + "grad_norm": 0.50390625, + "learning_rate": 8.564487953491511e-05, + "loss": 0.2692, + "step": 17724 + }, + { + "epoch": 2.37, + "grad_norm": 0.67578125, + "learning_rate": 8.563335522410409e-05, + "loss": 0.4577, + "step": 17725 + }, + { + "epoch": 2.37, + "grad_norm": 0.427734375, + "learning_rate": 8.562183110811396e-05, + "loss": 0.1504, + "step": 17726 + }, + { + "epoch": 2.37, + "grad_norm": 0.6484375, + "learning_rate": 8.561030718710106e-05, + "loss": 0.3314, + "step": 17727 + }, + { + "epoch": 2.37, + "grad_norm": 0.498046875, + "learning_rate": 8.55987834612216e-05, + "loss": 0.3034, + "step": 17728 + }, + { + "epoch": 2.37, + "grad_norm": 0.64453125, + "learning_rate": 8.558725993063187e-05, + "loss": 0.5342, + "step": 17729 + }, + { + "epoch": 2.37, + "grad_norm": 0.484375, + "learning_rate": 8.557573659548818e-05, + "loss": 0.2745, + "step": 17730 + }, + { + "epoch": 2.37, + "grad_norm": 0.6640625, + "learning_rate": 8.556421345594673e-05, + "loss": 0.3988, + "step": 17731 + }, + { + "epoch": 2.37, + "grad_norm": 0.6875, + "learning_rate": 8.555269051216388e-05, + "loss": 0.3596, + "step": 17732 + }, + { + "epoch": 2.37, + "grad_norm": 0.6875, + "learning_rate": 8.554116776429577e-05, + "loss": 0.5024, + "step": 17733 + }, + { + "epoch": 2.37, + "grad_norm": 0.67578125, + "learning_rate": 8.552964521249869e-05, + "loss": 0.5155, + "step": 17734 + }, + { + "epoch": 2.37, + "grad_norm": 0.56640625, + "learning_rate": 8.551812285692893e-05, + "loss": 0.1739, + "step": 17735 + }, + { + "epoch": 2.37, + "grad_norm": 0.59765625, + "learning_rate": 8.55066006977427e-05, + "loss": 0.2313, + "step": 17736 + }, + { + "epoch": 2.37, + "grad_norm": 0.5234375, + "learning_rate": 8.549507873509628e-05, + "loss": 0.2646, + "step": 17737 + }, + { + "epoch": 2.37, + "grad_norm": 0.6015625, + "learning_rate": 8.54835569691459e-05, + "loss": 0.3158, + "step": 17738 + }, + { + "epoch": 2.37, + "grad_norm": 0.69921875, + "learning_rate": 8.54720354000478e-05, + "loss": 0.5545, + "step": 17739 + }, + { + "epoch": 2.37, + "grad_norm": 0.5390625, + "learning_rate": 8.546051402795822e-05, + "loss": 0.3116, + "step": 17740 + }, + { + "epoch": 2.37, + "grad_norm": 0.53515625, + "learning_rate": 8.54489928530334e-05, + "loss": 0.5066, + "step": 17741 + }, + { + "epoch": 2.37, + "grad_norm": 0.69140625, + "learning_rate": 8.543747187542957e-05, + "loss": 0.4144, + "step": 17742 + }, + { + "epoch": 2.37, + "grad_norm": 0.56640625, + "learning_rate": 8.542595109530301e-05, + "loss": 0.2818, + "step": 17743 + }, + { + "epoch": 2.37, + "grad_norm": 0.3515625, + "learning_rate": 8.54144305128099e-05, + "loss": 0.14, + "step": 17744 + }, + { + "epoch": 2.37, + "grad_norm": 0.486328125, + "learning_rate": 8.540291012810643e-05, + "loss": 0.3901, + "step": 17745 + }, + { + "epoch": 2.37, + "grad_norm": 0.455078125, + "learning_rate": 8.53913899413489e-05, + "loss": 0.1329, + "step": 17746 + }, + { + "epoch": 2.37, + "grad_norm": 0.65234375, + "learning_rate": 8.537986995269347e-05, + "loss": 0.2166, + "step": 17747 + }, + { + "epoch": 2.37, + "grad_norm": 0.63671875, + "learning_rate": 8.53683501622964e-05, + "loss": 0.4373, + "step": 17748 + }, + { + "epoch": 2.37, + "grad_norm": 0.54296875, + "learning_rate": 8.53568305703139e-05, + "loss": 0.2923, + "step": 17749 + }, + { + "epoch": 2.37, + "grad_norm": 0.5, + "learning_rate": 8.534531117690216e-05, + "loss": 0.1853, + "step": 17750 + }, + { + "epoch": 2.37, + "grad_norm": 0.62890625, + "learning_rate": 8.533379198221741e-05, + "loss": 0.5772, + "step": 17751 + }, + { + "epoch": 2.37, + "grad_norm": 0.58203125, + "learning_rate": 8.532227298641587e-05, + "loss": 0.4187, + "step": 17752 + }, + { + "epoch": 2.37, + "grad_norm": 0.6328125, + "learning_rate": 8.53107541896537e-05, + "loss": 0.3258, + "step": 17753 + }, + { + "epoch": 2.37, + "grad_norm": 0.6875, + "learning_rate": 8.529923559208714e-05, + "loss": 0.5231, + "step": 17754 + }, + { + "epoch": 2.37, + "grad_norm": 0.498046875, + "learning_rate": 8.528771719387239e-05, + "loss": 0.3538, + "step": 17755 + }, + { + "epoch": 2.37, + "grad_norm": 0.44140625, + "learning_rate": 8.527619899516567e-05, + "loss": 0.4505, + "step": 17756 + }, + { + "epoch": 2.37, + "grad_norm": 0.73046875, + "learning_rate": 8.526468099612309e-05, + "loss": 0.4035, + "step": 17757 + }, + { + "epoch": 2.37, + "grad_norm": 0.4609375, + "learning_rate": 8.525316319690092e-05, + "loss": 0.1226, + "step": 17758 + }, + { + "epoch": 2.37, + "grad_norm": 0.453125, + "learning_rate": 8.52416455976553e-05, + "loss": 0.1506, + "step": 17759 + }, + { + "epoch": 2.37, + "grad_norm": 0.796875, + "learning_rate": 8.523012819854246e-05, + "loss": 0.5627, + "step": 17760 + }, + { + "epoch": 2.37, + "grad_norm": 0.47265625, + "learning_rate": 8.521861099971854e-05, + "loss": 0.2016, + "step": 17761 + }, + { + "epoch": 2.37, + "grad_norm": 0.5390625, + "learning_rate": 8.520709400133975e-05, + "loss": 0.2449, + "step": 17762 + }, + { + "epoch": 2.37, + "grad_norm": 0.91796875, + "learning_rate": 8.519557720356225e-05, + "loss": 0.548, + "step": 17763 + }, + { + "epoch": 2.37, + "grad_norm": 0.443359375, + "learning_rate": 8.518406060654224e-05, + "loss": 0.2484, + "step": 17764 + }, + { + "epoch": 2.37, + "grad_norm": 0.40625, + "learning_rate": 8.517254421043586e-05, + "loss": 0.1795, + "step": 17765 + }, + { + "epoch": 2.37, + "grad_norm": 0.84375, + "learning_rate": 8.516102801539932e-05, + "loss": 0.3884, + "step": 17766 + }, + { + "epoch": 2.37, + "grad_norm": 0.337890625, + "learning_rate": 8.514951202158874e-05, + "loss": 0.1315, + "step": 17767 + }, + { + "epoch": 2.37, + "grad_norm": 0.447265625, + "learning_rate": 8.513799622916035e-05, + "loss": 0.2521, + "step": 17768 + }, + { + "epoch": 2.37, + "grad_norm": 0.57421875, + "learning_rate": 8.512648063827024e-05, + "loss": 0.3345, + "step": 17769 + }, + { + "epoch": 2.37, + "grad_norm": 0.6171875, + "learning_rate": 8.511496524907458e-05, + "loss": 0.4833, + "step": 17770 + }, + { + "epoch": 2.37, + "grad_norm": 0.4921875, + "learning_rate": 8.510345006172957e-05, + "loss": 0.3271, + "step": 17771 + }, + { + "epoch": 2.37, + "grad_norm": 0.62109375, + "learning_rate": 8.509193507639133e-05, + "loss": 0.3641, + "step": 17772 + }, + { + "epoch": 2.37, + "grad_norm": 0.91015625, + "learning_rate": 8.508042029321602e-05, + "loss": 0.6409, + "step": 17773 + }, + { + "epoch": 2.37, + "grad_norm": 0.56640625, + "learning_rate": 8.506890571235977e-05, + "loss": 0.4343, + "step": 17774 + }, + { + "epoch": 2.37, + "grad_norm": 0.6875, + "learning_rate": 8.505739133397877e-05, + "loss": 0.1948, + "step": 17775 + }, + { + "epoch": 2.37, + "grad_norm": 0.66015625, + "learning_rate": 8.50458771582291e-05, + "loss": 0.4571, + "step": 17776 + }, + { + "epoch": 2.37, + "grad_norm": 0.6640625, + "learning_rate": 8.503436318526693e-05, + "loss": 0.4394, + "step": 17777 + }, + { + "epoch": 2.37, + "grad_norm": 0.71875, + "learning_rate": 8.502284941524844e-05, + "loss": 0.5342, + "step": 17778 + }, + { + "epoch": 2.37, + "grad_norm": 0.474609375, + "learning_rate": 8.50113358483297e-05, + "loss": 0.3809, + "step": 17779 + }, + { + "epoch": 2.37, + "grad_norm": 0.4296875, + "learning_rate": 8.499982248466688e-05, + "loss": 0.266, + "step": 17780 + }, + { + "epoch": 2.37, + "grad_norm": 0.86328125, + "learning_rate": 8.498830932441612e-05, + "loss": 0.3011, + "step": 17781 + }, + { + "epoch": 2.37, + "grad_norm": 0.8046875, + "learning_rate": 8.497679636773347e-05, + "loss": 0.4818, + "step": 17782 + }, + { + "epoch": 2.37, + "grad_norm": 0.546875, + "learning_rate": 8.496528361477512e-05, + "loss": 0.398, + "step": 17783 + }, + { + "epoch": 2.37, + "grad_norm": 0.609375, + "learning_rate": 8.495377106569716e-05, + "loss": 0.4363, + "step": 17784 + }, + { + "epoch": 2.37, + "grad_norm": 0.76171875, + "learning_rate": 8.494225872065575e-05, + "loss": 0.2909, + "step": 17785 + }, + { + "epoch": 2.37, + "grad_norm": 0.46875, + "learning_rate": 8.493074657980694e-05, + "loss": 0.1996, + "step": 17786 + }, + { + "epoch": 2.37, + "grad_norm": 0.5625, + "learning_rate": 8.49192346433069e-05, + "loss": 0.4182, + "step": 17787 + }, + { + "epoch": 2.37, + "grad_norm": 0.65234375, + "learning_rate": 8.490772291131171e-05, + "loss": 0.4124, + "step": 17788 + }, + { + "epoch": 2.37, + "grad_norm": 0.50390625, + "learning_rate": 8.48962113839775e-05, + "loss": 0.4366, + "step": 17789 + }, + { + "epoch": 2.37, + "grad_norm": 0.40625, + "learning_rate": 8.488470006146034e-05, + "loss": 0.1534, + "step": 17790 + }, + { + "epoch": 2.37, + "grad_norm": 0.55859375, + "learning_rate": 8.487318894391635e-05, + "loss": 0.3705, + "step": 17791 + }, + { + "epoch": 2.37, + "grad_norm": 0.625, + "learning_rate": 8.486167803150162e-05, + "loss": 0.3351, + "step": 17792 + }, + { + "epoch": 2.37, + "grad_norm": 0.58984375, + "learning_rate": 8.485016732437228e-05, + "loss": 0.2594, + "step": 17793 + }, + { + "epoch": 2.37, + "grad_norm": 0.486328125, + "learning_rate": 8.48386568226844e-05, + "loss": 0.3385, + "step": 17794 + }, + { + "epoch": 2.37, + "grad_norm": 0.55078125, + "learning_rate": 8.482714652659402e-05, + "loss": 0.2108, + "step": 17795 + }, + { + "epoch": 2.37, + "grad_norm": 0.65625, + "learning_rate": 8.48156364362573e-05, + "loss": 0.4352, + "step": 17796 + }, + { + "epoch": 2.37, + "grad_norm": 0.53515625, + "learning_rate": 8.480412655183026e-05, + "loss": 0.3148, + "step": 17797 + }, + { + "epoch": 2.37, + "grad_norm": 0.4609375, + "learning_rate": 8.479261687346905e-05, + "loss": 0.1975, + "step": 17798 + }, + { + "epoch": 2.38, + "grad_norm": 0.4609375, + "learning_rate": 8.47811074013297e-05, + "loss": 0.2177, + "step": 17799 + }, + { + "epoch": 2.38, + "grad_norm": 0.54296875, + "learning_rate": 8.476959813556831e-05, + "loss": 0.1606, + "step": 17800 + }, + { + "epoch": 2.38, + "grad_norm": 0.5546875, + "learning_rate": 8.475808907634094e-05, + "loss": 0.2878, + "step": 17801 + }, + { + "epoch": 2.38, + "grad_norm": 0.515625, + "learning_rate": 8.474658022380365e-05, + "loss": 0.3085, + "step": 17802 + }, + { + "epoch": 2.38, + "grad_norm": 0.61328125, + "learning_rate": 8.473507157811254e-05, + "loss": 0.5437, + "step": 17803 + }, + { + "epoch": 2.38, + "grad_norm": 0.5859375, + "learning_rate": 8.472356313942365e-05, + "loss": 0.5286, + "step": 17804 + }, + { + "epoch": 2.38, + "grad_norm": 0.50390625, + "learning_rate": 8.471205490789308e-05, + "loss": 0.3071, + "step": 17805 + }, + { + "epoch": 2.38, + "grad_norm": 0.435546875, + "learning_rate": 8.470054688367685e-05, + "loss": 0.327, + "step": 17806 + }, + { + "epoch": 2.38, + "grad_norm": 0.5546875, + "learning_rate": 8.468903906693098e-05, + "loss": 0.2398, + "step": 17807 + }, + { + "epoch": 2.38, + "grad_norm": 0.66015625, + "learning_rate": 8.46775314578116e-05, + "loss": 0.4084, + "step": 17808 + }, + { + "epoch": 2.38, + "grad_norm": 0.474609375, + "learning_rate": 8.46660240564747e-05, + "loss": 0.4328, + "step": 17809 + }, + { + "epoch": 2.38, + "grad_norm": 0.361328125, + "learning_rate": 8.465451686307638e-05, + "loss": 0.1948, + "step": 17810 + }, + { + "epoch": 2.38, + "grad_norm": 0.5625, + "learning_rate": 8.464300987777265e-05, + "loss": 0.2027, + "step": 17811 + }, + { + "epoch": 2.38, + "grad_norm": 0.435546875, + "learning_rate": 8.463150310071957e-05, + "loss": 0.2226, + "step": 17812 + }, + { + "epoch": 2.38, + "grad_norm": 0.60546875, + "learning_rate": 8.461999653207315e-05, + "loss": 0.332, + "step": 17813 + }, + { + "epoch": 2.38, + "grad_norm": 0.345703125, + "learning_rate": 8.460849017198948e-05, + "loss": 0.1356, + "step": 17814 + }, + { + "epoch": 2.38, + "grad_norm": 0.65625, + "learning_rate": 8.459698402062452e-05, + "loss": 0.5643, + "step": 17815 + }, + { + "epoch": 2.38, + "grad_norm": 0.6484375, + "learning_rate": 8.458547807813439e-05, + "loss": 0.3652, + "step": 17816 + }, + { + "epoch": 2.38, + "grad_norm": 0.8046875, + "learning_rate": 8.457397234467506e-05, + "loss": 0.3288, + "step": 17817 + }, + { + "epoch": 2.38, + "grad_norm": 0.62109375, + "learning_rate": 8.456246682040256e-05, + "loss": 0.3556, + "step": 17818 + }, + { + "epoch": 2.38, + "grad_norm": 0.546875, + "learning_rate": 8.45509615054729e-05, + "loss": 0.3535, + "step": 17819 + }, + { + "epoch": 2.38, + "grad_norm": 0.71484375, + "learning_rate": 8.453945640004213e-05, + "loss": 0.4669, + "step": 17820 + }, + { + "epoch": 2.38, + "grad_norm": 0.46484375, + "learning_rate": 8.452795150426626e-05, + "loss": 0.179, + "step": 17821 + }, + { + "epoch": 2.38, + "grad_norm": 0.51953125, + "learning_rate": 8.451644681830126e-05, + "loss": 0.3001, + "step": 17822 + }, + { + "epoch": 2.38, + "grad_norm": 0.734375, + "learning_rate": 8.450494234230323e-05, + "loss": 0.3607, + "step": 17823 + }, + { + "epoch": 2.38, + "grad_norm": 0.83203125, + "learning_rate": 8.449343807642808e-05, + "loss": 0.3127, + "step": 17824 + }, + { + "epoch": 2.38, + "grad_norm": 0.53515625, + "learning_rate": 8.44819340208319e-05, + "loss": 0.347, + "step": 17825 + }, + { + "epoch": 2.38, + "grad_norm": 0.6875, + "learning_rate": 8.447043017567062e-05, + "loss": 0.4433, + "step": 17826 + }, + { + "epoch": 2.38, + "grad_norm": 0.51953125, + "learning_rate": 8.44589265411003e-05, + "loss": 0.4878, + "step": 17827 + }, + { + "epoch": 2.38, + "grad_norm": 0.50390625, + "learning_rate": 8.44474231172769e-05, + "loss": 0.1852, + "step": 17828 + }, + { + "epoch": 2.38, + "grad_norm": 0.5703125, + "learning_rate": 8.443591990435647e-05, + "loss": 0.4597, + "step": 17829 + }, + { + "epoch": 2.38, + "grad_norm": 0.6640625, + "learning_rate": 8.44244169024949e-05, + "loss": 0.4382, + "step": 17830 + }, + { + "epoch": 2.38, + "grad_norm": 0.56640625, + "learning_rate": 8.441291411184825e-05, + "loss": 0.5416, + "step": 17831 + }, + { + "epoch": 2.38, + "grad_norm": 0.5390625, + "learning_rate": 8.440141153257246e-05, + "loss": 0.2257, + "step": 17832 + }, + { + "epoch": 2.38, + "grad_norm": 0.61328125, + "learning_rate": 8.438990916482358e-05, + "loss": 0.3004, + "step": 17833 + }, + { + "epoch": 2.38, + "grad_norm": 0.74609375, + "learning_rate": 8.437840700875752e-05, + "loss": 0.2849, + "step": 17834 + }, + { + "epoch": 2.38, + "grad_norm": 0.57421875, + "learning_rate": 8.43669050645303e-05, + "loss": 0.1894, + "step": 17835 + }, + { + "epoch": 2.38, + "grad_norm": 0.4453125, + "learning_rate": 8.435540333229787e-05, + "loss": 0.2637, + "step": 17836 + }, + { + "epoch": 2.38, + "grad_norm": 0.61328125, + "learning_rate": 8.434390181221625e-05, + "loss": 0.6736, + "step": 17837 + }, + { + "epoch": 2.38, + "grad_norm": 0.58984375, + "learning_rate": 8.433240050444133e-05, + "loss": 0.284, + "step": 17838 + }, + { + "epoch": 2.38, + "grad_norm": 0.56640625, + "learning_rate": 8.432089940912913e-05, + "loss": 0.3329, + "step": 17839 + }, + { + "epoch": 2.38, + "grad_norm": 0.72265625, + "learning_rate": 8.430939852643558e-05, + "loss": 0.3834, + "step": 17840 + }, + { + "epoch": 2.38, + "grad_norm": 0.4921875, + "learning_rate": 8.429789785651671e-05, + "loss": 0.1982, + "step": 17841 + }, + { + "epoch": 2.38, + "grad_norm": 0.53125, + "learning_rate": 8.428639739952839e-05, + "loss": 0.4281, + "step": 17842 + }, + { + "epoch": 2.38, + "grad_norm": 0.6015625, + "learning_rate": 8.427489715562659e-05, + "loss": 0.3078, + "step": 17843 + }, + { + "epoch": 2.38, + "grad_norm": 0.6796875, + "learning_rate": 8.426339712496729e-05, + "loss": 0.3499, + "step": 17844 + }, + { + "epoch": 2.38, + "grad_norm": 0.734375, + "learning_rate": 8.425189730770641e-05, + "loss": 0.5775, + "step": 17845 + }, + { + "epoch": 2.38, + "grad_norm": 0.625, + "learning_rate": 8.424039770399993e-05, + "loss": 0.3095, + "step": 17846 + }, + { + "epoch": 2.38, + "grad_norm": 0.60546875, + "learning_rate": 8.422889831400376e-05, + "loss": 0.1991, + "step": 17847 + }, + { + "epoch": 2.38, + "grad_norm": 0.52734375, + "learning_rate": 8.421739913787386e-05, + "loss": 0.1953, + "step": 17848 + }, + { + "epoch": 2.38, + "grad_norm": 0.6796875, + "learning_rate": 8.420590017576613e-05, + "loss": 0.5751, + "step": 17849 + }, + { + "epoch": 2.38, + "grad_norm": 0.50390625, + "learning_rate": 8.419440142783653e-05, + "loss": 0.3859, + "step": 17850 + }, + { + "epoch": 2.38, + "grad_norm": 0.46484375, + "learning_rate": 8.418290289424102e-05, + "loss": 0.2739, + "step": 17851 + }, + { + "epoch": 2.38, + "grad_norm": 0.65234375, + "learning_rate": 8.417140457513548e-05, + "loss": 0.5578, + "step": 17852 + }, + { + "epoch": 2.38, + "grad_norm": 0.578125, + "learning_rate": 8.41599064706759e-05, + "loss": 0.1901, + "step": 17853 + }, + { + "epoch": 2.38, + "grad_norm": 0.6484375, + "learning_rate": 8.414840858101812e-05, + "loss": 0.6107, + "step": 17854 + }, + { + "epoch": 2.38, + "grad_norm": 0.404296875, + "learning_rate": 8.413691090631808e-05, + "loss": 0.1675, + "step": 17855 + }, + { + "epoch": 2.38, + "grad_norm": 0.66015625, + "learning_rate": 8.412541344673172e-05, + "loss": 0.519, + "step": 17856 + }, + { + "epoch": 2.38, + "grad_norm": 0.609375, + "learning_rate": 8.411391620241493e-05, + "loss": 0.2803, + "step": 17857 + }, + { + "epoch": 2.38, + "grad_norm": 0.59375, + "learning_rate": 8.410241917352365e-05, + "loss": 0.356, + "step": 17858 + }, + { + "epoch": 2.38, + "grad_norm": 0.69921875, + "learning_rate": 8.409092236021374e-05, + "loss": 0.5917, + "step": 17859 + }, + { + "epoch": 2.38, + "grad_norm": 0.65625, + "learning_rate": 8.407942576264117e-05, + "loss": 0.2702, + "step": 17860 + }, + { + "epoch": 2.38, + "grad_norm": 0.5390625, + "learning_rate": 8.406792938096178e-05, + "loss": 0.1885, + "step": 17861 + }, + { + "epoch": 2.38, + "grad_norm": 0.5703125, + "learning_rate": 8.405643321533151e-05, + "loss": 0.3874, + "step": 17862 + }, + { + "epoch": 2.38, + "grad_norm": 0.55078125, + "learning_rate": 8.404493726590621e-05, + "loss": 0.1959, + "step": 17863 + }, + { + "epoch": 2.38, + "grad_norm": 0.447265625, + "learning_rate": 8.403344153284182e-05, + "loss": 0.2075, + "step": 17864 + }, + { + "epoch": 2.38, + "grad_norm": 0.61328125, + "learning_rate": 8.40219460162942e-05, + "loss": 0.1801, + "step": 17865 + }, + { + "epoch": 2.38, + "grad_norm": 0.66015625, + "learning_rate": 8.40104507164193e-05, + "loss": 0.2721, + "step": 17866 + }, + { + "epoch": 2.38, + "grad_norm": 0.609375, + "learning_rate": 8.399895563337291e-05, + "loss": 0.3603, + "step": 17867 + }, + { + "epoch": 2.38, + "grad_norm": 0.51953125, + "learning_rate": 8.398746076731094e-05, + "loss": 0.2085, + "step": 17868 + }, + { + "epoch": 2.38, + "grad_norm": 0.486328125, + "learning_rate": 8.397596611838929e-05, + "loss": 0.1587, + "step": 17869 + }, + { + "epoch": 2.38, + "grad_norm": 0.6484375, + "learning_rate": 8.396447168676381e-05, + "loss": 0.5408, + "step": 17870 + }, + { + "epoch": 2.38, + "grad_norm": 0.466796875, + "learning_rate": 8.395297747259042e-05, + "loss": 0.2004, + "step": 17871 + }, + { + "epoch": 2.38, + "grad_norm": 0.5625, + "learning_rate": 8.394148347602491e-05, + "loss": 0.212, + "step": 17872 + }, + { + "epoch": 2.38, + "grad_norm": 0.66015625, + "learning_rate": 8.392998969722321e-05, + "loss": 0.3355, + "step": 17873 + }, + { + "epoch": 2.39, + "grad_norm": 0.451171875, + "learning_rate": 8.391849613634115e-05, + "loss": 0.2218, + "step": 17874 + }, + { + "epoch": 2.39, + "grad_norm": 0.72265625, + "learning_rate": 8.39070027935346e-05, + "loss": 0.2514, + "step": 17875 + }, + { + "epoch": 2.39, + "grad_norm": 0.62109375, + "learning_rate": 8.389550966895944e-05, + "loss": 0.3085, + "step": 17876 + }, + { + "epoch": 2.39, + "grad_norm": 0.47265625, + "learning_rate": 8.38840167627715e-05, + "loss": 0.2172, + "step": 17877 + }, + { + "epoch": 2.39, + "grad_norm": 0.66796875, + "learning_rate": 8.387252407512667e-05, + "loss": 0.2898, + "step": 17878 + }, + { + "epoch": 2.39, + "grad_norm": 0.65234375, + "learning_rate": 8.386103160618076e-05, + "loss": 0.2983, + "step": 17879 + }, + { + "epoch": 2.39, + "grad_norm": 0.625, + "learning_rate": 8.384953935608957e-05, + "loss": 0.6316, + "step": 17880 + }, + { + "epoch": 2.39, + "grad_norm": 0.375, + "learning_rate": 8.383804732500902e-05, + "loss": 0.1437, + "step": 17881 + }, + { + "epoch": 2.39, + "grad_norm": 0.54296875, + "learning_rate": 8.38265555130949e-05, + "loss": 0.5029, + "step": 17882 + }, + { + "epoch": 2.39, + "grad_norm": 0.65234375, + "learning_rate": 8.38150639205031e-05, + "loss": 0.4457, + "step": 17883 + }, + { + "epoch": 2.39, + "grad_norm": 0.4609375, + "learning_rate": 8.380357254738938e-05, + "loss": 0.1605, + "step": 17884 + }, + { + "epoch": 2.39, + "grad_norm": 0.6328125, + "learning_rate": 8.379208139390964e-05, + "loss": 0.5517, + "step": 17885 + }, + { + "epoch": 2.39, + "grad_norm": 0.5625, + "learning_rate": 8.378059046021966e-05, + "loss": 0.1532, + "step": 17886 + }, + { + "epoch": 2.39, + "grad_norm": 0.6015625, + "learning_rate": 8.37690997464753e-05, + "loss": 0.4782, + "step": 17887 + }, + { + "epoch": 2.39, + "grad_norm": 0.59375, + "learning_rate": 8.375760925283235e-05, + "loss": 0.5057, + "step": 17888 + }, + { + "epoch": 2.39, + "grad_norm": 0.9296875, + "learning_rate": 8.374611897944666e-05, + "loss": 0.5213, + "step": 17889 + }, + { + "epoch": 2.39, + "grad_norm": 0.828125, + "learning_rate": 8.373462892647406e-05, + "loss": 0.4044, + "step": 17890 + }, + { + "epoch": 2.39, + "grad_norm": 0.6328125, + "learning_rate": 8.372313909407027e-05, + "loss": 0.2971, + "step": 17891 + }, + { + "epoch": 2.39, + "grad_norm": 0.55078125, + "learning_rate": 8.371164948239118e-05, + "loss": 0.3185, + "step": 17892 + }, + { + "epoch": 2.39, + "grad_norm": 0.64453125, + "learning_rate": 8.370016009159257e-05, + "loss": 0.2754, + "step": 17893 + }, + { + "epoch": 2.39, + "grad_norm": 0.6328125, + "learning_rate": 8.368867092183027e-05, + "loss": 0.2609, + "step": 17894 + }, + { + "epoch": 2.39, + "grad_norm": 0.55859375, + "learning_rate": 8.367718197326005e-05, + "loss": 0.5404, + "step": 17895 + }, + { + "epoch": 2.39, + "grad_norm": 0.515625, + "learning_rate": 8.366569324603772e-05, + "loss": 0.3416, + "step": 17896 + }, + { + "epoch": 2.39, + "grad_norm": 0.7421875, + "learning_rate": 8.365420474031905e-05, + "loss": 0.3313, + "step": 17897 + }, + { + "epoch": 2.39, + "grad_norm": 0.47265625, + "learning_rate": 8.364271645625989e-05, + "loss": 0.2049, + "step": 17898 + }, + { + "epoch": 2.39, + "grad_norm": 0.6640625, + "learning_rate": 8.363122839401597e-05, + "loss": 0.703, + "step": 17899 + }, + { + "epoch": 2.39, + "grad_norm": 0.67578125, + "learning_rate": 8.361974055374308e-05, + "loss": 0.3155, + "step": 17900 + }, + { + "epoch": 2.39, + "grad_norm": 0.609375, + "learning_rate": 8.360825293559706e-05, + "loss": 0.6068, + "step": 17901 + }, + { + "epoch": 2.39, + "grad_norm": 0.734375, + "learning_rate": 8.359676553973367e-05, + "loss": 0.5051, + "step": 17902 + }, + { + "epoch": 2.39, + "grad_norm": 0.392578125, + "learning_rate": 8.358527836630862e-05, + "loss": 0.1398, + "step": 17903 + }, + { + "epoch": 2.39, + "grad_norm": 0.46875, + "learning_rate": 8.357379141547776e-05, + "loss": 0.2215, + "step": 17904 + }, + { + "epoch": 2.39, + "grad_norm": 0.796875, + "learning_rate": 8.35623046873968e-05, + "loss": 0.6201, + "step": 17905 + }, + { + "epoch": 2.39, + "grad_norm": 0.50390625, + "learning_rate": 8.355081818222156e-05, + "loss": 0.2371, + "step": 17906 + }, + { + "epoch": 2.39, + "grad_norm": 0.703125, + "learning_rate": 8.353933190010777e-05, + "loss": 0.5222, + "step": 17907 + }, + { + "epoch": 2.39, + "grad_norm": 0.54296875, + "learning_rate": 8.35278458412112e-05, + "loss": 0.3349, + "step": 17908 + }, + { + "epoch": 2.39, + "grad_norm": 0.58984375, + "learning_rate": 8.351636000568762e-05, + "loss": 0.4468, + "step": 17909 + }, + { + "epoch": 2.39, + "grad_norm": 0.392578125, + "learning_rate": 8.350487439369279e-05, + "loss": 0.2354, + "step": 17910 + }, + { + "epoch": 2.39, + "grad_norm": 0.57421875, + "learning_rate": 8.349338900538241e-05, + "loss": 0.2757, + "step": 17911 + }, + { + "epoch": 2.39, + "grad_norm": 0.51953125, + "learning_rate": 8.348190384091232e-05, + "loss": 0.484, + "step": 17912 + }, + { + "epoch": 2.39, + "grad_norm": 0.5859375, + "learning_rate": 8.347041890043818e-05, + "loss": 0.2036, + "step": 17913 + }, + { + "epoch": 2.39, + "grad_norm": 0.625, + "learning_rate": 8.34589341841158e-05, + "loss": 0.3368, + "step": 17914 + }, + { + "epoch": 2.39, + "grad_norm": 0.58984375, + "learning_rate": 8.344744969210089e-05, + "loss": 0.247, + "step": 17915 + }, + { + "epoch": 2.39, + "grad_norm": 0.5078125, + "learning_rate": 8.343596542454915e-05, + "loss": 0.4012, + "step": 17916 + }, + { + "epoch": 2.39, + "grad_norm": 0.48828125, + "learning_rate": 8.342448138161638e-05, + "loss": 0.2249, + "step": 17917 + }, + { + "epoch": 2.39, + "grad_norm": 0.58203125, + "learning_rate": 8.341299756345825e-05, + "loss": 0.2941, + "step": 17918 + }, + { + "epoch": 2.39, + "grad_norm": 0.5859375, + "learning_rate": 8.340151397023054e-05, + "loss": 0.6435, + "step": 17919 + }, + { + "epoch": 2.39, + "grad_norm": 0.5625, + "learning_rate": 8.339003060208894e-05, + "loss": 0.5243, + "step": 17920 + }, + { + "epoch": 2.39, + "grad_norm": 0.51953125, + "learning_rate": 8.337854745918919e-05, + "loss": 0.5131, + "step": 17921 + }, + { + "epoch": 2.39, + "grad_norm": 0.51953125, + "learning_rate": 8.336706454168701e-05, + "loss": 0.1943, + "step": 17922 + }, + { + "epoch": 2.39, + "grad_norm": 0.69921875, + "learning_rate": 8.335558184973809e-05, + "loss": 0.539, + "step": 17923 + }, + { + "epoch": 2.39, + "grad_norm": 0.63671875, + "learning_rate": 8.334409938349818e-05, + "loss": 0.5155, + "step": 17924 + }, + { + "epoch": 2.39, + "grad_norm": 0.80859375, + "learning_rate": 8.333261714312296e-05, + "loss": 0.3576, + "step": 17925 + }, + { + "epoch": 2.39, + "grad_norm": 0.45703125, + "learning_rate": 8.332113512876818e-05, + "loss": 0.1959, + "step": 17926 + }, + { + "epoch": 2.39, + "grad_norm": 0.79296875, + "learning_rate": 8.330965334058951e-05, + "loss": 0.1839, + "step": 17927 + }, + { + "epoch": 2.39, + "grad_norm": 0.59375, + "learning_rate": 8.329817177874261e-05, + "loss": 0.2386, + "step": 17928 + }, + { + "epoch": 2.39, + "grad_norm": 0.50390625, + "learning_rate": 8.328669044338327e-05, + "loss": 0.1949, + "step": 17929 + }, + { + "epoch": 2.39, + "grad_norm": 0.3984375, + "learning_rate": 8.327520933466709e-05, + "loss": 0.2339, + "step": 17930 + }, + { + "epoch": 2.39, + "grad_norm": 0.5078125, + "learning_rate": 8.326372845274983e-05, + "loss": 0.2708, + "step": 17931 + }, + { + "epoch": 2.39, + "grad_norm": 0.52734375, + "learning_rate": 8.325224779778713e-05, + "loss": 0.3055, + "step": 17932 + }, + { + "epoch": 2.39, + "grad_norm": 0.578125, + "learning_rate": 8.324076736993474e-05, + "loss": 0.2893, + "step": 17933 + }, + { + "epoch": 2.39, + "grad_norm": 0.5078125, + "learning_rate": 8.322928716934826e-05, + "loss": 0.2446, + "step": 17934 + }, + { + "epoch": 2.39, + "grad_norm": 0.44921875, + "learning_rate": 8.321780719618343e-05, + "loss": 0.2031, + "step": 17935 + }, + { + "epoch": 2.39, + "grad_norm": 0.5546875, + "learning_rate": 8.320632745059588e-05, + "loss": 0.2704, + "step": 17936 + }, + { + "epoch": 2.39, + "grad_norm": 0.435546875, + "learning_rate": 8.319484793274133e-05, + "loss": 0.1451, + "step": 17937 + }, + { + "epoch": 2.39, + "grad_norm": 0.59375, + "learning_rate": 8.318336864277548e-05, + "loss": 0.327, + "step": 17938 + }, + { + "epoch": 2.39, + "grad_norm": 0.65234375, + "learning_rate": 8.317188958085387e-05, + "loss": 0.2652, + "step": 17939 + }, + { + "epoch": 2.39, + "grad_norm": 0.57421875, + "learning_rate": 8.316041074713225e-05, + "loss": 0.2853, + "step": 17940 + }, + { + "epoch": 2.39, + "grad_norm": 0.98828125, + "learning_rate": 8.314893214176626e-05, + "loss": 0.4723, + "step": 17941 + }, + { + "epoch": 2.39, + "grad_norm": 0.67578125, + "learning_rate": 8.31374537649116e-05, + "loss": 0.4747, + "step": 17942 + }, + { + "epoch": 2.39, + "grad_norm": 0.55078125, + "learning_rate": 8.312597561672386e-05, + "loss": 0.3806, + "step": 17943 + }, + { + "epoch": 2.39, + "grad_norm": 0.66015625, + "learning_rate": 8.311449769735873e-05, + "loss": 0.2614, + "step": 17944 + }, + { + "epoch": 2.39, + "grad_norm": 0.486328125, + "learning_rate": 8.310302000697185e-05, + "loss": 0.3415, + "step": 17945 + }, + { + "epoch": 2.39, + "grad_norm": 0.6640625, + "learning_rate": 8.309154254571886e-05, + "loss": 0.5206, + "step": 17946 + }, + { + "epoch": 2.39, + "grad_norm": 0.7421875, + "learning_rate": 8.308006531375539e-05, + "loss": 0.4706, + "step": 17947 + }, + { + "epoch": 2.39, + "grad_norm": 0.53515625, + "learning_rate": 8.30685883112371e-05, + "loss": 0.2665, + "step": 17948 + }, + { + "epoch": 2.4, + "grad_norm": 0.546875, + "learning_rate": 8.305711153831964e-05, + "loss": 0.3002, + "step": 17949 + }, + { + "epoch": 2.4, + "grad_norm": 0.458984375, + "learning_rate": 8.304563499515859e-05, + "loss": 0.301, + "step": 17950 + }, + { + "epoch": 2.4, + "grad_norm": 0.4921875, + "learning_rate": 8.303415868190967e-05, + "loss": 0.3102, + "step": 17951 + }, + { + "epoch": 2.4, + "grad_norm": 0.4453125, + "learning_rate": 8.30226825987284e-05, + "loss": 0.3127, + "step": 17952 + }, + { + "epoch": 2.4, + "grad_norm": 0.494140625, + "learning_rate": 8.301120674577046e-05, + "loss": 0.3916, + "step": 17953 + }, + { + "epoch": 2.4, + "grad_norm": 0.5390625, + "learning_rate": 8.299973112319146e-05, + "loss": 0.4075, + "step": 17954 + }, + { + "epoch": 2.4, + "grad_norm": 0.76171875, + "learning_rate": 8.2988255731147e-05, + "loss": 0.3693, + "step": 17955 + }, + { + "epoch": 2.4, + "grad_norm": 0.5234375, + "learning_rate": 8.297678056979273e-05, + "loss": 0.4847, + "step": 17956 + }, + { + "epoch": 2.4, + "grad_norm": 0.55859375, + "learning_rate": 8.296530563928423e-05, + "loss": 0.2903, + "step": 17957 + }, + { + "epoch": 2.4, + "grad_norm": 0.515625, + "learning_rate": 8.295383093977713e-05, + "loss": 0.3745, + "step": 17958 + }, + { + "epoch": 2.4, + "grad_norm": 0.63671875, + "learning_rate": 8.294235647142701e-05, + "loss": 0.2818, + "step": 17959 + }, + { + "epoch": 2.4, + "grad_norm": 0.69921875, + "learning_rate": 8.29308822343895e-05, + "loss": 0.3974, + "step": 17960 + }, + { + "epoch": 2.4, + "grad_norm": 0.546875, + "learning_rate": 8.291940822882016e-05, + "loss": 0.1782, + "step": 17961 + }, + { + "epoch": 2.4, + "grad_norm": 0.4375, + "learning_rate": 8.290793445487462e-05, + "loss": 0.2247, + "step": 17962 + }, + { + "epoch": 2.4, + "grad_norm": 0.65234375, + "learning_rate": 8.289646091270849e-05, + "loss": 0.628, + "step": 17963 + }, + { + "epoch": 2.4, + "grad_norm": 0.578125, + "learning_rate": 8.288498760247727e-05, + "loss": 0.3504, + "step": 17964 + }, + { + "epoch": 2.4, + "grad_norm": 0.59765625, + "learning_rate": 8.287351452433663e-05, + "loss": 0.2633, + "step": 17965 + }, + { + "epoch": 2.4, + "grad_norm": 0.494140625, + "learning_rate": 8.286204167844212e-05, + "loss": 0.2857, + "step": 17966 + }, + { + "epoch": 2.4, + "grad_norm": 0.478515625, + "learning_rate": 8.285056906494933e-05, + "loss": 0.3379, + "step": 17967 + }, + { + "epoch": 2.4, + "grad_norm": 0.70703125, + "learning_rate": 8.283909668401382e-05, + "loss": 0.358, + "step": 17968 + }, + { + "epoch": 2.4, + "grad_norm": 0.51171875, + "learning_rate": 8.282762453579118e-05, + "loss": 0.2578, + "step": 17969 + }, + { + "epoch": 2.4, + "grad_norm": 0.71875, + "learning_rate": 8.281615262043696e-05, + "loss": 0.334, + "step": 17970 + }, + { + "epoch": 2.4, + "grad_norm": 0.388671875, + "learning_rate": 8.280468093810677e-05, + "loss": 0.2672, + "step": 17971 + }, + { + "epoch": 2.4, + "grad_norm": 0.6328125, + "learning_rate": 8.27932094889561e-05, + "loss": 0.399, + "step": 17972 + }, + { + "epoch": 2.4, + "grad_norm": 0.5859375, + "learning_rate": 8.278173827314056e-05, + "loss": 0.2189, + "step": 17973 + }, + { + "epoch": 2.4, + "grad_norm": 0.435546875, + "learning_rate": 8.277026729081572e-05, + "loss": 0.2621, + "step": 17974 + }, + { + "epoch": 2.4, + "grad_norm": 0.8359375, + "learning_rate": 8.275879654213714e-05, + "loss": 0.4126, + "step": 17975 + }, + { + "epoch": 2.4, + "grad_norm": 0.484375, + "learning_rate": 8.27473260272603e-05, + "loss": 0.1379, + "step": 17976 + }, + { + "epoch": 2.4, + "grad_norm": 0.59765625, + "learning_rate": 8.273585574634081e-05, + "loss": 0.5074, + "step": 17977 + }, + { + "epoch": 2.4, + "grad_norm": 0.60546875, + "learning_rate": 8.272438569953416e-05, + "loss": 0.5431, + "step": 17978 + }, + { + "epoch": 2.4, + "grad_norm": 0.75390625, + "learning_rate": 8.271291588699598e-05, + "loss": 0.3821, + "step": 17979 + }, + { + "epoch": 2.4, + "grad_norm": 0.51953125, + "learning_rate": 8.270144630888172e-05, + "loss": 0.5259, + "step": 17980 + }, + { + "epoch": 2.4, + "grad_norm": 0.53515625, + "learning_rate": 8.268997696534697e-05, + "loss": 0.2362, + "step": 17981 + }, + { + "epoch": 2.4, + "grad_norm": 0.69140625, + "learning_rate": 8.267850785654722e-05, + "loss": 0.3667, + "step": 17982 + }, + { + "epoch": 2.4, + "grad_norm": 0.77734375, + "learning_rate": 8.266703898263804e-05, + "loss": 0.4645, + "step": 17983 + }, + { + "epoch": 2.4, + "grad_norm": 0.625, + "learning_rate": 8.265557034377491e-05, + "loss": 0.3254, + "step": 17984 + }, + { + "epoch": 2.4, + "grad_norm": 0.55078125, + "learning_rate": 8.26441019401134e-05, + "loss": 0.1863, + "step": 17985 + }, + { + "epoch": 2.4, + "grad_norm": 0.61328125, + "learning_rate": 8.2632633771809e-05, + "loss": 0.2335, + "step": 17986 + }, + { + "epoch": 2.4, + "grad_norm": 0.62890625, + "learning_rate": 8.262116583901726e-05, + "loss": 0.4069, + "step": 17987 + }, + { + "epoch": 2.4, + "grad_norm": 0.54296875, + "learning_rate": 8.260969814189366e-05, + "loss": 0.3989, + "step": 17988 + }, + { + "epoch": 2.4, + "grad_norm": 0.51171875, + "learning_rate": 8.259823068059368e-05, + "loss": 0.2858, + "step": 17989 + }, + { + "epoch": 2.4, + "grad_norm": 0.59375, + "learning_rate": 8.258676345527289e-05, + "loss": 0.3683, + "step": 17990 + }, + { + "epoch": 2.4, + "grad_norm": 0.55078125, + "learning_rate": 8.257529646608672e-05, + "loss": 0.2233, + "step": 17991 + }, + { + "epoch": 2.4, + "grad_norm": 0.67578125, + "learning_rate": 8.256382971319076e-05, + "loss": 0.3127, + "step": 17992 + }, + { + "epoch": 2.4, + "grad_norm": 0.8671875, + "learning_rate": 8.255236319674043e-05, + "loss": 0.347, + "step": 17993 + }, + { + "epoch": 2.4, + "grad_norm": 0.4453125, + "learning_rate": 8.254089691689128e-05, + "loss": 0.3588, + "step": 17994 + }, + { + "epoch": 2.4, + "grad_norm": 0.6796875, + "learning_rate": 8.252943087379874e-05, + "loss": 0.4262, + "step": 17995 + }, + { + "epoch": 2.4, + "grad_norm": 0.48828125, + "learning_rate": 8.251796506761833e-05, + "loss": 0.3711, + "step": 17996 + }, + { + "epoch": 2.4, + "grad_norm": 0.400390625, + "learning_rate": 8.250649949850556e-05, + "loss": 0.1596, + "step": 17997 + }, + { + "epoch": 2.4, + "grad_norm": 0.609375, + "learning_rate": 8.249503416661587e-05, + "loss": 0.4574, + "step": 17998 + }, + { + "epoch": 2.4, + "grad_norm": 0.65234375, + "learning_rate": 8.248356907210478e-05, + "loss": 0.2073, + "step": 17999 + }, + { + "epoch": 2.4, + "grad_norm": 0.546875, + "learning_rate": 8.247210421512772e-05, + "loss": 0.4243, + "step": 18000 + }, + { + "epoch": 2.4, + "grad_norm": 0.6171875, + "learning_rate": 8.246063959584016e-05, + "loss": 0.3449, + "step": 18001 + }, + { + "epoch": 2.4, + "grad_norm": 0.73828125, + "learning_rate": 8.24491752143976e-05, + "loss": 0.3808, + "step": 18002 + }, + { + "epoch": 2.4, + "grad_norm": 0.6640625, + "learning_rate": 8.243771107095546e-05, + "loss": 0.396, + "step": 18003 + }, + { + "epoch": 2.4, + "grad_norm": 0.7734375, + "learning_rate": 8.242624716566927e-05, + "loss": 0.4339, + "step": 18004 + }, + { + "epoch": 2.4, + "grad_norm": 0.87109375, + "learning_rate": 8.24147834986944e-05, + "loss": 0.4414, + "step": 18005 + }, + { + "epoch": 2.4, + "grad_norm": 0.94921875, + "learning_rate": 8.24033200701864e-05, + "loss": 0.4568, + "step": 18006 + }, + { + "epoch": 2.4, + "grad_norm": 0.6171875, + "learning_rate": 8.239185688030063e-05, + "loss": 0.4934, + "step": 18007 + }, + { + "epoch": 2.4, + "grad_norm": 0.53125, + "learning_rate": 8.238039392919262e-05, + "loss": 0.2275, + "step": 18008 + }, + { + "epoch": 2.4, + "grad_norm": 0.6796875, + "learning_rate": 8.236893121701774e-05, + "loss": 0.5174, + "step": 18009 + }, + { + "epoch": 2.4, + "grad_norm": 0.447265625, + "learning_rate": 8.23574687439315e-05, + "loss": 0.2955, + "step": 18010 + }, + { + "epoch": 2.4, + "grad_norm": 0.578125, + "learning_rate": 8.234600651008933e-05, + "loss": 0.3503, + "step": 18011 + }, + { + "epoch": 2.4, + "grad_norm": 0.51171875, + "learning_rate": 8.233454451564659e-05, + "loss": 0.3308, + "step": 18012 + }, + { + "epoch": 2.4, + "grad_norm": 0.53515625, + "learning_rate": 8.232308276075878e-05, + "loss": 0.4961, + "step": 18013 + }, + { + "epoch": 2.4, + "grad_norm": 0.578125, + "learning_rate": 8.231162124558131e-05, + "loss": 0.3655, + "step": 18014 + }, + { + "epoch": 2.4, + "grad_norm": 0.6484375, + "learning_rate": 8.230015997026962e-05, + "loss": 0.3117, + "step": 18015 + }, + { + "epoch": 2.4, + "grad_norm": 0.6015625, + "learning_rate": 8.22886989349791e-05, + "loss": 0.3152, + "step": 18016 + }, + { + "epoch": 2.4, + "grad_norm": 0.60546875, + "learning_rate": 8.227723813986521e-05, + "loss": 0.2194, + "step": 18017 + }, + { + "epoch": 2.4, + "grad_norm": 0.470703125, + "learning_rate": 8.226577758508333e-05, + "loss": 0.368, + "step": 18018 + }, + { + "epoch": 2.4, + "grad_norm": 0.6328125, + "learning_rate": 8.22543172707889e-05, + "loss": 0.2621, + "step": 18019 + }, + { + "epoch": 2.4, + "grad_norm": 0.515625, + "learning_rate": 8.224285719713729e-05, + "loss": 0.4553, + "step": 18020 + }, + { + "epoch": 2.4, + "grad_norm": 0.62109375, + "learning_rate": 8.223139736428395e-05, + "loss": 0.2691, + "step": 18021 + }, + { + "epoch": 2.4, + "grad_norm": 0.50390625, + "learning_rate": 8.221993777238428e-05, + "loss": 0.2245, + "step": 18022 + }, + { + "epoch": 2.4, + "grad_norm": 0.59765625, + "learning_rate": 8.220847842159368e-05, + "loss": 0.4389, + "step": 18023 + }, + { + "epoch": 2.41, + "grad_norm": 0.734375, + "learning_rate": 8.219701931206749e-05, + "loss": 0.2875, + "step": 18024 + }, + { + "epoch": 2.41, + "grad_norm": 0.5390625, + "learning_rate": 8.218556044396118e-05, + "loss": 0.1761, + "step": 18025 + }, + { + "epoch": 2.41, + "grad_norm": 0.4296875, + "learning_rate": 8.217410181743006e-05, + "loss": 0.2067, + "step": 18026 + }, + { + "epoch": 2.41, + "grad_norm": 0.54296875, + "learning_rate": 8.21626434326296e-05, + "loss": 0.2891, + "step": 18027 + }, + { + "epoch": 2.41, + "grad_norm": 0.58984375, + "learning_rate": 8.215118528971511e-05, + "loss": 0.5016, + "step": 18028 + }, + { + "epoch": 2.41, + "grad_norm": 0.478515625, + "learning_rate": 8.213972738884202e-05, + "loss": 0.2087, + "step": 18029 + }, + { + "epoch": 2.41, + "grad_norm": 0.75390625, + "learning_rate": 8.212826973016568e-05, + "loss": 0.3955, + "step": 18030 + }, + { + "epoch": 2.41, + "grad_norm": 0.640625, + "learning_rate": 8.211681231384149e-05, + "loss": 0.4241, + "step": 18031 + }, + { + "epoch": 2.41, + "grad_norm": 0.55859375, + "learning_rate": 8.210535514002477e-05, + "loss": 0.359, + "step": 18032 + }, + { + "epoch": 2.41, + "grad_norm": 0.498046875, + "learning_rate": 8.209389820887096e-05, + "loss": 0.3086, + "step": 18033 + }, + { + "epoch": 2.41, + "grad_norm": 0.68359375, + "learning_rate": 8.208244152053535e-05, + "loss": 0.4725, + "step": 18034 + }, + { + "epoch": 2.41, + "grad_norm": 0.54296875, + "learning_rate": 8.207098507517336e-05, + "loss": 0.3716, + "step": 18035 + }, + { + "epoch": 2.41, + "grad_norm": 0.474609375, + "learning_rate": 8.205952887294032e-05, + "loss": 0.3237, + "step": 18036 + }, + { + "epoch": 2.41, + "grad_norm": 0.65234375, + "learning_rate": 8.204807291399157e-05, + "loss": 0.4775, + "step": 18037 + }, + { + "epoch": 2.41, + "grad_norm": 0.55078125, + "learning_rate": 8.203661719848248e-05, + "loss": 0.2697, + "step": 18038 + }, + { + "epoch": 2.41, + "grad_norm": 0.6015625, + "learning_rate": 8.202516172656836e-05, + "loss": 0.3588, + "step": 18039 + }, + { + "epoch": 2.41, + "grad_norm": 0.56640625, + "learning_rate": 8.201370649840462e-05, + "loss": 0.4004, + "step": 18040 + }, + { + "epoch": 2.41, + "grad_norm": 0.546875, + "learning_rate": 8.200225151414654e-05, + "loss": 0.2687, + "step": 18041 + }, + { + "epoch": 2.41, + "grad_norm": 0.51171875, + "learning_rate": 8.199079677394949e-05, + "loss": 0.2242, + "step": 18042 + }, + { + "epoch": 2.41, + "grad_norm": 0.66796875, + "learning_rate": 8.197934227796877e-05, + "loss": 0.384, + "step": 18043 + }, + { + "epoch": 2.41, + "grad_norm": 0.5546875, + "learning_rate": 8.196788802635977e-05, + "loss": 0.5177, + "step": 18044 + }, + { + "epoch": 2.41, + "grad_norm": 0.5859375, + "learning_rate": 8.195643401927777e-05, + "loss": 0.3627, + "step": 18045 + }, + { + "epoch": 2.41, + "grad_norm": 0.486328125, + "learning_rate": 8.194498025687808e-05, + "loss": 0.2171, + "step": 18046 + }, + { + "epoch": 2.41, + "grad_norm": 0.46875, + "learning_rate": 8.193352673931608e-05, + "loss": 0.3241, + "step": 18047 + }, + { + "epoch": 2.41, + "grad_norm": 0.57421875, + "learning_rate": 8.192207346674707e-05, + "loss": 0.2153, + "step": 18048 + }, + { + "epoch": 2.41, + "grad_norm": 0.6328125, + "learning_rate": 8.19106204393263e-05, + "loss": 0.4745, + "step": 18049 + }, + { + "epoch": 2.41, + "grad_norm": 0.625, + "learning_rate": 8.189916765720916e-05, + "loss": 0.3297, + "step": 18050 + }, + { + "epoch": 2.41, + "grad_norm": 0.61328125, + "learning_rate": 8.18877151205509e-05, + "loss": 0.2155, + "step": 18051 + }, + { + "epoch": 2.41, + "grad_norm": 0.46484375, + "learning_rate": 8.187626282950684e-05, + "loss": 0.472, + "step": 18052 + }, + { + "epoch": 2.41, + "grad_norm": 0.62890625, + "learning_rate": 8.186481078423229e-05, + "loss": 0.4897, + "step": 18053 + }, + { + "epoch": 2.41, + "grad_norm": 0.578125, + "learning_rate": 8.185335898488258e-05, + "loss": 0.3893, + "step": 18054 + }, + { + "epoch": 2.41, + "grad_norm": 0.73046875, + "learning_rate": 8.184190743161293e-05, + "loss": 0.4559, + "step": 18055 + }, + { + "epoch": 2.41, + "grad_norm": 0.494140625, + "learning_rate": 8.18304561245787e-05, + "loss": 0.3354, + "step": 18056 + }, + { + "epoch": 2.41, + "grad_norm": 0.6328125, + "learning_rate": 8.181900506393511e-05, + "loss": 0.2855, + "step": 18057 + }, + { + "epoch": 2.41, + "grad_norm": 0.453125, + "learning_rate": 8.18075542498375e-05, + "loss": 0.2094, + "step": 18058 + }, + { + "epoch": 2.41, + "grad_norm": 0.5078125, + "learning_rate": 8.179610368244112e-05, + "loss": 0.3439, + "step": 18059 + }, + { + "epoch": 2.41, + "grad_norm": 0.6796875, + "learning_rate": 8.17846533619013e-05, + "loss": 0.396, + "step": 18060 + }, + { + "epoch": 2.41, + "grad_norm": 0.59765625, + "learning_rate": 8.177320328837324e-05, + "loss": 0.3074, + "step": 18061 + }, + { + "epoch": 2.41, + "grad_norm": 0.66015625, + "learning_rate": 8.176175346201224e-05, + "loss": 0.3558, + "step": 18062 + }, + { + "epoch": 2.41, + "grad_norm": 0.55078125, + "learning_rate": 8.175030388297356e-05, + "loss": 0.2102, + "step": 18063 + }, + { + "epoch": 2.41, + "grad_norm": 0.51171875, + "learning_rate": 8.173885455141248e-05, + "loss": 0.4016, + "step": 18064 + }, + { + "epoch": 2.41, + "grad_norm": 0.49609375, + "learning_rate": 8.172740546748425e-05, + "loss": 0.3352, + "step": 18065 + }, + { + "epoch": 2.41, + "grad_norm": 0.5703125, + "learning_rate": 8.171595663134413e-05, + "loss": 0.4889, + "step": 18066 + }, + { + "epoch": 2.41, + "grad_norm": 0.80078125, + "learning_rate": 8.170450804314738e-05, + "loss": 0.2801, + "step": 18067 + }, + { + "epoch": 2.41, + "grad_norm": 0.77734375, + "learning_rate": 8.169305970304922e-05, + "loss": 0.2546, + "step": 18068 + }, + { + "epoch": 2.41, + "grad_norm": 0.65625, + "learning_rate": 8.168161161120492e-05, + "loss": 0.2339, + "step": 18069 + }, + { + "epoch": 2.41, + "grad_norm": 0.63671875, + "learning_rate": 8.167016376776975e-05, + "loss": 0.3051, + "step": 18070 + }, + { + "epoch": 2.41, + "grad_norm": 0.69140625, + "learning_rate": 8.165871617289891e-05, + "loss": 0.4815, + "step": 18071 + }, + { + "epoch": 2.41, + "grad_norm": 0.69140625, + "learning_rate": 8.164726882674769e-05, + "loss": 0.3359, + "step": 18072 + }, + { + "epoch": 2.41, + "grad_norm": 0.6796875, + "learning_rate": 8.163582172947124e-05, + "loss": 0.4199, + "step": 18073 + }, + { + "epoch": 2.41, + "grad_norm": 0.48046875, + "learning_rate": 8.162437488122483e-05, + "loss": 0.1892, + "step": 18074 + }, + { + "epoch": 2.41, + "grad_norm": 0.69140625, + "learning_rate": 8.161292828216372e-05, + "loss": 0.3824, + "step": 18075 + }, + { + "epoch": 2.41, + "grad_norm": 0.5234375, + "learning_rate": 8.160148193244306e-05, + "loss": 0.3215, + "step": 18076 + }, + { + "epoch": 2.41, + "grad_norm": 0.404296875, + "learning_rate": 8.159003583221814e-05, + "loss": 0.2434, + "step": 18077 + }, + { + "epoch": 2.41, + "grad_norm": 0.64453125, + "learning_rate": 8.157858998164414e-05, + "loss": 0.3046, + "step": 18078 + }, + { + "epoch": 2.41, + "grad_norm": 0.703125, + "learning_rate": 8.156714438087629e-05, + "loss": 0.364, + "step": 18079 + }, + { + "epoch": 2.41, + "grad_norm": 0.6328125, + "learning_rate": 8.155569903006979e-05, + "loss": 0.5526, + "step": 18080 + }, + { + "epoch": 2.41, + "grad_norm": 0.640625, + "learning_rate": 8.154425392937986e-05, + "loss": 0.4689, + "step": 18081 + }, + { + "epoch": 2.41, + "grad_norm": 0.69140625, + "learning_rate": 8.153280907896166e-05, + "loss": 0.673, + "step": 18082 + }, + { + "epoch": 2.41, + "grad_norm": 0.3984375, + "learning_rate": 8.152136447897045e-05, + "loss": 0.1883, + "step": 18083 + }, + { + "epoch": 2.41, + "grad_norm": 0.61328125, + "learning_rate": 8.150992012956142e-05, + "loss": 0.5648, + "step": 18084 + }, + { + "epoch": 2.41, + "grad_norm": 0.46875, + "learning_rate": 8.149847603088969e-05, + "loss": 0.3139, + "step": 18085 + }, + { + "epoch": 2.41, + "grad_norm": 0.70703125, + "learning_rate": 8.148703218311053e-05, + "loss": 0.5085, + "step": 18086 + }, + { + "epoch": 2.41, + "grad_norm": 0.5546875, + "learning_rate": 8.147558858637907e-05, + "loss": 0.7813, + "step": 18087 + }, + { + "epoch": 2.41, + "grad_norm": 0.5390625, + "learning_rate": 8.146414524085055e-05, + "loss": 0.2831, + "step": 18088 + }, + { + "epoch": 2.41, + "grad_norm": 0.52734375, + "learning_rate": 8.145270214668008e-05, + "loss": 0.2156, + "step": 18089 + }, + { + "epoch": 2.41, + "grad_norm": 0.6953125, + "learning_rate": 8.14412593040229e-05, + "loss": 0.1587, + "step": 18090 + }, + { + "epoch": 2.41, + "grad_norm": 0.5859375, + "learning_rate": 8.142981671303413e-05, + "loss": 0.6813, + "step": 18091 + }, + { + "epoch": 2.41, + "grad_norm": 0.484375, + "learning_rate": 8.1418374373869e-05, + "loss": 0.3079, + "step": 18092 + }, + { + "epoch": 2.41, + "grad_norm": 0.53125, + "learning_rate": 8.140693228668259e-05, + "loss": 0.2664, + "step": 18093 + }, + { + "epoch": 2.41, + "grad_norm": 0.58203125, + "learning_rate": 8.139549045163013e-05, + "loss": 0.3088, + "step": 18094 + }, + { + "epoch": 2.41, + "grad_norm": 0.60546875, + "learning_rate": 8.138404886886676e-05, + "loss": 0.2837, + "step": 18095 + }, + { + "epoch": 2.41, + "grad_norm": 0.5859375, + "learning_rate": 8.137260753854769e-05, + "loss": 0.2349, + "step": 18096 + }, + { + "epoch": 2.41, + "grad_norm": 0.66796875, + "learning_rate": 8.136116646082796e-05, + "loss": 0.2847, + "step": 18097 + }, + { + "epoch": 2.41, + "grad_norm": 0.6171875, + "learning_rate": 8.134972563586278e-05, + "loss": 0.3379, + "step": 18098 + }, + { + "epoch": 2.42, + "grad_norm": 0.44140625, + "learning_rate": 8.133828506380729e-05, + "loss": 0.3031, + "step": 18099 + }, + { + "epoch": 2.42, + "grad_norm": 0.515625, + "learning_rate": 8.132684474481665e-05, + "loss": 0.343, + "step": 18100 + }, + { + "epoch": 2.42, + "grad_norm": 0.48046875, + "learning_rate": 8.131540467904595e-05, + "loss": 0.1605, + "step": 18101 + }, + { + "epoch": 2.42, + "grad_norm": 0.62109375, + "learning_rate": 8.130396486665038e-05, + "loss": 0.193, + "step": 18102 + }, + { + "epoch": 2.42, + "grad_norm": 0.5, + "learning_rate": 8.129252530778503e-05, + "loss": 0.2503, + "step": 18103 + }, + { + "epoch": 2.42, + "grad_norm": 0.4375, + "learning_rate": 8.128108600260506e-05, + "loss": 0.1994, + "step": 18104 + }, + { + "epoch": 2.42, + "grad_norm": 0.474609375, + "learning_rate": 8.126964695126555e-05, + "loss": 0.2873, + "step": 18105 + }, + { + "epoch": 2.42, + "grad_norm": 0.59375, + "learning_rate": 8.125820815392167e-05, + "loss": 0.4231, + "step": 18106 + }, + { + "epoch": 2.42, + "grad_norm": 0.5859375, + "learning_rate": 8.12467696107285e-05, + "loss": 0.1634, + "step": 18107 + }, + { + "epoch": 2.42, + "grad_norm": 0.484375, + "learning_rate": 8.123533132184121e-05, + "loss": 0.2167, + "step": 18108 + }, + { + "epoch": 2.42, + "grad_norm": 0.55078125, + "learning_rate": 8.122389328741485e-05, + "loss": 0.1589, + "step": 18109 + }, + { + "epoch": 2.42, + "grad_norm": 0.427734375, + "learning_rate": 8.121245550760453e-05, + "loss": 0.2478, + "step": 18110 + }, + { + "epoch": 2.42, + "grad_norm": 0.73046875, + "learning_rate": 8.120101798256539e-05, + "loss": 0.2994, + "step": 18111 + }, + { + "epoch": 2.42, + "grad_norm": 0.74609375, + "learning_rate": 8.118958071245249e-05, + "loss": 0.2375, + "step": 18112 + }, + { + "epoch": 2.42, + "grad_norm": 0.53515625, + "learning_rate": 8.117814369742096e-05, + "loss": 0.2438, + "step": 18113 + }, + { + "epoch": 2.42, + "grad_norm": 0.5234375, + "learning_rate": 8.116670693762587e-05, + "loss": 0.3351, + "step": 18114 + }, + { + "epoch": 2.42, + "grad_norm": 0.57421875, + "learning_rate": 8.115527043322234e-05, + "loss": 0.2857, + "step": 18115 + }, + { + "epoch": 2.42, + "grad_norm": 0.58203125, + "learning_rate": 8.114383418436542e-05, + "loss": 0.3787, + "step": 18116 + }, + { + "epoch": 2.42, + "grad_norm": 0.57421875, + "learning_rate": 8.113239819121022e-05, + "loss": 0.3578, + "step": 18117 + }, + { + "epoch": 2.42, + "grad_norm": 0.5546875, + "learning_rate": 8.112096245391181e-05, + "loss": 0.2526, + "step": 18118 + }, + { + "epoch": 2.42, + "grad_norm": 0.59765625, + "learning_rate": 8.110952697262524e-05, + "loss": 0.6233, + "step": 18119 + }, + { + "epoch": 2.42, + "grad_norm": 0.71484375, + "learning_rate": 8.109809174750564e-05, + "loss": 0.2847, + "step": 18120 + }, + { + "epoch": 2.42, + "grad_norm": 0.58984375, + "learning_rate": 8.108665677870808e-05, + "loss": 0.4183, + "step": 18121 + }, + { + "epoch": 2.42, + "grad_norm": 0.82421875, + "learning_rate": 8.107522206638754e-05, + "loss": 0.3667, + "step": 18122 + }, + { + "epoch": 2.42, + "grad_norm": 0.6953125, + "learning_rate": 8.106378761069915e-05, + "loss": 0.3688, + "step": 18123 + }, + { + "epoch": 2.42, + "grad_norm": 0.75, + "learning_rate": 8.105235341179794e-05, + "loss": 0.3409, + "step": 18124 + }, + { + "epoch": 2.42, + "grad_norm": 0.71875, + "learning_rate": 8.1040919469839e-05, + "loss": 0.2885, + "step": 18125 + }, + { + "epoch": 2.42, + "grad_norm": 0.50390625, + "learning_rate": 8.102948578497733e-05, + "loss": 0.2213, + "step": 18126 + }, + { + "epoch": 2.42, + "grad_norm": 0.62890625, + "learning_rate": 8.101805235736804e-05, + "loss": 0.4652, + "step": 18127 + }, + { + "epoch": 2.42, + "grad_norm": 0.796875, + "learning_rate": 8.100661918716612e-05, + "loss": 0.2823, + "step": 18128 + }, + { + "epoch": 2.42, + "grad_norm": 0.5625, + "learning_rate": 8.099518627452665e-05, + "loss": 0.3934, + "step": 18129 + }, + { + "epoch": 2.42, + "grad_norm": 0.5078125, + "learning_rate": 8.098375361960462e-05, + "loss": 0.3544, + "step": 18130 + }, + { + "epoch": 2.42, + "grad_norm": 0.73046875, + "learning_rate": 8.097232122255513e-05, + "loss": 0.4295, + "step": 18131 + }, + { + "epoch": 2.42, + "grad_norm": 0.72265625, + "learning_rate": 8.096088908353315e-05, + "loss": 0.5715, + "step": 18132 + }, + { + "epoch": 2.42, + "grad_norm": 0.55859375, + "learning_rate": 8.094945720269378e-05, + "loss": 0.356, + "step": 18133 + }, + { + "epoch": 2.42, + "grad_norm": 0.59375, + "learning_rate": 8.093802558019198e-05, + "loss": 0.2603, + "step": 18134 + }, + { + "epoch": 2.42, + "grad_norm": 0.5078125, + "learning_rate": 8.092659421618276e-05, + "loss": 0.289, + "step": 18135 + }, + { + "epoch": 2.42, + "grad_norm": 0.57421875, + "learning_rate": 8.091516311082118e-05, + "loss": 0.4358, + "step": 18136 + }, + { + "epoch": 2.42, + "grad_norm": 0.53125, + "learning_rate": 8.090373226426223e-05, + "loss": 0.3811, + "step": 18137 + }, + { + "epoch": 2.42, + "grad_norm": 0.5078125, + "learning_rate": 8.089230167666093e-05, + "loss": 0.2995, + "step": 18138 + }, + { + "epoch": 2.42, + "grad_norm": 0.59375, + "learning_rate": 8.088087134817227e-05, + "loss": 0.3038, + "step": 18139 + }, + { + "epoch": 2.42, + "grad_norm": 0.453125, + "learning_rate": 8.086944127895129e-05, + "loss": 0.3115, + "step": 18140 + }, + { + "epoch": 2.42, + "grad_norm": 0.6484375, + "learning_rate": 8.085801146915293e-05, + "loss": 0.2394, + "step": 18141 + }, + { + "epoch": 2.42, + "grad_norm": 0.73046875, + "learning_rate": 8.084658191893224e-05, + "loss": 0.6294, + "step": 18142 + }, + { + "epoch": 2.42, + "grad_norm": 0.65625, + "learning_rate": 8.083515262844419e-05, + "loss": 0.54, + "step": 18143 + }, + { + "epoch": 2.42, + "grad_norm": 0.48828125, + "learning_rate": 8.082372359784376e-05, + "loss": 0.3266, + "step": 18144 + }, + { + "epoch": 2.42, + "grad_norm": 0.421875, + "learning_rate": 8.081229482728598e-05, + "loss": 0.1592, + "step": 18145 + }, + { + "epoch": 2.42, + "grad_norm": 0.55078125, + "learning_rate": 8.080086631692579e-05, + "loss": 0.256, + "step": 18146 + }, + { + "epoch": 2.42, + "grad_norm": 0.546875, + "learning_rate": 8.078943806691814e-05, + "loss": 0.4509, + "step": 18147 + }, + { + "epoch": 2.42, + "grad_norm": 0.5, + "learning_rate": 8.077801007741807e-05, + "loss": 0.302, + "step": 18148 + }, + { + "epoch": 2.42, + "grad_norm": 0.609375, + "learning_rate": 8.07665823485805e-05, + "loss": 0.4401, + "step": 18149 + }, + { + "epoch": 2.42, + "grad_norm": 0.5, + "learning_rate": 8.075515488056044e-05, + "loss": 0.2283, + "step": 18150 + }, + { + "epoch": 2.42, + "grad_norm": 0.7109375, + "learning_rate": 8.07437276735128e-05, + "loss": 0.4284, + "step": 18151 + }, + { + "epoch": 2.42, + "grad_norm": 0.470703125, + "learning_rate": 8.07323007275926e-05, + "loss": 0.331, + "step": 18152 + }, + { + "epoch": 2.42, + "grad_norm": 0.53515625, + "learning_rate": 8.072087404295474e-05, + "loss": 0.4115, + "step": 18153 + }, + { + "epoch": 2.42, + "grad_norm": 0.50390625, + "learning_rate": 8.070944761975423e-05, + "loss": 0.263, + "step": 18154 + }, + { + "epoch": 2.42, + "grad_norm": 0.64453125, + "learning_rate": 8.069802145814596e-05, + "loss": 0.443, + "step": 18155 + }, + { + "epoch": 2.42, + "grad_norm": 0.494140625, + "learning_rate": 8.068659555828493e-05, + "loss": 0.2289, + "step": 18156 + }, + { + "epoch": 2.42, + "grad_norm": 0.6796875, + "learning_rate": 8.067516992032608e-05, + "loss": 0.4572, + "step": 18157 + }, + { + "epoch": 2.42, + "grad_norm": 0.640625, + "learning_rate": 8.06637445444243e-05, + "loss": 0.5438, + "step": 18158 + }, + { + "epoch": 2.42, + "grad_norm": 0.46875, + "learning_rate": 8.065231943073456e-05, + "loss": 0.2981, + "step": 18159 + }, + { + "epoch": 2.42, + "grad_norm": 0.546875, + "learning_rate": 8.064089457941177e-05, + "loss": 0.2277, + "step": 18160 + }, + { + "epoch": 2.42, + "grad_norm": 0.5625, + "learning_rate": 8.06294699906109e-05, + "loss": 0.1779, + "step": 18161 + }, + { + "epoch": 2.42, + "grad_norm": 0.5234375, + "learning_rate": 8.061804566448683e-05, + "loss": 0.4439, + "step": 18162 + }, + { + "epoch": 2.42, + "grad_norm": 0.5703125, + "learning_rate": 8.060662160119452e-05, + "loss": 0.2561, + "step": 18163 + }, + { + "epoch": 2.42, + "grad_norm": 0.59375, + "learning_rate": 8.059519780088884e-05, + "loss": 0.3204, + "step": 18164 + }, + { + "epoch": 2.42, + "grad_norm": 0.56640625, + "learning_rate": 8.058377426372475e-05, + "loss": 0.3048, + "step": 18165 + }, + { + "epoch": 2.42, + "grad_norm": 0.357421875, + "learning_rate": 8.057235098985712e-05, + "loss": 0.0923, + "step": 18166 + }, + { + "epoch": 2.42, + "grad_norm": 0.5390625, + "learning_rate": 8.056092797944088e-05, + "loss": 0.3633, + "step": 18167 + }, + { + "epoch": 2.42, + "grad_norm": 0.5390625, + "learning_rate": 8.054950523263096e-05, + "loss": 0.5266, + "step": 18168 + }, + { + "epoch": 2.42, + "grad_norm": 0.57421875, + "learning_rate": 8.053808274958226e-05, + "loss": 0.3031, + "step": 18169 + }, + { + "epoch": 2.42, + "grad_norm": 0.78125, + "learning_rate": 8.05266605304496e-05, + "loss": 0.2331, + "step": 18170 + }, + { + "epoch": 2.42, + "grad_norm": 0.67578125, + "learning_rate": 8.051523857538793e-05, + "loss": 0.2879, + "step": 18171 + }, + { + "epoch": 2.42, + "grad_norm": 0.40625, + "learning_rate": 8.050381688455212e-05, + "loss": 0.1457, + "step": 18172 + }, + { + "epoch": 2.43, + "grad_norm": 0.515625, + "learning_rate": 8.049239545809708e-05, + "loss": 0.3602, + "step": 18173 + }, + { + "epoch": 2.43, + "grad_norm": 0.63671875, + "learning_rate": 8.048097429617767e-05, + "loss": 0.4449, + "step": 18174 + }, + { + "epoch": 2.43, + "grad_norm": 0.54296875, + "learning_rate": 8.046955339894878e-05, + "loss": 0.5584, + "step": 18175 + }, + { + "epoch": 2.43, + "grad_norm": 0.69921875, + "learning_rate": 8.045813276656527e-05, + "loss": 0.3419, + "step": 18176 + }, + { + "epoch": 2.43, + "grad_norm": 0.65234375, + "learning_rate": 8.044671239918204e-05, + "loss": 0.2722, + "step": 18177 + }, + { + "epoch": 2.43, + "grad_norm": 0.640625, + "learning_rate": 8.043529229695392e-05, + "loss": 0.3539, + "step": 18178 + }, + { + "epoch": 2.43, + "grad_norm": 0.5546875, + "learning_rate": 8.042387246003582e-05, + "loss": 0.2272, + "step": 18179 + }, + { + "epoch": 2.43, + "grad_norm": 0.59765625, + "learning_rate": 8.041245288858254e-05, + "loss": 0.5208, + "step": 18180 + }, + { + "epoch": 2.43, + "grad_norm": 0.6328125, + "learning_rate": 8.040103358274901e-05, + "loss": 0.3558, + "step": 18181 + }, + { + "epoch": 2.43, + "grad_norm": 0.57421875, + "learning_rate": 8.038961454269004e-05, + "loss": 0.3333, + "step": 18182 + }, + { + "epoch": 2.43, + "grad_norm": 0.59765625, + "learning_rate": 8.037819576856044e-05, + "loss": 0.4385, + "step": 18183 + }, + { + "epoch": 2.43, + "grad_norm": 0.73046875, + "learning_rate": 8.036677726051514e-05, + "loss": 0.6975, + "step": 18184 + }, + { + "epoch": 2.43, + "grad_norm": 0.578125, + "learning_rate": 8.035535901870891e-05, + "loss": 0.4887, + "step": 18185 + }, + { + "epoch": 2.43, + "grad_norm": 0.578125, + "learning_rate": 8.034394104329665e-05, + "loss": 0.4263, + "step": 18186 + }, + { + "epoch": 2.43, + "grad_norm": 0.671875, + "learning_rate": 8.033252333443312e-05, + "loss": 0.3703, + "step": 18187 + }, + { + "epoch": 2.43, + "grad_norm": 0.7578125, + "learning_rate": 8.032110589227324e-05, + "loss": 0.3495, + "step": 18188 + }, + { + "epoch": 2.43, + "grad_norm": 0.609375, + "learning_rate": 8.030968871697175e-05, + "loss": 0.3549, + "step": 18189 + }, + { + "epoch": 2.43, + "grad_norm": 0.78515625, + "learning_rate": 8.029827180868356e-05, + "loss": 0.3962, + "step": 18190 + }, + { + "epoch": 2.43, + "grad_norm": 0.54296875, + "learning_rate": 8.028685516756343e-05, + "loss": 0.1924, + "step": 18191 + }, + { + "epoch": 2.43, + "grad_norm": 0.47265625, + "learning_rate": 8.027543879376617e-05, + "loss": 0.4335, + "step": 18192 + }, + { + "epoch": 2.43, + "grad_norm": 0.55078125, + "learning_rate": 8.026402268744672e-05, + "loss": 0.24, + "step": 18193 + }, + { + "epoch": 2.43, + "grad_norm": 0.45703125, + "learning_rate": 8.025260684875971e-05, + "loss": 0.3231, + "step": 18194 + }, + { + "epoch": 2.43, + "grad_norm": 0.498046875, + "learning_rate": 8.024119127786001e-05, + "loss": 0.2604, + "step": 18195 + }, + { + "epoch": 2.43, + "grad_norm": 0.625, + "learning_rate": 8.022977597490246e-05, + "loss": 0.3411, + "step": 18196 + }, + { + "epoch": 2.43, + "grad_norm": 0.53515625, + "learning_rate": 8.021836094004182e-05, + "loss": 0.3936, + "step": 18197 + }, + { + "epoch": 2.43, + "grad_norm": 0.6015625, + "learning_rate": 8.020694617343294e-05, + "loss": 0.2945, + "step": 18198 + }, + { + "epoch": 2.43, + "grad_norm": 0.4765625, + "learning_rate": 8.019553167523053e-05, + "loss": 0.4429, + "step": 18199 + }, + { + "epoch": 2.43, + "grad_norm": 0.61328125, + "learning_rate": 8.018411744558946e-05, + "loss": 0.3061, + "step": 18200 + }, + { + "epoch": 2.43, + "grad_norm": 0.5390625, + "learning_rate": 8.017270348466443e-05, + "loss": 0.2666, + "step": 18201 + }, + { + "epoch": 2.43, + "grad_norm": 0.71484375, + "learning_rate": 8.016128979261031e-05, + "loss": 0.4611, + "step": 18202 + }, + { + "epoch": 2.43, + "grad_norm": 0.671875, + "learning_rate": 8.01498763695818e-05, + "loss": 0.5617, + "step": 18203 + }, + { + "epoch": 2.43, + "grad_norm": 0.4296875, + "learning_rate": 8.013846321573373e-05, + "loss": 0.2327, + "step": 18204 + }, + { + "epoch": 2.43, + "grad_norm": 0.58984375, + "learning_rate": 8.012705033122087e-05, + "loss": 0.3603, + "step": 18205 + }, + { + "epoch": 2.43, + "grad_norm": 0.51953125, + "learning_rate": 8.011563771619792e-05, + "loss": 0.1508, + "step": 18206 + }, + { + "epoch": 2.43, + "grad_norm": 0.57421875, + "learning_rate": 8.01042253708197e-05, + "loss": 0.3742, + "step": 18207 + }, + { + "epoch": 2.43, + "grad_norm": 0.46484375, + "learning_rate": 8.009281329524093e-05, + "loss": 0.3002, + "step": 18208 + }, + { + "epoch": 2.43, + "grad_norm": 0.6171875, + "learning_rate": 8.008140148961641e-05, + "loss": 0.3666, + "step": 18209 + }, + { + "epoch": 2.43, + "grad_norm": 0.58203125, + "learning_rate": 8.006998995410085e-05, + "loss": 0.4846, + "step": 18210 + }, + { + "epoch": 2.43, + "grad_norm": 0.66015625, + "learning_rate": 8.005857868884905e-05, + "loss": 0.3478, + "step": 18211 + }, + { + "epoch": 2.43, + "grad_norm": 0.6875, + "learning_rate": 8.004716769401568e-05, + "loss": 0.3638, + "step": 18212 + }, + { + "epoch": 2.43, + "grad_norm": 0.73828125, + "learning_rate": 8.003575696975556e-05, + "loss": 0.4906, + "step": 18213 + }, + { + "epoch": 2.43, + "grad_norm": 0.392578125, + "learning_rate": 8.002434651622335e-05, + "loss": 0.1606, + "step": 18214 + }, + { + "epoch": 2.43, + "grad_norm": 0.57421875, + "learning_rate": 8.001293633357385e-05, + "loss": 0.2153, + "step": 18215 + }, + { + "epoch": 2.43, + "grad_norm": 0.53125, + "learning_rate": 8.000152642196175e-05, + "loss": 0.4048, + "step": 18216 + }, + { + "epoch": 2.43, + "grad_norm": 0.70703125, + "learning_rate": 7.999011678154178e-05, + "loss": 0.3709, + "step": 18217 + }, + { + "epoch": 2.43, + "grad_norm": 0.5859375, + "learning_rate": 7.997870741246872e-05, + "loss": 0.3938, + "step": 18218 + }, + { + "epoch": 2.43, + "grad_norm": 0.423828125, + "learning_rate": 7.99672983148972e-05, + "loss": 0.1984, + "step": 18219 + }, + { + "epoch": 2.43, + "grad_norm": 0.53125, + "learning_rate": 7.995588948898196e-05, + "loss": 0.1917, + "step": 18220 + }, + { + "epoch": 2.43, + "grad_norm": 0.66015625, + "learning_rate": 7.994448093487774e-05, + "loss": 0.4819, + "step": 18221 + }, + { + "epoch": 2.43, + "grad_norm": 0.71484375, + "learning_rate": 7.993307265273921e-05, + "loss": 0.3976, + "step": 18222 + }, + { + "epoch": 2.43, + "grad_norm": 0.515625, + "learning_rate": 7.99216646427211e-05, + "loss": 0.5294, + "step": 18223 + }, + { + "epoch": 2.43, + "grad_norm": 0.5234375, + "learning_rate": 7.991025690497811e-05, + "loss": 0.2604, + "step": 18224 + }, + { + "epoch": 2.43, + "grad_norm": 0.68359375, + "learning_rate": 7.989884943966494e-05, + "loss": 0.2609, + "step": 18225 + }, + { + "epoch": 2.43, + "grad_norm": 0.66015625, + "learning_rate": 7.988744224693625e-05, + "loss": 0.3989, + "step": 18226 + }, + { + "epoch": 2.43, + "grad_norm": 0.6953125, + "learning_rate": 7.987603532694676e-05, + "loss": 0.1745, + "step": 18227 + }, + { + "epoch": 2.43, + "grad_norm": 0.578125, + "learning_rate": 7.986462867985115e-05, + "loss": 0.4772, + "step": 18228 + }, + { + "epoch": 2.43, + "grad_norm": 0.58984375, + "learning_rate": 7.98532223058041e-05, + "loss": 0.4015, + "step": 18229 + }, + { + "epoch": 2.43, + "grad_norm": 0.353515625, + "learning_rate": 7.98418162049603e-05, + "loss": 0.1407, + "step": 18230 + }, + { + "epoch": 2.43, + "grad_norm": 0.6015625, + "learning_rate": 7.983041037747438e-05, + "loss": 0.5449, + "step": 18231 + }, + { + "epoch": 2.43, + "grad_norm": 0.60546875, + "learning_rate": 7.981900482350107e-05, + "loss": 0.2845, + "step": 18232 + }, + { + "epoch": 2.43, + "grad_norm": 0.515625, + "learning_rate": 7.980759954319497e-05, + "loss": 0.3934, + "step": 18233 + }, + { + "epoch": 2.43, + "grad_norm": 0.6875, + "learning_rate": 7.979619453671082e-05, + "loss": 0.3346, + "step": 18234 + }, + { + "epoch": 2.43, + "grad_norm": 0.52734375, + "learning_rate": 7.978478980420319e-05, + "loss": 0.2538, + "step": 18235 + }, + { + "epoch": 2.43, + "grad_norm": 0.5859375, + "learning_rate": 7.977338534582683e-05, + "loss": 0.3318, + "step": 18236 + }, + { + "epoch": 2.43, + "grad_norm": 0.59375, + "learning_rate": 7.976198116173633e-05, + "loss": 0.4987, + "step": 18237 + }, + { + "epoch": 2.43, + "grad_norm": 0.416015625, + "learning_rate": 7.975057725208634e-05, + "loss": 0.2307, + "step": 18238 + }, + { + "epoch": 2.43, + "grad_norm": 0.671875, + "learning_rate": 7.973917361703152e-05, + "loss": 0.543, + "step": 18239 + }, + { + "epoch": 2.43, + "grad_norm": 0.6484375, + "learning_rate": 7.97277702567265e-05, + "loss": 0.2757, + "step": 18240 + }, + { + "epoch": 2.43, + "grad_norm": 0.515625, + "learning_rate": 7.971636717132597e-05, + "loss": 0.3176, + "step": 18241 + }, + { + "epoch": 2.43, + "grad_norm": 0.546875, + "learning_rate": 7.970496436098452e-05, + "loss": 0.3023, + "step": 18242 + }, + { + "epoch": 2.43, + "grad_norm": 0.4140625, + "learning_rate": 7.969356182585675e-05, + "loss": 0.2279, + "step": 18243 + }, + { + "epoch": 2.43, + "grad_norm": 0.54296875, + "learning_rate": 7.968215956609733e-05, + "loss": 0.2659, + "step": 18244 + }, + { + "epoch": 2.43, + "grad_norm": 0.6640625, + "learning_rate": 7.967075758186084e-05, + "loss": 0.4142, + "step": 18245 + }, + { + "epoch": 2.43, + "grad_norm": 0.81640625, + "learning_rate": 7.965935587330193e-05, + "loss": 0.5633, + "step": 18246 + }, + { + "epoch": 2.43, + "grad_norm": 0.55078125, + "learning_rate": 7.964795444057522e-05, + "loss": 0.2161, + "step": 18247 + }, + { + "epoch": 2.44, + "grad_norm": 0.7265625, + "learning_rate": 7.96365532838353e-05, + "loss": 0.334, + "step": 18248 + }, + { + "epoch": 2.44, + "grad_norm": 0.451171875, + "learning_rate": 7.962515240323678e-05, + "loss": 0.1346, + "step": 18249 + }, + { + "epoch": 2.44, + "grad_norm": 0.486328125, + "learning_rate": 7.96137517989343e-05, + "loss": 0.1623, + "step": 18250 + }, + { + "epoch": 2.44, + "grad_norm": 0.74609375, + "learning_rate": 7.96023514710824e-05, + "loss": 0.3572, + "step": 18251 + }, + { + "epoch": 2.44, + "grad_norm": 0.82421875, + "learning_rate": 7.959095141983573e-05, + "loss": 0.4329, + "step": 18252 + }, + { + "epoch": 2.44, + "grad_norm": 0.60546875, + "learning_rate": 7.957955164534883e-05, + "loss": 0.4505, + "step": 18253 + }, + { + "epoch": 2.44, + "grad_norm": 0.65625, + "learning_rate": 7.956815214777636e-05, + "loss": 0.3649, + "step": 18254 + }, + { + "epoch": 2.44, + "grad_norm": 0.388671875, + "learning_rate": 7.955675292727284e-05, + "loss": 0.152, + "step": 18255 + }, + { + "epoch": 2.44, + "grad_norm": 0.59375, + "learning_rate": 7.954535398399285e-05, + "loss": 0.4352, + "step": 18256 + }, + { + "epoch": 2.44, + "grad_norm": 0.60546875, + "learning_rate": 7.9533955318091e-05, + "loss": 0.4514, + "step": 18257 + }, + { + "epoch": 2.44, + "grad_norm": 0.77734375, + "learning_rate": 7.952255692972184e-05, + "loss": 0.3771, + "step": 18258 + }, + { + "epoch": 2.44, + "grad_norm": 0.498046875, + "learning_rate": 7.951115881903997e-05, + "loss": 0.2227, + "step": 18259 + }, + { + "epoch": 2.44, + "grad_norm": 0.6328125, + "learning_rate": 7.949976098619991e-05, + "loss": 0.2607, + "step": 18260 + }, + { + "epoch": 2.44, + "grad_norm": 0.51953125, + "learning_rate": 7.948836343135627e-05, + "loss": 0.3562, + "step": 18261 + }, + { + "epoch": 2.44, + "grad_norm": 0.640625, + "learning_rate": 7.947696615466357e-05, + "loss": 0.4463, + "step": 18262 + }, + { + "epoch": 2.44, + "grad_norm": 0.66796875, + "learning_rate": 7.94655691562764e-05, + "loss": 0.3431, + "step": 18263 + }, + { + "epoch": 2.44, + "grad_norm": 0.423828125, + "learning_rate": 7.945417243634927e-05, + "loss": 0.2155, + "step": 18264 + }, + { + "epoch": 2.44, + "grad_norm": 0.72265625, + "learning_rate": 7.944277599503674e-05, + "loss": 0.6511, + "step": 18265 + }, + { + "epoch": 2.44, + "grad_norm": 0.486328125, + "learning_rate": 7.943137983249345e-05, + "loss": 0.2406, + "step": 18266 + }, + { + "epoch": 2.44, + "grad_norm": 0.482421875, + "learning_rate": 7.941998394887376e-05, + "loss": 0.2093, + "step": 18267 + }, + { + "epoch": 2.44, + "grad_norm": 0.55078125, + "learning_rate": 7.94085883443323e-05, + "loss": 0.2783, + "step": 18268 + }, + { + "epoch": 2.44, + "grad_norm": 0.494140625, + "learning_rate": 7.939719301902363e-05, + "loss": 0.3466, + "step": 18269 + }, + { + "epoch": 2.44, + "grad_norm": 0.82421875, + "learning_rate": 7.938579797310221e-05, + "loss": 0.3176, + "step": 18270 + }, + { + "epoch": 2.44, + "grad_norm": 0.484375, + "learning_rate": 7.937440320672262e-05, + "loss": 0.4055, + "step": 18271 + }, + { + "epoch": 2.44, + "grad_norm": 0.625, + "learning_rate": 7.936300872003936e-05, + "loss": 0.3405, + "step": 18272 + }, + { + "epoch": 2.44, + "grad_norm": 0.57421875, + "learning_rate": 7.935161451320696e-05, + "loss": 0.1947, + "step": 18273 + }, + { + "epoch": 2.44, + "grad_norm": 0.828125, + "learning_rate": 7.934022058637989e-05, + "loss": 0.4838, + "step": 18274 + }, + { + "epoch": 2.44, + "grad_norm": 0.72265625, + "learning_rate": 7.932882693971271e-05, + "loss": 0.2854, + "step": 18275 + }, + { + "epoch": 2.44, + "grad_norm": 0.515625, + "learning_rate": 7.931743357335987e-05, + "loss": 0.2588, + "step": 18276 + }, + { + "epoch": 2.44, + "grad_norm": 0.38671875, + "learning_rate": 7.930604048747595e-05, + "loss": 0.1311, + "step": 18277 + }, + { + "epoch": 2.44, + "grad_norm": 0.52734375, + "learning_rate": 7.929464768221541e-05, + "loss": 0.2804, + "step": 18278 + }, + { + "epoch": 2.44, + "grad_norm": 0.6328125, + "learning_rate": 7.92832551577327e-05, + "loss": 0.3298, + "step": 18279 + }, + { + "epoch": 2.44, + "grad_norm": 0.5078125, + "learning_rate": 7.927186291418238e-05, + "loss": 0.21, + "step": 18280 + }, + { + "epoch": 2.44, + "grad_norm": 0.458984375, + "learning_rate": 7.926047095171886e-05, + "loss": 0.1926, + "step": 18281 + }, + { + "epoch": 2.44, + "grad_norm": 0.65234375, + "learning_rate": 7.924907927049669e-05, + "loss": 0.3492, + "step": 18282 + }, + { + "epoch": 2.44, + "grad_norm": 0.51171875, + "learning_rate": 7.92376878706703e-05, + "loss": 0.1973, + "step": 18283 + }, + { + "epoch": 2.44, + "grad_norm": 0.64453125, + "learning_rate": 7.922629675239421e-05, + "loss": 0.3356, + "step": 18284 + }, + { + "epoch": 2.44, + "grad_norm": 0.7109375, + "learning_rate": 7.921490591582284e-05, + "loss": 0.248, + "step": 18285 + }, + { + "epoch": 2.44, + "grad_norm": 0.625, + "learning_rate": 7.920351536111073e-05, + "loss": 0.1665, + "step": 18286 + }, + { + "epoch": 2.44, + "grad_norm": 0.6171875, + "learning_rate": 7.919212508841225e-05, + "loss": 0.2501, + "step": 18287 + }, + { + "epoch": 2.44, + "grad_norm": 0.61328125, + "learning_rate": 7.918073509788194e-05, + "loss": 0.2762, + "step": 18288 + }, + { + "epoch": 2.44, + "grad_norm": 0.51171875, + "learning_rate": 7.916934538967422e-05, + "loss": 0.2628, + "step": 18289 + }, + { + "epoch": 2.44, + "grad_norm": 0.58984375, + "learning_rate": 7.915795596394357e-05, + "loss": 0.2939, + "step": 18290 + }, + { + "epoch": 2.44, + "grad_norm": 0.56640625, + "learning_rate": 7.914656682084437e-05, + "loss": 0.245, + "step": 18291 + }, + { + "epoch": 2.44, + "grad_norm": 0.73828125, + "learning_rate": 7.913517796053114e-05, + "loss": 0.4617, + "step": 18292 + }, + { + "epoch": 2.44, + "grad_norm": 0.6953125, + "learning_rate": 7.912378938315825e-05, + "loss": 0.4948, + "step": 18293 + }, + { + "epoch": 2.44, + "grad_norm": 0.58984375, + "learning_rate": 7.91124010888802e-05, + "loss": 0.2905, + "step": 18294 + }, + { + "epoch": 2.44, + "grad_norm": 0.63671875, + "learning_rate": 7.91010130778514e-05, + "loss": 0.2779, + "step": 18295 + }, + { + "epoch": 2.44, + "grad_norm": 0.63671875, + "learning_rate": 7.908962535022627e-05, + "loss": 0.366, + "step": 18296 + }, + { + "epoch": 2.44, + "grad_norm": 0.55859375, + "learning_rate": 7.907823790615923e-05, + "loss": 0.2833, + "step": 18297 + }, + { + "epoch": 2.44, + "grad_norm": 0.486328125, + "learning_rate": 7.906685074580473e-05, + "loss": 0.2218, + "step": 18298 + }, + { + "epoch": 2.44, + "grad_norm": 0.6484375, + "learning_rate": 7.905546386931715e-05, + "loss": 0.375, + "step": 18299 + }, + { + "epoch": 2.44, + "grad_norm": 0.6796875, + "learning_rate": 7.904407727685093e-05, + "loss": 0.3493, + "step": 18300 + }, + { + "epoch": 2.44, + "grad_norm": 0.625, + "learning_rate": 7.903269096856047e-05, + "loss": 0.5128, + "step": 18301 + }, + { + "epoch": 2.44, + "grad_norm": 0.74609375, + "learning_rate": 7.902130494460018e-05, + "loss": 0.2485, + "step": 18302 + }, + { + "epoch": 2.44, + "grad_norm": 0.6875, + "learning_rate": 7.90099192051245e-05, + "loss": 0.4152, + "step": 18303 + }, + { + "epoch": 2.44, + "grad_norm": 0.380859375, + "learning_rate": 7.899853375028772e-05, + "loss": 0.2383, + "step": 18304 + }, + { + "epoch": 2.44, + "grad_norm": 0.578125, + "learning_rate": 7.898714858024433e-05, + "loss": 0.4577, + "step": 18305 + }, + { + "epoch": 2.44, + "grad_norm": 0.3671875, + "learning_rate": 7.897576369514869e-05, + "loss": 0.1338, + "step": 18306 + }, + { + "epoch": 2.44, + "grad_norm": 0.58984375, + "learning_rate": 7.89643790951552e-05, + "loss": 0.3323, + "step": 18307 + }, + { + "epoch": 2.44, + "grad_norm": 0.59375, + "learning_rate": 7.89529947804182e-05, + "loss": 0.2969, + "step": 18308 + }, + { + "epoch": 2.44, + "grad_norm": 0.4609375, + "learning_rate": 7.894161075109212e-05, + "loss": 0.2681, + "step": 18309 + }, + { + "epoch": 2.44, + "grad_norm": 0.61328125, + "learning_rate": 7.893022700733129e-05, + "loss": 0.457, + "step": 18310 + }, + { + "epoch": 2.44, + "grad_norm": 0.453125, + "learning_rate": 7.891884354929013e-05, + "loss": 0.2832, + "step": 18311 + }, + { + "epoch": 2.44, + "grad_norm": 0.431640625, + "learning_rate": 7.890746037712296e-05, + "loss": 0.2569, + "step": 18312 + }, + { + "epoch": 2.44, + "grad_norm": 0.62890625, + "learning_rate": 7.889607749098417e-05, + "loss": 0.6003, + "step": 18313 + }, + { + "epoch": 2.44, + "grad_norm": 0.439453125, + "learning_rate": 7.888469489102813e-05, + "loss": 0.2552, + "step": 18314 + }, + { + "epoch": 2.44, + "grad_norm": 0.5078125, + "learning_rate": 7.88733125774092e-05, + "loss": 0.48, + "step": 18315 + }, + { + "epoch": 2.44, + "grad_norm": 0.486328125, + "learning_rate": 7.886193055028168e-05, + "loss": 0.2102, + "step": 18316 + }, + { + "epoch": 2.44, + "grad_norm": 0.4765625, + "learning_rate": 7.885054880979994e-05, + "loss": 0.3428, + "step": 18317 + }, + { + "epoch": 2.44, + "grad_norm": 0.64453125, + "learning_rate": 7.883916735611834e-05, + "loss": 0.3439, + "step": 18318 + }, + { + "epoch": 2.44, + "grad_norm": 0.50390625, + "learning_rate": 7.882778618939121e-05, + "loss": 0.2778, + "step": 18319 + }, + { + "epoch": 2.44, + "grad_norm": 0.71875, + "learning_rate": 7.881640530977288e-05, + "loss": 0.4729, + "step": 18320 + }, + { + "epoch": 2.44, + "grad_norm": 0.66015625, + "learning_rate": 7.88050247174177e-05, + "loss": 0.366, + "step": 18321 + }, + { + "epoch": 2.44, + "grad_norm": 0.578125, + "learning_rate": 7.879364441247998e-05, + "loss": 0.1915, + "step": 18322 + }, + { + "epoch": 2.45, + "grad_norm": 0.455078125, + "learning_rate": 7.878226439511407e-05, + "loss": 0.1929, + "step": 18323 + }, + { + "epoch": 2.45, + "grad_norm": 0.44140625, + "learning_rate": 7.877088466547425e-05, + "loss": 0.2282, + "step": 18324 + }, + { + "epoch": 2.45, + "grad_norm": 0.7265625, + "learning_rate": 7.875950522371488e-05, + "loss": 0.3871, + "step": 18325 + }, + { + "epoch": 2.45, + "grad_norm": 0.34375, + "learning_rate": 7.874812606999022e-05, + "loss": 0.2476, + "step": 18326 + }, + { + "epoch": 2.45, + "grad_norm": 0.58203125, + "learning_rate": 7.873674720445468e-05, + "loss": 0.1866, + "step": 18327 + }, + { + "epoch": 2.45, + "grad_norm": 0.50390625, + "learning_rate": 7.872536862726244e-05, + "loss": 0.1835, + "step": 18328 + }, + { + "epoch": 2.45, + "grad_norm": 0.65625, + "learning_rate": 7.871399033856786e-05, + "loss": 0.2267, + "step": 18329 + }, + { + "epoch": 2.45, + "grad_norm": 0.43359375, + "learning_rate": 7.870261233852523e-05, + "loss": 0.2272, + "step": 18330 + }, + { + "epoch": 2.45, + "grad_norm": 0.66015625, + "learning_rate": 7.869123462728884e-05, + "loss": 0.4059, + "step": 18331 + }, + { + "epoch": 2.45, + "grad_norm": 0.65234375, + "learning_rate": 7.867985720501301e-05, + "loss": 0.2316, + "step": 18332 + }, + { + "epoch": 2.45, + "grad_norm": 0.33984375, + "learning_rate": 7.866848007185195e-05, + "loss": 0.1474, + "step": 18333 + }, + { + "epoch": 2.45, + "grad_norm": 0.5234375, + "learning_rate": 7.865710322796005e-05, + "loss": 0.2286, + "step": 18334 + }, + { + "epoch": 2.45, + "grad_norm": 0.71875, + "learning_rate": 7.864572667349148e-05, + "loss": 0.4448, + "step": 18335 + }, + { + "epoch": 2.45, + "grad_norm": 0.59765625, + "learning_rate": 7.863435040860059e-05, + "loss": 0.2911, + "step": 18336 + }, + { + "epoch": 2.45, + "grad_norm": 0.4375, + "learning_rate": 7.86229744334416e-05, + "loss": 0.2007, + "step": 18337 + }, + { + "epoch": 2.45, + "grad_norm": 0.625, + "learning_rate": 7.86115987481688e-05, + "loss": 0.4092, + "step": 18338 + }, + { + "epoch": 2.45, + "grad_norm": 0.498046875, + "learning_rate": 7.860022335293651e-05, + "loss": 0.1777, + "step": 18339 + }, + { + "epoch": 2.45, + "grad_norm": 0.6875, + "learning_rate": 7.858884824789886e-05, + "loss": 0.4663, + "step": 18340 + }, + { + "epoch": 2.45, + "grad_norm": 0.671875, + "learning_rate": 7.857747343321017e-05, + "loss": 0.436, + "step": 18341 + }, + { + "epoch": 2.45, + "grad_norm": 0.578125, + "learning_rate": 7.856609890902471e-05, + "loss": 0.2909, + "step": 18342 + }, + { + "epoch": 2.45, + "grad_norm": 0.59765625, + "learning_rate": 7.855472467549669e-05, + "loss": 0.2351, + "step": 18343 + }, + { + "epoch": 2.45, + "grad_norm": 0.60546875, + "learning_rate": 7.854335073278038e-05, + "loss": 0.2726, + "step": 18344 + }, + { + "epoch": 2.45, + "grad_norm": 0.443359375, + "learning_rate": 7.853197708102999e-05, + "loss": 0.2766, + "step": 18345 + }, + { + "epoch": 2.45, + "grad_norm": 0.55859375, + "learning_rate": 7.852060372039978e-05, + "loss": 0.426, + "step": 18346 + }, + { + "epoch": 2.45, + "grad_norm": 0.4453125, + "learning_rate": 7.850923065104395e-05, + "loss": 0.2141, + "step": 18347 + }, + { + "epoch": 2.45, + "grad_norm": 0.431640625, + "learning_rate": 7.849785787311676e-05, + "loss": 0.3503, + "step": 18348 + }, + { + "epoch": 2.45, + "grad_norm": 0.70703125, + "learning_rate": 7.848648538677241e-05, + "loss": 0.4541, + "step": 18349 + }, + { + "epoch": 2.45, + "grad_norm": 0.56640625, + "learning_rate": 7.847511319216515e-05, + "loss": 0.3002, + "step": 18350 + }, + { + "epoch": 2.45, + "grad_norm": 0.61328125, + "learning_rate": 7.846374128944917e-05, + "loss": 0.4638, + "step": 18351 + }, + { + "epoch": 2.45, + "grad_norm": 0.40625, + "learning_rate": 7.845236967877866e-05, + "loss": 0.1348, + "step": 18352 + }, + { + "epoch": 2.45, + "grad_norm": 0.51953125, + "learning_rate": 7.844099836030786e-05, + "loss": 0.3442, + "step": 18353 + }, + { + "epoch": 2.45, + "grad_norm": 0.65234375, + "learning_rate": 7.842962733419093e-05, + "loss": 0.4164, + "step": 18354 + }, + { + "epoch": 2.45, + "grad_norm": 0.578125, + "learning_rate": 7.841825660058213e-05, + "loss": 0.2685, + "step": 18355 + }, + { + "epoch": 2.45, + "grad_norm": 0.427734375, + "learning_rate": 7.84068861596356e-05, + "loss": 0.2355, + "step": 18356 + }, + { + "epoch": 2.45, + "grad_norm": 0.55859375, + "learning_rate": 7.839551601150556e-05, + "loss": 0.4214, + "step": 18357 + }, + { + "epoch": 2.45, + "grad_norm": 0.625, + "learning_rate": 7.838414615634618e-05, + "loss": 0.2668, + "step": 18358 + }, + { + "epoch": 2.45, + "grad_norm": 0.458984375, + "learning_rate": 7.837277659431167e-05, + "loss": 0.1501, + "step": 18359 + }, + { + "epoch": 2.45, + "grad_norm": 0.671875, + "learning_rate": 7.836140732555617e-05, + "loss": 0.2773, + "step": 18360 + }, + { + "epoch": 2.45, + "grad_norm": 0.703125, + "learning_rate": 7.835003835023387e-05, + "loss": 0.4251, + "step": 18361 + }, + { + "epoch": 2.45, + "grad_norm": 0.63671875, + "learning_rate": 7.833866966849897e-05, + "loss": 0.3511, + "step": 18362 + }, + { + "epoch": 2.45, + "grad_norm": 0.56640625, + "learning_rate": 7.832730128050563e-05, + "loss": 0.2011, + "step": 18363 + }, + { + "epoch": 2.45, + "grad_norm": 0.490234375, + "learning_rate": 7.831593318640795e-05, + "loss": 0.3056, + "step": 18364 + }, + { + "epoch": 2.45, + "grad_norm": 0.56640625, + "learning_rate": 7.830456538636015e-05, + "loss": 0.3174, + "step": 18365 + }, + { + "epoch": 2.45, + "grad_norm": 0.51171875, + "learning_rate": 7.829319788051634e-05, + "loss": 0.2979, + "step": 18366 + }, + { + "epoch": 2.45, + "grad_norm": 0.44140625, + "learning_rate": 7.828183066903074e-05, + "loss": 0.1378, + "step": 18367 + }, + { + "epoch": 2.45, + "grad_norm": 0.53515625, + "learning_rate": 7.827046375205742e-05, + "loss": 0.2408, + "step": 18368 + }, + { + "epoch": 2.45, + "grad_norm": 0.7578125, + "learning_rate": 7.825909712975058e-05, + "loss": 0.1963, + "step": 18369 + }, + { + "epoch": 2.45, + "grad_norm": 0.4140625, + "learning_rate": 7.824773080226432e-05, + "loss": 0.2435, + "step": 18370 + }, + { + "epoch": 2.45, + "grad_norm": 0.5703125, + "learning_rate": 7.823636476975282e-05, + "loss": 0.4367, + "step": 18371 + }, + { + "epoch": 2.45, + "grad_norm": 0.5546875, + "learning_rate": 7.822499903237014e-05, + "loss": 0.192, + "step": 18372 + }, + { + "epoch": 2.45, + "grad_norm": 0.7265625, + "learning_rate": 7.821363359027048e-05, + "loss": 0.52, + "step": 18373 + }, + { + "epoch": 2.45, + "grad_norm": 0.4375, + "learning_rate": 7.820226844360791e-05, + "loss": 0.1514, + "step": 18374 + }, + { + "epoch": 2.45, + "grad_norm": 0.46875, + "learning_rate": 7.81909035925366e-05, + "loss": 0.2146, + "step": 18375 + }, + { + "epoch": 2.45, + "grad_norm": 0.82421875, + "learning_rate": 7.817953903721063e-05, + "loss": 0.4542, + "step": 18376 + }, + { + "epoch": 2.45, + "grad_norm": 0.6640625, + "learning_rate": 7.816817477778408e-05, + "loss": 0.5267, + "step": 18377 + }, + { + "epoch": 2.45, + "grad_norm": 0.5390625, + "learning_rate": 7.815681081441112e-05, + "loss": 0.2963, + "step": 18378 + }, + { + "epoch": 2.45, + "grad_norm": 0.65625, + "learning_rate": 7.81454471472458e-05, + "loss": 0.1691, + "step": 18379 + }, + { + "epoch": 2.45, + "grad_norm": 0.54296875, + "learning_rate": 7.813408377644228e-05, + "loss": 0.2932, + "step": 18380 + }, + { + "epoch": 2.45, + "grad_norm": 0.62890625, + "learning_rate": 7.812272070215458e-05, + "loss": 0.4682, + "step": 18381 + }, + { + "epoch": 2.45, + "grad_norm": 0.5703125, + "learning_rate": 7.811135792453685e-05, + "loss": 0.4447, + "step": 18382 + }, + { + "epoch": 2.45, + "grad_norm": 0.78125, + "learning_rate": 7.809999544374314e-05, + "loss": 0.5197, + "step": 18383 + }, + { + "epoch": 2.45, + "grad_norm": 0.5234375, + "learning_rate": 7.808863325992757e-05, + "loss": 0.3561, + "step": 18384 + }, + { + "epoch": 2.45, + "grad_norm": 0.56640625, + "learning_rate": 7.807727137324417e-05, + "loss": 0.3041, + "step": 18385 + }, + { + "epoch": 2.45, + "grad_norm": 0.8046875, + "learning_rate": 7.806590978384703e-05, + "loss": 0.443, + "step": 18386 + }, + { + "epoch": 2.45, + "grad_norm": 0.53125, + "learning_rate": 7.805454849189026e-05, + "loss": 0.3414, + "step": 18387 + }, + { + "epoch": 2.45, + "grad_norm": 0.48828125, + "learning_rate": 7.804318749752793e-05, + "loss": 0.2954, + "step": 18388 + }, + { + "epoch": 2.45, + "grad_norm": 0.494140625, + "learning_rate": 7.803182680091401e-05, + "loss": 0.207, + "step": 18389 + }, + { + "epoch": 2.45, + "grad_norm": 0.42578125, + "learning_rate": 7.802046640220264e-05, + "loss": 0.1312, + "step": 18390 + }, + { + "epoch": 2.45, + "grad_norm": 0.69140625, + "learning_rate": 7.800910630154784e-05, + "loss": 0.5168, + "step": 18391 + }, + { + "epoch": 2.45, + "grad_norm": 0.44921875, + "learning_rate": 7.799774649910368e-05, + "loss": 0.1534, + "step": 18392 + }, + { + "epoch": 2.45, + "grad_norm": 0.5, + "learning_rate": 7.79863869950242e-05, + "loss": 0.1812, + "step": 18393 + }, + { + "epoch": 2.45, + "grad_norm": 0.5859375, + "learning_rate": 7.797502778946344e-05, + "loss": 0.428, + "step": 18394 + }, + { + "epoch": 2.45, + "grad_norm": 0.5703125, + "learning_rate": 7.796366888257542e-05, + "loss": 0.3108, + "step": 18395 + }, + { + "epoch": 2.45, + "grad_norm": 0.76171875, + "learning_rate": 7.795231027451423e-05, + "loss": 0.241, + "step": 18396 + }, + { + "epoch": 2.45, + "grad_norm": 0.47265625, + "learning_rate": 7.794095196543383e-05, + "loss": 0.2245, + "step": 18397 + }, + { + "epoch": 2.46, + "grad_norm": 0.4609375, + "learning_rate": 7.79295939554883e-05, + "loss": 0.357, + "step": 18398 + }, + { + "epoch": 2.46, + "grad_norm": 0.671875, + "learning_rate": 7.791823624483161e-05, + "loss": 0.2127, + "step": 18399 + }, + { + "epoch": 2.46, + "grad_norm": 0.58203125, + "learning_rate": 7.790687883361787e-05, + "loss": 0.2085, + "step": 18400 + }, + { + "epoch": 2.46, + "grad_norm": 0.48828125, + "learning_rate": 7.789552172200101e-05, + "loss": 0.25, + "step": 18401 + }, + { + "epoch": 2.46, + "grad_norm": 0.57421875, + "learning_rate": 7.788416491013503e-05, + "loss": 0.404, + "step": 18402 + }, + { + "epoch": 2.46, + "grad_norm": 0.4921875, + "learning_rate": 7.787280839817399e-05, + "loss": 0.2469, + "step": 18403 + }, + { + "epoch": 2.46, + "grad_norm": 0.8515625, + "learning_rate": 7.786145218627184e-05, + "loss": 0.4835, + "step": 18404 + }, + { + "epoch": 2.46, + "grad_norm": 0.466796875, + "learning_rate": 7.785009627458263e-05, + "loss": 0.1821, + "step": 18405 + }, + { + "epoch": 2.46, + "grad_norm": 0.49609375, + "learning_rate": 7.783874066326033e-05, + "loss": 0.3723, + "step": 18406 + }, + { + "epoch": 2.46, + "grad_norm": 0.55078125, + "learning_rate": 7.782738535245894e-05, + "loss": 0.2787, + "step": 18407 + }, + { + "epoch": 2.46, + "grad_norm": 0.65234375, + "learning_rate": 7.781603034233241e-05, + "loss": 0.4468, + "step": 18408 + }, + { + "epoch": 2.46, + "grad_norm": 0.6953125, + "learning_rate": 7.780467563303478e-05, + "loss": 0.4271, + "step": 18409 + }, + { + "epoch": 2.46, + "grad_norm": 0.447265625, + "learning_rate": 7.779332122471995e-05, + "loss": 0.2946, + "step": 18410 + }, + { + "epoch": 2.46, + "grad_norm": 0.5078125, + "learning_rate": 7.778196711754194e-05, + "loss": 0.2948, + "step": 18411 + }, + { + "epoch": 2.46, + "grad_norm": 0.64453125, + "learning_rate": 7.77706133116548e-05, + "loss": 0.2948, + "step": 18412 + }, + { + "epoch": 2.46, + "grad_norm": 0.6484375, + "learning_rate": 7.775925980721232e-05, + "loss": 0.4834, + "step": 18413 + }, + { + "epoch": 2.46, + "grad_norm": 0.7265625, + "learning_rate": 7.774790660436858e-05, + "loss": 0.4165, + "step": 18414 + }, + { + "epoch": 2.46, + "grad_norm": 0.466796875, + "learning_rate": 7.77365537032775e-05, + "loss": 0.2683, + "step": 18415 + }, + { + "epoch": 2.46, + "grad_norm": 0.69140625, + "learning_rate": 7.772520110409303e-05, + "loss": 0.4895, + "step": 18416 + }, + { + "epoch": 2.46, + "grad_norm": 0.3984375, + "learning_rate": 7.771384880696916e-05, + "loss": 0.2377, + "step": 18417 + }, + { + "epoch": 2.46, + "grad_norm": 0.287109375, + "learning_rate": 7.770249681205976e-05, + "loss": 0.0738, + "step": 18418 + }, + { + "epoch": 2.46, + "grad_norm": 0.6015625, + "learning_rate": 7.769114511951886e-05, + "loss": 0.3127, + "step": 18419 + }, + { + "epoch": 2.46, + "grad_norm": 0.4765625, + "learning_rate": 7.767979372950031e-05, + "loss": 0.2693, + "step": 18420 + }, + { + "epoch": 2.46, + "grad_norm": 0.5546875, + "learning_rate": 7.766844264215812e-05, + "loss": 0.5308, + "step": 18421 + }, + { + "epoch": 2.46, + "grad_norm": 0.61328125, + "learning_rate": 7.765709185764614e-05, + "loss": 0.3997, + "step": 18422 + }, + { + "epoch": 2.46, + "grad_norm": 0.58203125, + "learning_rate": 7.764574137611838e-05, + "loss": 0.3964, + "step": 18423 + }, + { + "epoch": 2.46, + "grad_norm": 0.6015625, + "learning_rate": 7.763439119772872e-05, + "loss": 0.1945, + "step": 18424 + }, + { + "epoch": 2.46, + "grad_norm": 0.60546875, + "learning_rate": 7.762304132263101e-05, + "loss": 0.5778, + "step": 18425 + }, + { + "epoch": 2.46, + "grad_norm": 0.7421875, + "learning_rate": 7.761169175097927e-05, + "loss": 0.7042, + "step": 18426 + }, + { + "epoch": 2.46, + "grad_norm": 0.494140625, + "learning_rate": 7.760034248292733e-05, + "loss": 0.2764, + "step": 18427 + }, + { + "epoch": 2.46, + "grad_norm": 0.8359375, + "learning_rate": 7.758899351862913e-05, + "loss": 0.54, + "step": 18428 + }, + { + "epoch": 2.46, + "grad_norm": 0.466796875, + "learning_rate": 7.757764485823855e-05, + "loss": 0.2498, + "step": 18429 + }, + { + "epoch": 2.46, + "grad_norm": 0.69140625, + "learning_rate": 7.756629650190951e-05, + "loss": 0.1924, + "step": 18430 + }, + { + "epoch": 2.46, + "grad_norm": 0.60546875, + "learning_rate": 7.75549484497959e-05, + "loss": 0.4658, + "step": 18431 + }, + { + "epoch": 2.46, + "grad_norm": 0.421875, + "learning_rate": 7.754360070205158e-05, + "loss": 0.2171, + "step": 18432 + }, + { + "epoch": 2.46, + "grad_norm": 0.5703125, + "learning_rate": 7.753225325883044e-05, + "loss": 0.3474, + "step": 18433 + }, + { + "epoch": 2.46, + "grad_norm": 0.470703125, + "learning_rate": 7.752090612028635e-05, + "loss": 0.331, + "step": 18434 + }, + { + "epoch": 2.46, + "grad_norm": 0.59375, + "learning_rate": 7.750955928657322e-05, + "loss": 0.2191, + "step": 18435 + }, + { + "epoch": 2.46, + "grad_norm": 0.5859375, + "learning_rate": 7.749821275784495e-05, + "loss": 0.1572, + "step": 18436 + }, + { + "epoch": 2.46, + "grad_norm": 0.54296875, + "learning_rate": 7.74868665342553e-05, + "loss": 0.2216, + "step": 18437 + }, + { + "epoch": 2.46, + "grad_norm": 0.455078125, + "learning_rate": 7.74755206159582e-05, + "loss": 0.332, + "step": 18438 + }, + { + "epoch": 2.46, + "grad_norm": 0.67578125, + "learning_rate": 7.746417500310748e-05, + "loss": 0.427, + "step": 18439 + }, + { + "epoch": 2.46, + "grad_norm": 0.427734375, + "learning_rate": 7.745282969585704e-05, + "loss": 0.2884, + "step": 18440 + }, + { + "epoch": 2.46, + "grad_norm": 0.6015625, + "learning_rate": 7.744148469436066e-05, + "loss": 0.2189, + "step": 18441 + }, + { + "epoch": 2.46, + "grad_norm": 0.474609375, + "learning_rate": 7.743013999877227e-05, + "loss": 0.4476, + "step": 18442 + }, + { + "epoch": 2.46, + "grad_norm": 0.52734375, + "learning_rate": 7.741879560924564e-05, + "loss": 0.3623, + "step": 18443 + }, + { + "epoch": 2.46, + "grad_norm": 0.59765625, + "learning_rate": 7.740745152593464e-05, + "loss": 0.2791, + "step": 18444 + }, + { + "epoch": 2.46, + "grad_norm": 0.69921875, + "learning_rate": 7.739610774899308e-05, + "loss": 0.4712, + "step": 18445 + }, + { + "epoch": 2.46, + "grad_norm": 0.5390625, + "learning_rate": 7.738476427857484e-05, + "loss": 0.2284, + "step": 18446 + }, + { + "epoch": 2.46, + "grad_norm": 0.62890625, + "learning_rate": 7.737342111483368e-05, + "loss": 0.3645, + "step": 18447 + }, + { + "epoch": 2.46, + "grad_norm": 0.65625, + "learning_rate": 7.736207825792349e-05, + "loss": 0.5797, + "step": 18448 + }, + { + "epoch": 2.46, + "grad_norm": 0.53515625, + "learning_rate": 7.735073570799805e-05, + "loss": 0.248, + "step": 18449 + }, + { + "epoch": 2.46, + "grad_norm": 0.435546875, + "learning_rate": 7.733939346521114e-05, + "loss": 0.2524, + "step": 18450 + }, + { + "epoch": 2.46, + "grad_norm": 0.56640625, + "learning_rate": 7.73280515297166e-05, + "loss": 0.3638, + "step": 18451 + }, + { + "epoch": 2.46, + "grad_norm": 0.296875, + "learning_rate": 7.731670990166821e-05, + "loss": 0.1081, + "step": 18452 + }, + { + "epoch": 2.46, + "grad_norm": 0.8046875, + "learning_rate": 7.730536858121985e-05, + "loss": 0.6135, + "step": 18453 + }, + { + "epoch": 2.46, + "grad_norm": 0.7578125, + "learning_rate": 7.729402756852519e-05, + "loss": 0.195, + "step": 18454 + }, + { + "epoch": 2.46, + "grad_norm": 0.5546875, + "learning_rate": 7.728268686373814e-05, + "loss": 0.3288, + "step": 18455 + }, + { + "epoch": 2.46, + "grad_norm": 0.52734375, + "learning_rate": 7.72713464670124e-05, + "loss": 0.3483, + "step": 18456 + }, + { + "epoch": 2.46, + "grad_norm": 0.6484375, + "learning_rate": 7.726000637850181e-05, + "loss": 0.3396, + "step": 18457 + }, + { + "epoch": 2.46, + "grad_norm": 0.419921875, + "learning_rate": 7.724866659836012e-05, + "loss": 0.2364, + "step": 18458 + }, + { + "epoch": 2.46, + "grad_norm": 0.423828125, + "learning_rate": 7.723732712674109e-05, + "loss": 0.1999, + "step": 18459 + }, + { + "epoch": 2.46, + "grad_norm": 0.7578125, + "learning_rate": 7.72259879637986e-05, + "loss": 0.4192, + "step": 18460 + }, + { + "epoch": 2.46, + "grad_norm": 0.7734375, + "learning_rate": 7.721464910968627e-05, + "loss": 0.3121, + "step": 18461 + }, + { + "epoch": 2.46, + "grad_norm": 0.609375, + "learning_rate": 7.72033105645579e-05, + "loss": 0.3809, + "step": 18462 + }, + { + "epoch": 2.46, + "grad_norm": 0.50390625, + "learning_rate": 7.719197232856731e-05, + "loss": 0.1997, + "step": 18463 + }, + { + "epoch": 2.46, + "grad_norm": 0.6875, + "learning_rate": 7.718063440186819e-05, + "loss": 0.6112, + "step": 18464 + }, + { + "epoch": 2.46, + "grad_norm": 0.796875, + "learning_rate": 7.716929678461432e-05, + "loss": 0.2179, + "step": 18465 + }, + { + "epoch": 2.46, + "grad_norm": 0.5703125, + "learning_rate": 7.715795947695942e-05, + "loss": 0.2695, + "step": 18466 + }, + { + "epoch": 2.46, + "grad_norm": 0.474609375, + "learning_rate": 7.714662247905727e-05, + "loss": 0.2227, + "step": 18467 + }, + { + "epoch": 2.46, + "grad_norm": 0.59765625, + "learning_rate": 7.713528579106158e-05, + "loss": 0.3501, + "step": 18468 + }, + { + "epoch": 2.46, + "grad_norm": 0.59765625, + "learning_rate": 7.712394941312609e-05, + "loss": 0.397, + "step": 18469 + }, + { + "epoch": 2.46, + "grad_norm": 0.5703125, + "learning_rate": 7.711261334540451e-05, + "loss": 0.5009, + "step": 18470 + }, + { + "epoch": 2.46, + "grad_norm": 0.890625, + "learning_rate": 7.710127758805061e-05, + "loss": 0.327, + "step": 18471 + }, + { + "epoch": 2.46, + "grad_norm": 0.640625, + "learning_rate": 7.708994214121808e-05, + "loss": 0.2346, + "step": 18472 + }, + { + "epoch": 2.47, + "grad_norm": 0.5234375, + "learning_rate": 7.707860700506066e-05, + "loss": 0.3795, + "step": 18473 + }, + { + "epoch": 2.47, + "grad_norm": 0.431640625, + "learning_rate": 7.706727217973201e-05, + "loss": 0.1347, + "step": 18474 + }, + { + "epoch": 2.47, + "grad_norm": 0.6796875, + "learning_rate": 7.705593766538586e-05, + "loss": 0.2803, + "step": 18475 + }, + { + "epoch": 2.47, + "grad_norm": 0.57421875, + "learning_rate": 7.704460346217593e-05, + "loss": 0.1951, + "step": 18476 + }, + { + "epoch": 2.47, + "grad_norm": 0.703125, + "learning_rate": 7.703326957025591e-05, + "loss": 0.3965, + "step": 18477 + }, + { + "epoch": 2.47, + "grad_norm": 0.625, + "learning_rate": 7.702193598977949e-05, + "loss": 0.2195, + "step": 18478 + }, + { + "epoch": 2.47, + "grad_norm": 0.671875, + "learning_rate": 7.701060272090037e-05, + "loss": 0.3416, + "step": 18479 + }, + { + "epoch": 2.47, + "grad_norm": 0.609375, + "learning_rate": 7.699926976377223e-05, + "loss": 0.2766, + "step": 18480 + }, + { + "epoch": 2.47, + "grad_norm": 0.484375, + "learning_rate": 7.698793711854875e-05, + "loss": 0.2516, + "step": 18481 + }, + { + "epoch": 2.47, + "grad_norm": 0.51953125, + "learning_rate": 7.697660478538363e-05, + "loss": 0.3314, + "step": 18482 + }, + { + "epoch": 2.47, + "grad_norm": 0.58203125, + "learning_rate": 7.696527276443049e-05, + "loss": 0.4441, + "step": 18483 + }, + { + "epoch": 2.47, + "grad_norm": 0.63671875, + "learning_rate": 7.695394105584304e-05, + "loss": 0.4406, + "step": 18484 + }, + { + "epoch": 2.47, + "grad_norm": 0.47265625, + "learning_rate": 7.6942609659775e-05, + "loss": 0.2484, + "step": 18485 + }, + { + "epoch": 2.47, + "grad_norm": 0.64453125, + "learning_rate": 7.693127857637991e-05, + "loss": 0.1768, + "step": 18486 + }, + { + "epoch": 2.47, + "grad_norm": 0.62890625, + "learning_rate": 7.69199478058115e-05, + "loss": 0.3174, + "step": 18487 + }, + { + "epoch": 2.47, + "grad_norm": 0.6171875, + "learning_rate": 7.690861734822343e-05, + "loss": 0.3285, + "step": 18488 + }, + { + "epoch": 2.47, + "grad_norm": 0.5625, + "learning_rate": 7.689728720376929e-05, + "loss": 0.4075, + "step": 18489 + }, + { + "epoch": 2.47, + "grad_norm": 1.0078125, + "learning_rate": 7.68859573726028e-05, + "loss": 0.5218, + "step": 18490 + }, + { + "epoch": 2.47, + "grad_norm": 0.64453125, + "learning_rate": 7.687462785487754e-05, + "loss": 0.366, + "step": 18491 + }, + { + "epoch": 2.47, + "grad_norm": 0.51953125, + "learning_rate": 7.68632986507472e-05, + "loss": 0.3948, + "step": 18492 + }, + { + "epoch": 2.47, + "grad_norm": 0.578125, + "learning_rate": 7.685196976036533e-05, + "loss": 0.3028, + "step": 18493 + }, + { + "epoch": 2.47, + "grad_norm": 0.5078125, + "learning_rate": 7.684064118388564e-05, + "loss": 0.3225, + "step": 18494 + }, + { + "epoch": 2.47, + "grad_norm": 0.703125, + "learning_rate": 7.682931292146171e-05, + "loss": 0.2624, + "step": 18495 + }, + { + "epoch": 2.47, + "grad_norm": 0.474609375, + "learning_rate": 7.681798497324716e-05, + "loss": 0.2562, + "step": 18496 + }, + { + "epoch": 2.47, + "grad_norm": 0.52734375, + "learning_rate": 7.680665733939567e-05, + "loss": 0.3253, + "step": 18497 + }, + { + "epoch": 2.47, + "grad_norm": 0.68359375, + "learning_rate": 7.679533002006073e-05, + "loss": 0.5126, + "step": 18498 + }, + { + "epoch": 2.47, + "grad_norm": 0.65625, + "learning_rate": 7.678400301539603e-05, + "loss": 0.2999, + "step": 18499 + }, + { + "epoch": 2.47, + "grad_norm": 0.515625, + "learning_rate": 7.677267632555512e-05, + "loss": 0.3019, + "step": 18500 + }, + { + "epoch": 2.47, + "grad_norm": 0.5625, + "learning_rate": 7.676134995069167e-05, + "loss": 0.3088, + "step": 18501 + }, + { + "epoch": 2.47, + "grad_norm": 0.69140625, + "learning_rate": 7.67500238909592e-05, + "loss": 0.3898, + "step": 18502 + }, + { + "epoch": 2.47, + "grad_norm": 0.59375, + "learning_rate": 7.673869814651133e-05, + "loss": 0.4617, + "step": 18503 + }, + { + "epoch": 2.47, + "grad_norm": 0.6484375, + "learning_rate": 7.672737271750164e-05, + "loss": 0.4092, + "step": 18504 + }, + { + "epoch": 2.47, + "grad_norm": 0.73046875, + "learning_rate": 7.671604760408372e-05, + "loss": 0.5169, + "step": 18505 + }, + { + "epoch": 2.47, + "grad_norm": 0.55078125, + "learning_rate": 7.670472280641112e-05, + "loss": 0.5011, + "step": 18506 + }, + { + "epoch": 2.47, + "grad_norm": 0.6171875, + "learning_rate": 7.669339832463744e-05, + "loss": 0.2495, + "step": 18507 + }, + { + "epoch": 2.47, + "grad_norm": 0.451171875, + "learning_rate": 7.668207415891624e-05, + "loss": 0.3593, + "step": 18508 + }, + { + "epoch": 2.47, + "grad_norm": 0.640625, + "learning_rate": 7.667075030940111e-05, + "loss": 0.3182, + "step": 18509 + }, + { + "epoch": 2.47, + "grad_norm": 0.78515625, + "learning_rate": 7.665942677624554e-05, + "loss": 0.4924, + "step": 18510 + }, + { + "epoch": 2.47, + "grad_norm": 0.8125, + "learning_rate": 7.664810355960314e-05, + "loss": 0.4805, + "step": 18511 + }, + { + "epoch": 2.47, + "grad_norm": 0.640625, + "learning_rate": 7.663678065962743e-05, + "loss": 0.3973, + "step": 18512 + }, + { + "epoch": 2.47, + "grad_norm": 0.482421875, + "learning_rate": 7.6625458076472e-05, + "loss": 0.2071, + "step": 18513 + }, + { + "epoch": 2.47, + "grad_norm": 0.490234375, + "learning_rate": 7.661413581029031e-05, + "loss": 0.3337, + "step": 18514 + }, + { + "epoch": 2.47, + "grad_norm": 0.54296875, + "learning_rate": 7.660281386123598e-05, + "loss": 0.3324, + "step": 18515 + }, + { + "epoch": 2.47, + "grad_norm": 0.494140625, + "learning_rate": 7.65914922294625e-05, + "loss": 0.2993, + "step": 18516 + }, + { + "epoch": 2.47, + "grad_norm": 0.498046875, + "learning_rate": 7.65801709151234e-05, + "loss": 0.3871, + "step": 18517 + }, + { + "epoch": 2.47, + "grad_norm": 0.54296875, + "learning_rate": 7.656884991837222e-05, + "loss": 0.4047, + "step": 18518 + }, + { + "epoch": 2.47, + "grad_norm": 0.88671875, + "learning_rate": 7.655752923936249e-05, + "loss": 0.5936, + "step": 18519 + }, + { + "epoch": 2.47, + "grad_norm": 0.490234375, + "learning_rate": 7.654620887824769e-05, + "loss": 0.2474, + "step": 18520 + }, + { + "epoch": 2.47, + "grad_norm": 0.65234375, + "learning_rate": 7.653488883518138e-05, + "loss": 0.2487, + "step": 18521 + }, + { + "epoch": 2.47, + "grad_norm": 0.5625, + "learning_rate": 7.652356911031702e-05, + "loss": 0.4138, + "step": 18522 + }, + { + "epoch": 2.47, + "grad_norm": 0.56640625, + "learning_rate": 7.651224970380812e-05, + "loss": 0.4367, + "step": 18523 + }, + { + "epoch": 2.47, + "grad_norm": 0.6796875, + "learning_rate": 7.650093061580819e-05, + "loss": 0.3752, + "step": 18524 + }, + { + "epoch": 2.47, + "grad_norm": 0.98828125, + "learning_rate": 7.648961184647072e-05, + "loss": 0.4682, + "step": 18525 + }, + { + "epoch": 2.47, + "grad_norm": 0.4765625, + "learning_rate": 7.647829339594921e-05, + "loss": 0.3703, + "step": 18526 + }, + { + "epoch": 2.47, + "grad_norm": 0.546875, + "learning_rate": 7.646697526439712e-05, + "loss": 0.4276, + "step": 18527 + }, + { + "epoch": 2.47, + "grad_norm": 0.52734375, + "learning_rate": 7.645565745196797e-05, + "loss": 0.3595, + "step": 18528 + }, + { + "epoch": 2.47, + "grad_norm": 0.578125, + "learning_rate": 7.64443399588152e-05, + "loss": 0.4818, + "step": 18529 + }, + { + "epoch": 2.47, + "grad_norm": 0.75390625, + "learning_rate": 7.643302278509233e-05, + "loss": 0.3585, + "step": 18530 + }, + { + "epoch": 2.47, + "grad_norm": 0.46875, + "learning_rate": 7.642170593095275e-05, + "loss": 0.146, + "step": 18531 + }, + { + "epoch": 2.47, + "grad_norm": 0.61328125, + "learning_rate": 7.641038939654998e-05, + "loss": 0.4385, + "step": 18532 + }, + { + "epoch": 2.47, + "grad_norm": 0.734375, + "learning_rate": 7.639907318203754e-05, + "loss": 0.3472, + "step": 18533 + }, + { + "epoch": 2.47, + "grad_norm": 0.482421875, + "learning_rate": 7.638775728756874e-05, + "loss": 0.2291, + "step": 18534 + }, + { + "epoch": 2.47, + "grad_norm": 0.5546875, + "learning_rate": 7.637644171329711e-05, + "loss": 0.4576, + "step": 18535 + }, + { + "epoch": 2.47, + "grad_norm": 0.8046875, + "learning_rate": 7.636512645937611e-05, + "loss": 0.4809, + "step": 18536 + }, + { + "epoch": 2.47, + "grad_norm": 0.59765625, + "learning_rate": 7.635381152595915e-05, + "loss": 0.3027, + "step": 18537 + }, + { + "epoch": 2.47, + "grad_norm": 0.7265625, + "learning_rate": 7.63424969131997e-05, + "loss": 0.4729, + "step": 18538 + }, + { + "epoch": 2.47, + "grad_norm": 0.6953125, + "learning_rate": 7.633118262125115e-05, + "loss": 0.327, + "step": 18539 + }, + { + "epoch": 2.47, + "grad_norm": 0.421875, + "learning_rate": 7.631986865026698e-05, + "loss": 0.2165, + "step": 18540 + }, + { + "epoch": 2.47, + "grad_norm": 0.6171875, + "learning_rate": 7.630855500040057e-05, + "loss": 0.493, + "step": 18541 + }, + { + "epoch": 2.47, + "grad_norm": 0.5234375, + "learning_rate": 7.629724167180536e-05, + "loss": 0.3529, + "step": 18542 + }, + { + "epoch": 2.47, + "grad_norm": 0.53125, + "learning_rate": 7.628592866463478e-05, + "loss": 0.2791, + "step": 18543 + }, + { + "epoch": 2.47, + "grad_norm": 0.412109375, + "learning_rate": 7.627461597904222e-05, + "loss": 0.207, + "step": 18544 + }, + { + "epoch": 2.47, + "grad_norm": 0.65625, + "learning_rate": 7.626330361518113e-05, + "loss": 0.407, + "step": 18545 + }, + { + "epoch": 2.47, + "grad_norm": 0.70703125, + "learning_rate": 7.625199157320482e-05, + "loss": 0.4029, + "step": 18546 + }, + { + "epoch": 2.47, + "grad_norm": 0.5078125, + "learning_rate": 7.624067985326678e-05, + "loss": 0.391, + "step": 18547 + }, + { + "epoch": 2.48, + "grad_norm": 0.59375, + "learning_rate": 7.622936845552034e-05, + "loss": 0.3005, + "step": 18548 + }, + { + "epoch": 2.48, + "grad_norm": 0.69140625, + "learning_rate": 7.621805738011895e-05, + "loss": 0.2557, + "step": 18549 + }, + { + "epoch": 2.48, + "grad_norm": 0.478515625, + "learning_rate": 7.620674662721594e-05, + "loss": 0.3475, + "step": 18550 + }, + { + "epoch": 2.48, + "grad_norm": 0.474609375, + "learning_rate": 7.619543619696474e-05, + "loss": 0.2492, + "step": 18551 + }, + { + "epoch": 2.48, + "grad_norm": 0.58984375, + "learning_rate": 7.618412608951867e-05, + "loss": 0.3521, + "step": 18552 + }, + { + "epoch": 2.48, + "grad_norm": 0.57421875, + "learning_rate": 7.617281630503117e-05, + "loss": 0.4865, + "step": 18553 + }, + { + "epoch": 2.48, + "grad_norm": 0.66015625, + "learning_rate": 7.616150684365554e-05, + "loss": 0.2997, + "step": 18554 + }, + { + "epoch": 2.48, + "grad_norm": 0.5703125, + "learning_rate": 7.615019770554521e-05, + "loss": 0.4125, + "step": 18555 + }, + { + "epoch": 2.48, + "grad_norm": 0.63671875, + "learning_rate": 7.613888889085348e-05, + "loss": 0.3459, + "step": 18556 + }, + { + "epoch": 2.48, + "grad_norm": 0.52734375, + "learning_rate": 7.612758039973372e-05, + "loss": 0.1965, + "step": 18557 + }, + { + "epoch": 2.48, + "grad_norm": 0.625, + "learning_rate": 7.611627223233937e-05, + "loss": 0.324, + "step": 18558 + }, + { + "epoch": 2.48, + "grad_norm": 0.68359375, + "learning_rate": 7.610496438882362e-05, + "loss": 0.2262, + "step": 18559 + }, + { + "epoch": 2.48, + "grad_norm": 0.435546875, + "learning_rate": 7.60936568693399e-05, + "loss": 0.2486, + "step": 18560 + }, + { + "epoch": 2.48, + "grad_norm": 0.53515625, + "learning_rate": 7.608234967404155e-05, + "loss": 0.2997, + "step": 18561 + }, + { + "epoch": 2.48, + "grad_norm": 0.443359375, + "learning_rate": 7.607104280308188e-05, + "loss": 0.186, + "step": 18562 + }, + { + "epoch": 2.48, + "grad_norm": 0.73046875, + "learning_rate": 7.605973625661424e-05, + "loss": 0.3199, + "step": 18563 + }, + { + "epoch": 2.48, + "grad_norm": 0.466796875, + "learning_rate": 7.604843003479192e-05, + "loss": 0.2013, + "step": 18564 + }, + { + "epoch": 2.48, + "grad_norm": 0.50390625, + "learning_rate": 7.603712413776827e-05, + "loss": 0.3367, + "step": 18565 + }, + { + "epoch": 2.48, + "grad_norm": 0.60546875, + "learning_rate": 7.60258185656966e-05, + "loss": 0.5262, + "step": 18566 + }, + { + "epoch": 2.48, + "grad_norm": 0.66796875, + "learning_rate": 7.601451331873021e-05, + "loss": 0.409, + "step": 18567 + }, + { + "epoch": 2.48, + "grad_norm": 0.58203125, + "learning_rate": 7.60032083970224e-05, + "loss": 0.4181, + "step": 18568 + }, + { + "epoch": 2.48, + "grad_norm": 0.69921875, + "learning_rate": 7.599190380072652e-05, + "loss": 0.5118, + "step": 18569 + }, + { + "epoch": 2.48, + "grad_norm": 0.5, + "learning_rate": 7.598059952999584e-05, + "loss": 0.3964, + "step": 18570 + }, + { + "epoch": 2.48, + "grad_norm": 0.61328125, + "learning_rate": 7.596929558498362e-05, + "loss": 0.1943, + "step": 18571 + }, + { + "epoch": 2.48, + "grad_norm": 0.62109375, + "learning_rate": 7.595799196584318e-05, + "loss": 0.2194, + "step": 18572 + }, + { + "epoch": 2.48, + "grad_norm": 0.71875, + "learning_rate": 7.594668867272778e-05, + "loss": 0.5194, + "step": 18573 + }, + { + "epoch": 2.48, + "grad_norm": 0.76171875, + "learning_rate": 7.593538570579074e-05, + "loss": 0.4477, + "step": 18574 + }, + { + "epoch": 2.48, + "grad_norm": 0.58203125, + "learning_rate": 7.59240830651853e-05, + "loss": 0.3696, + "step": 18575 + }, + { + "epoch": 2.48, + "grad_norm": 0.6796875, + "learning_rate": 7.591278075106477e-05, + "loss": 0.307, + "step": 18576 + }, + { + "epoch": 2.48, + "grad_norm": 0.6015625, + "learning_rate": 7.590147876358236e-05, + "loss": 0.3604, + "step": 18577 + }, + { + "epoch": 2.48, + "grad_norm": 0.6328125, + "learning_rate": 7.589017710289139e-05, + "loss": 0.5346, + "step": 18578 + }, + { + "epoch": 2.48, + "grad_norm": 0.53515625, + "learning_rate": 7.587887576914506e-05, + "loss": 0.4277, + "step": 18579 + }, + { + "epoch": 2.48, + "grad_norm": 0.443359375, + "learning_rate": 7.586757476249667e-05, + "loss": 0.2359, + "step": 18580 + }, + { + "epoch": 2.48, + "grad_norm": 0.427734375, + "learning_rate": 7.585627408309948e-05, + "loss": 0.1281, + "step": 18581 + }, + { + "epoch": 2.48, + "grad_norm": 0.828125, + "learning_rate": 7.58449737311067e-05, + "loss": 0.3709, + "step": 18582 + }, + { + "epoch": 2.48, + "grad_norm": 0.546875, + "learning_rate": 7.583367370667158e-05, + "loss": 0.392, + "step": 18583 + }, + { + "epoch": 2.48, + "grad_norm": 0.392578125, + "learning_rate": 7.582237400994734e-05, + "loss": 0.2265, + "step": 18584 + }, + { + "epoch": 2.48, + "grad_norm": 0.494140625, + "learning_rate": 7.581107464108723e-05, + "loss": 0.2187, + "step": 18585 + }, + { + "epoch": 2.48, + "grad_norm": 0.46484375, + "learning_rate": 7.579977560024447e-05, + "loss": 0.2099, + "step": 18586 + }, + { + "epoch": 2.48, + "grad_norm": 0.431640625, + "learning_rate": 7.578847688757226e-05, + "loss": 0.2533, + "step": 18587 + }, + { + "epoch": 2.48, + "grad_norm": 0.5546875, + "learning_rate": 7.577717850322388e-05, + "loss": 0.5731, + "step": 18588 + }, + { + "epoch": 2.48, + "grad_norm": 0.8125, + "learning_rate": 7.57658804473525e-05, + "loss": 0.361, + "step": 18589 + }, + { + "epoch": 2.48, + "grad_norm": 0.51171875, + "learning_rate": 7.575458272011133e-05, + "loss": 0.2684, + "step": 18590 + }, + { + "epoch": 2.48, + "grad_norm": 0.61328125, + "learning_rate": 7.574328532165356e-05, + "loss": 0.3716, + "step": 18591 + }, + { + "epoch": 2.48, + "grad_norm": 0.9375, + "learning_rate": 7.573198825213244e-05, + "loss": 0.3606, + "step": 18592 + }, + { + "epoch": 2.48, + "grad_norm": 0.625, + "learning_rate": 7.572069151170112e-05, + "loss": 0.247, + "step": 18593 + }, + { + "epoch": 2.48, + "grad_norm": 0.6640625, + "learning_rate": 7.570939510051283e-05, + "loss": 0.4805, + "step": 18594 + }, + { + "epoch": 2.48, + "grad_norm": 0.671875, + "learning_rate": 7.569809901872072e-05, + "loss": 0.306, + "step": 18595 + }, + { + "epoch": 2.48, + "grad_norm": 0.42578125, + "learning_rate": 7.568680326647794e-05, + "loss": 0.2537, + "step": 18596 + }, + { + "epoch": 2.48, + "grad_norm": 0.62890625, + "learning_rate": 7.567550784393777e-05, + "loss": 0.4076, + "step": 18597 + }, + { + "epoch": 2.48, + "grad_norm": 0.62109375, + "learning_rate": 7.566421275125329e-05, + "loss": 0.3241, + "step": 18598 + }, + { + "epoch": 2.48, + "grad_norm": 0.56640625, + "learning_rate": 7.56529179885777e-05, + "loss": 0.4535, + "step": 18599 + }, + { + "epoch": 2.48, + "grad_norm": 0.7109375, + "learning_rate": 7.564162355606418e-05, + "loss": 0.2632, + "step": 18600 + }, + { + "epoch": 2.48, + "grad_norm": 0.59765625, + "learning_rate": 7.563032945386589e-05, + "loss": 0.2364, + "step": 18601 + }, + { + "epoch": 2.48, + "grad_norm": 0.66796875, + "learning_rate": 7.561903568213595e-05, + "loss": 0.3095, + "step": 18602 + }, + { + "epoch": 2.48, + "grad_norm": 0.6328125, + "learning_rate": 7.560774224102756e-05, + "loss": 0.3501, + "step": 18603 + }, + { + "epoch": 2.48, + "grad_norm": 0.70703125, + "learning_rate": 7.55964491306938e-05, + "loss": 0.469, + "step": 18604 + }, + { + "epoch": 2.48, + "grad_norm": 0.486328125, + "learning_rate": 7.558515635128786e-05, + "loss": 0.1726, + "step": 18605 + }, + { + "epoch": 2.48, + "grad_norm": 0.384765625, + "learning_rate": 7.557386390296295e-05, + "loss": 0.1724, + "step": 18606 + }, + { + "epoch": 2.48, + "grad_norm": 0.71484375, + "learning_rate": 7.556257178587205e-05, + "loss": 0.4589, + "step": 18607 + }, + { + "epoch": 2.48, + "grad_norm": 0.55859375, + "learning_rate": 7.555128000016835e-05, + "loss": 0.3099, + "step": 18608 + }, + { + "epoch": 2.48, + "grad_norm": 0.72265625, + "learning_rate": 7.553998854600502e-05, + "loss": 0.6253, + "step": 18609 + }, + { + "epoch": 2.48, + "grad_norm": 0.52734375, + "learning_rate": 7.55286974235351e-05, + "loss": 0.4975, + "step": 18610 + }, + { + "epoch": 2.48, + "grad_norm": 0.5078125, + "learning_rate": 7.551740663291179e-05, + "loss": 0.2277, + "step": 18611 + }, + { + "epoch": 2.48, + "grad_norm": 0.5234375, + "learning_rate": 7.550611617428813e-05, + "loss": 0.2444, + "step": 18612 + }, + { + "epoch": 2.48, + "grad_norm": 0.67578125, + "learning_rate": 7.549482604781729e-05, + "loss": 0.3036, + "step": 18613 + }, + { + "epoch": 2.48, + "grad_norm": 0.640625, + "learning_rate": 7.548353625365229e-05, + "loss": 0.116, + "step": 18614 + }, + { + "epoch": 2.48, + "grad_norm": 0.703125, + "learning_rate": 7.54722467919463e-05, + "loss": 0.4594, + "step": 18615 + }, + { + "epoch": 2.48, + "grad_norm": 0.59765625, + "learning_rate": 7.546095766285238e-05, + "loss": 0.2598, + "step": 18616 + }, + { + "epoch": 2.48, + "grad_norm": 0.52734375, + "learning_rate": 7.544966886652364e-05, + "loss": 0.2091, + "step": 18617 + }, + { + "epoch": 2.48, + "grad_norm": 0.53125, + "learning_rate": 7.543838040311316e-05, + "loss": 0.2573, + "step": 18618 + }, + { + "epoch": 2.48, + "grad_norm": 0.6796875, + "learning_rate": 7.542709227277396e-05, + "loss": 0.2004, + "step": 18619 + }, + { + "epoch": 2.48, + "grad_norm": 0.625, + "learning_rate": 7.541580447565919e-05, + "loss": 0.4764, + "step": 18620 + }, + { + "epoch": 2.48, + "grad_norm": 0.71484375, + "learning_rate": 7.540451701192187e-05, + "loss": 0.5525, + "step": 18621 + }, + { + "epoch": 2.48, + "grad_norm": 0.66796875, + "learning_rate": 7.53932298817151e-05, + "loss": 0.5618, + "step": 18622 + }, + { + "epoch": 2.49, + "grad_norm": 0.5, + "learning_rate": 7.538194308519189e-05, + "loss": 0.2699, + "step": 18623 + }, + { + "epoch": 2.49, + "grad_norm": 0.48828125, + "learning_rate": 7.537065662250537e-05, + "loss": 0.4259, + "step": 18624 + }, + { + "epoch": 2.49, + "grad_norm": 0.46875, + "learning_rate": 7.535937049380853e-05, + "loss": 0.2141, + "step": 18625 + }, + { + "epoch": 2.49, + "grad_norm": 0.53125, + "learning_rate": 7.534808469925447e-05, + "loss": 0.2321, + "step": 18626 + }, + { + "epoch": 2.49, + "grad_norm": 0.5625, + "learning_rate": 7.533679923899616e-05, + "loss": 0.267, + "step": 18627 + }, + { + "epoch": 2.49, + "grad_norm": 0.5078125, + "learning_rate": 7.532551411318673e-05, + "loss": 0.2399, + "step": 18628 + }, + { + "epoch": 2.49, + "grad_norm": 0.76171875, + "learning_rate": 7.531422932197912e-05, + "loss": 0.3858, + "step": 18629 + }, + { + "epoch": 2.49, + "grad_norm": 0.515625, + "learning_rate": 7.530294486552646e-05, + "loss": 0.5731, + "step": 18630 + }, + { + "epoch": 2.49, + "grad_norm": 0.70703125, + "learning_rate": 7.529166074398169e-05, + "loss": 0.3662, + "step": 18631 + }, + { + "epoch": 2.49, + "grad_norm": 0.494140625, + "learning_rate": 7.528037695749783e-05, + "loss": 0.3751, + "step": 18632 + }, + { + "epoch": 2.49, + "grad_norm": 0.58203125, + "learning_rate": 7.526909350622793e-05, + "loss": 0.2354, + "step": 18633 + }, + { + "epoch": 2.49, + "grad_norm": 0.447265625, + "learning_rate": 7.525781039032502e-05, + "loss": 0.2578, + "step": 18634 + }, + { + "epoch": 2.49, + "grad_norm": 0.42578125, + "learning_rate": 7.524652760994207e-05, + "loss": 0.2221, + "step": 18635 + }, + { + "epoch": 2.49, + "grad_norm": 0.55859375, + "learning_rate": 7.523524516523209e-05, + "loss": 0.4686, + "step": 18636 + }, + { + "epoch": 2.49, + "grad_norm": 0.5859375, + "learning_rate": 7.522396305634807e-05, + "loss": 0.2341, + "step": 18637 + }, + { + "epoch": 2.49, + "grad_norm": 0.6953125, + "learning_rate": 7.521268128344303e-05, + "loss": 0.5469, + "step": 18638 + }, + { + "epoch": 2.49, + "grad_norm": 0.734375, + "learning_rate": 7.520139984666992e-05, + "loss": 0.6202, + "step": 18639 + }, + { + "epoch": 2.49, + "grad_norm": 0.578125, + "learning_rate": 7.519011874618177e-05, + "loss": 0.3158, + "step": 18640 + }, + { + "epoch": 2.49, + "grad_norm": 0.40234375, + "learning_rate": 7.51788379821315e-05, + "loss": 0.1215, + "step": 18641 + }, + { + "epoch": 2.49, + "grad_norm": 0.53125, + "learning_rate": 7.516755755467216e-05, + "loss": 0.2021, + "step": 18642 + }, + { + "epoch": 2.49, + "grad_norm": 0.7265625, + "learning_rate": 7.515627746395667e-05, + "loss": 0.3576, + "step": 18643 + }, + { + "epoch": 2.49, + "grad_norm": 0.6171875, + "learning_rate": 7.514499771013797e-05, + "loss": 0.3367, + "step": 18644 + }, + { + "epoch": 2.49, + "grad_norm": 0.5078125, + "learning_rate": 7.513371829336908e-05, + "loss": 0.4119, + "step": 18645 + }, + { + "epoch": 2.49, + "grad_norm": 0.57421875, + "learning_rate": 7.51224392138029e-05, + "loss": 0.21, + "step": 18646 + }, + { + "epoch": 2.49, + "grad_norm": 0.625, + "learning_rate": 7.511116047159244e-05, + "loss": 0.2818, + "step": 18647 + }, + { + "epoch": 2.49, + "grad_norm": 0.5703125, + "learning_rate": 7.509988206689059e-05, + "loss": 0.2926, + "step": 18648 + }, + { + "epoch": 2.49, + "grad_norm": 0.55078125, + "learning_rate": 7.508860399985035e-05, + "loss": 0.4205, + "step": 18649 + }, + { + "epoch": 2.49, + "grad_norm": 0.4921875, + "learning_rate": 7.50773262706246e-05, + "loss": 0.2604, + "step": 18650 + }, + { + "epoch": 2.49, + "grad_norm": 0.5, + "learning_rate": 7.506604887936632e-05, + "loss": 0.4864, + "step": 18651 + }, + { + "epoch": 2.49, + "grad_norm": 0.59765625, + "learning_rate": 7.50547718262284e-05, + "loss": 0.3989, + "step": 18652 + }, + { + "epoch": 2.49, + "grad_norm": 0.74609375, + "learning_rate": 7.504349511136379e-05, + "loss": 0.6602, + "step": 18653 + }, + { + "epoch": 2.49, + "grad_norm": 0.44921875, + "learning_rate": 7.503221873492542e-05, + "loss": 0.3458, + "step": 18654 + }, + { + "epoch": 2.49, + "grad_norm": 0.47265625, + "learning_rate": 7.502094269706621e-05, + "loss": 0.3169, + "step": 18655 + }, + { + "epoch": 2.49, + "grad_norm": 0.57421875, + "learning_rate": 7.5009666997939e-05, + "loss": 0.2504, + "step": 18656 + }, + { + "epoch": 2.49, + "grad_norm": 0.54296875, + "learning_rate": 7.499839163769678e-05, + "loss": 0.5106, + "step": 18657 + }, + { + "epoch": 2.49, + "grad_norm": 0.353515625, + "learning_rate": 7.498711661649237e-05, + "loss": 0.1919, + "step": 18658 + }, + { + "epoch": 2.49, + "grad_norm": 0.58203125, + "learning_rate": 7.497584193447875e-05, + "loss": 0.4062, + "step": 18659 + }, + { + "epoch": 2.49, + "grad_norm": 0.59765625, + "learning_rate": 7.496456759180875e-05, + "loss": 0.281, + "step": 18660 + }, + { + "epoch": 2.49, + "grad_norm": 0.53125, + "learning_rate": 7.495329358863531e-05, + "loss": 0.3219, + "step": 18661 + }, + { + "epoch": 2.49, + "grad_norm": 0.498046875, + "learning_rate": 7.494201992511125e-05, + "loss": 0.4401, + "step": 18662 + }, + { + "epoch": 2.49, + "grad_norm": 0.6875, + "learning_rate": 7.49307466013895e-05, + "loss": 0.3924, + "step": 18663 + }, + { + "epoch": 2.49, + "grad_norm": 0.8046875, + "learning_rate": 7.491947361762291e-05, + "loss": 0.4517, + "step": 18664 + }, + { + "epoch": 2.49, + "grad_norm": 1.03125, + "learning_rate": 7.490820097396437e-05, + "loss": 0.254, + "step": 18665 + }, + { + "epoch": 2.49, + "grad_norm": 0.421875, + "learning_rate": 7.48969286705667e-05, + "loss": 0.1628, + "step": 18666 + }, + { + "epoch": 2.49, + "grad_norm": 0.62109375, + "learning_rate": 7.488565670758284e-05, + "loss": 0.5193, + "step": 18667 + }, + { + "epoch": 2.49, + "grad_norm": 0.5078125, + "learning_rate": 7.487438508516558e-05, + "loss": 0.1793, + "step": 18668 + }, + { + "epoch": 2.49, + "grad_norm": 0.625, + "learning_rate": 7.486311380346774e-05, + "loss": 0.4321, + "step": 18669 + }, + { + "epoch": 2.49, + "grad_norm": 0.51171875, + "learning_rate": 7.485184286264224e-05, + "loss": 0.2987, + "step": 18670 + }, + { + "epoch": 2.49, + "grad_norm": 0.625, + "learning_rate": 7.484057226284187e-05, + "loss": 0.283, + "step": 18671 + }, + { + "epoch": 2.49, + "grad_norm": 0.5, + "learning_rate": 7.482930200421951e-05, + "loss": 0.1627, + "step": 18672 + }, + { + "epoch": 2.49, + "grad_norm": 0.6328125, + "learning_rate": 7.481803208692795e-05, + "loss": 0.4647, + "step": 18673 + }, + { + "epoch": 2.49, + "grad_norm": 0.53125, + "learning_rate": 7.480676251112006e-05, + "loss": 0.375, + "step": 18674 + }, + { + "epoch": 2.49, + "grad_norm": 0.58203125, + "learning_rate": 7.479549327694862e-05, + "loss": 0.4128, + "step": 18675 + }, + { + "epoch": 2.49, + "grad_norm": 0.6015625, + "learning_rate": 7.478422438456648e-05, + "loss": 0.2958, + "step": 18676 + }, + { + "epoch": 2.49, + "grad_norm": 0.5390625, + "learning_rate": 7.477295583412642e-05, + "loss": 0.3946, + "step": 18677 + }, + { + "epoch": 2.49, + "grad_norm": 0.671875, + "learning_rate": 7.476168762578126e-05, + "loss": 0.1897, + "step": 18678 + }, + { + "epoch": 2.49, + "grad_norm": 0.56640625, + "learning_rate": 7.475041975968388e-05, + "loss": 0.4067, + "step": 18679 + }, + { + "epoch": 2.49, + "grad_norm": 0.6171875, + "learning_rate": 7.473915223598698e-05, + "loss": 0.401, + "step": 18680 + }, + { + "epoch": 2.49, + "grad_norm": 0.4609375, + "learning_rate": 7.472788505484336e-05, + "loss": 0.3143, + "step": 18681 + }, + { + "epoch": 2.49, + "grad_norm": 0.58984375, + "learning_rate": 7.471661821640585e-05, + "loss": 0.3404, + "step": 18682 + }, + { + "epoch": 2.49, + "grad_norm": 0.484375, + "learning_rate": 7.470535172082722e-05, + "loss": 0.2943, + "step": 18683 + }, + { + "epoch": 2.49, + "grad_norm": 0.76171875, + "learning_rate": 7.469408556826027e-05, + "loss": 0.6359, + "step": 18684 + }, + { + "epoch": 2.49, + "grad_norm": 0.5078125, + "learning_rate": 7.468281975885776e-05, + "loss": 0.4505, + "step": 18685 + }, + { + "epoch": 2.49, + "grad_norm": 0.6796875, + "learning_rate": 7.467155429277247e-05, + "loss": 0.4303, + "step": 18686 + }, + { + "epoch": 2.49, + "grad_norm": 0.6328125, + "learning_rate": 7.466028917015713e-05, + "loss": 0.4451, + "step": 18687 + }, + { + "epoch": 2.49, + "grad_norm": 0.51171875, + "learning_rate": 7.464902439116455e-05, + "loss": 0.3129, + "step": 18688 + }, + { + "epoch": 2.49, + "grad_norm": 0.71484375, + "learning_rate": 7.463775995594747e-05, + "loss": 0.4379, + "step": 18689 + }, + { + "epoch": 2.49, + "grad_norm": 0.640625, + "learning_rate": 7.462649586465865e-05, + "loss": 0.3333, + "step": 18690 + }, + { + "epoch": 2.49, + "grad_norm": 0.5, + "learning_rate": 7.461523211745085e-05, + "loss": 0.3267, + "step": 18691 + }, + { + "epoch": 2.49, + "grad_norm": 0.60546875, + "learning_rate": 7.460396871447675e-05, + "loss": 0.597, + "step": 18692 + }, + { + "epoch": 2.49, + "grad_norm": 0.51171875, + "learning_rate": 7.459270565588916e-05, + "loss": 0.2462, + "step": 18693 + }, + { + "epoch": 2.49, + "grad_norm": 0.447265625, + "learning_rate": 7.458144294184077e-05, + "loss": 0.1911, + "step": 18694 + }, + { + "epoch": 2.49, + "grad_norm": 0.5625, + "learning_rate": 7.457018057248433e-05, + "loss": 0.3995, + "step": 18695 + }, + { + "epoch": 2.49, + "grad_norm": 0.60546875, + "learning_rate": 7.455891854797256e-05, + "loss": 0.4781, + "step": 18696 + }, + { + "epoch": 2.49, + "grad_norm": 0.68359375, + "learning_rate": 7.454765686845819e-05, + "loss": 0.5144, + "step": 18697 + }, + { + "epoch": 2.5, + "grad_norm": 0.6328125, + "learning_rate": 7.453639553409393e-05, + "loss": 0.383, + "step": 18698 + }, + { + "epoch": 2.5, + "grad_norm": 0.427734375, + "learning_rate": 7.452513454503249e-05, + "loss": 0.2947, + "step": 18699 + }, + { + "epoch": 2.5, + "grad_norm": 0.58984375, + "learning_rate": 7.451387390142655e-05, + "loss": 0.4315, + "step": 18700 + }, + { + "epoch": 2.5, + "grad_norm": 0.59765625, + "learning_rate": 7.450261360342888e-05, + "loss": 0.3456, + "step": 18701 + }, + { + "epoch": 2.5, + "grad_norm": 0.66796875, + "learning_rate": 7.449135365119209e-05, + "loss": 0.2185, + "step": 18702 + }, + { + "epoch": 2.5, + "grad_norm": 0.52734375, + "learning_rate": 7.448009404486897e-05, + "loss": 0.3269, + "step": 18703 + }, + { + "epoch": 2.5, + "grad_norm": 0.6328125, + "learning_rate": 7.446883478461213e-05, + "loss": 0.4892, + "step": 18704 + }, + { + "epoch": 2.5, + "grad_norm": 0.55078125, + "learning_rate": 7.445757587057424e-05, + "loss": 0.2885, + "step": 18705 + }, + { + "epoch": 2.5, + "grad_norm": 0.9375, + "learning_rate": 7.444631730290804e-05, + "loss": 0.1817, + "step": 18706 + }, + { + "epoch": 2.5, + "grad_norm": 0.78125, + "learning_rate": 7.443505908176619e-05, + "loss": 0.4151, + "step": 18707 + }, + { + "epoch": 2.5, + "grad_norm": 0.76953125, + "learning_rate": 7.442380120730133e-05, + "loss": 0.5443, + "step": 18708 + }, + { + "epoch": 2.5, + "grad_norm": 0.7734375, + "learning_rate": 7.441254367966615e-05, + "loss": 0.2676, + "step": 18709 + }, + { + "epoch": 2.5, + "grad_norm": 0.69921875, + "learning_rate": 7.440128649901328e-05, + "loss": 0.3903, + "step": 18710 + }, + { + "epoch": 2.5, + "grad_norm": 0.578125, + "learning_rate": 7.439002966549541e-05, + "loss": 0.3308, + "step": 18711 + }, + { + "epoch": 2.5, + "grad_norm": 0.6640625, + "learning_rate": 7.437877317926516e-05, + "loss": 0.2302, + "step": 18712 + }, + { + "epoch": 2.5, + "grad_norm": 0.5703125, + "learning_rate": 7.436751704047521e-05, + "loss": 0.3995, + "step": 18713 + }, + { + "epoch": 2.5, + "grad_norm": 0.83203125, + "learning_rate": 7.435626124927817e-05, + "loss": 0.4307, + "step": 18714 + }, + { + "epoch": 2.5, + "grad_norm": 0.5390625, + "learning_rate": 7.434500580582672e-05, + "loss": 0.1849, + "step": 18715 + }, + { + "epoch": 2.5, + "grad_norm": 0.5234375, + "learning_rate": 7.433375071027343e-05, + "loss": 0.1871, + "step": 18716 + }, + { + "epoch": 2.5, + "grad_norm": 0.68359375, + "learning_rate": 7.432249596277093e-05, + "loss": 0.3339, + "step": 18717 + }, + { + "epoch": 2.5, + "grad_norm": 0.61328125, + "learning_rate": 7.431124156347189e-05, + "loss": 0.3404, + "step": 18718 + }, + { + "epoch": 2.5, + "grad_norm": 0.6328125, + "learning_rate": 7.429998751252889e-05, + "loss": 0.4823, + "step": 18719 + }, + { + "epoch": 2.5, + "grad_norm": 0.6640625, + "learning_rate": 7.428873381009456e-05, + "loss": 0.3027, + "step": 18720 + }, + { + "epoch": 2.5, + "grad_norm": 0.73828125, + "learning_rate": 7.427748045632148e-05, + "loss": 0.6001, + "step": 18721 + }, + { + "epoch": 2.5, + "grad_norm": 0.578125, + "learning_rate": 7.42662274513623e-05, + "loss": 0.1564, + "step": 18722 + }, + { + "epoch": 2.5, + "grad_norm": 0.6640625, + "learning_rate": 7.425497479536958e-05, + "loss": 0.2717, + "step": 18723 + }, + { + "epoch": 2.5, + "grad_norm": 0.51171875, + "learning_rate": 7.424372248849594e-05, + "loss": 0.4833, + "step": 18724 + }, + { + "epoch": 2.5, + "grad_norm": 0.5859375, + "learning_rate": 7.423247053089393e-05, + "loss": 0.1377, + "step": 18725 + }, + { + "epoch": 2.5, + "grad_norm": 0.55078125, + "learning_rate": 7.422121892271614e-05, + "loss": 0.2472, + "step": 18726 + }, + { + "epoch": 2.5, + "grad_norm": 0.578125, + "learning_rate": 7.420996766411521e-05, + "loss": 0.287, + "step": 18727 + }, + { + "epoch": 2.5, + "grad_norm": 0.72265625, + "learning_rate": 7.419871675524368e-05, + "loss": 0.6894, + "step": 18728 + }, + { + "epoch": 2.5, + "grad_norm": 0.60546875, + "learning_rate": 7.418746619625407e-05, + "loss": 0.3572, + "step": 18729 + }, + { + "epoch": 2.5, + "grad_norm": 0.57421875, + "learning_rate": 7.417621598729901e-05, + "loss": 0.388, + "step": 18730 + }, + { + "epoch": 2.5, + "grad_norm": 0.5390625, + "learning_rate": 7.416496612853101e-05, + "loss": 0.2882, + "step": 18731 + }, + { + "epoch": 2.5, + "grad_norm": 0.57421875, + "learning_rate": 7.415371662010267e-05, + "loss": 0.6479, + "step": 18732 + }, + { + "epoch": 2.5, + "grad_norm": 0.60546875, + "learning_rate": 7.41424674621665e-05, + "loss": 0.2749, + "step": 18733 + }, + { + "epoch": 2.5, + "grad_norm": 0.6015625, + "learning_rate": 7.413121865487509e-05, + "loss": 0.2987, + "step": 18734 + }, + { + "epoch": 2.5, + "grad_norm": 0.54296875, + "learning_rate": 7.411997019838094e-05, + "loss": 0.2695, + "step": 18735 + }, + { + "epoch": 2.5, + "grad_norm": 0.58203125, + "learning_rate": 7.410872209283662e-05, + "loss": 0.3714, + "step": 18736 + }, + { + "epoch": 2.5, + "grad_norm": 0.455078125, + "learning_rate": 7.409747433839462e-05, + "loss": 0.2242, + "step": 18737 + }, + { + "epoch": 2.5, + "grad_norm": 0.61328125, + "learning_rate": 7.408622693520753e-05, + "loss": 0.2135, + "step": 18738 + }, + { + "epoch": 2.5, + "grad_norm": 0.60546875, + "learning_rate": 7.40749798834278e-05, + "loss": 0.3565, + "step": 18739 + }, + { + "epoch": 2.5, + "grad_norm": 0.90234375, + "learning_rate": 7.406373318320803e-05, + "loss": 0.6592, + "step": 18740 + }, + { + "epoch": 2.5, + "grad_norm": 0.546875, + "learning_rate": 7.405248683470065e-05, + "loss": 0.3511, + "step": 18741 + }, + { + "epoch": 2.5, + "grad_norm": 0.53515625, + "learning_rate": 7.404124083805819e-05, + "loss": 0.2597, + "step": 18742 + }, + { + "epoch": 2.5, + "grad_norm": 0.91015625, + "learning_rate": 7.402999519343319e-05, + "loss": 0.3107, + "step": 18743 + }, + { + "epoch": 2.5, + "grad_norm": 0.494140625, + "learning_rate": 7.401874990097809e-05, + "loss": 0.2121, + "step": 18744 + }, + { + "epoch": 2.5, + "grad_norm": 0.5234375, + "learning_rate": 7.400750496084545e-05, + "loss": 0.1955, + "step": 18745 + }, + { + "epoch": 2.5, + "grad_norm": 0.90625, + "learning_rate": 7.399626037318769e-05, + "loss": 0.6327, + "step": 18746 + }, + { + "epoch": 2.5, + "grad_norm": 0.47265625, + "learning_rate": 7.398501613815736e-05, + "loss": 0.1459, + "step": 18747 + }, + { + "epoch": 2.5, + "grad_norm": 0.6875, + "learning_rate": 7.397377225590686e-05, + "loss": 0.4162, + "step": 18748 + }, + { + "epoch": 2.5, + "grad_norm": 0.6328125, + "learning_rate": 7.396252872658878e-05, + "loss": 0.293, + "step": 18749 + }, + { + "epoch": 2.5, + "grad_norm": 0.58203125, + "learning_rate": 7.395128555035547e-05, + "loss": 0.2767, + "step": 18750 + }, + { + "epoch": 2.5, + "grad_norm": 0.7109375, + "learning_rate": 7.394004272735945e-05, + "loss": 0.4239, + "step": 18751 + }, + { + "epoch": 2.5, + "grad_norm": 0.7578125, + "learning_rate": 7.392880025775324e-05, + "loss": 0.4999, + "step": 18752 + }, + { + "epoch": 2.5, + "grad_norm": 0.578125, + "learning_rate": 7.391755814168917e-05, + "loss": 0.3207, + "step": 18753 + }, + { + "epoch": 2.5, + "grad_norm": 0.6953125, + "learning_rate": 7.390631637931975e-05, + "loss": 0.2056, + "step": 18754 + }, + { + "epoch": 2.5, + "grad_norm": 0.5234375, + "learning_rate": 7.389507497079744e-05, + "loss": 0.338, + "step": 18755 + }, + { + "epoch": 2.5, + "grad_norm": 0.75390625, + "learning_rate": 7.388383391627464e-05, + "loss": 0.3814, + "step": 18756 + }, + { + "epoch": 2.5, + "grad_norm": 0.5859375, + "learning_rate": 7.387259321590386e-05, + "loss": 0.3529, + "step": 18757 + }, + { + "epoch": 2.5, + "grad_norm": 0.478515625, + "learning_rate": 7.386135286983745e-05, + "loss": 0.3354, + "step": 18758 + }, + { + "epoch": 2.5, + "grad_norm": 0.48046875, + "learning_rate": 7.385011287822789e-05, + "loss": 0.264, + "step": 18759 + }, + { + "epoch": 2.5, + "grad_norm": 0.64453125, + "learning_rate": 7.383887324122757e-05, + "loss": 0.2434, + "step": 18760 + }, + { + "epoch": 2.5, + "grad_norm": 0.50390625, + "learning_rate": 7.382763395898893e-05, + "loss": 0.2279, + "step": 18761 + }, + { + "epoch": 2.5, + "grad_norm": 0.59765625, + "learning_rate": 7.381639503166435e-05, + "loss": 0.4461, + "step": 18762 + }, + { + "epoch": 2.5, + "grad_norm": 0.5625, + "learning_rate": 7.380515645940629e-05, + "loss": 0.3416, + "step": 18763 + }, + { + "epoch": 2.5, + "grad_norm": 0.6015625, + "learning_rate": 7.379391824236711e-05, + "loss": 0.6463, + "step": 18764 + }, + { + "epoch": 2.5, + "grad_norm": 0.57421875, + "learning_rate": 7.378268038069922e-05, + "loss": 0.4195, + "step": 18765 + }, + { + "epoch": 2.5, + "grad_norm": 0.365234375, + "learning_rate": 7.3771442874555e-05, + "loss": 0.1918, + "step": 18766 + }, + { + "epoch": 2.5, + "grad_norm": 0.76171875, + "learning_rate": 7.376020572408684e-05, + "loss": 0.23, + "step": 18767 + }, + { + "epoch": 2.5, + "grad_norm": 0.375, + "learning_rate": 7.374896892944716e-05, + "loss": 0.1611, + "step": 18768 + }, + { + "epoch": 2.5, + "grad_norm": 0.5234375, + "learning_rate": 7.373773249078827e-05, + "loss": 0.3836, + "step": 18769 + }, + { + "epoch": 2.5, + "grad_norm": 0.51953125, + "learning_rate": 7.372649640826262e-05, + "loss": 0.4236, + "step": 18770 + }, + { + "epoch": 2.5, + "grad_norm": 0.72265625, + "learning_rate": 7.37152606820225e-05, + "loss": 0.3505, + "step": 18771 + }, + { + "epoch": 2.5, + "grad_norm": 0.6171875, + "learning_rate": 7.370402531222035e-05, + "loss": 0.2661, + "step": 18772 + }, + { + "epoch": 2.51, + "grad_norm": 0.6875, + "learning_rate": 7.369279029900846e-05, + "loss": 0.4161, + "step": 18773 + }, + { + "epoch": 2.51, + "grad_norm": 0.60546875, + "learning_rate": 7.368155564253925e-05, + "loss": 0.2533, + "step": 18774 + }, + { + "epoch": 2.51, + "grad_norm": 0.45703125, + "learning_rate": 7.3670321342965e-05, + "loss": 0.2097, + "step": 18775 + }, + { + "epoch": 2.51, + "grad_norm": 0.515625, + "learning_rate": 7.365908740043816e-05, + "loss": 0.3407, + "step": 18776 + }, + { + "epoch": 2.51, + "grad_norm": 0.451171875, + "learning_rate": 7.364785381511097e-05, + "loss": 0.351, + "step": 18777 + }, + { + "epoch": 2.51, + "grad_norm": 0.4765625, + "learning_rate": 7.363662058713577e-05, + "loss": 0.1701, + "step": 18778 + }, + { + "epoch": 2.51, + "grad_norm": 0.6015625, + "learning_rate": 7.362538771666491e-05, + "loss": 0.3451, + "step": 18779 + }, + { + "epoch": 2.51, + "grad_norm": 0.96875, + "learning_rate": 7.361415520385074e-05, + "loss": 0.4417, + "step": 18780 + }, + { + "epoch": 2.51, + "grad_norm": 0.6015625, + "learning_rate": 7.360292304884555e-05, + "loss": 0.4853, + "step": 18781 + }, + { + "epoch": 2.51, + "grad_norm": 0.5234375, + "learning_rate": 7.359169125180167e-05, + "loss": 0.2639, + "step": 18782 + }, + { + "epoch": 2.51, + "grad_norm": 0.5546875, + "learning_rate": 7.358045981287141e-05, + "loss": 0.2744, + "step": 18783 + }, + { + "epoch": 2.51, + "grad_norm": 0.7109375, + "learning_rate": 7.356922873220706e-05, + "loss": 0.2955, + "step": 18784 + }, + { + "epoch": 2.51, + "grad_norm": 0.46875, + "learning_rate": 7.355799800996093e-05, + "loss": 0.1896, + "step": 18785 + }, + { + "epoch": 2.51, + "grad_norm": 0.5625, + "learning_rate": 7.354676764628534e-05, + "loss": 0.2097, + "step": 18786 + }, + { + "epoch": 2.51, + "grad_norm": 0.58984375, + "learning_rate": 7.353553764133252e-05, + "loss": 0.366, + "step": 18787 + }, + { + "epoch": 2.51, + "grad_norm": 0.48828125, + "learning_rate": 7.352430799525485e-05, + "loss": 0.1382, + "step": 18788 + }, + { + "epoch": 2.51, + "grad_norm": 0.61328125, + "learning_rate": 7.351307870820454e-05, + "loss": 0.2549, + "step": 18789 + }, + { + "epoch": 2.51, + "grad_norm": 0.8515625, + "learning_rate": 7.350184978033386e-05, + "loss": 0.2033, + "step": 18790 + }, + { + "epoch": 2.51, + "grad_norm": 0.578125, + "learning_rate": 7.34906212117951e-05, + "loss": 0.3616, + "step": 18791 + }, + { + "epoch": 2.51, + "grad_norm": 0.71484375, + "learning_rate": 7.347939300274053e-05, + "loss": 0.3218, + "step": 18792 + }, + { + "epoch": 2.51, + "grad_norm": 0.73046875, + "learning_rate": 7.346816515332244e-05, + "loss": 0.459, + "step": 18793 + }, + { + "epoch": 2.51, + "grad_norm": 0.5859375, + "learning_rate": 7.345693766369301e-05, + "loss": 0.5928, + "step": 18794 + }, + { + "epoch": 2.51, + "grad_norm": 0.76171875, + "learning_rate": 7.344571053400458e-05, + "loss": 0.4932, + "step": 18795 + }, + { + "epoch": 2.51, + "grad_norm": 0.6640625, + "learning_rate": 7.343448376440934e-05, + "loss": 0.4217, + "step": 18796 + }, + { + "epoch": 2.51, + "grad_norm": 0.6953125, + "learning_rate": 7.342325735505958e-05, + "loss": 0.24, + "step": 18797 + }, + { + "epoch": 2.51, + "grad_norm": 0.51171875, + "learning_rate": 7.341203130610745e-05, + "loss": 0.4062, + "step": 18798 + }, + { + "epoch": 2.51, + "grad_norm": 0.67578125, + "learning_rate": 7.340080561770527e-05, + "loss": 0.2973, + "step": 18799 + }, + { + "epoch": 2.51, + "grad_norm": 0.609375, + "learning_rate": 7.33895802900053e-05, + "loss": 0.177, + "step": 18800 + }, + { + "epoch": 2.51, + "grad_norm": 0.75390625, + "learning_rate": 7.337835532315962e-05, + "loss": 0.3825, + "step": 18801 + }, + { + "epoch": 2.51, + "grad_norm": 0.53515625, + "learning_rate": 7.336713071732053e-05, + "loss": 0.2106, + "step": 18802 + }, + { + "epoch": 2.51, + "grad_norm": 0.59765625, + "learning_rate": 7.335590647264028e-05, + "loss": 0.335, + "step": 18803 + }, + { + "epoch": 2.51, + "grad_norm": 0.91796875, + "learning_rate": 7.3344682589271e-05, + "loss": 0.4198, + "step": 18804 + }, + { + "epoch": 2.51, + "grad_norm": 0.5, + "learning_rate": 7.333345906736496e-05, + "loss": 0.24, + "step": 18805 + }, + { + "epoch": 2.51, + "grad_norm": 0.44921875, + "learning_rate": 7.33222359070743e-05, + "loss": 0.4049, + "step": 18806 + }, + { + "epoch": 2.51, + "grad_norm": 0.69140625, + "learning_rate": 7.331101310855126e-05, + "loss": 0.5764, + "step": 18807 + }, + { + "epoch": 2.51, + "grad_norm": 0.53125, + "learning_rate": 7.3299790671948e-05, + "loss": 0.3177, + "step": 18808 + }, + { + "epoch": 2.51, + "grad_norm": 0.470703125, + "learning_rate": 7.328856859741674e-05, + "loss": 0.2197, + "step": 18809 + }, + { + "epoch": 2.51, + "grad_norm": 0.6640625, + "learning_rate": 7.327734688510961e-05, + "loss": 0.3617, + "step": 18810 + }, + { + "epoch": 2.51, + "grad_norm": 0.462890625, + "learning_rate": 7.326612553517882e-05, + "loss": 0.23, + "step": 18811 + }, + { + "epoch": 2.51, + "grad_norm": 0.54296875, + "learning_rate": 7.32549045477765e-05, + "loss": 0.3917, + "step": 18812 + }, + { + "epoch": 2.51, + "grad_norm": 0.62890625, + "learning_rate": 7.32436839230549e-05, + "loss": 0.3545, + "step": 18813 + }, + { + "epoch": 2.51, + "grad_norm": 0.734375, + "learning_rate": 7.323246366116608e-05, + "loss": 0.5412, + "step": 18814 + }, + { + "epoch": 2.51, + "grad_norm": 0.4765625, + "learning_rate": 7.322124376226223e-05, + "loss": 0.3757, + "step": 18815 + }, + { + "epoch": 2.51, + "grad_norm": 0.6640625, + "learning_rate": 7.32100242264955e-05, + "loss": 0.3933, + "step": 18816 + }, + { + "epoch": 2.51, + "grad_norm": 0.474609375, + "learning_rate": 7.319880505401802e-05, + "loss": 0.1791, + "step": 18817 + }, + { + "epoch": 2.51, + "grad_norm": 0.58984375, + "learning_rate": 7.318758624498198e-05, + "loss": 0.2212, + "step": 18818 + }, + { + "epoch": 2.51, + "grad_norm": 0.57421875, + "learning_rate": 7.317636779953945e-05, + "loss": 0.2256, + "step": 18819 + }, + { + "epoch": 2.51, + "grad_norm": 0.5859375, + "learning_rate": 7.316514971784261e-05, + "loss": 0.2306, + "step": 18820 + }, + { + "epoch": 2.51, + "grad_norm": 0.439453125, + "learning_rate": 7.315393200004353e-05, + "loss": 0.3644, + "step": 18821 + }, + { + "epoch": 2.51, + "grad_norm": 0.478515625, + "learning_rate": 7.314271464629438e-05, + "loss": 0.2888, + "step": 18822 + }, + { + "epoch": 2.51, + "grad_norm": 0.66015625, + "learning_rate": 7.313149765674725e-05, + "loss": 0.4884, + "step": 18823 + }, + { + "epoch": 2.51, + "grad_norm": 0.84765625, + "learning_rate": 7.312028103155426e-05, + "loss": 0.2864, + "step": 18824 + }, + { + "epoch": 2.51, + "grad_norm": 0.62109375, + "learning_rate": 7.310906477086756e-05, + "loss": 0.2141, + "step": 18825 + }, + { + "epoch": 2.51, + "grad_norm": 0.6171875, + "learning_rate": 7.309784887483915e-05, + "loss": 0.3524, + "step": 18826 + }, + { + "epoch": 2.51, + "grad_norm": 0.75, + "learning_rate": 7.308663334362116e-05, + "loss": 0.6545, + "step": 18827 + }, + { + "epoch": 2.51, + "grad_norm": 0.53515625, + "learning_rate": 7.307541817736572e-05, + "loss": 0.2524, + "step": 18828 + }, + { + "epoch": 2.51, + "grad_norm": 0.498046875, + "learning_rate": 7.306420337622487e-05, + "loss": 0.3947, + "step": 18829 + }, + { + "epoch": 2.51, + "grad_norm": 0.5390625, + "learning_rate": 7.305298894035073e-05, + "loss": 0.3104, + "step": 18830 + }, + { + "epoch": 2.51, + "grad_norm": 0.76171875, + "learning_rate": 7.304177486989535e-05, + "loss": 0.4966, + "step": 18831 + }, + { + "epoch": 2.51, + "grad_norm": 0.3828125, + "learning_rate": 7.30305611650108e-05, + "loss": 0.1774, + "step": 18832 + }, + { + "epoch": 2.51, + "grad_norm": 0.494140625, + "learning_rate": 7.301934782584914e-05, + "loss": 0.168, + "step": 18833 + }, + { + "epoch": 2.51, + "grad_norm": 0.5390625, + "learning_rate": 7.300813485256246e-05, + "loss": 0.3629, + "step": 18834 + }, + { + "epoch": 2.51, + "grad_norm": 0.6640625, + "learning_rate": 7.299692224530277e-05, + "loss": 0.3049, + "step": 18835 + }, + { + "epoch": 2.51, + "grad_norm": 0.66796875, + "learning_rate": 7.298571000422217e-05, + "loss": 0.4379, + "step": 18836 + }, + { + "epoch": 2.51, + "grad_norm": 0.53125, + "learning_rate": 7.29744981294727e-05, + "loss": 0.2927, + "step": 18837 + }, + { + "epoch": 2.51, + "grad_norm": 0.671875, + "learning_rate": 7.296328662120636e-05, + "loss": 0.2473, + "step": 18838 + }, + { + "epoch": 2.51, + "grad_norm": 0.53125, + "learning_rate": 7.295207547957519e-05, + "loss": 0.3142, + "step": 18839 + }, + { + "epoch": 2.51, + "grad_norm": 0.65234375, + "learning_rate": 7.294086470473123e-05, + "loss": 0.5021, + "step": 18840 + }, + { + "epoch": 2.51, + "grad_norm": 0.7578125, + "learning_rate": 7.292965429682652e-05, + "loss": 0.4401, + "step": 18841 + }, + { + "epoch": 2.51, + "grad_norm": 0.671875, + "learning_rate": 7.291844425601306e-05, + "loss": 0.233, + "step": 18842 + }, + { + "epoch": 2.51, + "grad_norm": 0.65234375, + "learning_rate": 7.29072345824429e-05, + "loss": 0.5776, + "step": 18843 + }, + { + "epoch": 2.51, + "grad_norm": 0.53515625, + "learning_rate": 7.289602527626802e-05, + "loss": 0.4851, + "step": 18844 + }, + { + "epoch": 2.51, + "grad_norm": 0.515625, + "learning_rate": 7.288481633764043e-05, + "loss": 0.3332, + "step": 18845 + }, + { + "epoch": 2.51, + "grad_norm": 0.625, + "learning_rate": 7.287360776671213e-05, + "loss": 0.3452, + "step": 18846 + }, + { + "epoch": 2.51, + "grad_norm": 0.70703125, + "learning_rate": 7.286239956363512e-05, + "loss": 0.7066, + "step": 18847 + }, + { + "epoch": 2.52, + "grad_norm": 0.49609375, + "learning_rate": 7.285119172856138e-05, + "loss": 0.3343, + "step": 18848 + }, + { + "epoch": 2.52, + "grad_norm": 0.5625, + "learning_rate": 7.283998426164296e-05, + "loss": 0.4683, + "step": 18849 + }, + { + "epoch": 2.52, + "grad_norm": 0.486328125, + "learning_rate": 7.282877716303174e-05, + "loss": 0.3294, + "step": 18850 + }, + { + "epoch": 2.52, + "grad_norm": 0.7890625, + "learning_rate": 7.281757043287973e-05, + "loss": 0.5356, + "step": 18851 + }, + { + "epoch": 2.52, + "grad_norm": 0.451171875, + "learning_rate": 7.280636407133889e-05, + "loss": 0.1631, + "step": 18852 + }, + { + "epoch": 2.52, + "grad_norm": 0.443359375, + "learning_rate": 7.279515807856125e-05, + "loss": 0.2407, + "step": 18853 + }, + { + "epoch": 2.52, + "grad_norm": 0.7421875, + "learning_rate": 7.278395245469871e-05, + "loss": 0.289, + "step": 18854 + }, + { + "epoch": 2.52, + "grad_norm": 0.498046875, + "learning_rate": 7.277274719990326e-05, + "loss": 0.3129, + "step": 18855 + }, + { + "epoch": 2.52, + "grad_norm": 0.62890625, + "learning_rate": 7.276154231432679e-05, + "loss": 0.3442, + "step": 18856 + }, + { + "epoch": 2.52, + "grad_norm": 0.625, + "learning_rate": 7.275033779812132e-05, + "loss": 0.2465, + "step": 18857 + }, + { + "epoch": 2.52, + "grad_norm": 0.49609375, + "learning_rate": 7.273913365143873e-05, + "loss": 0.5166, + "step": 18858 + }, + { + "epoch": 2.52, + "grad_norm": 0.625, + "learning_rate": 7.272792987443103e-05, + "loss": 0.1824, + "step": 18859 + }, + { + "epoch": 2.52, + "grad_norm": 0.55078125, + "learning_rate": 7.271672646725007e-05, + "loss": 0.389, + "step": 18860 + }, + { + "epoch": 2.52, + "grad_norm": 0.65234375, + "learning_rate": 7.270552343004784e-05, + "loss": 0.4953, + "step": 18861 + }, + { + "epoch": 2.52, + "grad_norm": 0.65625, + "learning_rate": 7.269432076297622e-05, + "loss": 0.6146, + "step": 18862 + }, + { + "epoch": 2.52, + "grad_norm": 0.5625, + "learning_rate": 7.268311846618712e-05, + "loss": 0.4206, + "step": 18863 + }, + { + "epoch": 2.52, + "grad_norm": 0.9375, + "learning_rate": 7.267191653983247e-05, + "loss": 0.2515, + "step": 18864 + }, + { + "epoch": 2.52, + "grad_norm": 0.48046875, + "learning_rate": 7.266071498406417e-05, + "loss": 0.2215, + "step": 18865 + }, + { + "epoch": 2.52, + "grad_norm": 0.51953125, + "learning_rate": 7.264951379903414e-05, + "loss": 0.3289, + "step": 18866 + }, + { + "epoch": 2.52, + "grad_norm": 0.578125, + "learning_rate": 7.26383129848942e-05, + "loss": 0.4424, + "step": 18867 + }, + { + "epoch": 2.52, + "grad_norm": 0.435546875, + "learning_rate": 7.262711254179635e-05, + "loss": 0.1797, + "step": 18868 + }, + { + "epoch": 2.52, + "grad_norm": 0.65234375, + "learning_rate": 7.261591246989239e-05, + "loss": 0.4248, + "step": 18869 + }, + { + "epoch": 2.52, + "grad_norm": 0.54296875, + "learning_rate": 7.260471276933426e-05, + "loss": 0.3116, + "step": 18870 + }, + { + "epoch": 2.52, + "grad_norm": 0.431640625, + "learning_rate": 7.259351344027379e-05, + "loss": 0.2535, + "step": 18871 + }, + { + "epoch": 2.52, + "grad_norm": 0.640625, + "learning_rate": 7.258231448286285e-05, + "loss": 0.4495, + "step": 18872 + }, + { + "epoch": 2.52, + "grad_norm": 0.61328125, + "learning_rate": 7.25711158972534e-05, + "loss": 0.4942, + "step": 18873 + }, + { + "epoch": 2.52, + "grad_norm": 0.5625, + "learning_rate": 7.255991768359716e-05, + "loss": 0.3114, + "step": 18874 + }, + { + "epoch": 2.52, + "grad_norm": 0.453125, + "learning_rate": 7.254871984204603e-05, + "loss": 0.1613, + "step": 18875 + }, + { + "epoch": 2.52, + "grad_norm": 0.37109375, + "learning_rate": 7.253752237275191e-05, + "loss": 0.1346, + "step": 18876 + }, + { + "epoch": 2.52, + "grad_norm": 0.73046875, + "learning_rate": 7.252632527586659e-05, + "loss": 0.2337, + "step": 18877 + }, + { + "epoch": 2.52, + "grad_norm": 0.53515625, + "learning_rate": 7.251512855154195e-05, + "loss": 0.3191, + "step": 18878 + }, + { + "epoch": 2.52, + "grad_norm": 0.51953125, + "learning_rate": 7.250393219992979e-05, + "loss": 0.2119, + "step": 18879 + }, + { + "epoch": 2.52, + "grad_norm": 0.58984375, + "learning_rate": 7.249273622118198e-05, + "loss": 0.3154, + "step": 18880 + }, + { + "epoch": 2.52, + "grad_norm": 0.7109375, + "learning_rate": 7.24815406154503e-05, + "loss": 0.3786, + "step": 18881 + }, + { + "epoch": 2.52, + "grad_norm": 0.5546875, + "learning_rate": 7.24703453828866e-05, + "loss": 0.3344, + "step": 18882 + }, + { + "epoch": 2.52, + "grad_norm": 0.609375, + "learning_rate": 7.245915052364267e-05, + "loss": 0.3438, + "step": 18883 + }, + { + "epoch": 2.52, + "grad_norm": 0.5234375, + "learning_rate": 7.244795603787036e-05, + "loss": 0.351, + "step": 18884 + }, + { + "epoch": 2.52, + "grad_norm": 0.60546875, + "learning_rate": 7.243676192572146e-05, + "loss": 0.6125, + "step": 18885 + }, + { + "epoch": 2.52, + "grad_norm": 0.546875, + "learning_rate": 7.242556818734774e-05, + "loss": 0.3158, + "step": 18886 + }, + { + "epoch": 2.52, + "grad_norm": 0.69140625, + "learning_rate": 7.2414374822901e-05, + "loss": 0.594, + "step": 18887 + }, + { + "epoch": 2.52, + "grad_norm": 0.453125, + "learning_rate": 7.240318183253304e-05, + "loss": 0.208, + "step": 18888 + }, + { + "epoch": 2.52, + "grad_norm": 0.66796875, + "learning_rate": 7.239198921639568e-05, + "loss": 0.2911, + "step": 18889 + }, + { + "epoch": 2.52, + "grad_norm": 0.51953125, + "learning_rate": 7.238079697464061e-05, + "loss": 0.1971, + "step": 18890 + }, + { + "epoch": 2.52, + "grad_norm": 0.55078125, + "learning_rate": 7.236960510741971e-05, + "loss": 0.402, + "step": 18891 + }, + { + "epoch": 2.52, + "grad_norm": 0.64453125, + "learning_rate": 7.235841361488465e-05, + "loss": 0.3918, + "step": 18892 + }, + { + "epoch": 2.52, + "grad_norm": 0.546875, + "learning_rate": 7.234722249718728e-05, + "loss": 0.2626, + "step": 18893 + }, + { + "epoch": 2.52, + "grad_norm": 0.5546875, + "learning_rate": 7.233603175447929e-05, + "loss": 0.1734, + "step": 18894 + }, + { + "epoch": 2.52, + "grad_norm": 0.365234375, + "learning_rate": 7.23248413869125e-05, + "loss": 0.221, + "step": 18895 + }, + { + "epoch": 2.52, + "grad_norm": 0.82421875, + "learning_rate": 7.23136513946386e-05, + "loss": 0.3175, + "step": 18896 + }, + { + "epoch": 2.52, + "grad_norm": 0.408203125, + "learning_rate": 7.230246177780935e-05, + "loss": 0.1686, + "step": 18897 + }, + { + "epoch": 2.52, + "grad_norm": 0.357421875, + "learning_rate": 7.229127253657655e-05, + "loss": 0.1154, + "step": 18898 + }, + { + "epoch": 2.52, + "grad_norm": 0.53125, + "learning_rate": 7.228008367109183e-05, + "loss": 0.4316, + "step": 18899 + }, + { + "epoch": 2.52, + "grad_norm": 0.5078125, + "learning_rate": 7.226889518150694e-05, + "loss": 0.3805, + "step": 18900 + }, + { + "epoch": 2.52, + "grad_norm": 0.8125, + "learning_rate": 7.225770706797368e-05, + "loss": 0.3687, + "step": 18901 + }, + { + "epoch": 2.52, + "grad_norm": 0.53125, + "learning_rate": 7.224651933064369e-05, + "loss": 0.3411, + "step": 18902 + }, + { + "epoch": 2.52, + "grad_norm": 0.59375, + "learning_rate": 7.223533196966872e-05, + "loss": 0.3059, + "step": 18903 + }, + { + "epoch": 2.52, + "grad_norm": 0.53515625, + "learning_rate": 7.222414498520045e-05, + "loss": 0.487, + "step": 18904 + }, + { + "epoch": 2.52, + "grad_norm": 0.609375, + "learning_rate": 7.221295837739061e-05, + "loss": 0.4327, + "step": 18905 + }, + { + "epoch": 2.52, + "grad_norm": 0.62109375, + "learning_rate": 7.220177214639088e-05, + "loss": 0.6542, + "step": 18906 + }, + { + "epoch": 2.52, + "grad_norm": 0.484375, + "learning_rate": 7.219058629235298e-05, + "loss": 0.2903, + "step": 18907 + }, + { + "epoch": 2.52, + "grad_norm": 0.7421875, + "learning_rate": 7.217940081542856e-05, + "loss": 0.5016, + "step": 18908 + }, + { + "epoch": 2.52, + "grad_norm": 0.3671875, + "learning_rate": 7.216821571576935e-05, + "loss": 0.183, + "step": 18909 + }, + { + "epoch": 2.52, + "grad_norm": 0.80859375, + "learning_rate": 7.215703099352699e-05, + "loss": 0.3643, + "step": 18910 + }, + { + "epoch": 2.52, + "grad_norm": 0.6015625, + "learning_rate": 7.214584664885314e-05, + "loss": 0.4208, + "step": 18911 + }, + { + "epoch": 2.52, + "grad_norm": 0.6875, + "learning_rate": 7.21346626818995e-05, + "loss": 0.3643, + "step": 18912 + }, + { + "epoch": 2.52, + "grad_norm": 0.59765625, + "learning_rate": 7.21234790928177e-05, + "loss": 0.3635, + "step": 18913 + }, + { + "epoch": 2.52, + "grad_norm": 0.9375, + "learning_rate": 7.211229588175945e-05, + "loss": 0.2888, + "step": 18914 + }, + { + "epoch": 2.52, + "grad_norm": 0.66015625, + "learning_rate": 7.210111304887632e-05, + "loss": 0.4671, + "step": 18915 + }, + { + "epoch": 2.52, + "grad_norm": 0.494140625, + "learning_rate": 7.208993059432004e-05, + "loss": 0.2119, + "step": 18916 + }, + { + "epoch": 2.52, + "grad_norm": 0.69140625, + "learning_rate": 7.207874851824216e-05, + "loss": 0.4975, + "step": 18917 + }, + { + "epoch": 2.52, + "grad_norm": 0.58203125, + "learning_rate": 7.206756682079443e-05, + "loss": 0.1956, + "step": 18918 + }, + { + "epoch": 2.52, + "grad_norm": 0.578125, + "learning_rate": 7.205638550212839e-05, + "loss": 0.5809, + "step": 18919 + }, + { + "epoch": 2.52, + "grad_norm": 0.5234375, + "learning_rate": 7.20452045623957e-05, + "loss": 0.1416, + "step": 18920 + }, + { + "epoch": 2.52, + "grad_norm": 0.54296875, + "learning_rate": 7.203402400174799e-05, + "loss": 0.3454, + "step": 18921 + }, + { + "epoch": 2.52, + "grad_norm": 0.7734375, + "learning_rate": 7.202284382033687e-05, + "loss": 0.6284, + "step": 18922 + }, + { + "epoch": 2.53, + "grad_norm": 0.5078125, + "learning_rate": 7.201166401831394e-05, + "loss": 0.301, + "step": 18923 + }, + { + "epoch": 2.53, + "grad_norm": 0.66796875, + "learning_rate": 7.200048459583079e-05, + "loss": 0.37, + "step": 18924 + }, + { + "epoch": 2.53, + "grad_norm": 0.5390625, + "learning_rate": 7.198930555303903e-05, + "loss": 0.45, + "step": 18925 + }, + { + "epoch": 2.53, + "grad_norm": 0.66015625, + "learning_rate": 7.19781268900903e-05, + "loss": 0.3007, + "step": 18926 + }, + { + "epoch": 2.53, + "grad_norm": 0.486328125, + "learning_rate": 7.196694860713612e-05, + "loss": 0.1694, + "step": 18927 + }, + { + "epoch": 2.53, + "grad_norm": 0.53125, + "learning_rate": 7.195577070432811e-05, + "loss": 0.2872, + "step": 18928 + }, + { + "epoch": 2.53, + "grad_norm": 0.59375, + "learning_rate": 7.194459318181785e-05, + "loss": 0.4559, + "step": 18929 + }, + { + "epoch": 2.53, + "grad_norm": 0.5859375, + "learning_rate": 7.193341603975694e-05, + "loss": 0.3969, + "step": 18930 + }, + { + "epoch": 2.53, + "grad_norm": 0.51953125, + "learning_rate": 7.192223927829689e-05, + "loss": 0.2091, + "step": 18931 + }, + { + "epoch": 2.53, + "grad_norm": 0.6484375, + "learning_rate": 7.191106289758931e-05, + "loss": 0.2877, + "step": 18932 + }, + { + "epoch": 2.53, + "grad_norm": 0.5390625, + "learning_rate": 7.189988689778575e-05, + "loss": 0.3066, + "step": 18933 + }, + { + "epoch": 2.53, + "grad_norm": 0.65234375, + "learning_rate": 7.188871127903777e-05, + "loss": 0.45, + "step": 18934 + }, + { + "epoch": 2.53, + "grad_norm": 0.75390625, + "learning_rate": 7.187753604149689e-05, + "loss": 0.22, + "step": 18935 + }, + { + "epoch": 2.53, + "grad_norm": 0.8359375, + "learning_rate": 7.186636118531467e-05, + "loss": 1.0168, + "step": 18936 + }, + { + "epoch": 2.53, + "grad_norm": 0.59375, + "learning_rate": 7.185518671064266e-05, + "loss": 0.2548, + "step": 18937 + }, + { + "epoch": 2.53, + "grad_norm": 0.466796875, + "learning_rate": 7.184401261763233e-05, + "loss": 0.3196, + "step": 18938 + }, + { + "epoch": 2.53, + "grad_norm": 0.5390625, + "learning_rate": 7.183283890643533e-05, + "loss": 0.1856, + "step": 18939 + }, + { + "epoch": 2.53, + "grad_norm": 0.5625, + "learning_rate": 7.182166557720307e-05, + "loss": 0.2853, + "step": 18940 + }, + { + "epoch": 2.53, + "grad_norm": 0.578125, + "learning_rate": 7.181049263008712e-05, + "loss": 0.347, + "step": 18941 + }, + { + "epoch": 2.53, + "grad_norm": 0.546875, + "learning_rate": 7.179932006523897e-05, + "loss": 0.2814, + "step": 18942 + }, + { + "epoch": 2.53, + "grad_norm": 0.53515625, + "learning_rate": 7.178814788281016e-05, + "loss": 0.3146, + "step": 18943 + }, + { + "epoch": 2.53, + "grad_norm": 0.6640625, + "learning_rate": 7.177697608295215e-05, + "loss": 0.5393, + "step": 18944 + }, + { + "epoch": 2.53, + "grad_norm": 0.55859375, + "learning_rate": 7.176580466581646e-05, + "loss": 0.2402, + "step": 18945 + }, + { + "epoch": 2.53, + "grad_norm": 0.71484375, + "learning_rate": 7.175463363155463e-05, + "loss": 0.3926, + "step": 18946 + }, + { + "epoch": 2.53, + "grad_norm": 0.59375, + "learning_rate": 7.174346298031804e-05, + "loss": 0.3929, + "step": 18947 + }, + { + "epoch": 2.53, + "grad_norm": 0.55078125, + "learning_rate": 7.173229271225824e-05, + "loss": 0.2556, + "step": 18948 + }, + { + "epoch": 2.53, + "grad_norm": 0.55859375, + "learning_rate": 7.17211228275267e-05, + "loss": 0.4407, + "step": 18949 + }, + { + "epoch": 2.53, + "grad_norm": 0.86328125, + "learning_rate": 7.170995332627486e-05, + "loss": 0.6193, + "step": 18950 + }, + { + "epoch": 2.53, + "grad_norm": 0.6015625, + "learning_rate": 7.169878420865424e-05, + "loss": 0.3522, + "step": 18951 + }, + { + "epoch": 2.53, + "grad_norm": 0.66796875, + "learning_rate": 7.168761547481623e-05, + "loss": 0.4062, + "step": 18952 + }, + { + "epoch": 2.53, + "grad_norm": 0.546875, + "learning_rate": 7.167644712491237e-05, + "loss": 0.3808, + "step": 18953 + }, + { + "epoch": 2.53, + "grad_norm": 0.66796875, + "learning_rate": 7.166527915909403e-05, + "loss": 0.3863, + "step": 18954 + }, + { + "epoch": 2.53, + "grad_norm": 0.65234375, + "learning_rate": 7.165411157751272e-05, + "loss": 0.394, + "step": 18955 + }, + { + "epoch": 2.53, + "grad_norm": 0.59765625, + "learning_rate": 7.164294438031982e-05, + "loss": 0.4864, + "step": 18956 + }, + { + "epoch": 2.53, + "grad_norm": 0.6171875, + "learning_rate": 7.163177756766681e-05, + "loss": 0.232, + "step": 18957 + }, + { + "epoch": 2.53, + "grad_norm": 0.65234375, + "learning_rate": 7.162061113970514e-05, + "loss": 0.346, + "step": 18958 + }, + { + "epoch": 2.53, + "grad_norm": 0.66796875, + "learning_rate": 7.160944509658615e-05, + "loss": 0.5534, + "step": 18959 + }, + { + "epoch": 2.53, + "grad_norm": 0.62890625, + "learning_rate": 7.159827943846132e-05, + "loss": 0.4489, + "step": 18960 + }, + { + "epoch": 2.53, + "grad_norm": 0.8125, + "learning_rate": 7.158711416548203e-05, + "loss": 0.5687, + "step": 18961 + }, + { + "epoch": 2.53, + "grad_norm": 0.61328125, + "learning_rate": 7.157594927779972e-05, + "loss": 0.2009, + "step": 18962 + }, + { + "epoch": 2.53, + "grad_norm": 0.474609375, + "learning_rate": 7.156478477556577e-05, + "loss": 0.3204, + "step": 18963 + }, + { + "epoch": 2.53, + "grad_norm": 0.419921875, + "learning_rate": 7.155362065893161e-05, + "loss": 0.2192, + "step": 18964 + }, + { + "epoch": 2.53, + "grad_norm": 0.48046875, + "learning_rate": 7.15424569280486e-05, + "loss": 0.2444, + "step": 18965 + }, + { + "epoch": 2.53, + "grad_norm": 1.0703125, + "learning_rate": 7.153129358306814e-05, + "loss": 0.2695, + "step": 18966 + }, + { + "epoch": 2.53, + "grad_norm": 0.5859375, + "learning_rate": 7.15201306241416e-05, + "loss": 0.4391, + "step": 18967 + }, + { + "epoch": 2.53, + "grad_norm": 0.51171875, + "learning_rate": 7.150896805142038e-05, + "loss": 0.2669, + "step": 18968 + }, + { + "epoch": 2.53, + "grad_norm": 0.470703125, + "learning_rate": 7.149780586505581e-05, + "loss": 0.3288, + "step": 18969 + }, + { + "epoch": 2.53, + "grad_norm": 0.5703125, + "learning_rate": 7.148664406519934e-05, + "loss": 0.3693, + "step": 18970 + }, + { + "epoch": 2.53, + "grad_norm": 0.76953125, + "learning_rate": 7.147548265200226e-05, + "loss": 0.2908, + "step": 18971 + }, + { + "epoch": 2.53, + "grad_norm": 0.70703125, + "learning_rate": 7.14643216256159e-05, + "loss": 0.2986, + "step": 18972 + }, + { + "epoch": 2.53, + "grad_norm": 0.515625, + "learning_rate": 7.145316098619167e-05, + "loss": 0.3212, + "step": 18973 + }, + { + "epoch": 2.53, + "grad_norm": 0.46875, + "learning_rate": 7.144200073388092e-05, + "loss": 0.1501, + "step": 18974 + }, + { + "epoch": 2.53, + "grad_norm": 0.59375, + "learning_rate": 7.143084086883494e-05, + "loss": 0.1789, + "step": 18975 + }, + { + "epoch": 2.53, + "grad_norm": 0.578125, + "learning_rate": 7.141968139120511e-05, + "loss": 0.4587, + "step": 18976 + }, + { + "epoch": 2.53, + "grad_norm": 0.47265625, + "learning_rate": 7.140852230114274e-05, + "loss": 0.1423, + "step": 18977 + }, + { + "epoch": 2.53, + "grad_norm": 0.6796875, + "learning_rate": 7.139736359879916e-05, + "loss": 0.4755, + "step": 18978 + }, + { + "epoch": 2.53, + "grad_norm": 0.578125, + "learning_rate": 7.138620528432568e-05, + "loss": 0.2788, + "step": 18979 + }, + { + "epoch": 2.53, + "grad_norm": 0.46484375, + "learning_rate": 7.137504735787363e-05, + "loss": 0.4361, + "step": 18980 + }, + { + "epoch": 2.53, + "grad_norm": 0.64453125, + "learning_rate": 7.13638898195943e-05, + "loss": 0.4979, + "step": 18981 + }, + { + "epoch": 2.53, + "grad_norm": 0.54296875, + "learning_rate": 7.135273266963902e-05, + "loss": 0.2512, + "step": 18982 + }, + { + "epoch": 2.53, + "grad_norm": 0.46484375, + "learning_rate": 7.134157590815909e-05, + "loss": 0.3007, + "step": 18983 + }, + { + "epoch": 2.53, + "grad_norm": 0.7421875, + "learning_rate": 7.133041953530574e-05, + "loss": 0.2764, + "step": 18984 + }, + { + "epoch": 2.53, + "grad_norm": 0.5546875, + "learning_rate": 7.131926355123032e-05, + "loss": 0.2025, + "step": 18985 + }, + { + "epoch": 2.53, + "grad_norm": 0.5546875, + "learning_rate": 7.130810795608407e-05, + "loss": 0.387, + "step": 18986 + }, + { + "epoch": 2.53, + "grad_norm": 1.0390625, + "learning_rate": 7.129695275001831e-05, + "loss": 0.2753, + "step": 18987 + }, + { + "epoch": 2.53, + "grad_norm": 0.7734375, + "learning_rate": 7.128579793318428e-05, + "loss": 0.4692, + "step": 18988 + }, + { + "epoch": 2.53, + "grad_norm": 0.63671875, + "learning_rate": 7.127464350573329e-05, + "loss": 0.4364, + "step": 18989 + }, + { + "epoch": 2.53, + "grad_norm": 0.5859375, + "learning_rate": 7.126348946781654e-05, + "loss": 0.5774, + "step": 18990 + }, + { + "epoch": 2.53, + "grad_norm": 0.52734375, + "learning_rate": 7.125233581958533e-05, + "loss": 0.3784, + "step": 18991 + }, + { + "epoch": 2.53, + "grad_norm": 0.54296875, + "learning_rate": 7.124118256119087e-05, + "loss": 0.1957, + "step": 18992 + }, + { + "epoch": 2.53, + "grad_norm": 0.5546875, + "learning_rate": 7.123002969278447e-05, + "loss": 0.3095, + "step": 18993 + }, + { + "epoch": 2.53, + "grad_norm": 0.6015625, + "learning_rate": 7.12188772145173e-05, + "loss": 0.3437, + "step": 18994 + }, + { + "epoch": 2.53, + "grad_norm": 0.44921875, + "learning_rate": 7.120772512654067e-05, + "loss": 0.2257, + "step": 18995 + }, + { + "epoch": 2.53, + "grad_norm": 0.6328125, + "learning_rate": 7.119657342900574e-05, + "loss": 0.3843, + "step": 18996 + }, + { + "epoch": 2.53, + "grad_norm": 0.5703125, + "learning_rate": 7.118542212206373e-05, + "loss": 0.2262, + "step": 18997 + }, + { + "epoch": 2.54, + "grad_norm": 0.59375, + "learning_rate": 7.117427120586592e-05, + "loss": 0.1896, + "step": 18998 + }, + { + "epoch": 2.54, + "grad_norm": 0.76171875, + "learning_rate": 7.116312068056349e-05, + "loss": 0.2652, + "step": 18999 + }, + { + "epoch": 2.54, + "grad_norm": 0.478515625, + "learning_rate": 7.115197054630764e-05, + "loss": 0.5142, + "step": 19000 + }, + { + "epoch": 2.54, + "grad_norm": 0.57421875, + "learning_rate": 7.114082080324961e-05, + "loss": 0.3353, + "step": 19001 + }, + { + "epoch": 2.54, + "grad_norm": 0.458984375, + "learning_rate": 7.112967145154054e-05, + "loss": 0.2516, + "step": 19002 + }, + { + "epoch": 2.54, + "grad_norm": 0.8203125, + "learning_rate": 7.111852249133169e-05, + "loss": 0.447, + "step": 19003 + }, + { + "epoch": 2.54, + "grad_norm": 0.443359375, + "learning_rate": 7.110737392277417e-05, + "loss": 0.2717, + "step": 19004 + }, + { + "epoch": 2.54, + "grad_norm": 0.640625, + "learning_rate": 7.109622574601923e-05, + "loss": 0.4776, + "step": 19005 + }, + { + "epoch": 2.54, + "grad_norm": 0.58203125, + "learning_rate": 7.1085077961218e-05, + "loss": 0.3778, + "step": 19006 + }, + { + "epoch": 2.54, + "grad_norm": 0.4453125, + "learning_rate": 7.107393056852171e-05, + "loss": 0.3069, + "step": 19007 + }, + { + "epoch": 2.54, + "grad_norm": 0.435546875, + "learning_rate": 7.106278356808146e-05, + "loss": 0.2212, + "step": 19008 + }, + { + "epoch": 2.54, + "grad_norm": 0.66796875, + "learning_rate": 7.105163696004843e-05, + "loss": 0.1923, + "step": 19009 + }, + { + "epoch": 2.54, + "grad_norm": 0.45703125, + "learning_rate": 7.10404907445738e-05, + "loss": 0.2801, + "step": 19010 + }, + { + "epoch": 2.54, + "grad_norm": 0.4453125, + "learning_rate": 7.102934492180867e-05, + "loss": 0.2441, + "step": 19011 + }, + { + "epoch": 2.54, + "grad_norm": 0.5625, + "learning_rate": 7.101819949190424e-05, + "loss": 0.3113, + "step": 19012 + }, + { + "epoch": 2.54, + "grad_norm": 0.78125, + "learning_rate": 7.100705445501162e-05, + "loss": 0.3892, + "step": 19013 + }, + { + "epoch": 2.54, + "grad_norm": 0.55859375, + "learning_rate": 7.099590981128194e-05, + "loss": 0.2448, + "step": 19014 + }, + { + "epoch": 2.54, + "grad_norm": 0.60546875, + "learning_rate": 7.098476556086635e-05, + "loss": 0.3245, + "step": 19015 + }, + { + "epoch": 2.54, + "grad_norm": 0.62890625, + "learning_rate": 7.097362170391595e-05, + "loss": 0.2763, + "step": 19016 + }, + { + "epoch": 2.54, + "grad_norm": 0.6484375, + "learning_rate": 7.096247824058188e-05, + "loss": 0.2862, + "step": 19017 + }, + { + "epoch": 2.54, + "grad_norm": 0.5625, + "learning_rate": 7.095133517101522e-05, + "loss": 0.2881, + "step": 19018 + }, + { + "epoch": 2.54, + "grad_norm": 0.57421875, + "learning_rate": 7.094019249536716e-05, + "loss": 0.5131, + "step": 19019 + }, + { + "epoch": 2.54, + "grad_norm": 0.6015625, + "learning_rate": 7.092905021378868e-05, + "loss": 0.201, + "step": 19020 + }, + { + "epoch": 2.54, + "grad_norm": 0.51171875, + "learning_rate": 7.091790832643094e-05, + "loss": 0.1751, + "step": 19021 + }, + { + "epoch": 2.54, + "grad_norm": 0.6015625, + "learning_rate": 7.090676683344507e-05, + "loss": 0.2753, + "step": 19022 + }, + { + "epoch": 2.54, + "grad_norm": 0.75, + "learning_rate": 7.089562573498206e-05, + "loss": 0.2873, + "step": 19023 + }, + { + "epoch": 2.54, + "grad_norm": 0.44921875, + "learning_rate": 7.088448503119309e-05, + "loss": 0.3868, + "step": 19024 + }, + { + "epoch": 2.54, + "grad_norm": 0.388671875, + "learning_rate": 7.087334472222915e-05, + "loss": 0.2187, + "step": 19025 + }, + { + "epoch": 2.54, + "grad_norm": 0.64453125, + "learning_rate": 7.086220480824137e-05, + "loss": 0.4102, + "step": 19026 + }, + { + "epoch": 2.54, + "grad_norm": 0.447265625, + "learning_rate": 7.085106528938078e-05, + "loss": 0.2672, + "step": 19027 + }, + { + "epoch": 2.54, + "grad_norm": 0.54296875, + "learning_rate": 7.083992616579846e-05, + "loss": 0.2911, + "step": 19028 + }, + { + "epoch": 2.54, + "grad_norm": 0.86328125, + "learning_rate": 7.082878743764545e-05, + "loss": 0.44, + "step": 19029 + }, + { + "epoch": 2.54, + "grad_norm": 0.443359375, + "learning_rate": 7.081764910507283e-05, + "loss": 0.3012, + "step": 19030 + }, + { + "epoch": 2.54, + "grad_norm": 0.48046875, + "learning_rate": 7.080651116823163e-05, + "loss": 0.3487, + "step": 19031 + }, + { + "epoch": 2.54, + "grad_norm": 0.57421875, + "learning_rate": 7.079537362727283e-05, + "loss": 0.427, + "step": 19032 + }, + { + "epoch": 2.54, + "grad_norm": 0.5625, + "learning_rate": 7.078423648234753e-05, + "loss": 0.2083, + "step": 19033 + }, + { + "epoch": 2.54, + "grad_norm": 0.58984375, + "learning_rate": 7.07730997336067e-05, + "loss": 0.4109, + "step": 19034 + }, + { + "epoch": 2.54, + "grad_norm": 0.5703125, + "learning_rate": 7.076196338120144e-05, + "loss": 0.2028, + "step": 19035 + }, + { + "epoch": 2.54, + "grad_norm": 0.69140625, + "learning_rate": 7.07508274252827e-05, + "loss": 0.1429, + "step": 19036 + }, + { + "epoch": 2.54, + "grad_norm": 0.64453125, + "learning_rate": 7.073969186600152e-05, + "loss": 0.5922, + "step": 19037 + }, + { + "epoch": 2.54, + "grad_norm": 0.55859375, + "learning_rate": 7.072855670350887e-05, + "loss": 0.227, + "step": 19038 + }, + { + "epoch": 2.54, + "grad_norm": 0.64453125, + "learning_rate": 7.071742193795581e-05, + "loss": 0.4221, + "step": 19039 + }, + { + "epoch": 2.54, + "grad_norm": 0.6484375, + "learning_rate": 7.070628756949328e-05, + "loss": 0.2542, + "step": 19040 + }, + { + "epoch": 2.54, + "grad_norm": 0.67578125, + "learning_rate": 7.069515359827231e-05, + "loss": 0.4827, + "step": 19041 + }, + { + "epoch": 2.54, + "grad_norm": 0.53515625, + "learning_rate": 7.068402002444386e-05, + "loss": 0.3329, + "step": 19042 + }, + { + "epoch": 2.54, + "grad_norm": 0.63671875, + "learning_rate": 7.067288684815892e-05, + "loss": 0.4823, + "step": 19043 + }, + { + "epoch": 2.54, + "grad_norm": 0.6953125, + "learning_rate": 7.066175406956846e-05, + "loss": 0.5453, + "step": 19044 + }, + { + "epoch": 2.54, + "grad_norm": 0.5625, + "learning_rate": 7.065062168882341e-05, + "loss": 0.2703, + "step": 19045 + }, + { + "epoch": 2.54, + "grad_norm": 0.55078125, + "learning_rate": 7.063948970607479e-05, + "loss": 0.3391, + "step": 19046 + }, + { + "epoch": 2.54, + "grad_norm": 0.453125, + "learning_rate": 7.062835812147353e-05, + "loss": 0.3392, + "step": 19047 + }, + { + "epoch": 2.54, + "grad_norm": 0.400390625, + "learning_rate": 7.061722693517057e-05, + "loss": 0.1571, + "step": 19048 + }, + { + "epoch": 2.54, + "grad_norm": 0.4375, + "learning_rate": 7.060609614731691e-05, + "loss": 0.4211, + "step": 19049 + }, + { + "epoch": 2.54, + "grad_norm": 0.5546875, + "learning_rate": 7.05949657580634e-05, + "loss": 0.2255, + "step": 19050 + }, + { + "epoch": 2.54, + "grad_norm": 0.68359375, + "learning_rate": 7.058383576756107e-05, + "loss": 0.2526, + "step": 19051 + }, + { + "epoch": 2.54, + "grad_norm": 0.5625, + "learning_rate": 7.057270617596077e-05, + "loss": 0.2466, + "step": 19052 + }, + { + "epoch": 2.54, + "grad_norm": 0.373046875, + "learning_rate": 7.056157698341348e-05, + "loss": 0.0786, + "step": 19053 + }, + { + "epoch": 2.54, + "grad_norm": 0.61328125, + "learning_rate": 7.055044819007009e-05, + "loss": 0.4845, + "step": 19054 + }, + { + "epoch": 2.54, + "grad_norm": 0.462890625, + "learning_rate": 7.053931979608157e-05, + "loss": 0.3048, + "step": 19055 + }, + { + "epoch": 2.54, + "grad_norm": 0.58984375, + "learning_rate": 7.052819180159876e-05, + "loss": 0.3743, + "step": 19056 + }, + { + "epoch": 2.54, + "grad_norm": 0.68359375, + "learning_rate": 7.051706420677254e-05, + "loss": 0.1717, + "step": 19057 + }, + { + "epoch": 2.54, + "grad_norm": 0.52734375, + "learning_rate": 7.05059370117539e-05, + "loss": 0.1544, + "step": 19058 + }, + { + "epoch": 2.54, + "grad_norm": 0.53515625, + "learning_rate": 7.049481021669364e-05, + "loss": 0.1425, + "step": 19059 + }, + { + "epoch": 2.54, + "grad_norm": 0.55078125, + "learning_rate": 7.048368382174272e-05, + "loss": 0.2747, + "step": 19060 + }, + { + "epoch": 2.54, + "grad_norm": 0.64453125, + "learning_rate": 7.047255782705198e-05, + "loss": 0.4238, + "step": 19061 + }, + { + "epoch": 2.54, + "grad_norm": 0.466796875, + "learning_rate": 7.046143223277231e-05, + "loss": 0.2581, + "step": 19062 + }, + { + "epoch": 2.54, + "grad_norm": 0.71484375, + "learning_rate": 7.045030703905456e-05, + "loss": 0.5057, + "step": 19063 + }, + { + "epoch": 2.54, + "grad_norm": 1.015625, + "learning_rate": 7.043918224604964e-05, + "loss": 0.343, + "step": 19064 + }, + { + "epoch": 2.54, + "grad_norm": 0.63671875, + "learning_rate": 7.042805785390836e-05, + "loss": 0.5365, + "step": 19065 + }, + { + "epoch": 2.54, + "grad_norm": 0.40625, + "learning_rate": 7.04169338627816e-05, + "loss": 0.1601, + "step": 19066 + }, + { + "epoch": 2.54, + "grad_norm": 0.796875, + "learning_rate": 7.040581027282021e-05, + "loss": 0.4811, + "step": 19067 + }, + { + "epoch": 2.54, + "grad_norm": 0.734375, + "learning_rate": 7.039468708417504e-05, + "loss": 0.6025, + "step": 19068 + }, + { + "epoch": 2.54, + "grad_norm": 0.6875, + "learning_rate": 7.03835642969969e-05, + "loss": 0.2771, + "step": 19069 + }, + { + "epoch": 2.54, + "grad_norm": 0.67578125, + "learning_rate": 7.037244191143661e-05, + "loss": 0.2626, + "step": 19070 + }, + { + "epoch": 2.54, + "grad_norm": 0.55859375, + "learning_rate": 7.036131992764504e-05, + "loss": 0.3373, + "step": 19071 + }, + { + "epoch": 2.54, + "grad_norm": 0.458984375, + "learning_rate": 7.035019834577301e-05, + "loss": 0.2491, + "step": 19072 + }, + { + "epoch": 2.55, + "grad_norm": 0.453125, + "learning_rate": 7.03390771659713e-05, + "loss": 0.1359, + "step": 19073 + }, + { + "epoch": 2.55, + "grad_norm": 0.6484375, + "learning_rate": 7.032795638839074e-05, + "loss": 0.1925, + "step": 19074 + }, + { + "epoch": 2.55, + "grad_norm": 0.41796875, + "learning_rate": 7.031683601318213e-05, + "loss": 0.2525, + "step": 19075 + }, + { + "epoch": 2.55, + "grad_norm": 0.796875, + "learning_rate": 7.03057160404963e-05, + "loss": 0.4659, + "step": 19076 + }, + { + "epoch": 2.55, + "grad_norm": 0.7578125, + "learning_rate": 7.0294596470484e-05, + "loss": 0.5224, + "step": 19077 + }, + { + "epoch": 2.55, + "grad_norm": 0.494140625, + "learning_rate": 7.028347730329604e-05, + "loss": 0.5086, + "step": 19078 + }, + { + "epoch": 2.55, + "grad_norm": 0.6171875, + "learning_rate": 7.02723585390832e-05, + "loss": 0.4105, + "step": 19079 + }, + { + "epoch": 2.55, + "grad_norm": 0.5546875, + "learning_rate": 7.026124017799628e-05, + "loss": 0.3414, + "step": 19080 + }, + { + "epoch": 2.55, + "grad_norm": 0.51171875, + "learning_rate": 7.025012222018601e-05, + "loss": 0.2761, + "step": 19081 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 7.023900466580316e-05, + "loss": 0.2528, + "step": 19082 + }, + { + "epoch": 2.55, + "grad_norm": 0.64453125, + "learning_rate": 7.022788751499854e-05, + "loss": 0.3559, + "step": 19083 + }, + { + "epoch": 2.55, + "grad_norm": 0.6875, + "learning_rate": 7.021677076792284e-05, + "loss": 0.546, + "step": 19084 + }, + { + "epoch": 2.55, + "grad_norm": 0.68359375, + "learning_rate": 7.020565442472686e-05, + "loss": 0.2898, + "step": 19085 + }, + { + "epoch": 2.55, + "grad_norm": 0.50390625, + "learning_rate": 7.019453848556132e-05, + "loss": 0.1813, + "step": 19086 + }, + { + "epoch": 2.55, + "grad_norm": 0.421875, + "learning_rate": 7.018342295057698e-05, + "loss": 0.1731, + "step": 19087 + }, + { + "epoch": 2.55, + "grad_norm": 0.37109375, + "learning_rate": 7.017230781992453e-05, + "loss": 0.1981, + "step": 19088 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 7.016119309375477e-05, + "loss": 0.4957, + "step": 19089 + }, + { + "epoch": 2.55, + "grad_norm": 0.6015625, + "learning_rate": 7.015007877221835e-05, + "loss": 0.4025, + "step": 19090 + }, + { + "epoch": 2.55, + "grad_norm": 0.7265625, + "learning_rate": 7.013896485546602e-05, + "loss": 0.3511, + "step": 19091 + }, + { + "epoch": 2.55, + "grad_norm": 0.6171875, + "learning_rate": 7.012785134364855e-05, + "loss": 0.3269, + "step": 19092 + }, + { + "epoch": 2.55, + "grad_norm": 0.7890625, + "learning_rate": 7.011673823691653e-05, + "loss": 0.4501, + "step": 19093 + }, + { + "epoch": 2.55, + "grad_norm": 0.6484375, + "learning_rate": 7.010562553542071e-05, + "loss": 0.2928, + "step": 19094 + }, + { + "epoch": 2.55, + "grad_norm": 0.48828125, + "learning_rate": 7.009451323931183e-05, + "loss": 0.238, + "step": 19095 + }, + { + "epoch": 2.55, + "grad_norm": 0.46484375, + "learning_rate": 7.008340134874052e-05, + "loss": 0.1632, + "step": 19096 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 7.00722898638575e-05, + "loss": 0.199, + "step": 19097 + }, + { + "epoch": 2.55, + "grad_norm": 0.69921875, + "learning_rate": 7.006117878481342e-05, + "loss": 0.3925, + "step": 19098 + }, + { + "epoch": 2.55, + "grad_norm": 0.4375, + "learning_rate": 7.0050068111759e-05, + "loss": 0.2349, + "step": 19099 + }, + { + "epoch": 2.55, + "grad_norm": 0.64453125, + "learning_rate": 7.003895784484484e-05, + "loss": 0.5091, + "step": 19100 + }, + { + "epoch": 2.55, + "grad_norm": 0.61328125, + "learning_rate": 7.002784798422167e-05, + "loss": 0.2965, + "step": 19101 + }, + { + "epoch": 2.55, + "grad_norm": 0.515625, + "learning_rate": 7.001673853004009e-05, + "loss": 0.326, + "step": 19102 + }, + { + "epoch": 2.55, + "grad_norm": 0.53515625, + "learning_rate": 7.000562948245082e-05, + "loss": 0.3573, + "step": 19103 + }, + { + "epoch": 2.55, + "grad_norm": 0.65625, + "learning_rate": 6.999452084160447e-05, + "loss": 0.4367, + "step": 19104 + }, + { + "epoch": 2.55, + "grad_norm": 0.7890625, + "learning_rate": 6.998341260765163e-05, + "loss": 0.445, + "step": 19105 + }, + { + "epoch": 2.55, + "grad_norm": 0.494140625, + "learning_rate": 6.9972304780743e-05, + "loss": 0.1257, + "step": 19106 + }, + { + "epoch": 2.55, + "grad_norm": 0.69140625, + "learning_rate": 6.996119736102917e-05, + "loss": 0.5287, + "step": 19107 + }, + { + "epoch": 2.55, + "grad_norm": 0.5546875, + "learning_rate": 6.995009034866081e-05, + "loss": 0.517, + "step": 19108 + }, + { + "epoch": 2.55, + "grad_norm": 0.57421875, + "learning_rate": 6.993898374378849e-05, + "loss": 0.5887, + "step": 19109 + }, + { + "epoch": 2.55, + "grad_norm": 0.56640625, + "learning_rate": 6.992787754656287e-05, + "loss": 0.3787, + "step": 19110 + }, + { + "epoch": 2.55, + "grad_norm": 0.59765625, + "learning_rate": 6.991677175713449e-05, + "loss": 0.2755, + "step": 19111 + }, + { + "epoch": 2.55, + "grad_norm": 0.66796875, + "learning_rate": 6.990566637565403e-05, + "loss": 0.4036, + "step": 19112 + }, + { + "epoch": 2.55, + "grad_norm": 0.76953125, + "learning_rate": 6.989456140227202e-05, + "loss": 0.3128, + "step": 19113 + }, + { + "epoch": 2.55, + "grad_norm": 0.68359375, + "learning_rate": 6.98834568371391e-05, + "loss": 0.3593, + "step": 19114 + }, + { + "epoch": 2.55, + "grad_norm": 0.5, + "learning_rate": 6.987235268040582e-05, + "loss": 0.4512, + "step": 19115 + }, + { + "epoch": 2.55, + "grad_norm": 0.5078125, + "learning_rate": 6.98612489322228e-05, + "loss": 0.2844, + "step": 19116 + }, + { + "epoch": 2.55, + "grad_norm": 0.451171875, + "learning_rate": 6.985014559274059e-05, + "loss": 0.2074, + "step": 19117 + }, + { + "epoch": 2.55, + "grad_norm": 0.57421875, + "learning_rate": 6.98390426621097e-05, + "loss": 0.3682, + "step": 19118 + }, + { + "epoch": 2.55, + "grad_norm": 0.6171875, + "learning_rate": 6.982794014048077e-05, + "loss": 0.2823, + "step": 19119 + }, + { + "epoch": 2.55, + "grad_norm": 0.3984375, + "learning_rate": 6.981683802800436e-05, + "loss": 0.1167, + "step": 19120 + }, + { + "epoch": 2.55, + "grad_norm": 0.671875, + "learning_rate": 6.980573632483097e-05, + "loss": 0.2862, + "step": 19121 + }, + { + "epoch": 2.55, + "grad_norm": 0.5859375, + "learning_rate": 6.979463503111121e-05, + "loss": 0.3563, + "step": 19122 + }, + { + "epoch": 2.55, + "grad_norm": 0.59765625, + "learning_rate": 6.978353414699555e-05, + "loss": 0.3132, + "step": 19123 + }, + { + "epoch": 2.55, + "grad_norm": 0.63671875, + "learning_rate": 6.977243367263457e-05, + "loss": 0.2358, + "step": 19124 + }, + { + "epoch": 2.55, + "grad_norm": 0.53515625, + "learning_rate": 6.976133360817879e-05, + "loss": 0.2724, + "step": 19125 + }, + { + "epoch": 2.55, + "grad_norm": 0.57421875, + "learning_rate": 6.975023395377874e-05, + "loss": 0.3603, + "step": 19126 + }, + { + "epoch": 2.55, + "grad_norm": 0.5703125, + "learning_rate": 6.973913470958492e-05, + "loss": 0.4854, + "step": 19127 + }, + { + "epoch": 2.55, + "grad_norm": 0.53515625, + "learning_rate": 6.97280358757479e-05, + "loss": 0.222, + "step": 19128 + }, + { + "epoch": 2.55, + "grad_norm": 0.412109375, + "learning_rate": 6.97169374524181e-05, + "loss": 0.1943, + "step": 19129 + }, + { + "epoch": 2.55, + "grad_norm": 0.81640625, + "learning_rate": 6.970583943974606e-05, + "loss": 0.4151, + "step": 19130 + }, + { + "epoch": 2.55, + "grad_norm": 0.734375, + "learning_rate": 6.969474183788229e-05, + "loss": 0.2727, + "step": 19131 + }, + { + "epoch": 2.55, + "grad_norm": 0.51953125, + "learning_rate": 6.968364464697727e-05, + "loss": 0.2084, + "step": 19132 + }, + { + "epoch": 2.55, + "grad_norm": 0.58203125, + "learning_rate": 6.967254786718149e-05, + "loss": 0.6251, + "step": 19133 + }, + { + "epoch": 2.55, + "grad_norm": 0.58984375, + "learning_rate": 6.96614514986454e-05, + "loss": 0.3479, + "step": 19134 + }, + { + "epoch": 2.55, + "grad_norm": 0.703125, + "learning_rate": 6.965035554151952e-05, + "loss": 0.2216, + "step": 19135 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 6.963925999595429e-05, + "loss": 0.3453, + "step": 19136 + }, + { + "epoch": 2.55, + "grad_norm": 0.462890625, + "learning_rate": 6.96281648621002e-05, + "loss": 0.397, + "step": 19137 + }, + { + "epoch": 2.55, + "grad_norm": 0.455078125, + "learning_rate": 6.961707014010765e-05, + "loss": 0.1813, + "step": 19138 + }, + { + "epoch": 2.55, + "grad_norm": 0.56640625, + "learning_rate": 6.960597583012716e-05, + "loss": 0.4941, + "step": 19139 + }, + { + "epoch": 2.55, + "grad_norm": 0.46484375, + "learning_rate": 6.959488193230916e-05, + "loss": 0.2014, + "step": 19140 + }, + { + "epoch": 2.55, + "grad_norm": 0.64453125, + "learning_rate": 6.958378844680405e-05, + "loss": 0.5507, + "step": 19141 + }, + { + "epoch": 2.55, + "grad_norm": 0.7265625, + "learning_rate": 6.957269537376229e-05, + "loss": 0.1895, + "step": 19142 + }, + { + "epoch": 2.55, + "grad_norm": 0.5859375, + "learning_rate": 6.95616027133343e-05, + "loss": 0.3481, + "step": 19143 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 6.955051046567052e-05, + "loss": 0.4355, + "step": 19144 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 6.953941863092136e-05, + "loss": 0.4028, + "step": 19145 + }, + { + "epoch": 2.55, + "grad_norm": 0.52734375, + "learning_rate": 6.952832720923722e-05, + "loss": 0.2279, + "step": 19146 + }, + { + "epoch": 2.55, + "grad_norm": 0.5546875, + "learning_rate": 6.951723620076854e-05, + "loss": 0.1135, + "step": 19147 + }, + { + "epoch": 2.56, + "grad_norm": 0.65625, + "learning_rate": 6.950614560566569e-05, + "loss": 0.5242, + "step": 19148 + }, + { + "epoch": 2.56, + "grad_norm": 0.5078125, + "learning_rate": 6.949505542407907e-05, + "loss": 0.2212, + "step": 19149 + }, + { + "epoch": 2.56, + "grad_norm": 0.53125, + "learning_rate": 6.948396565615909e-05, + "loss": 0.4256, + "step": 19150 + }, + { + "epoch": 2.56, + "grad_norm": 0.5625, + "learning_rate": 6.947287630205612e-05, + "loss": 0.2231, + "step": 19151 + }, + { + "epoch": 2.56, + "grad_norm": 0.59765625, + "learning_rate": 6.946178736192053e-05, + "loss": 0.2707, + "step": 19152 + }, + { + "epoch": 2.56, + "grad_norm": 0.34765625, + "learning_rate": 6.945069883590274e-05, + "loss": 0.2005, + "step": 19153 + }, + { + "epoch": 2.56, + "grad_norm": 0.44921875, + "learning_rate": 6.943961072415306e-05, + "loss": 0.1468, + "step": 19154 + }, + { + "epoch": 2.56, + "grad_norm": 0.482421875, + "learning_rate": 6.942852302682186e-05, + "loss": 0.2118, + "step": 19155 + }, + { + "epoch": 2.56, + "grad_norm": 0.578125, + "learning_rate": 6.941743574405952e-05, + "loss": 0.3322, + "step": 19156 + }, + { + "epoch": 2.56, + "grad_norm": 0.69140625, + "learning_rate": 6.940634887601635e-05, + "loss": 0.4116, + "step": 19157 + }, + { + "epoch": 2.56, + "grad_norm": 0.47265625, + "learning_rate": 6.939526242284276e-05, + "loss": 0.2903, + "step": 19158 + }, + { + "epoch": 2.56, + "grad_norm": 0.53125, + "learning_rate": 6.938417638468901e-05, + "loss": 0.4677, + "step": 19159 + }, + { + "epoch": 2.56, + "grad_norm": 0.83984375, + "learning_rate": 6.937309076170551e-05, + "loss": 0.4643, + "step": 19160 + }, + { + "epoch": 2.56, + "grad_norm": 0.458984375, + "learning_rate": 6.936200555404255e-05, + "loss": 0.2784, + "step": 19161 + }, + { + "epoch": 2.56, + "grad_norm": 0.45703125, + "learning_rate": 6.935092076185046e-05, + "loss": 0.169, + "step": 19162 + }, + { + "epoch": 2.56, + "grad_norm": 0.5625, + "learning_rate": 6.933983638527954e-05, + "loss": 0.4502, + "step": 19163 + }, + { + "epoch": 2.56, + "grad_norm": 0.53515625, + "learning_rate": 6.932875242448011e-05, + "loss": 0.2839, + "step": 19164 + }, + { + "epoch": 2.56, + "grad_norm": 0.54296875, + "learning_rate": 6.931766887960256e-05, + "loss": 0.2921, + "step": 19165 + }, + { + "epoch": 2.56, + "grad_norm": 0.6953125, + "learning_rate": 6.930658575079705e-05, + "loss": 0.4899, + "step": 19166 + }, + { + "epoch": 2.56, + "grad_norm": 0.640625, + "learning_rate": 6.929550303821393e-05, + "loss": 0.4208, + "step": 19167 + }, + { + "epoch": 2.56, + "grad_norm": 0.8046875, + "learning_rate": 6.92844207420035e-05, + "loss": 0.6416, + "step": 19168 + }, + { + "epoch": 2.56, + "grad_norm": 0.609375, + "learning_rate": 6.927333886231604e-05, + "loss": 0.2577, + "step": 19169 + }, + { + "epoch": 2.56, + "grad_norm": 0.75390625, + "learning_rate": 6.926225739930184e-05, + "loss": 0.6486, + "step": 19170 + }, + { + "epoch": 2.56, + "grad_norm": 0.515625, + "learning_rate": 6.925117635311112e-05, + "loss": 0.2337, + "step": 19171 + }, + { + "epoch": 2.56, + "grad_norm": 0.58203125, + "learning_rate": 6.924009572389422e-05, + "loss": 0.5373, + "step": 19172 + }, + { + "epoch": 2.56, + "grad_norm": 0.4765625, + "learning_rate": 6.922901551180134e-05, + "loss": 0.2328, + "step": 19173 + }, + { + "epoch": 2.56, + "grad_norm": 0.427734375, + "learning_rate": 6.921793571698278e-05, + "loss": 0.2993, + "step": 19174 + }, + { + "epoch": 2.56, + "grad_norm": 0.65625, + "learning_rate": 6.920685633958875e-05, + "loss": 0.2431, + "step": 19175 + }, + { + "epoch": 2.56, + "grad_norm": 0.5, + "learning_rate": 6.919577737976952e-05, + "loss": 0.1974, + "step": 19176 + }, + { + "epoch": 2.56, + "grad_norm": 0.72265625, + "learning_rate": 6.918469883767534e-05, + "loss": 0.572, + "step": 19177 + }, + { + "epoch": 2.56, + "grad_norm": 0.91796875, + "learning_rate": 6.917362071345639e-05, + "loss": 0.5233, + "step": 19178 + }, + { + "epoch": 2.56, + "grad_norm": 0.5625, + "learning_rate": 6.916254300726293e-05, + "loss": 0.2888, + "step": 19179 + }, + { + "epoch": 2.56, + "grad_norm": 0.6875, + "learning_rate": 6.915146571924518e-05, + "loss": 0.499, + "step": 19180 + }, + { + "epoch": 2.56, + "grad_norm": 0.734375, + "learning_rate": 6.914038884955337e-05, + "loss": 0.5009, + "step": 19181 + }, + { + "epoch": 2.56, + "grad_norm": 0.8125, + "learning_rate": 6.912931239833766e-05, + "loss": 0.5501, + "step": 19182 + }, + { + "epoch": 2.56, + "grad_norm": 0.65625, + "learning_rate": 6.911823636574832e-05, + "loss": 0.3908, + "step": 19183 + }, + { + "epoch": 2.56, + "grad_norm": 0.515625, + "learning_rate": 6.910716075193549e-05, + "loss": 0.3075, + "step": 19184 + }, + { + "epoch": 2.56, + "grad_norm": 0.64453125, + "learning_rate": 6.90960855570494e-05, + "loss": 0.2784, + "step": 19185 + }, + { + "epoch": 2.56, + "grad_norm": 0.94921875, + "learning_rate": 6.90850107812402e-05, + "loss": 1.0068, + "step": 19186 + }, + { + "epoch": 2.56, + "grad_norm": 0.625, + "learning_rate": 6.907393642465813e-05, + "loss": 0.2206, + "step": 19187 + }, + { + "epoch": 2.56, + "grad_norm": 0.6171875, + "learning_rate": 6.906286248745331e-05, + "loss": 0.4608, + "step": 19188 + }, + { + "epoch": 2.56, + "grad_norm": 0.435546875, + "learning_rate": 6.905178896977596e-05, + "loss": 0.4419, + "step": 19189 + }, + { + "epoch": 2.56, + "grad_norm": 0.5390625, + "learning_rate": 6.904071587177621e-05, + "loss": 0.4399, + "step": 19190 + }, + { + "epoch": 2.56, + "grad_norm": 0.416015625, + "learning_rate": 6.902964319360419e-05, + "loss": 0.2311, + "step": 19191 + }, + { + "epoch": 2.56, + "grad_norm": 0.56640625, + "learning_rate": 6.901857093541008e-05, + "loss": 0.3289, + "step": 19192 + }, + { + "epoch": 2.56, + "grad_norm": 0.60546875, + "learning_rate": 6.900749909734406e-05, + "loss": 0.6606, + "step": 19193 + }, + { + "epoch": 2.56, + "grad_norm": 0.62109375, + "learning_rate": 6.899642767955621e-05, + "loss": 0.1916, + "step": 19194 + }, + { + "epoch": 2.56, + "grad_norm": 0.57421875, + "learning_rate": 6.898535668219674e-05, + "loss": 0.4097, + "step": 19195 + }, + { + "epoch": 2.56, + "grad_norm": 0.64453125, + "learning_rate": 6.897428610541569e-05, + "loss": 0.5765, + "step": 19196 + }, + { + "epoch": 2.56, + "grad_norm": 0.6796875, + "learning_rate": 6.896321594936327e-05, + "loss": 0.4388, + "step": 19197 + }, + { + "epoch": 2.56, + "grad_norm": 0.6328125, + "learning_rate": 6.895214621418955e-05, + "loss": 0.4422, + "step": 19198 + }, + { + "epoch": 2.56, + "grad_norm": 0.70703125, + "learning_rate": 6.894107690004467e-05, + "loss": 0.413, + "step": 19199 + }, + { + "epoch": 2.56, + "grad_norm": 0.7421875, + "learning_rate": 6.893000800707869e-05, + "loss": 0.2426, + "step": 19200 + }, + { + "epoch": 2.56, + "grad_norm": 0.54296875, + "learning_rate": 6.891893953544179e-05, + "loss": 0.1676, + "step": 19201 + }, + { + "epoch": 2.56, + "grad_norm": 0.69140625, + "learning_rate": 6.8907871485284e-05, + "loss": 0.5251, + "step": 19202 + }, + { + "epoch": 2.56, + "grad_norm": 0.8046875, + "learning_rate": 6.88968038567554e-05, + "loss": 0.527, + "step": 19203 + }, + { + "epoch": 2.56, + "grad_norm": 0.5859375, + "learning_rate": 6.888573665000613e-05, + "loss": 0.3198, + "step": 19204 + }, + { + "epoch": 2.56, + "grad_norm": 0.60546875, + "learning_rate": 6.88746698651862e-05, + "loss": 0.3504, + "step": 19205 + }, + { + "epoch": 2.56, + "grad_norm": 0.58984375, + "learning_rate": 6.886360350244577e-05, + "loss": 0.2462, + "step": 19206 + }, + { + "epoch": 2.56, + "grad_norm": 0.58984375, + "learning_rate": 6.885253756193481e-05, + "loss": 0.2742, + "step": 19207 + }, + { + "epoch": 2.56, + "grad_norm": 0.4375, + "learning_rate": 6.884147204380347e-05, + "loss": 0.3163, + "step": 19208 + }, + { + "epoch": 2.56, + "grad_norm": 0.53515625, + "learning_rate": 6.883040694820176e-05, + "loss": 0.5271, + "step": 19209 + }, + { + "epoch": 2.56, + "grad_norm": 0.494140625, + "learning_rate": 6.881934227527973e-05, + "loss": 0.1668, + "step": 19210 + }, + { + "epoch": 2.56, + "grad_norm": 0.6171875, + "learning_rate": 6.880827802518742e-05, + "loss": 0.3134, + "step": 19211 + }, + { + "epoch": 2.56, + "grad_norm": 0.52734375, + "learning_rate": 6.87972141980749e-05, + "loss": 0.2862, + "step": 19212 + }, + { + "epoch": 2.56, + "grad_norm": 0.455078125, + "learning_rate": 6.878615079409221e-05, + "loss": 0.104, + "step": 19213 + }, + { + "epoch": 2.56, + "grad_norm": 0.5703125, + "learning_rate": 6.87750878133893e-05, + "loss": 0.3439, + "step": 19214 + }, + { + "epoch": 2.56, + "grad_norm": 0.5703125, + "learning_rate": 6.876402525611624e-05, + "loss": 0.5291, + "step": 19215 + }, + { + "epoch": 2.56, + "grad_norm": 0.875, + "learning_rate": 6.875296312242305e-05, + "loss": 0.3651, + "step": 19216 + }, + { + "epoch": 2.56, + "grad_norm": 0.5859375, + "learning_rate": 6.874190141245973e-05, + "loss": 0.1772, + "step": 19217 + }, + { + "epoch": 2.56, + "grad_norm": 0.55859375, + "learning_rate": 6.873084012637632e-05, + "loss": 0.4704, + "step": 19218 + }, + { + "epoch": 2.56, + "grad_norm": 0.56640625, + "learning_rate": 6.871977926432274e-05, + "loss": 0.3529, + "step": 19219 + }, + { + "epoch": 2.56, + "grad_norm": 0.60546875, + "learning_rate": 6.870871882644905e-05, + "loss": 0.4081, + "step": 19220 + }, + { + "epoch": 2.56, + "grad_norm": 0.453125, + "learning_rate": 6.869765881290521e-05, + "loss": 0.1983, + "step": 19221 + }, + { + "epoch": 2.56, + "grad_norm": 0.6015625, + "learning_rate": 6.868659922384123e-05, + "loss": 0.3669, + "step": 19222 + }, + { + "epoch": 2.57, + "grad_norm": 0.5078125, + "learning_rate": 6.867554005940704e-05, + "loss": 0.2669, + "step": 19223 + }, + { + "epoch": 2.57, + "grad_norm": 0.59375, + "learning_rate": 6.866448131975264e-05, + "loss": 0.4124, + "step": 19224 + }, + { + "epoch": 2.57, + "grad_norm": 0.55078125, + "learning_rate": 6.8653423005028e-05, + "loss": 0.4252, + "step": 19225 + }, + { + "epoch": 2.57, + "grad_norm": 0.5703125, + "learning_rate": 6.864236511538303e-05, + "loss": 0.3897, + "step": 19226 + }, + { + "epoch": 2.57, + "grad_norm": 0.55859375, + "learning_rate": 6.863130765096774e-05, + "loss": 0.3123, + "step": 19227 + }, + { + "epoch": 2.57, + "grad_norm": 0.75, + "learning_rate": 6.862025061193202e-05, + "loss": 0.6224, + "step": 19228 + }, + { + "epoch": 2.57, + "grad_norm": 0.435546875, + "learning_rate": 6.860919399842586e-05, + "loss": 0.2545, + "step": 19229 + }, + { + "epoch": 2.57, + "grad_norm": 0.5234375, + "learning_rate": 6.859813781059918e-05, + "loss": 0.3441, + "step": 19230 + }, + { + "epoch": 2.57, + "grad_norm": 0.58984375, + "learning_rate": 6.858708204860191e-05, + "loss": 0.4701, + "step": 19231 + }, + { + "epoch": 2.57, + "grad_norm": 0.5625, + "learning_rate": 6.857602671258394e-05, + "loss": 0.4047, + "step": 19232 + }, + { + "epoch": 2.57, + "grad_norm": 0.439453125, + "learning_rate": 6.856497180269523e-05, + "loss": 0.3627, + "step": 19233 + }, + { + "epoch": 2.57, + "grad_norm": 0.53515625, + "learning_rate": 6.855391731908567e-05, + "loss": 0.3075, + "step": 19234 + }, + { + "epoch": 2.57, + "grad_norm": 0.48828125, + "learning_rate": 6.85428632619052e-05, + "loss": 0.3135, + "step": 19235 + }, + { + "epoch": 2.57, + "grad_norm": 0.55078125, + "learning_rate": 6.853180963130366e-05, + "loss": 0.1423, + "step": 19236 + }, + { + "epoch": 2.57, + "grad_norm": 0.6484375, + "learning_rate": 6.852075642743099e-05, + "loss": 0.375, + "step": 19237 + }, + { + "epoch": 2.57, + "grad_norm": 0.68359375, + "learning_rate": 6.850970365043712e-05, + "loss": 0.5848, + "step": 19238 + }, + { + "epoch": 2.57, + "grad_norm": 0.65625, + "learning_rate": 6.84986513004718e-05, + "loss": 0.3951, + "step": 19239 + }, + { + "epoch": 2.57, + "grad_norm": 0.462890625, + "learning_rate": 6.848759937768501e-05, + "loss": 0.3167, + "step": 19240 + }, + { + "epoch": 2.57, + "grad_norm": 0.578125, + "learning_rate": 6.847654788222662e-05, + "loss": 0.3867, + "step": 19241 + }, + { + "epoch": 2.57, + "grad_norm": 0.466796875, + "learning_rate": 6.846549681424643e-05, + "loss": 0.2126, + "step": 19242 + }, + { + "epoch": 2.57, + "grad_norm": 0.546875, + "learning_rate": 6.845444617389437e-05, + "loss": 0.209, + "step": 19243 + }, + { + "epoch": 2.57, + "grad_norm": 0.7109375, + "learning_rate": 6.844339596132026e-05, + "loss": 0.5626, + "step": 19244 + }, + { + "epoch": 2.57, + "grad_norm": 0.49609375, + "learning_rate": 6.843234617667395e-05, + "loss": 0.2322, + "step": 19245 + }, + { + "epoch": 2.57, + "grad_norm": 0.55859375, + "learning_rate": 6.842129682010528e-05, + "loss": 0.3863, + "step": 19246 + }, + { + "epoch": 2.57, + "grad_norm": 0.380859375, + "learning_rate": 6.841024789176411e-05, + "loss": 0.1819, + "step": 19247 + }, + { + "epoch": 2.57, + "grad_norm": 0.65625, + "learning_rate": 6.839919939180024e-05, + "loss": 0.3419, + "step": 19248 + }, + { + "epoch": 2.57, + "grad_norm": 0.7890625, + "learning_rate": 6.838815132036352e-05, + "loss": 0.3091, + "step": 19249 + }, + { + "epoch": 2.57, + "grad_norm": 0.453125, + "learning_rate": 6.837710367760377e-05, + "loss": 0.2301, + "step": 19250 + }, + { + "epoch": 2.57, + "grad_norm": 0.482421875, + "learning_rate": 6.836605646367076e-05, + "loss": 0.2055, + "step": 19251 + }, + { + "epoch": 2.57, + "grad_norm": 0.50390625, + "learning_rate": 6.835500967871435e-05, + "loss": 0.2927, + "step": 19252 + }, + { + "epoch": 2.57, + "grad_norm": 0.7265625, + "learning_rate": 6.834396332288429e-05, + "loss": 0.3977, + "step": 19253 + }, + { + "epoch": 2.57, + "grad_norm": 0.498046875, + "learning_rate": 6.833291739633043e-05, + "loss": 0.293, + "step": 19254 + }, + { + "epoch": 2.57, + "grad_norm": 0.5, + "learning_rate": 6.832187189920252e-05, + "loss": 0.2968, + "step": 19255 + }, + { + "epoch": 2.57, + "grad_norm": 0.53515625, + "learning_rate": 6.831082683165035e-05, + "loss": 0.3844, + "step": 19256 + }, + { + "epoch": 2.57, + "grad_norm": 0.6171875, + "learning_rate": 6.829978219382371e-05, + "loss": 0.3666, + "step": 19257 + }, + { + "epoch": 2.57, + "grad_norm": 0.51171875, + "learning_rate": 6.828873798587238e-05, + "loss": 0.2767, + "step": 19258 + }, + { + "epoch": 2.57, + "grad_norm": 0.58984375, + "learning_rate": 6.82776942079461e-05, + "loss": 0.1575, + "step": 19259 + }, + { + "epoch": 2.57, + "grad_norm": 0.5390625, + "learning_rate": 6.826665086019466e-05, + "loss": 0.2468, + "step": 19260 + }, + { + "epoch": 2.57, + "grad_norm": 0.470703125, + "learning_rate": 6.82556079427678e-05, + "loss": 0.1904, + "step": 19261 + }, + { + "epoch": 2.57, + "grad_norm": 0.56640625, + "learning_rate": 6.824456545581527e-05, + "loss": 0.4247, + "step": 19262 + }, + { + "epoch": 2.57, + "grad_norm": 0.45703125, + "learning_rate": 6.823352339948683e-05, + "loss": 0.2549, + "step": 19263 + }, + { + "epoch": 2.57, + "grad_norm": 0.68359375, + "learning_rate": 6.822248177393218e-05, + "loss": 0.3721, + "step": 19264 + }, + { + "epoch": 2.57, + "grad_norm": 0.6953125, + "learning_rate": 6.821144057930104e-05, + "loss": 0.3699, + "step": 19265 + }, + { + "epoch": 2.57, + "grad_norm": 0.6015625, + "learning_rate": 6.820039981574322e-05, + "loss": 0.3996, + "step": 19266 + }, + { + "epoch": 2.57, + "grad_norm": 0.59375, + "learning_rate": 6.818935948340834e-05, + "loss": 0.3236, + "step": 19267 + }, + { + "epoch": 2.57, + "grad_norm": 0.60546875, + "learning_rate": 6.817831958244621e-05, + "loss": 0.192, + "step": 19268 + }, + { + "epoch": 2.57, + "grad_norm": 0.76171875, + "learning_rate": 6.816728011300647e-05, + "loss": 0.3493, + "step": 19269 + }, + { + "epoch": 2.57, + "grad_norm": 0.5234375, + "learning_rate": 6.815624107523884e-05, + "loss": 0.2333, + "step": 19270 + }, + { + "epoch": 2.57, + "grad_norm": 0.66015625, + "learning_rate": 6.814520246929299e-05, + "loss": 0.276, + "step": 19271 + }, + { + "epoch": 2.57, + "grad_norm": 0.470703125, + "learning_rate": 6.813416429531866e-05, + "loss": 0.3055, + "step": 19272 + }, + { + "epoch": 2.57, + "grad_norm": 0.4609375, + "learning_rate": 6.812312655346553e-05, + "loss": 0.2843, + "step": 19273 + }, + { + "epoch": 2.57, + "grad_norm": 0.58984375, + "learning_rate": 6.811208924388328e-05, + "loss": 0.2448, + "step": 19274 + }, + { + "epoch": 2.57, + "grad_norm": 0.6015625, + "learning_rate": 6.810105236672155e-05, + "loss": 0.2721, + "step": 19275 + }, + { + "epoch": 2.57, + "grad_norm": 0.43359375, + "learning_rate": 6.809001592212998e-05, + "loss": 0.2058, + "step": 19276 + }, + { + "epoch": 2.57, + "grad_norm": 0.640625, + "learning_rate": 6.807897991025832e-05, + "loss": 0.2704, + "step": 19277 + }, + { + "epoch": 2.57, + "grad_norm": 0.56640625, + "learning_rate": 6.806794433125615e-05, + "loss": 0.4068, + "step": 19278 + }, + { + "epoch": 2.57, + "grad_norm": 0.640625, + "learning_rate": 6.805690918527317e-05, + "loss": 0.2948, + "step": 19279 + }, + { + "epoch": 2.57, + "grad_norm": 0.73046875, + "learning_rate": 6.804587447245898e-05, + "loss": 0.5375, + "step": 19280 + }, + { + "epoch": 2.57, + "grad_norm": 0.57421875, + "learning_rate": 6.803484019296326e-05, + "loss": 0.1622, + "step": 19281 + }, + { + "epoch": 2.57, + "grad_norm": 0.6015625, + "learning_rate": 6.80238063469356e-05, + "loss": 0.2705, + "step": 19282 + }, + { + "epoch": 2.57, + "grad_norm": 0.55859375, + "learning_rate": 6.801277293452568e-05, + "loss": 0.2133, + "step": 19283 + }, + { + "epoch": 2.57, + "grad_norm": 0.4609375, + "learning_rate": 6.800173995588305e-05, + "loss": 0.1433, + "step": 19284 + }, + { + "epoch": 2.57, + "grad_norm": 0.546875, + "learning_rate": 6.799070741115741e-05, + "loss": 0.3878, + "step": 19285 + }, + { + "epoch": 2.57, + "grad_norm": 0.60546875, + "learning_rate": 6.797967530049831e-05, + "loss": 0.4605, + "step": 19286 + }, + { + "epoch": 2.57, + "grad_norm": 0.75390625, + "learning_rate": 6.796864362405533e-05, + "loss": 0.4918, + "step": 19287 + }, + { + "epoch": 2.57, + "grad_norm": 0.396484375, + "learning_rate": 6.795761238197815e-05, + "loss": 0.17, + "step": 19288 + }, + { + "epoch": 2.57, + "grad_norm": 0.52734375, + "learning_rate": 6.794658157441626e-05, + "loss": 0.3175, + "step": 19289 + }, + { + "epoch": 2.57, + "grad_norm": 0.625, + "learning_rate": 6.79355512015193e-05, + "loss": 0.4195, + "step": 19290 + }, + { + "epoch": 2.57, + "grad_norm": 0.625, + "learning_rate": 6.792452126343687e-05, + "loss": 0.4549, + "step": 19291 + }, + { + "epoch": 2.57, + "grad_norm": 0.458984375, + "learning_rate": 6.791349176031849e-05, + "loss": 0.3319, + "step": 19292 + }, + { + "epoch": 2.57, + "grad_norm": 0.73828125, + "learning_rate": 6.790246269231379e-05, + "loss": 0.8308, + "step": 19293 + }, + { + "epoch": 2.57, + "grad_norm": 0.5078125, + "learning_rate": 6.789143405957225e-05, + "loss": 0.3299, + "step": 19294 + }, + { + "epoch": 2.57, + "grad_norm": 0.81640625, + "learning_rate": 6.78804058622435e-05, + "loss": 0.4106, + "step": 19295 + }, + { + "epoch": 2.57, + "grad_norm": 0.55078125, + "learning_rate": 6.786937810047707e-05, + "loss": 0.4172, + "step": 19296 + }, + { + "epoch": 2.57, + "grad_norm": 0.62109375, + "learning_rate": 6.785835077442249e-05, + "loss": 0.4629, + "step": 19297 + }, + { + "epoch": 2.58, + "grad_norm": 0.62890625, + "learning_rate": 6.784732388422932e-05, + "loss": 0.2152, + "step": 19298 + }, + { + "epoch": 2.58, + "grad_norm": 0.5625, + "learning_rate": 6.783629743004704e-05, + "loss": 0.3031, + "step": 19299 + }, + { + "epoch": 2.58, + "grad_norm": 0.57421875, + "learning_rate": 6.782527141202524e-05, + "loss": 0.5738, + "step": 19300 + }, + { + "epoch": 2.58, + "grad_norm": 0.61328125, + "learning_rate": 6.781424583031337e-05, + "loss": 0.189, + "step": 19301 + }, + { + "epoch": 2.58, + "grad_norm": 0.73046875, + "learning_rate": 6.7803220685061e-05, + "loss": 0.5898, + "step": 19302 + }, + { + "epoch": 2.58, + "grad_norm": 0.5234375, + "learning_rate": 6.779219597641762e-05, + "loss": 0.1993, + "step": 19303 + }, + { + "epoch": 2.58, + "grad_norm": 0.546875, + "learning_rate": 6.778117170453275e-05, + "loss": 0.5845, + "step": 19304 + }, + { + "epoch": 2.58, + "grad_norm": 0.64453125, + "learning_rate": 6.777014786955583e-05, + "loss": 0.515, + "step": 19305 + }, + { + "epoch": 2.58, + "grad_norm": 0.56640625, + "learning_rate": 6.775912447163643e-05, + "loss": 0.2998, + "step": 19306 + }, + { + "epoch": 2.58, + "grad_norm": 0.51171875, + "learning_rate": 6.774810151092396e-05, + "loss": 0.2435, + "step": 19307 + }, + { + "epoch": 2.58, + "grad_norm": 0.57421875, + "learning_rate": 6.773707898756797e-05, + "loss": 0.3091, + "step": 19308 + }, + { + "epoch": 2.58, + "grad_norm": 0.55078125, + "learning_rate": 6.772605690171787e-05, + "loss": 0.2176, + "step": 19309 + }, + { + "epoch": 2.58, + "grad_norm": 0.4609375, + "learning_rate": 6.771503525352319e-05, + "loss": 0.1995, + "step": 19310 + }, + { + "epoch": 2.58, + "grad_norm": 0.408203125, + "learning_rate": 6.770401404313333e-05, + "loss": 0.3537, + "step": 19311 + }, + { + "epoch": 2.58, + "grad_norm": 0.60546875, + "learning_rate": 6.769299327069773e-05, + "loss": 0.299, + "step": 19312 + }, + { + "epoch": 2.58, + "grad_norm": 0.419921875, + "learning_rate": 6.76819729363659e-05, + "loss": 0.1815, + "step": 19313 + }, + { + "epoch": 2.58, + "grad_norm": 0.671875, + "learning_rate": 6.767095304028728e-05, + "loss": 0.488, + "step": 19314 + }, + { + "epoch": 2.58, + "grad_norm": 0.5390625, + "learning_rate": 6.765993358261124e-05, + "loss": 0.2953, + "step": 19315 + }, + { + "epoch": 2.58, + "grad_norm": 0.671875, + "learning_rate": 6.764891456348729e-05, + "loss": 0.1845, + "step": 19316 + }, + { + "epoch": 2.58, + "grad_norm": 0.60546875, + "learning_rate": 6.763789598306483e-05, + "loss": 0.3062, + "step": 19317 + }, + { + "epoch": 2.58, + "grad_norm": 0.6328125, + "learning_rate": 6.762687784149326e-05, + "loss": 0.1987, + "step": 19318 + }, + { + "epoch": 2.58, + "grad_norm": 0.78515625, + "learning_rate": 6.761586013892198e-05, + "loss": 0.6034, + "step": 19319 + }, + { + "epoch": 2.58, + "grad_norm": 0.63671875, + "learning_rate": 6.760484287550045e-05, + "loss": 0.4397, + "step": 19320 + }, + { + "epoch": 2.58, + "grad_norm": 0.494140625, + "learning_rate": 6.759382605137802e-05, + "loss": 0.3027, + "step": 19321 + }, + { + "epoch": 2.58, + "grad_norm": 0.65625, + "learning_rate": 6.758280966670414e-05, + "loss": 0.3592, + "step": 19322 + }, + { + "epoch": 2.58, + "grad_norm": 0.83203125, + "learning_rate": 6.757179372162816e-05, + "loss": 0.2195, + "step": 19323 + }, + { + "epoch": 2.58, + "grad_norm": 0.59765625, + "learning_rate": 6.756077821629945e-05, + "loss": 0.318, + "step": 19324 + }, + { + "epoch": 2.58, + "grad_norm": 0.640625, + "learning_rate": 6.754976315086741e-05, + "loss": 0.294, + "step": 19325 + }, + { + "epoch": 2.58, + "grad_norm": 0.5625, + "learning_rate": 6.75387485254814e-05, + "loss": 0.5057, + "step": 19326 + }, + { + "epoch": 2.58, + "grad_norm": 0.53515625, + "learning_rate": 6.752773434029081e-05, + "loss": 0.2745, + "step": 19327 + }, + { + "epoch": 2.58, + "grad_norm": 0.734375, + "learning_rate": 6.751672059544496e-05, + "loss": 0.2787, + "step": 19328 + }, + { + "epoch": 2.58, + "grad_norm": 0.427734375, + "learning_rate": 6.750570729109326e-05, + "loss": 0.3134, + "step": 19329 + }, + { + "epoch": 2.58, + "grad_norm": 0.73828125, + "learning_rate": 6.749469442738498e-05, + "loss": 0.4635, + "step": 19330 + }, + { + "epoch": 2.58, + "grad_norm": 0.6171875, + "learning_rate": 6.748368200446955e-05, + "loss": 0.5505, + "step": 19331 + }, + { + "epoch": 2.58, + "grad_norm": 0.55859375, + "learning_rate": 6.747267002249622e-05, + "loss": 0.3012, + "step": 19332 + }, + { + "epoch": 2.58, + "grad_norm": 0.56640625, + "learning_rate": 6.746165848161439e-05, + "loss": 0.3871, + "step": 19333 + }, + { + "epoch": 2.58, + "grad_norm": 0.53515625, + "learning_rate": 6.745064738197335e-05, + "loss": 0.3222, + "step": 19334 + }, + { + "epoch": 2.58, + "grad_norm": 0.53125, + "learning_rate": 6.743963672372244e-05, + "loss": 0.3275, + "step": 19335 + }, + { + "epoch": 2.58, + "grad_norm": 0.66015625, + "learning_rate": 6.742862650701094e-05, + "loss": 0.6058, + "step": 19336 + }, + { + "epoch": 2.58, + "grad_norm": 0.57421875, + "learning_rate": 6.741761673198815e-05, + "loss": 0.2822, + "step": 19337 + }, + { + "epoch": 2.58, + "grad_norm": 0.65625, + "learning_rate": 6.740660739880338e-05, + "loss": 0.4361, + "step": 19338 + }, + { + "epoch": 2.58, + "grad_norm": 0.62109375, + "learning_rate": 6.739559850760596e-05, + "loss": 0.3793, + "step": 19339 + }, + { + "epoch": 2.58, + "grad_norm": 0.57421875, + "learning_rate": 6.738459005854513e-05, + "loss": 0.3121, + "step": 19340 + }, + { + "epoch": 2.58, + "grad_norm": 0.58203125, + "learning_rate": 6.73735820517702e-05, + "loss": 0.4441, + "step": 19341 + }, + { + "epoch": 2.58, + "grad_norm": 0.5703125, + "learning_rate": 6.736257448743044e-05, + "loss": 0.306, + "step": 19342 + }, + { + "epoch": 2.58, + "grad_norm": 0.640625, + "learning_rate": 6.735156736567511e-05, + "loss": 0.4614, + "step": 19343 + }, + { + "epoch": 2.58, + "grad_norm": 0.72265625, + "learning_rate": 6.734056068665348e-05, + "loss": 0.3127, + "step": 19344 + }, + { + "epoch": 2.58, + "grad_norm": 0.8125, + "learning_rate": 6.732955445051482e-05, + "loss": 0.5781, + "step": 19345 + }, + { + "epoch": 2.58, + "grad_norm": 0.55859375, + "learning_rate": 6.731854865740835e-05, + "loss": 0.3861, + "step": 19346 + }, + { + "epoch": 2.58, + "grad_norm": 0.66015625, + "learning_rate": 6.730754330748338e-05, + "loss": 0.244, + "step": 19347 + }, + { + "epoch": 2.58, + "grad_norm": 1.1875, + "learning_rate": 6.729653840088907e-05, + "loss": 0.4599, + "step": 19348 + }, + { + "epoch": 2.58, + "grad_norm": 0.6640625, + "learning_rate": 6.728553393777468e-05, + "loss": 0.5131, + "step": 19349 + }, + { + "epoch": 2.58, + "grad_norm": 0.72265625, + "learning_rate": 6.727452991828946e-05, + "loss": 0.5758, + "step": 19350 + }, + { + "epoch": 2.58, + "grad_norm": 0.62109375, + "learning_rate": 6.72635263425826e-05, + "loss": 0.2479, + "step": 19351 + }, + { + "epoch": 2.58, + "grad_norm": 0.41796875, + "learning_rate": 6.725252321080336e-05, + "loss": 0.3407, + "step": 19352 + }, + { + "epoch": 2.58, + "grad_norm": 0.6484375, + "learning_rate": 6.72415205231009e-05, + "loss": 0.3197, + "step": 19353 + }, + { + "epoch": 2.58, + "grad_norm": 0.470703125, + "learning_rate": 6.723051827962445e-05, + "loss": 0.2294, + "step": 19354 + }, + { + "epoch": 2.58, + "grad_norm": 0.4453125, + "learning_rate": 6.72195164805232e-05, + "loss": 0.2457, + "step": 19355 + }, + { + "epoch": 2.58, + "grad_norm": 0.7109375, + "learning_rate": 6.720851512594637e-05, + "loss": 0.37, + "step": 19356 + }, + { + "epoch": 2.58, + "grad_norm": 0.48046875, + "learning_rate": 6.719751421604309e-05, + "loss": 0.2919, + "step": 19357 + }, + { + "epoch": 2.58, + "grad_norm": 0.6640625, + "learning_rate": 6.718651375096259e-05, + "loss": 0.1618, + "step": 19358 + }, + { + "epoch": 2.58, + "grad_norm": 0.6953125, + "learning_rate": 6.717551373085405e-05, + "loss": 0.2931, + "step": 19359 + }, + { + "epoch": 2.58, + "grad_norm": 0.578125, + "learning_rate": 6.716451415586656e-05, + "loss": 0.3244, + "step": 19360 + }, + { + "epoch": 2.58, + "grad_norm": 0.6796875, + "learning_rate": 6.715351502614934e-05, + "loss": 0.4923, + "step": 19361 + }, + { + "epoch": 2.58, + "grad_norm": 0.5703125, + "learning_rate": 6.714251634185155e-05, + "loss": 0.3393, + "step": 19362 + }, + { + "epoch": 2.58, + "grad_norm": 0.46484375, + "learning_rate": 6.71315181031223e-05, + "loss": 0.2047, + "step": 19363 + }, + { + "epoch": 2.58, + "grad_norm": 0.87109375, + "learning_rate": 6.712052031011079e-05, + "loss": 0.4154, + "step": 19364 + }, + { + "epoch": 2.58, + "grad_norm": 0.6171875, + "learning_rate": 6.710952296296611e-05, + "loss": 0.2783, + "step": 19365 + }, + { + "epoch": 2.58, + "grad_norm": 0.5546875, + "learning_rate": 6.709852606183742e-05, + "loss": 0.2628, + "step": 19366 + }, + { + "epoch": 2.58, + "grad_norm": 0.361328125, + "learning_rate": 6.708752960687381e-05, + "loss": 0.1287, + "step": 19367 + }, + { + "epoch": 2.58, + "grad_norm": 0.58984375, + "learning_rate": 6.707653359822445e-05, + "loss": 0.3753, + "step": 19368 + }, + { + "epoch": 2.58, + "grad_norm": 0.51953125, + "learning_rate": 6.70655380360384e-05, + "loss": 0.3168, + "step": 19369 + }, + { + "epoch": 2.58, + "grad_norm": 0.50390625, + "learning_rate": 6.705454292046481e-05, + "loss": 0.4134, + "step": 19370 + }, + { + "epoch": 2.58, + "grad_norm": 0.66015625, + "learning_rate": 6.70435482516528e-05, + "loss": 0.4283, + "step": 19371 + }, + { + "epoch": 2.59, + "grad_norm": 0.458984375, + "learning_rate": 6.703255402975138e-05, + "loss": 0.166, + "step": 19372 + }, + { + "epoch": 2.59, + "grad_norm": 0.65234375, + "learning_rate": 6.702156025490969e-05, + "loss": 0.2274, + "step": 19373 + }, + { + "epoch": 2.59, + "grad_norm": 0.388671875, + "learning_rate": 6.701056692727678e-05, + "loss": 0.1011, + "step": 19374 + }, + { + "epoch": 2.59, + "grad_norm": 0.3359375, + "learning_rate": 6.69995740470018e-05, + "loss": 0.1763, + "step": 19375 + }, + { + "epoch": 2.59, + "grad_norm": 0.421875, + "learning_rate": 6.698858161423373e-05, + "loss": 0.3008, + "step": 19376 + }, + { + "epoch": 2.59, + "grad_norm": 0.59375, + "learning_rate": 6.697758962912172e-05, + "loss": 0.4522, + "step": 19377 + }, + { + "epoch": 2.59, + "grad_norm": 0.6875, + "learning_rate": 6.696659809181478e-05, + "loss": 0.5256, + "step": 19378 + }, + { + "epoch": 2.59, + "grad_norm": 0.6640625, + "learning_rate": 6.695560700246198e-05, + "loss": 0.394, + "step": 19379 + }, + { + "epoch": 2.59, + "grad_norm": 0.423828125, + "learning_rate": 6.694461636121231e-05, + "loss": 0.2139, + "step": 19380 + }, + { + "epoch": 2.59, + "grad_norm": 0.71875, + "learning_rate": 6.693362616821491e-05, + "loss": 0.287, + "step": 19381 + }, + { + "epoch": 2.59, + "grad_norm": 0.625, + "learning_rate": 6.692263642361873e-05, + "loss": 0.4758, + "step": 19382 + }, + { + "epoch": 2.59, + "grad_norm": 0.5234375, + "learning_rate": 6.691164712757288e-05, + "loss": 0.2604, + "step": 19383 + }, + { + "epoch": 2.59, + "grad_norm": 0.353515625, + "learning_rate": 6.69006582802263e-05, + "loss": 0.1405, + "step": 19384 + }, + { + "epoch": 2.59, + "grad_norm": 0.58984375, + "learning_rate": 6.6889669881728e-05, + "loss": 0.3295, + "step": 19385 + }, + { + "epoch": 2.59, + "grad_norm": 0.6171875, + "learning_rate": 6.687868193222706e-05, + "loss": 0.2718, + "step": 19386 + }, + { + "epoch": 2.59, + "grad_norm": 0.71484375, + "learning_rate": 6.686769443187245e-05, + "loss": 0.567, + "step": 19387 + }, + { + "epoch": 2.59, + "grad_norm": 0.59375, + "learning_rate": 6.685670738081314e-05, + "loss": 0.3814, + "step": 19388 + }, + { + "epoch": 2.59, + "grad_norm": 0.6953125, + "learning_rate": 6.684572077919818e-05, + "loss": 0.4957, + "step": 19389 + }, + { + "epoch": 2.59, + "grad_norm": 0.703125, + "learning_rate": 6.68347346271765e-05, + "loss": 0.4994, + "step": 19390 + }, + { + "epoch": 2.59, + "grad_norm": 0.56640625, + "learning_rate": 6.682374892489714e-05, + "loss": 0.3495, + "step": 19391 + }, + { + "epoch": 2.59, + "grad_norm": 0.5859375, + "learning_rate": 6.681276367250899e-05, + "loss": 0.4397, + "step": 19392 + }, + { + "epoch": 2.59, + "grad_norm": 0.671875, + "learning_rate": 6.68017788701611e-05, + "loss": 0.3828, + "step": 19393 + }, + { + "epoch": 2.59, + "grad_norm": 0.8359375, + "learning_rate": 6.679079451800239e-05, + "loss": 0.3448, + "step": 19394 + }, + { + "epoch": 2.59, + "grad_norm": 0.7578125, + "learning_rate": 6.677981061618184e-05, + "loss": 0.5777, + "step": 19395 + }, + { + "epoch": 2.59, + "grad_norm": 0.55859375, + "learning_rate": 6.676882716484836e-05, + "loss": 0.3429, + "step": 19396 + }, + { + "epoch": 2.59, + "grad_norm": 0.8984375, + "learning_rate": 6.675784416415089e-05, + "loss": 0.385, + "step": 19397 + }, + { + "epoch": 2.59, + "grad_norm": 0.640625, + "learning_rate": 6.674686161423843e-05, + "loss": 0.405, + "step": 19398 + }, + { + "epoch": 2.59, + "grad_norm": 0.6171875, + "learning_rate": 6.673587951525982e-05, + "loss": 0.2242, + "step": 19399 + }, + { + "epoch": 2.59, + "grad_norm": 0.671875, + "learning_rate": 6.672489786736407e-05, + "loss": 0.5019, + "step": 19400 + }, + { + "epoch": 2.59, + "grad_norm": 0.7265625, + "learning_rate": 6.671391667070002e-05, + "loss": 0.4913, + "step": 19401 + }, + { + "epoch": 2.59, + "grad_norm": 0.65234375, + "learning_rate": 6.670293592541665e-05, + "loss": 0.3347, + "step": 19402 + }, + { + "epoch": 2.59, + "grad_norm": 0.51953125, + "learning_rate": 6.669195563166283e-05, + "loss": 0.2989, + "step": 19403 + }, + { + "epoch": 2.59, + "grad_norm": 0.60546875, + "learning_rate": 6.66809757895875e-05, + "loss": 0.3786, + "step": 19404 + }, + { + "epoch": 2.59, + "grad_norm": 0.578125, + "learning_rate": 6.666999639933947e-05, + "loss": 0.3809, + "step": 19405 + }, + { + "epoch": 2.59, + "grad_norm": 0.7265625, + "learning_rate": 6.665901746106772e-05, + "loss": 0.3519, + "step": 19406 + }, + { + "epoch": 2.59, + "grad_norm": 0.63671875, + "learning_rate": 6.664803897492111e-05, + "loss": 0.2961, + "step": 19407 + }, + { + "epoch": 2.59, + "grad_norm": 0.6875, + "learning_rate": 6.663706094104845e-05, + "loss": 0.242, + "step": 19408 + }, + { + "epoch": 2.59, + "grad_norm": 0.77734375, + "learning_rate": 6.662608335959868e-05, + "loss": 0.3806, + "step": 19409 + }, + { + "epoch": 2.59, + "grad_norm": 0.66015625, + "learning_rate": 6.661510623072063e-05, + "loss": 0.3932, + "step": 19410 + }, + { + "epoch": 2.59, + "grad_norm": 0.5390625, + "learning_rate": 6.660412955456314e-05, + "loss": 0.2943, + "step": 19411 + }, + { + "epoch": 2.59, + "grad_norm": 0.6171875, + "learning_rate": 6.659315333127512e-05, + "loss": 0.43, + "step": 19412 + }, + { + "epoch": 2.59, + "grad_norm": 0.59375, + "learning_rate": 6.658217756100536e-05, + "loss": 0.1705, + "step": 19413 + }, + { + "epoch": 2.59, + "grad_norm": 0.5859375, + "learning_rate": 6.657120224390275e-05, + "loss": 0.2135, + "step": 19414 + }, + { + "epoch": 2.59, + "grad_norm": 0.546875, + "learning_rate": 6.656022738011606e-05, + "loss": 0.3987, + "step": 19415 + }, + { + "epoch": 2.59, + "grad_norm": 0.490234375, + "learning_rate": 6.654925296979418e-05, + "loss": 0.2798, + "step": 19416 + }, + { + "epoch": 2.59, + "grad_norm": 0.83984375, + "learning_rate": 6.653827901308588e-05, + "loss": 0.5717, + "step": 19417 + }, + { + "epoch": 2.59, + "grad_norm": 0.625, + "learning_rate": 6.652730551014e-05, + "loss": 0.3315, + "step": 19418 + }, + { + "epoch": 2.59, + "grad_norm": 0.515625, + "learning_rate": 6.651633246110532e-05, + "loss": 0.3042, + "step": 19419 + }, + { + "epoch": 2.59, + "grad_norm": 0.546875, + "learning_rate": 6.650535986613068e-05, + "loss": 0.3765, + "step": 19420 + }, + { + "epoch": 2.59, + "grad_norm": 0.63671875, + "learning_rate": 6.649438772536487e-05, + "loss": 0.3472, + "step": 19421 + }, + { + "epoch": 2.59, + "grad_norm": 0.546875, + "learning_rate": 6.648341603895664e-05, + "loss": 0.2662, + "step": 19422 + }, + { + "epoch": 2.59, + "grad_norm": 0.6171875, + "learning_rate": 6.64724448070548e-05, + "loss": 0.2201, + "step": 19423 + }, + { + "epoch": 2.59, + "grad_norm": 0.71484375, + "learning_rate": 6.646147402980814e-05, + "loss": 0.3192, + "step": 19424 + }, + { + "epoch": 2.59, + "grad_norm": 0.5859375, + "learning_rate": 6.645050370736541e-05, + "loss": 0.4228, + "step": 19425 + }, + { + "epoch": 2.59, + "grad_norm": 0.75, + "learning_rate": 6.643953383987537e-05, + "loss": 0.457, + "step": 19426 + }, + { + "epoch": 2.59, + "grad_norm": 0.48828125, + "learning_rate": 6.642856442748681e-05, + "loss": 0.2802, + "step": 19427 + }, + { + "epoch": 2.59, + "grad_norm": 0.69140625, + "learning_rate": 6.641759547034844e-05, + "loss": 0.3641, + "step": 19428 + }, + { + "epoch": 2.59, + "grad_norm": 0.76953125, + "learning_rate": 6.640662696860907e-05, + "loss": 0.2957, + "step": 19429 + }, + { + "epoch": 2.59, + "grad_norm": 0.73828125, + "learning_rate": 6.639565892241735e-05, + "loss": 0.3442, + "step": 19430 + }, + { + "epoch": 2.59, + "grad_norm": 0.64453125, + "learning_rate": 6.63846913319221e-05, + "loss": 0.5293, + "step": 19431 + }, + { + "epoch": 2.59, + "grad_norm": 0.73046875, + "learning_rate": 6.637372419727201e-05, + "loss": 0.3497, + "step": 19432 + }, + { + "epoch": 2.59, + "grad_norm": 0.75390625, + "learning_rate": 6.636275751861579e-05, + "loss": 0.3371, + "step": 19433 + }, + { + "epoch": 2.59, + "grad_norm": 0.53515625, + "learning_rate": 6.635179129610216e-05, + "loss": 0.238, + "step": 19434 + }, + { + "epoch": 2.59, + "grad_norm": 0.4140625, + "learning_rate": 6.634082552987983e-05, + "loss": 0.2048, + "step": 19435 + }, + { + "epoch": 2.59, + "grad_norm": 0.515625, + "learning_rate": 6.63298602200975e-05, + "loss": 0.2922, + "step": 19436 + }, + { + "epoch": 2.59, + "grad_norm": 0.6328125, + "learning_rate": 6.63188953669039e-05, + "loss": 0.321, + "step": 19437 + }, + { + "epoch": 2.59, + "grad_norm": 0.65234375, + "learning_rate": 6.630793097044768e-05, + "loss": 0.4025, + "step": 19438 + }, + { + "epoch": 2.59, + "grad_norm": 0.56640625, + "learning_rate": 6.629696703087754e-05, + "loss": 0.3495, + "step": 19439 + }, + { + "epoch": 2.59, + "grad_norm": 0.4765625, + "learning_rate": 6.628600354834216e-05, + "loss": 0.3523, + "step": 19440 + }, + { + "epoch": 2.59, + "grad_norm": 0.5703125, + "learning_rate": 6.627504052299021e-05, + "loss": 0.3656, + "step": 19441 + }, + { + "epoch": 2.59, + "grad_norm": 0.65625, + "learning_rate": 6.626407795497033e-05, + "loss": 0.6329, + "step": 19442 + }, + { + "epoch": 2.59, + "grad_norm": 0.53125, + "learning_rate": 6.625311584443126e-05, + "loss": 0.4441, + "step": 19443 + }, + { + "epoch": 2.59, + "grad_norm": 0.625, + "learning_rate": 6.62421541915216e-05, + "loss": 0.5829, + "step": 19444 + }, + { + "epoch": 2.59, + "grad_norm": 0.43359375, + "learning_rate": 6.623119299638994e-05, + "loss": 0.3953, + "step": 19445 + }, + { + "epoch": 2.59, + "grad_norm": 0.6015625, + "learning_rate": 6.622023225918502e-05, + "loss": 0.2962, + "step": 19446 + }, + { + "epoch": 2.6, + "grad_norm": 0.392578125, + "learning_rate": 6.620927198005538e-05, + "loss": 0.2798, + "step": 19447 + }, + { + "epoch": 2.6, + "grad_norm": 0.578125, + "learning_rate": 6.619831215914974e-05, + "loss": 0.3001, + "step": 19448 + }, + { + "epoch": 2.6, + "grad_norm": 0.462890625, + "learning_rate": 6.618735279661665e-05, + "loss": 0.135, + "step": 19449 + }, + { + "epoch": 2.6, + "grad_norm": 0.72265625, + "learning_rate": 6.61763938926048e-05, + "loss": 0.4909, + "step": 19450 + }, + { + "epoch": 2.6, + "grad_norm": 0.57421875, + "learning_rate": 6.616543544726271e-05, + "loss": 0.3377, + "step": 19451 + }, + { + "epoch": 2.6, + "grad_norm": 0.6796875, + "learning_rate": 6.615447746073907e-05, + "loss": 0.3106, + "step": 19452 + }, + { + "epoch": 2.6, + "grad_norm": 0.7734375, + "learning_rate": 6.61435199331824e-05, + "loss": 0.3235, + "step": 19453 + }, + { + "epoch": 2.6, + "grad_norm": 0.515625, + "learning_rate": 6.613256286474137e-05, + "loss": 0.2399, + "step": 19454 + }, + { + "epoch": 2.6, + "grad_norm": 0.52734375, + "learning_rate": 6.612160625556448e-05, + "loss": 0.2743, + "step": 19455 + }, + { + "epoch": 2.6, + "grad_norm": 0.55859375, + "learning_rate": 6.61106501058004e-05, + "loss": 0.4061, + "step": 19456 + }, + { + "epoch": 2.6, + "grad_norm": 0.65234375, + "learning_rate": 6.609969441559765e-05, + "loss": 0.3101, + "step": 19457 + }, + { + "epoch": 2.6, + "grad_norm": 0.408203125, + "learning_rate": 6.608873918510476e-05, + "loss": 0.2328, + "step": 19458 + }, + { + "epoch": 2.6, + "grad_norm": 0.60546875, + "learning_rate": 6.607778441447035e-05, + "loss": 0.2609, + "step": 19459 + }, + { + "epoch": 2.6, + "grad_norm": 0.484375, + "learning_rate": 6.606683010384296e-05, + "loss": 0.2047, + "step": 19460 + }, + { + "epoch": 2.6, + "grad_norm": 0.578125, + "learning_rate": 6.605587625337111e-05, + "loss": 0.4626, + "step": 19461 + }, + { + "epoch": 2.6, + "grad_norm": 0.5703125, + "learning_rate": 6.604492286320338e-05, + "loss": 0.3439, + "step": 19462 + }, + { + "epoch": 2.6, + "grad_norm": 0.57421875, + "learning_rate": 6.603396993348829e-05, + "loss": 0.4429, + "step": 19463 + }, + { + "epoch": 2.6, + "grad_norm": 0.63671875, + "learning_rate": 6.602301746437436e-05, + "loss": 0.4221, + "step": 19464 + }, + { + "epoch": 2.6, + "grad_norm": 0.72265625, + "learning_rate": 6.601206545601012e-05, + "loss": 0.7697, + "step": 19465 + }, + { + "epoch": 2.6, + "grad_norm": 0.5859375, + "learning_rate": 6.60011139085441e-05, + "loss": 0.4843, + "step": 19466 + }, + { + "epoch": 2.6, + "grad_norm": 1.0078125, + "learning_rate": 6.599016282212477e-05, + "loss": 0.2696, + "step": 19467 + }, + { + "epoch": 2.6, + "grad_norm": 0.419921875, + "learning_rate": 6.59792121969007e-05, + "loss": 0.1462, + "step": 19468 + }, + { + "epoch": 2.6, + "grad_norm": 0.890625, + "learning_rate": 6.596826203302034e-05, + "loss": 0.3332, + "step": 19469 + }, + { + "epoch": 2.6, + "grad_norm": 0.63671875, + "learning_rate": 6.595731233063216e-05, + "loss": 0.1556, + "step": 19470 + }, + { + "epoch": 2.6, + "grad_norm": 0.62890625, + "learning_rate": 6.594636308988469e-05, + "loss": 0.5193, + "step": 19471 + }, + { + "epoch": 2.6, + "grad_norm": 0.69140625, + "learning_rate": 6.593541431092638e-05, + "loss": 0.2578, + "step": 19472 + }, + { + "epoch": 2.6, + "grad_norm": 0.6484375, + "learning_rate": 6.592446599390571e-05, + "loss": 0.4279, + "step": 19473 + }, + { + "epoch": 2.6, + "grad_norm": 0.7109375, + "learning_rate": 6.591351813897116e-05, + "loss": 0.222, + "step": 19474 + }, + { + "epoch": 2.6, + "grad_norm": 0.69921875, + "learning_rate": 6.59025707462712e-05, + "loss": 0.5438, + "step": 19475 + }, + { + "epoch": 2.6, + "grad_norm": 0.578125, + "learning_rate": 6.589162381595421e-05, + "loss": 0.3723, + "step": 19476 + }, + { + "epoch": 2.6, + "grad_norm": 0.60546875, + "learning_rate": 6.588067734816875e-05, + "loss": 0.6214, + "step": 19477 + }, + { + "epoch": 2.6, + "grad_norm": 0.6171875, + "learning_rate": 6.586973134306317e-05, + "loss": 0.1784, + "step": 19478 + }, + { + "epoch": 2.6, + "grad_norm": 0.5625, + "learning_rate": 6.585878580078595e-05, + "loss": 0.226, + "step": 19479 + }, + { + "epoch": 2.6, + "grad_norm": 0.7734375, + "learning_rate": 6.584784072148555e-05, + "loss": 0.7668, + "step": 19480 + }, + { + "epoch": 2.6, + "grad_norm": 0.4921875, + "learning_rate": 6.583689610531027e-05, + "loss": 0.3659, + "step": 19481 + }, + { + "epoch": 2.6, + "grad_norm": 0.62109375, + "learning_rate": 6.582595195240867e-05, + "loss": 0.3254, + "step": 19482 + }, + { + "epoch": 2.6, + "grad_norm": 0.67578125, + "learning_rate": 6.581500826292903e-05, + "loss": 0.291, + "step": 19483 + }, + { + "epoch": 2.6, + "grad_norm": 0.47265625, + "learning_rate": 6.580406503701985e-05, + "loss": 0.3061, + "step": 19484 + }, + { + "epoch": 2.6, + "grad_norm": 0.66796875, + "learning_rate": 6.57931222748295e-05, + "loss": 0.5346, + "step": 19485 + }, + { + "epoch": 2.6, + "grad_norm": 0.7578125, + "learning_rate": 6.578217997650636e-05, + "loss": 0.421, + "step": 19486 + }, + { + "epoch": 2.6, + "grad_norm": 0.546875, + "learning_rate": 6.577123814219881e-05, + "loss": 0.2688, + "step": 19487 + }, + { + "epoch": 2.6, + "grad_norm": 0.63671875, + "learning_rate": 6.576029677205523e-05, + "loss": 0.4121, + "step": 19488 + }, + { + "epoch": 2.6, + "grad_norm": 0.859375, + "learning_rate": 6.574935586622403e-05, + "loss": 0.3096, + "step": 19489 + }, + { + "epoch": 2.6, + "grad_norm": 0.47265625, + "learning_rate": 6.573841542485353e-05, + "loss": 0.1973, + "step": 19490 + }, + { + "epoch": 2.6, + "grad_norm": 0.7578125, + "learning_rate": 6.572747544809213e-05, + "loss": 0.2715, + "step": 19491 + }, + { + "epoch": 2.6, + "grad_norm": 0.51953125, + "learning_rate": 6.571653593608816e-05, + "loss": 0.3141, + "step": 19492 + }, + { + "epoch": 2.6, + "grad_norm": 0.94140625, + "learning_rate": 6.570559688898994e-05, + "loss": 0.5311, + "step": 19493 + }, + { + "epoch": 2.6, + "grad_norm": 0.67578125, + "learning_rate": 6.569465830694586e-05, + "loss": 0.3077, + "step": 19494 + }, + { + "epoch": 2.6, + "grad_norm": 0.69140625, + "learning_rate": 6.56837201901042e-05, + "loss": 0.4134, + "step": 19495 + }, + { + "epoch": 2.6, + "grad_norm": 0.6796875, + "learning_rate": 6.567278253861333e-05, + "loss": 0.3847, + "step": 19496 + }, + { + "epoch": 2.6, + "grad_norm": 0.5078125, + "learning_rate": 6.566184535262156e-05, + "loss": 0.2514, + "step": 19497 + }, + { + "epoch": 2.6, + "grad_norm": 0.5703125, + "learning_rate": 6.56509086322772e-05, + "loss": 0.222, + "step": 19498 + }, + { + "epoch": 2.6, + "grad_norm": 0.60546875, + "learning_rate": 6.563997237772856e-05, + "loss": 0.245, + "step": 19499 + }, + { + "epoch": 2.6, + "grad_norm": 0.65625, + "learning_rate": 6.562903658912397e-05, + "loss": 0.4422, + "step": 19500 + }, + { + "epoch": 2.6, + "grad_norm": 0.65625, + "learning_rate": 6.561810126661168e-05, + "loss": 0.4206, + "step": 19501 + }, + { + "epoch": 2.6, + "grad_norm": 0.48046875, + "learning_rate": 6.560716641034001e-05, + "loss": 0.2551, + "step": 19502 + }, + { + "epoch": 2.6, + "grad_norm": 0.609375, + "learning_rate": 6.559623202045724e-05, + "loss": 0.2039, + "step": 19503 + }, + { + "epoch": 2.6, + "grad_norm": 0.59375, + "learning_rate": 6.558529809711164e-05, + "loss": 0.1488, + "step": 19504 + }, + { + "epoch": 2.6, + "grad_norm": 0.65625, + "learning_rate": 6.557436464045152e-05, + "loss": 0.3221, + "step": 19505 + }, + { + "epoch": 2.6, + "grad_norm": 0.58203125, + "learning_rate": 6.556343165062507e-05, + "loss": 0.2105, + "step": 19506 + }, + { + "epoch": 2.6, + "grad_norm": 0.7109375, + "learning_rate": 6.555249912778061e-05, + "loss": 0.4027, + "step": 19507 + }, + { + "epoch": 2.6, + "grad_norm": 0.5859375, + "learning_rate": 6.554156707206635e-05, + "loss": 0.3935, + "step": 19508 + }, + { + "epoch": 2.6, + "grad_norm": 0.57421875, + "learning_rate": 6.553063548363054e-05, + "loss": 0.3441, + "step": 19509 + }, + { + "epoch": 2.6, + "grad_norm": 0.4453125, + "learning_rate": 6.551970436262147e-05, + "loss": 0.1854, + "step": 19510 + }, + { + "epoch": 2.6, + "grad_norm": 0.6484375, + "learning_rate": 6.55087737091873e-05, + "loss": 0.5421, + "step": 19511 + }, + { + "epoch": 2.6, + "grad_norm": 0.6875, + "learning_rate": 6.549784352347634e-05, + "loss": 0.3716, + "step": 19512 + }, + { + "epoch": 2.6, + "grad_norm": 0.443359375, + "learning_rate": 6.548691380563674e-05, + "loss": 0.1272, + "step": 19513 + }, + { + "epoch": 2.6, + "grad_norm": 0.6796875, + "learning_rate": 6.547598455581675e-05, + "loss": 0.4746, + "step": 19514 + }, + { + "epoch": 2.6, + "grad_norm": 0.62109375, + "learning_rate": 6.546505577416455e-05, + "loss": 0.245, + "step": 19515 + }, + { + "epoch": 2.6, + "grad_norm": 0.48046875, + "learning_rate": 6.545412746082838e-05, + "loss": 0.2483, + "step": 19516 + }, + { + "epoch": 2.6, + "grad_norm": 0.640625, + "learning_rate": 6.544319961595643e-05, + "loss": 0.6049, + "step": 19517 + }, + { + "epoch": 2.6, + "grad_norm": 0.59375, + "learning_rate": 6.543227223969683e-05, + "loss": 0.4241, + "step": 19518 + }, + { + "epoch": 2.6, + "grad_norm": 0.7734375, + "learning_rate": 6.542134533219782e-05, + "loss": 0.6436, + "step": 19519 + }, + { + "epoch": 2.6, + "grad_norm": 0.8359375, + "learning_rate": 6.541041889360754e-05, + "loss": 0.612, + "step": 19520 + }, + { + "epoch": 2.6, + "grad_norm": 0.50390625, + "learning_rate": 6.539949292407421e-05, + "loss": 0.2266, + "step": 19521 + }, + { + "epoch": 2.61, + "grad_norm": 0.470703125, + "learning_rate": 6.538856742374595e-05, + "loss": 0.2082, + "step": 19522 + }, + { + "epoch": 2.61, + "grad_norm": 0.55078125, + "learning_rate": 6.537764239277093e-05, + "loss": 0.395, + "step": 19523 + }, + { + "epoch": 2.61, + "grad_norm": 0.59765625, + "learning_rate": 6.536671783129729e-05, + "loss": 0.389, + "step": 19524 + }, + { + "epoch": 2.61, + "grad_norm": 0.55078125, + "learning_rate": 6.535579373947321e-05, + "loss": 0.4115, + "step": 19525 + }, + { + "epoch": 2.61, + "grad_norm": 0.392578125, + "learning_rate": 6.534487011744677e-05, + "loss": 0.2127, + "step": 19526 + }, + { + "epoch": 2.61, + "grad_norm": 0.48828125, + "learning_rate": 6.533394696536614e-05, + "loss": 0.241, + "step": 19527 + }, + { + "epoch": 2.61, + "grad_norm": 0.58203125, + "learning_rate": 6.532302428337944e-05, + "loss": 0.3059, + "step": 19528 + }, + { + "epoch": 2.61, + "grad_norm": 0.6015625, + "learning_rate": 6.531210207163482e-05, + "loss": 0.5239, + "step": 19529 + }, + { + "epoch": 2.61, + "grad_norm": 0.41796875, + "learning_rate": 6.530118033028033e-05, + "loss": 0.288, + "step": 19530 + }, + { + "epoch": 2.61, + "grad_norm": 0.53125, + "learning_rate": 6.529025905946409e-05, + "loss": 0.1959, + "step": 19531 + }, + { + "epoch": 2.61, + "grad_norm": 0.498046875, + "learning_rate": 6.527933825933421e-05, + "loss": 0.2942, + "step": 19532 + }, + { + "epoch": 2.61, + "grad_norm": 0.54296875, + "learning_rate": 6.526841793003881e-05, + "loss": 0.4581, + "step": 19533 + }, + { + "epoch": 2.61, + "grad_norm": 0.58203125, + "learning_rate": 6.525749807172594e-05, + "loss": 0.3525, + "step": 19534 + }, + { + "epoch": 2.61, + "grad_norm": 0.62109375, + "learning_rate": 6.524657868454372e-05, + "loss": 0.2555, + "step": 19535 + }, + { + "epoch": 2.61, + "grad_norm": 0.412109375, + "learning_rate": 6.523565976864016e-05, + "loss": 0.2024, + "step": 19536 + }, + { + "epoch": 2.61, + "grad_norm": 0.6953125, + "learning_rate": 6.522474132416339e-05, + "loss": 0.4615, + "step": 19537 + }, + { + "epoch": 2.61, + "grad_norm": 0.53125, + "learning_rate": 6.521382335126142e-05, + "loss": 0.2212, + "step": 19538 + }, + { + "epoch": 2.61, + "grad_norm": 0.59765625, + "learning_rate": 6.520290585008238e-05, + "loss": 0.1923, + "step": 19539 + }, + { + "epoch": 2.61, + "grad_norm": 0.66796875, + "learning_rate": 6.519198882077422e-05, + "loss": 0.2291, + "step": 19540 + }, + { + "epoch": 2.61, + "grad_norm": 0.703125, + "learning_rate": 6.518107226348507e-05, + "loss": 0.3157, + "step": 19541 + }, + { + "epoch": 2.61, + "grad_norm": 0.76171875, + "learning_rate": 6.517015617836291e-05, + "loss": 0.3078, + "step": 19542 + }, + { + "epoch": 2.61, + "grad_norm": 0.5546875, + "learning_rate": 6.515924056555577e-05, + "loss": 0.3754, + "step": 19543 + }, + { + "epoch": 2.61, + "grad_norm": 0.57421875, + "learning_rate": 6.514832542521171e-05, + "loss": 0.2609, + "step": 19544 + }, + { + "epoch": 2.61, + "grad_norm": 0.443359375, + "learning_rate": 6.513741075747873e-05, + "loss": 0.1843, + "step": 19545 + }, + { + "epoch": 2.61, + "grad_norm": 0.58984375, + "learning_rate": 6.512649656250482e-05, + "loss": 0.3075, + "step": 19546 + }, + { + "epoch": 2.61, + "grad_norm": 0.578125, + "learning_rate": 6.511558284043798e-05, + "loss": 0.2947, + "step": 19547 + }, + { + "epoch": 2.61, + "grad_norm": 0.546875, + "learning_rate": 6.510466959142626e-05, + "loss": 0.3087, + "step": 19548 + }, + { + "epoch": 2.61, + "grad_norm": 0.423828125, + "learning_rate": 6.509375681561759e-05, + "loss": 0.1367, + "step": 19549 + }, + { + "epoch": 2.61, + "grad_norm": 0.51171875, + "learning_rate": 6.508284451316e-05, + "loss": 0.4363, + "step": 19550 + }, + { + "epoch": 2.61, + "grad_norm": 0.53125, + "learning_rate": 6.507193268420142e-05, + "loss": 0.2951, + "step": 19551 + }, + { + "epoch": 2.61, + "grad_norm": 0.5234375, + "learning_rate": 6.506102132888989e-05, + "loss": 0.1407, + "step": 19552 + }, + { + "epoch": 2.61, + "grad_norm": 0.421875, + "learning_rate": 6.505011044737333e-05, + "loss": 0.2389, + "step": 19553 + }, + { + "epoch": 2.61, + "grad_norm": 0.5234375, + "learning_rate": 6.503920003979969e-05, + "loss": 0.3784, + "step": 19554 + }, + { + "epoch": 2.61, + "grad_norm": 0.66015625, + "learning_rate": 6.502829010631693e-05, + "loss": 0.4815, + "step": 19555 + }, + { + "epoch": 2.61, + "grad_norm": 0.6953125, + "learning_rate": 6.501738064707299e-05, + "loss": 0.5304, + "step": 19556 + }, + { + "epoch": 2.61, + "grad_norm": 0.5703125, + "learning_rate": 6.500647166221583e-05, + "loss": 0.3103, + "step": 19557 + }, + { + "epoch": 2.61, + "grad_norm": 0.58984375, + "learning_rate": 6.499556315189337e-05, + "loss": 0.3097, + "step": 19558 + }, + { + "epoch": 2.61, + "grad_norm": 0.66015625, + "learning_rate": 6.498465511625354e-05, + "loss": 0.6464, + "step": 19559 + }, + { + "epoch": 2.61, + "grad_norm": 0.6796875, + "learning_rate": 6.497374755544428e-05, + "loss": 0.4226, + "step": 19560 + }, + { + "epoch": 2.61, + "grad_norm": 0.6953125, + "learning_rate": 6.496284046961345e-05, + "loss": 0.3193, + "step": 19561 + }, + { + "epoch": 2.61, + "grad_norm": 0.65625, + "learning_rate": 6.495193385890901e-05, + "loss": 0.3957, + "step": 19562 + }, + { + "epoch": 2.61, + "grad_norm": 0.64453125, + "learning_rate": 6.494102772347883e-05, + "loss": 0.3191, + "step": 19563 + }, + { + "epoch": 2.61, + "grad_norm": 0.5390625, + "learning_rate": 6.493012206347084e-05, + "loss": 0.3275, + "step": 19564 + }, + { + "epoch": 2.61, + "grad_norm": 0.4921875, + "learning_rate": 6.49192168790329e-05, + "loss": 0.4587, + "step": 19565 + }, + { + "epoch": 2.61, + "grad_norm": 0.70703125, + "learning_rate": 6.490831217031288e-05, + "loss": 0.5632, + "step": 19566 + }, + { + "epoch": 2.61, + "grad_norm": 0.671875, + "learning_rate": 6.489740793745866e-05, + "loss": 0.2674, + "step": 19567 + }, + { + "epoch": 2.61, + "grad_norm": 0.6953125, + "learning_rate": 6.488650418061813e-05, + "loss": 0.302, + "step": 19568 + }, + { + "epoch": 2.61, + "grad_norm": 0.67578125, + "learning_rate": 6.487560089993913e-05, + "loss": 0.7816, + "step": 19569 + }, + { + "epoch": 2.61, + "grad_norm": 0.66015625, + "learning_rate": 6.486469809556952e-05, + "loss": 0.1888, + "step": 19570 + }, + { + "epoch": 2.61, + "grad_norm": 0.55859375, + "learning_rate": 6.485379576765718e-05, + "loss": 0.213, + "step": 19571 + }, + { + "epoch": 2.61, + "grad_norm": 0.40234375, + "learning_rate": 6.484289391634988e-05, + "loss": 0.1464, + "step": 19572 + }, + { + "epoch": 2.61, + "grad_norm": 0.474609375, + "learning_rate": 6.483199254179555e-05, + "loss": 0.1623, + "step": 19573 + }, + { + "epoch": 2.61, + "grad_norm": 0.71875, + "learning_rate": 6.482109164414194e-05, + "loss": 0.3759, + "step": 19574 + }, + { + "epoch": 2.61, + "grad_norm": 0.71484375, + "learning_rate": 6.481019122353692e-05, + "loss": 0.675, + "step": 19575 + }, + { + "epoch": 2.61, + "grad_norm": 0.5, + "learning_rate": 6.479929128012827e-05, + "loss": 0.1495, + "step": 19576 + }, + { + "epoch": 2.61, + "grad_norm": 0.5078125, + "learning_rate": 6.478839181406388e-05, + "loss": 0.3887, + "step": 19577 + }, + { + "epoch": 2.61, + "grad_norm": 0.60546875, + "learning_rate": 6.477749282549146e-05, + "loss": 0.2482, + "step": 19578 + }, + { + "epoch": 2.61, + "grad_norm": 0.5546875, + "learning_rate": 6.476659431455883e-05, + "loss": 0.4261, + "step": 19579 + }, + { + "epoch": 2.61, + "grad_norm": 0.40234375, + "learning_rate": 6.47556962814138e-05, + "loss": 0.1276, + "step": 19580 + }, + { + "epoch": 2.61, + "grad_norm": 0.49609375, + "learning_rate": 6.474479872620414e-05, + "loss": 0.3748, + "step": 19581 + }, + { + "epoch": 2.61, + "grad_norm": 0.458984375, + "learning_rate": 6.473390164907764e-05, + "loss": 0.2221, + "step": 19582 + }, + { + "epoch": 2.61, + "grad_norm": 0.396484375, + "learning_rate": 6.472300505018208e-05, + "loss": 0.2165, + "step": 19583 + }, + { + "epoch": 2.61, + "grad_norm": 0.49609375, + "learning_rate": 6.471210892966519e-05, + "loss": 0.18, + "step": 19584 + }, + { + "epoch": 2.61, + "grad_norm": 0.486328125, + "learning_rate": 6.470121328767478e-05, + "loss": 0.2762, + "step": 19585 + }, + { + "epoch": 2.61, + "grad_norm": 0.50390625, + "learning_rate": 6.469031812435855e-05, + "loss": 0.2933, + "step": 19586 + }, + { + "epoch": 2.61, + "grad_norm": 0.50390625, + "learning_rate": 6.467942343986429e-05, + "loss": 0.1999, + "step": 19587 + }, + { + "epoch": 2.61, + "grad_norm": 0.439453125, + "learning_rate": 6.466852923433969e-05, + "loss": 0.2309, + "step": 19588 + }, + { + "epoch": 2.61, + "grad_norm": 0.71875, + "learning_rate": 6.465763550793252e-05, + "loss": 0.3213, + "step": 19589 + }, + { + "epoch": 2.61, + "grad_norm": 0.49609375, + "learning_rate": 6.464674226079053e-05, + "loss": 0.4372, + "step": 19590 + }, + { + "epoch": 2.61, + "grad_norm": 0.65625, + "learning_rate": 6.463584949306136e-05, + "loss": 0.3071, + "step": 19591 + }, + { + "epoch": 2.61, + "grad_norm": 0.8359375, + "learning_rate": 6.462495720489279e-05, + "loss": 0.4614, + "step": 19592 + }, + { + "epoch": 2.61, + "grad_norm": 0.56640625, + "learning_rate": 6.461406539643249e-05, + "loss": 0.4056, + "step": 19593 + }, + { + "epoch": 2.61, + "grad_norm": 0.62109375, + "learning_rate": 6.460317406782818e-05, + "loss": 0.2895, + "step": 19594 + }, + { + "epoch": 2.61, + "grad_norm": 0.78515625, + "learning_rate": 6.459228321922752e-05, + "loss": 0.3084, + "step": 19595 + }, + { + "epoch": 2.61, + "grad_norm": 0.58984375, + "learning_rate": 6.458139285077827e-05, + "loss": 0.3008, + "step": 19596 + }, + { + "epoch": 2.62, + "grad_norm": 0.69140625, + "learning_rate": 6.457050296262804e-05, + "loss": 0.4357, + "step": 19597 + }, + { + "epoch": 2.62, + "grad_norm": 0.76953125, + "learning_rate": 6.455961355492452e-05, + "loss": 0.294, + "step": 19598 + }, + { + "epoch": 2.62, + "grad_norm": 0.73828125, + "learning_rate": 6.45487246278154e-05, + "loss": 0.3447, + "step": 19599 + }, + { + "epoch": 2.62, + "grad_norm": 0.7421875, + "learning_rate": 6.453783618144832e-05, + "loss": 0.4627, + "step": 19600 + }, + { + "epoch": 2.62, + "grad_norm": 0.5, + "learning_rate": 6.452694821597094e-05, + "loss": 0.2748, + "step": 19601 + }, + { + "epoch": 2.62, + "grad_norm": 0.5625, + "learning_rate": 6.451606073153093e-05, + "loss": 0.3368, + "step": 19602 + }, + { + "epoch": 2.62, + "grad_norm": 0.5546875, + "learning_rate": 6.450517372827591e-05, + "loss": 0.3005, + "step": 19603 + }, + { + "epoch": 2.62, + "grad_norm": 0.75390625, + "learning_rate": 6.449428720635349e-05, + "loss": 0.3739, + "step": 19604 + }, + { + "epoch": 2.62, + "grad_norm": 0.57421875, + "learning_rate": 6.448340116591132e-05, + "loss": 0.1779, + "step": 19605 + }, + { + "epoch": 2.62, + "grad_norm": 0.84375, + "learning_rate": 6.447251560709705e-05, + "loss": 0.6229, + "step": 19606 + }, + { + "epoch": 2.62, + "grad_norm": 0.62109375, + "learning_rate": 6.446163053005822e-05, + "loss": 0.2821, + "step": 19607 + }, + { + "epoch": 2.62, + "grad_norm": 0.578125, + "learning_rate": 6.445074593494254e-05, + "loss": 0.3417, + "step": 19608 + }, + { + "epoch": 2.62, + "grad_norm": 0.5703125, + "learning_rate": 6.443986182189754e-05, + "loss": 0.4599, + "step": 19609 + }, + { + "epoch": 2.62, + "grad_norm": 0.60546875, + "learning_rate": 6.442897819107084e-05, + "loss": 0.1647, + "step": 19610 + }, + { + "epoch": 2.62, + "grad_norm": 0.65234375, + "learning_rate": 6.441809504261002e-05, + "loss": 0.2236, + "step": 19611 + }, + { + "epoch": 2.62, + "grad_norm": 0.66796875, + "learning_rate": 6.440721237666268e-05, + "loss": 0.1917, + "step": 19612 + }, + { + "epoch": 2.62, + "grad_norm": 0.4375, + "learning_rate": 6.439633019337637e-05, + "loss": 0.1322, + "step": 19613 + }, + { + "epoch": 2.62, + "grad_norm": 0.470703125, + "learning_rate": 6.438544849289869e-05, + "loss": 0.3265, + "step": 19614 + }, + { + "epoch": 2.62, + "grad_norm": 0.61328125, + "learning_rate": 6.437456727537719e-05, + "loss": 0.4436, + "step": 19615 + }, + { + "epoch": 2.62, + "grad_norm": 0.78515625, + "learning_rate": 6.436368654095938e-05, + "loss": 0.3982, + "step": 19616 + }, + { + "epoch": 2.62, + "grad_norm": 0.7890625, + "learning_rate": 6.43528062897929e-05, + "loss": 0.4778, + "step": 19617 + }, + { + "epoch": 2.62, + "grad_norm": 0.466796875, + "learning_rate": 6.434192652202523e-05, + "loss": 0.2989, + "step": 19618 + }, + { + "epoch": 2.62, + "grad_norm": 0.6015625, + "learning_rate": 6.43310472378039e-05, + "loss": 0.506, + "step": 19619 + }, + { + "epoch": 2.62, + "grad_norm": 0.64453125, + "learning_rate": 6.432016843727648e-05, + "loss": 0.3979, + "step": 19620 + }, + { + "epoch": 2.62, + "grad_norm": 0.67578125, + "learning_rate": 6.430929012059047e-05, + "loss": 0.3529, + "step": 19621 + }, + { + "epoch": 2.62, + "grad_norm": 0.609375, + "learning_rate": 6.429841228789339e-05, + "loss": 0.572, + "step": 19622 + }, + { + "epoch": 2.62, + "grad_norm": 0.515625, + "learning_rate": 6.428753493933276e-05, + "loss": 0.1427, + "step": 19623 + }, + { + "epoch": 2.62, + "grad_norm": 0.73046875, + "learning_rate": 6.427665807505606e-05, + "loss": 0.2438, + "step": 19624 + }, + { + "epoch": 2.62, + "grad_norm": 0.400390625, + "learning_rate": 6.426578169521082e-05, + "loss": 0.1531, + "step": 19625 + }, + { + "epoch": 2.62, + "grad_norm": 0.6328125, + "learning_rate": 6.425490579994454e-05, + "loss": 0.4817, + "step": 19626 + }, + { + "epoch": 2.62, + "grad_norm": 0.380859375, + "learning_rate": 6.424403038940463e-05, + "loss": 0.1417, + "step": 19627 + }, + { + "epoch": 2.62, + "grad_norm": 0.5390625, + "learning_rate": 6.423315546373865e-05, + "loss": 0.376, + "step": 19628 + }, + { + "epoch": 2.62, + "grad_norm": 0.58203125, + "learning_rate": 6.4222281023094e-05, + "loss": 0.4723, + "step": 19629 + }, + { + "epoch": 2.62, + "grad_norm": 0.625, + "learning_rate": 6.42114070676182e-05, + "loss": 0.5669, + "step": 19630 + }, + { + "epoch": 2.62, + "grad_norm": 0.75, + "learning_rate": 6.42005335974587e-05, + "loss": 0.2716, + "step": 19631 + }, + { + "epoch": 2.62, + "grad_norm": 0.6640625, + "learning_rate": 6.418966061276293e-05, + "loss": 0.3717, + "step": 19632 + }, + { + "epoch": 2.62, + "grad_norm": 0.69921875, + "learning_rate": 6.417878811367834e-05, + "loss": 0.2928, + "step": 19633 + }, + { + "epoch": 2.62, + "grad_norm": 0.53515625, + "learning_rate": 6.416791610035238e-05, + "loss": 0.3439, + "step": 19634 + }, + { + "epoch": 2.62, + "grad_norm": 0.58984375, + "learning_rate": 6.415704457293248e-05, + "loss": 0.3412, + "step": 19635 + }, + { + "epoch": 2.62, + "grad_norm": 0.64453125, + "learning_rate": 6.414617353156605e-05, + "loss": 0.5443, + "step": 19636 + }, + { + "epoch": 2.62, + "grad_norm": 0.6015625, + "learning_rate": 6.413530297640051e-05, + "loss": 0.2125, + "step": 19637 + }, + { + "epoch": 2.62, + "grad_norm": 0.67578125, + "learning_rate": 6.412443290758333e-05, + "loss": 0.4581, + "step": 19638 + }, + { + "epoch": 2.62, + "grad_norm": 0.71875, + "learning_rate": 6.411356332526182e-05, + "loss": 0.3507, + "step": 19639 + }, + { + "epoch": 2.62, + "grad_norm": 0.578125, + "learning_rate": 6.410269422958344e-05, + "loss": 0.4783, + "step": 19640 + }, + { + "epoch": 2.62, + "grad_norm": 0.70703125, + "learning_rate": 6.409182562069553e-05, + "loss": 0.4711, + "step": 19641 + }, + { + "epoch": 2.62, + "grad_norm": 0.51953125, + "learning_rate": 6.408095749874554e-05, + "loss": 0.3034, + "step": 19642 + }, + { + "epoch": 2.62, + "grad_norm": 0.59765625, + "learning_rate": 6.407008986388081e-05, + "loss": 0.3874, + "step": 19643 + }, + { + "epoch": 2.62, + "grad_norm": 0.55859375, + "learning_rate": 6.405922271624874e-05, + "loss": 0.4015, + "step": 19644 + }, + { + "epoch": 2.62, + "grad_norm": 0.46875, + "learning_rate": 6.404835605599664e-05, + "loss": 0.1434, + "step": 19645 + }, + { + "epoch": 2.62, + "grad_norm": 0.48046875, + "learning_rate": 6.403748988327194e-05, + "loss": 0.3085, + "step": 19646 + }, + { + "epoch": 2.62, + "grad_norm": 0.6171875, + "learning_rate": 6.402662419822193e-05, + "loss": 0.5091, + "step": 19647 + }, + { + "epoch": 2.62, + "grad_norm": 0.65625, + "learning_rate": 6.401575900099402e-05, + "loss": 0.3413, + "step": 19648 + }, + { + "epoch": 2.62, + "grad_norm": 0.62109375, + "learning_rate": 6.400489429173547e-05, + "loss": 0.4658, + "step": 19649 + }, + { + "epoch": 2.62, + "grad_norm": 0.490234375, + "learning_rate": 6.39940300705937e-05, + "loss": 0.3397, + "step": 19650 + }, + { + "epoch": 2.62, + "grad_norm": 0.58203125, + "learning_rate": 6.398316633771597e-05, + "loss": 0.3457, + "step": 19651 + }, + { + "epoch": 2.62, + "grad_norm": 0.59765625, + "learning_rate": 6.397230309324961e-05, + "loss": 0.4597, + "step": 19652 + }, + { + "epoch": 2.62, + "grad_norm": 0.53515625, + "learning_rate": 6.396144033734195e-05, + "loss": 0.3891, + "step": 19653 + }, + { + "epoch": 2.62, + "grad_norm": 0.5390625, + "learning_rate": 6.395057807014028e-05, + "loss": 0.3251, + "step": 19654 + }, + { + "epoch": 2.62, + "grad_norm": 0.59375, + "learning_rate": 6.39397162917919e-05, + "loss": 0.2285, + "step": 19655 + }, + { + "epoch": 2.62, + "grad_norm": 0.64453125, + "learning_rate": 6.392885500244413e-05, + "loss": 0.3341, + "step": 19656 + }, + { + "epoch": 2.62, + "grad_norm": 0.578125, + "learning_rate": 6.391799420224423e-05, + "loss": 0.3568, + "step": 19657 + }, + { + "epoch": 2.62, + "grad_norm": 0.5859375, + "learning_rate": 6.390713389133949e-05, + "loss": 0.4207, + "step": 19658 + }, + { + "epoch": 2.62, + "grad_norm": 0.58203125, + "learning_rate": 6.389627406987716e-05, + "loss": 0.3528, + "step": 19659 + }, + { + "epoch": 2.62, + "grad_norm": 0.51953125, + "learning_rate": 6.388541473800456e-05, + "loss": 0.3402, + "step": 19660 + }, + { + "epoch": 2.62, + "grad_norm": 0.625, + "learning_rate": 6.387455589586888e-05, + "loss": 0.3538, + "step": 19661 + }, + { + "epoch": 2.62, + "grad_norm": 0.48046875, + "learning_rate": 6.386369754361745e-05, + "loss": 0.1877, + "step": 19662 + }, + { + "epoch": 2.62, + "grad_norm": 0.71484375, + "learning_rate": 6.385283968139745e-05, + "loss": 0.453, + "step": 19663 + }, + { + "epoch": 2.62, + "grad_norm": 0.50390625, + "learning_rate": 6.384198230935612e-05, + "loss": 0.2315, + "step": 19664 + }, + { + "epoch": 2.62, + "grad_norm": 0.5859375, + "learning_rate": 6.383112542764075e-05, + "loss": 0.1972, + "step": 19665 + }, + { + "epoch": 2.62, + "grad_norm": 0.53515625, + "learning_rate": 6.38202690363985e-05, + "loss": 0.3862, + "step": 19666 + }, + { + "epoch": 2.62, + "grad_norm": 0.5078125, + "learning_rate": 6.380941313577666e-05, + "loss": 0.24, + "step": 19667 + }, + { + "epoch": 2.62, + "grad_norm": 0.484375, + "learning_rate": 6.379855772592236e-05, + "loss": 0.3868, + "step": 19668 + }, + { + "epoch": 2.62, + "grad_norm": 0.61328125, + "learning_rate": 6.378770280698289e-05, + "loss": 0.4455, + "step": 19669 + }, + { + "epoch": 2.62, + "grad_norm": 0.44140625, + "learning_rate": 6.377684837910538e-05, + "loss": 0.1861, + "step": 19670 + }, + { + "epoch": 2.62, + "grad_norm": 0.453125, + "learning_rate": 6.376599444243707e-05, + "loss": 0.2151, + "step": 19671 + }, + { + "epoch": 2.63, + "grad_norm": 0.67578125, + "learning_rate": 6.375514099712512e-05, + "loss": 0.4771, + "step": 19672 + }, + { + "epoch": 2.63, + "grad_norm": 0.68359375, + "learning_rate": 6.374428804331673e-05, + "loss": 0.4224, + "step": 19673 + }, + { + "epoch": 2.63, + "grad_norm": 0.60546875, + "learning_rate": 6.373343558115907e-05, + "loss": 0.3294, + "step": 19674 + }, + { + "epoch": 2.63, + "grad_norm": 0.49609375, + "learning_rate": 6.372258361079932e-05, + "loss": 0.3621, + "step": 19675 + }, + { + "epoch": 2.63, + "grad_norm": 0.53515625, + "learning_rate": 6.37117321323846e-05, + "loss": 0.3158, + "step": 19676 + }, + { + "epoch": 2.63, + "grad_norm": 0.44921875, + "learning_rate": 6.370088114606206e-05, + "loss": 0.1768, + "step": 19677 + }, + { + "epoch": 2.63, + "grad_norm": 0.447265625, + "learning_rate": 6.369003065197886e-05, + "loss": 0.2719, + "step": 19678 + }, + { + "epoch": 2.63, + "grad_norm": 0.60546875, + "learning_rate": 6.367918065028218e-05, + "loss": 0.3168, + "step": 19679 + }, + { + "epoch": 2.63, + "grad_norm": 0.478515625, + "learning_rate": 6.36683311411191e-05, + "loss": 0.4228, + "step": 19680 + }, + { + "epoch": 2.63, + "grad_norm": 0.328125, + "learning_rate": 6.365748212463678e-05, + "loss": 0.1777, + "step": 19681 + }, + { + "epoch": 2.63, + "grad_norm": 0.44921875, + "learning_rate": 6.36466336009823e-05, + "loss": 0.3483, + "step": 19682 + }, + { + "epoch": 2.63, + "grad_norm": 0.46875, + "learning_rate": 6.363578557030286e-05, + "loss": 0.2567, + "step": 19683 + }, + { + "epoch": 2.63, + "grad_norm": 0.7421875, + "learning_rate": 6.362493803274545e-05, + "loss": 0.3723, + "step": 19684 + }, + { + "epoch": 2.63, + "grad_norm": 0.578125, + "learning_rate": 6.361409098845725e-05, + "loss": 0.4264, + "step": 19685 + }, + { + "epoch": 2.63, + "grad_norm": 0.68359375, + "learning_rate": 6.36032444375853e-05, + "loss": 0.4599, + "step": 19686 + }, + { + "epoch": 2.63, + "grad_norm": 0.6171875, + "learning_rate": 6.359239838027678e-05, + "loss": 0.3466, + "step": 19687 + }, + { + "epoch": 2.63, + "grad_norm": 0.421875, + "learning_rate": 6.358155281667866e-05, + "loss": 0.2834, + "step": 19688 + }, + { + "epoch": 2.63, + "grad_norm": 0.55859375, + "learning_rate": 6.357070774693806e-05, + "loss": 0.3205, + "step": 19689 + }, + { + "epoch": 2.63, + "grad_norm": 0.404296875, + "learning_rate": 6.355986317120206e-05, + "loss": 0.2571, + "step": 19690 + }, + { + "epoch": 2.63, + "grad_norm": 0.5703125, + "learning_rate": 6.354901908961766e-05, + "loss": 0.4686, + "step": 19691 + }, + { + "epoch": 2.63, + "grad_norm": 0.58203125, + "learning_rate": 6.353817550233199e-05, + "loss": 0.339, + "step": 19692 + }, + { + "epoch": 2.63, + "grad_norm": 0.9765625, + "learning_rate": 6.352733240949205e-05, + "loss": 0.3584, + "step": 19693 + }, + { + "epoch": 2.63, + "grad_norm": 0.55859375, + "learning_rate": 6.35164898112449e-05, + "loss": 0.3442, + "step": 19694 + }, + { + "epoch": 2.63, + "grad_norm": 0.6171875, + "learning_rate": 6.350564770773752e-05, + "loss": 0.3332, + "step": 19695 + }, + { + "epoch": 2.63, + "grad_norm": 0.5390625, + "learning_rate": 6.349480609911705e-05, + "loss": 0.2376, + "step": 19696 + }, + { + "epoch": 2.63, + "grad_norm": 0.58984375, + "learning_rate": 6.348396498553038e-05, + "loss": 0.1816, + "step": 19697 + }, + { + "epoch": 2.63, + "grad_norm": 0.63671875, + "learning_rate": 6.34731243671246e-05, + "loss": 0.2556, + "step": 19698 + }, + { + "epoch": 2.63, + "grad_norm": 0.75, + "learning_rate": 6.346228424404674e-05, + "loss": 0.2762, + "step": 19699 + }, + { + "epoch": 2.63, + "grad_norm": 0.58984375, + "learning_rate": 6.34514446164437e-05, + "loss": 0.3876, + "step": 19700 + }, + { + "epoch": 2.63, + "grad_norm": 0.53515625, + "learning_rate": 6.344060548446255e-05, + "loss": 0.2876, + "step": 19701 + }, + { + "epoch": 2.63, + "grad_norm": 0.77734375, + "learning_rate": 6.342976684825024e-05, + "loss": 0.2147, + "step": 19702 + }, + { + "epoch": 2.63, + "grad_norm": 0.494140625, + "learning_rate": 6.341892870795376e-05, + "loss": 0.2368, + "step": 19703 + }, + { + "epoch": 2.63, + "grad_norm": 0.353515625, + "learning_rate": 6.34080910637201e-05, + "loss": 0.1485, + "step": 19704 + }, + { + "epoch": 2.63, + "grad_norm": 0.484375, + "learning_rate": 6.339725391569617e-05, + "loss": 0.3322, + "step": 19705 + }, + { + "epoch": 2.63, + "grad_norm": 0.640625, + "learning_rate": 6.338641726402901e-05, + "loss": 0.2989, + "step": 19706 + }, + { + "epoch": 2.63, + "grad_norm": 0.466796875, + "learning_rate": 6.337558110886552e-05, + "loss": 0.1836, + "step": 19707 + }, + { + "epoch": 2.63, + "grad_norm": 0.45703125, + "learning_rate": 6.336474545035268e-05, + "loss": 0.3284, + "step": 19708 + }, + { + "epoch": 2.63, + "grad_norm": 0.63671875, + "learning_rate": 6.335391028863736e-05, + "loss": 0.5446, + "step": 19709 + }, + { + "epoch": 2.63, + "grad_norm": 0.59765625, + "learning_rate": 6.334307562386656e-05, + "loss": 0.3619, + "step": 19710 + }, + { + "epoch": 2.63, + "grad_norm": 0.62109375, + "learning_rate": 6.333224145618721e-05, + "loss": 0.2463, + "step": 19711 + }, + { + "epoch": 2.63, + "grad_norm": 0.59375, + "learning_rate": 6.332140778574614e-05, + "loss": 0.3338, + "step": 19712 + }, + { + "epoch": 2.63, + "grad_norm": 0.546875, + "learning_rate": 6.331057461269037e-05, + "loss": 0.1678, + "step": 19713 + }, + { + "epoch": 2.63, + "grad_norm": 0.5390625, + "learning_rate": 6.329974193716671e-05, + "loss": 0.3986, + "step": 19714 + }, + { + "epoch": 2.63, + "grad_norm": 0.6015625, + "learning_rate": 6.328890975932214e-05, + "loss": 0.326, + "step": 19715 + }, + { + "epoch": 2.63, + "grad_norm": 0.51171875, + "learning_rate": 6.327807807930348e-05, + "loss": 0.2562, + "step": 19716 + }, + { + "epoch": 2.63, + "grad_norm": 0.3671875, + "learning_rate": 6.326724689725769e-05, + "loss": 0.1128, + "step": 19717 + }, + { + "epoch": 2.63, + "grad_norm": 0.49609375, + "learning_rate": 6.325641621333156e-05, + "loss": 0.2756, + "step": 19718 + }, + { + "epoch": 2.63, + "grad_norm": 0.734375, + "learning_rate": 6.324558602767205e-05, + "loss": 0.2584, + "step": 19719 + }, + { + "epoch": 2.63, + "grad_norm": 0.5078125, + "learning_rate": 6.323475634042594e-05, + "loss": 0.328, + "step": 19720 + }, + { + "epoch": 2.63, + "grad_norm": 0.48828125, + "learning_rate": 6.322392715174018e-05, + "loss": 0.4458, + "step": 19721 + }, + { + "epoch": 2.63, + "grad_norm": 0.44140625, + "learning_rate": 6.321309846176154e-05, + "loss": 0.3654, + "step": 19722 + }, + { + "epoch": 2.63, + "grad_norm": 0.5234375, + "learning_rate": 6.320227027063692e-05, + "loss": 0.3833, + "step": 19723 + }, + { + "epoch": 2.63, + "grad_norm": 0.4140625, + "learning_rate": 6.319144257851313e-05, + "loss": 0.213, + "step": 19724 + }, + { + "epoch": 2.63, + "grad_norm": 0.55859375, + "learning_rate": 6.318061538553696e-05, + "loss": 0.3119, + "step": 19725 + }, + { + "epoch": 2.63, + "grad_norm": 0.6796875, + "learning_rate": 6.316978869185532e-05, + "loss": 0.3201, + "step": 19726 + }, + { + "epoch": 2.63, + "grad_norm": 0.5625, + "learning_rate": 6.315896249761494e-05, + "loss": 0.4761, + "step": 19727 + }, + { + "epoch": 2.63, + "grad_norm": 0.3359375, + "learning_rate": 6.31481368029627e-05, + "loss": 0.2086, + "step": 19728 + }, + { + "epoch": 2.63, + "grad_norm": 0.8203125, + "learning_rate": 6.313731160804539e-05, + "loss": 0.5872, + "step": 19729 + }, + { + "epoch": 2.63, + "grad_norm": 0.67578125, + "learning_rate": 6.312648691300975e-05, + "loss": 0.5716, + "step": 19730 + }, + { + "epoch": 2.63, + "grad_norm": 0.41015625, + "learning_rate": 6.311566271800265e-05, + "loss": 0.1959, + "step": 19731 + }, + { + "epoch": 2.63, + "grad_norm": 0.53125, + "learning_rate": 6.310483902317082e-05, + "loss": 0.3281, + "step": 19732 + }, + { + "epoch": 2.63, + "grad_norm": 0.474609375, + "learning_rate": 6.309401582866106e-05, + "loss": 0.3213, + "step": 19733 + }, + { + "epoch": 2.63, + "grad_norm": 0.59375, + "learning_rate": 6.308319313462012e-05, + "loss": 0.3866, + "step": 19734 + }, + { + "epoch": 2.63, + "grad_norm": 0.53125, + "learning_rate": 6.30723709411948e-05, + "loss": 0.1872, + "step": 19735 + }, + { + "epoch": 2.63, + "grad_norm": 0.451171875, + "learning_rate": 6.306154924853183e-05, + "loss": 0.4526, + "step": 19736 + }, + { + "epoch": 2.63, + "grad_norm": 0.478515625, + "learning_rate": 6.305072805677793e-05, + "loss": 0.3373, + "step": 19737 + }, + { + "epoch": 2.63, + "grad_norm": 0.58984375, + "learning_rate": 6.303990736607989e-05, + "loss": 0.3851, + "step": 19738 + }, + { + "epoch": 2.63, + "grad_norm": 0.53125, + "learning_rate": 6.30290871765844e-05, + "loss": 0.3337, + "step": 19739 + }, + { + "epoch": 2.63, + "grad_norm": 0.59765625, + "learning_rate": 6.301826748843824e-05, + "loss": 0.3267, + "step": 19740 + }, + { + "epoch": 2.63, + "grad_norm": 0.59375, + "learning_rate": 6.300744830178811e-05, + "loss": 0.226, + "step": 19741 + }, + { + "epoch": 2.63, + "grad_norm": 0.478515625, + "learning_rate": 6.299662961678073e-05, + "loss": 0.2224, + "step": 19742 + }, + { + "epoch": 2.63, + "grad_norm": 0.640625, + "learning_rate": 6.298581143356275e-05, + "loss": 0.3905, + "step": 19743 + }, + { + "epoch": 2.63, + "grad_norm": 0.52734375, + "learning_rate": 6.297499375228097e-05, + "loss": 0.1907, + "step": 19744 + }, + { + "epoch": 2.63, + "grad_norm": 0.625, + "learning_rate": 6.296417657308202e-05, + "loss": 0.3402, + "step": 19745 + }, + { + "epoch": 2.63, + "grad_norm": 0.51953125, + "learning_rate": 6.295335989611262e-05, + "loss": 0.2554, + "step": 19746 + }, + { + "epoch": 2.64, + "grad_norm": 0.55859375, + "learning_rate": 6.294254372151946e-05, + "loss": 0.4287, + "step": 19747 + }, + { + "epoch": 2.64, + "grad_norm": 0.4921875, + "learning_rate": 6.293172804944915e-05, + "loss": 0.2203, + "step": 19748 + }, + { + "epoch": 2.64, + "grad_norm": 0.51171875, + "learning_rate": 6.292091288004841e-05, + "loss": 0.2686, + "step": 19749 + }, + { + "epoch": 2.64, + "grad_norm": 0.6171875, + "learning_rate": 6.291009821346387e-05, + "loss": 0.447, + "step": 19750 + }, + { + "epoch": 2.64, + "grad_norm": 0.59765625, + "learning_rate": 6.289928404984221e-05, + "loss": 0.516, + "step": 19751 + }, + { + "epoch": 2.64, + "grad_norm": 0.80859375, + "learning_rate": 6.288847038933008e-05, + "loss": 0.3209, + "step": 19752 + }, + { + "epoch": 2.64, + "grad_norm": 0.6640625, + "learning_rate": 6.28776572320741e-05, + "loss": 0.3398, + "step": 19753 + }, + { + "epoch": 2.64, + "grad_norm": 0.498046875, + "learning_rate": 6.286684457822092e-05, + "loss": 0.2582, + "step": 19754 + }, + { + "epoch": 2.64, + "grad_norm": 0.6484375, + "learning_rate": 6.285603242791716e-05, + "loss": 0.4515, + "step": 19755 + }, + { + "epoch": 2.64, + "grad_norm": 0.66015625, + "learning_rate": 6.284522078130944e-05, + "loss": 0.3484, + "step": 19756 + }, + { + "epoch": 2.64, + "grad_norm": 0.70703125, + "learning_rate": 6.283440963854436e-05, + "loss": 0.7072, + "step": 19757 + }, + { + "epoch": 2.64, + "grad_norm": 0.6640625, + "learning_rate": 6.282359899976855e-05, + "loss": 0.4464, + "step": 19758 + }, + { + "epoch": 2.64, + "grad_norm": 0.87109375, + "learning_rate": 6.281278886512858e-05, + "loss": 0.2904, + "step": 19759 + }, + { + "epoch": 2.64, + "grad_norm": 0.6171875, + "learning_rate": 6.280197923477111e-05, + "loss": 0.4135, + "step": 19760 + }, + { + "epoch": 2.64, + "grad_norm": 0.4453125, + "learning_rate": 6.279117010884265e-05, + "loss": 0.2267, + "step": 19761 + }, + { + "epoch": 2.64, + "grad_norm": 0.388671875, + "learning_rate": 6.278036148748978e-05, + "loss": 0.1073, + "step": 19762 + }, + { + "epoch": 2.64, + "grad_norm": 0.5390625, + "learning_rate": 6.276955337085911e-05, + "loss": 0.2318, + "step": 19763 + }, + { + "epoch": 2.64, + "grad_norm": 0.546875, + "learning_rate": 6.275874575909719e-05, + "loss": 0.3167, + "step": 19764 + }, + { + "epoch": 2.64, + "grad_norm": 0.458984375, + "learning_rate": 6.274793865235058e-05, + "loss": 0.124, + "step": 19765 + }, + { + "epoch": 2.64, + "grad_norm": 0.578125, + "learning_rate": 6.273713205076581e-05, + "loss": 0.3415, + "step": 19766 + }, + { + "epoch": 2.64, + "grad_norm": 0.71875, + "learning_rate": 6.272632595448947e-05, + "loss": 0.3529, + "step": 19767 + }, + { + "epoch": 2.64, + "grad_norm": 0.7265625, + "learning_rate": 6.271552036366806e-05, + "loss": 0.4366, + "step": 19768 + }, + { + "epoch": 2.64, + "grad_norm": 0.47265625, + "learning_rate": 6.270471527844814e-05, + "loss": 0.3256, + "step": 19769 + }, + { + "epoch": 2.64, + "grad_norm": 0.5703125, + "learning_rate": 6.269391069897618e-05, + "loss": 0.5152, + "step": 19770 + }, + { + "epoch": 2.64, + "grad_norm": 0.703125, + "learning_rate": 6.268310662539877e-05, + "loss": 0.2851, + "step": 19771 + }, + { + "epoch": 2.64, + "grad_norm": 0.48828125, + "learning_rate": 6.26723030578624e-05, + "loss": 0.2357, + "step": 19772 + }, + { + "epoch": 2.64, + "grad_norm": 0.79296875, + "learning_rate": 6.266149999651351e-05, + "loss": 0.2594, + "step": 19773 + }, + { + "epoch": 2.64, + "grad_norm": 0.578125, + "learning_rate": 6.265069744149868e-05, + "loss": 0.3514, + "step": 19774 + }, + { + "epoch": 2.64, + "grad_norm": 0.38671875, + "learning_rate": 6.263989539296433e-05, + "loss": 0.1491, + "step": 19775 + }, + { + "epoch": 2.64, + "grad_norm": 0.4921875, + "learning_rate": 6.2629093851057e-05, + "loss": 0.2322, + "step": 19776 + }, + { + "epoch": 2.64, + "grad_norm": 0.5390625, + "learning_rate": 6.261829281592314e-05, + "loss": 0.3811, + "step": 19777 + }, + { + "epoch": 2.64, + "grad_norm": 0.58203125, + "learning_rate": 6.260749228770919e-05, + "loss": 0.167, + "step": 19778 + }, + { + "epoch": 2.64, + "grad_norm": 0.59765625, + "learning_rate": 6.259669226656168e-05, + "loss": 0.2919, + "step": 19779 + }, + { + "epoch": 2.64, + "grad_norm": 0.55859375, + "learning_rate": 6.2585892752627e-05, + "loss": 0.4471, + "step": 19780 + }, + { + "epoch": 2.64, + "grad_norm": 0.66796875, + "learning_rate": 6.257509374605166e-05, + "loss": 0.6244, + "step": 19781 + }, + { + "epoch": 2.64, + "grad_norm": 0.609375, + "learning_rate": 6.256429524698205e-05, + "loss": 0.4643, + "step": 19782 + }, + { + "epoch": 2.64, + "grad_norm": 0.57421875, + "learning_rate": 6.255349725556461e-05, + "loss": 0.305, + "step": 19783 + }, + { + "epoch": 2.64, + "grad_norm": 0.546875, + "learning_rate": 6.254269977194583e-05, + "loss": 0.2449, + "step": 19784 + }, + { + "epoch": 2.64, + "grad_norm": 0.5390625, + "learning_rate": 6.253190279627204e-05, + "loss": 0.3696, + "step": 19785 + }, + { + "epoch": 2.64, + "grad_norm": 0.5859375, + "learning_rate": 6.25211063286897e-05, + "loss": 0.3776, + "step": 19786 + }, + { + "epoch": 2.64, + "grad_norm": 0.71875, + "learning_rate": 6.251031036934521e-05, + "loss": 0.2748, + "step": 19787 + }, + { + "epoch": 2.64, + "grad_norm": 0.498046875, + "learning_rate": 6.249951491838498e-05, + "loss": 0.1687, + "step": 19788 + }, + { + "epoch": 2.64, + "grad_norm": 0.59375, + "learning_rate": 6.248871997595538e-05, + "loss": 0.5729, + "step": 19789 + }, + { + "epoch": 2.64, + "grad_norm": 0.490234375, + "learning_rate": 6.247792554220282e-05, + "loss": 0.1927, + "step": 19790 + }, + { + "epoch": 2.64, + "grad_norm": 0.46875, + "learning_rate": 6.246713161727366e-05, + "loss": 0.2427, + "step": 19791 + }, + { + "epoch": 2.64, + "grad_norm": 0.734375, + "learning_rate": 6.245633820131431e-05, + "loss": 0.5392, + "step": 19792 + }, + { + "epoch": 2.64, + "grad_norm": 0.86328125, + "learning_rate": 6.244554529447109e-05, + "loss": 0.2893, + "step": 19793 + }, + { + "epoch": 2.64, + "grad_norm": 0.5703125, + "learning_rate": 6.243475289689039e-05, + "loss": 0.188, + "step": 19794 + }, + { + "epoch": 2.64, + "grad_norm": 0.66796875, + "learning_rate": 6.242396100871852e-05, + "loss": 0.5213, + "step": 19795 + }, + { + "epoch": 2.64, + "grad_norm": 0.6640625, + "learning_rate": 6.24131696301019e-05, + "loss": 0.526, + "step": 19796 + }, + { + "epoch": 2.64, + "grad_norm": 0.451171875, + "learning_rate": 6.240237876118682e-05, + "loss": 0.2088, + "step": 19797 + }, + { + "epoch": 2.64, + "grad_norm": 0.6640625, + "learning_rate": 6.239158840211958e-05, + "loss": 0.2408, + "step": 19798 + }, + { + "epoch": 2.64, + "grad_norm": 0.5703125, + "learning_rate": 6.238079855304656e-05, + "loss": 0.2559, + "step": 19799 + }, + { + "epoch": 2.64, + "grad_norm": 0.6875, + "learning_rate": 6.237000921411405e-05, + "loss": 0.2345, + "step": 19800 + }, + { + "epoch": 2.64, + "grad_norm": 0.70703125, + "learning_rate": 6.235922038546834e-05, + "loss": 0.317, + "step": 19801 + }, + { + "epoch": 2.64, + "grad_norm": 0.66015625, + "learning_rate": 6.234843206725579e-05, + "loss": 0.2354, + "step": 19802 + }, + { + "epoch": 2.64, + "grad_norm": 0.484375, + "learning_rate": 6.233764425962266e-05, + "loss": 0.3229, + "step": 19803 + }, + { + "epoch": 2.64, + "grad_norm": 0.69140625, + "learning_rate": 6.232685696271524e-05, + "loss": 0.3501, + "step": 19804 + }, + { + "epoch": 2.64, + "grad_norm": 0.8359375, + "learning_rate": 6.231607017667982e-05, + "loss": 0.4536, + "step": 19805 + }, + { + "epoch": 2.64, + "grad_norm": 0.58203125, + "learning_rate": 6.230528390166268e-05, + "loss": 0.7054, + "step": 19806 + }, + { + "epoch": 2.64, + "grad_norm": 0.53515625, + "learning_rate": 6.229449813781006e-05, + "loss": 0.3618, + "step": 19807 + }, + { + "epoch": 2.64, + "grad_norm": 0.484375, + "learning_rate": 6.22837128852683e-05, + "loss": 0.1289, + "step": 19808 + }, + { + "epoch": 2.64, + "grad_norm": 0.3828125, + "learning_rate": 6.227292814418357e-05, + "loss": 0.1684, + "step": 19809 + }, + { + "epoch": 2.64, + "grad_norm": 0.5546875, + "learning_rate": 6.226214391470213e-05, + "loss": 0.3316, + "step": 19810 + }, + { + "epoch": 2.64, + "grad_norm": 0.5625, + "learning_rate": 6.225136019697026e-05, + "loss": 0.3904, + "step": 19811 + }, + { + "epoch": 2.64, + "grad_norm": 0.61328125, + "learning_rate": 6.224057699113415e-05, + "loss": 0.3506, + "step": 19812 + }, + { + "epoch": 2.64, + "grad_norm": 0.55078125, + "learning_rate": 6.222979429734006e-05, + "loss": 0.3891, + "step": 19813 + }, + { + "epoch": 2.64, + "grad_norm": 0.62109375, + "learning_rate": 6.221901211573419e-05, + "loss": 0.2073, + "step": 19814 + }, + { + "epoch": 2.64, + "grad_norm": 0.609375, + "learning_rate": 6.220823044646279e-05, + "loss": 0.3499, + "step": 19815 + }, + { + "epoch": 2.64, + "grad_norm": 0.7109375, + "learning_rate": 6.219744928967202e-05, + "loss": 0.3277, + "step": 19816 + }, + { + "epoch": 2.64, + "grad_norm": 0.56640625, + "learning_rate": 6.21866686455081e-05, + "loss": 0.576, + "step": 19817 + }, + { + "epoch": 2.64, + "grad_norm": 0.60546875, + "learning_rate": 6.217588851411721e-05, + "loss": 0.2154, + "step": 19818 + }, + { + "epoch": 2.64, + "grad_norm": 0.62890625, + "learning_rate": 6.216510889564558e-05, + "loss": 0.3956, + "step": 19819 + }, + { + "epoch": 2.64, + "grad_norm": 0.56640625, + "learning_rate": 6.215432979023936e-05, + "loss": 0.3089, + "step": 19820 + }, + { + "epoch": 2.64, + "grad_norm": 0.828125, + "learning_rate": 6.214355119804469e-05, + "loss": 0.5767, + "step": 19821 + }, + { + "epoch": 2.65, + "grad_norm": 0.515625, + "learning_rate": 6.213277311920778e-05, + "loss": 0.2214, + "step": 19822 + }, + { + "epoch": 2.65, + "grad_norm": 0.54296875, + "learning_rate": 6.212199555387474e-05, + "loss": 0.3753, + "step": 19823 + }, + { + "epoch": 2.65, + "grad_norm": 0.51953125, + "learning_rate": 6.211121850219175e-05, + "loss": 0.3763, + "step": 19824 + }, + { + "epoch": 2.65, + "grad_norm": 0.57421875, + "learning_rate": 6.210044196430498e-05, + "loss": 0.1704, + "step": 19825 + }, + { + "epoch": 2.65, + "grad_norm": 0.79296875, + "learning_rate": 6.208966594036052e-05, + "loss": 0.7991, + "step": 19826 + }, + { + "epoch": 2.65, + "grad_norm": 0.66796875, + "learning_rate": 6.207889043050454e-05, + "loss": 0.3406, + "step": 19827 + }, + { + "epoch": 2.65, + "grad_norm": 0.5625, + "learning_rate": 6.206811543488313e-05, + "loss": 0.3767, + "step": 19828 + }, + { + "epoch": 2.65, + "grad_norm": 0.498046875, + "learning_rate": 6.205734095364243e-05, + "loss": 0.3847, + "step": 19829 + }, + { + "epoch": 2.65, + "grad_norm": 0.4609375, + "learning_rate": 6.204656698692853e-05, + "loss": 0.23, + "step": 19830 + }, + { + "epoch": 2.65, + "grad_norm": 0.73046875, + "learning_rate": 6.203579353488754e-05, + "loss": 0.5311, + "step": 19831 + }, + { + "epoch": 2.65, + "grad_norm": 0.5390625, + "learning_rate": 6.20250205976656e-05, + "loss": 0.3048, + "step": 19832 + }, + { + "epoch": 2.65, + "grad_norm": 0.52734375, + "learning_rate": 6.201424817540868e-05, + "loss": 0.2922, + "step": 19833 + }, + { + "epoch": 2.65, + "grad_norm": 0.765625, + "learning_rate": 6.200347626826297e-05, + "loss": 0.5978, + "step": 19834 + }, + { + "epoch": 2.65, + "grad_norm": 0.50390625, + "learning_rate": 6.19927048763745e-05, + "loss": 0.2702, + "step": 19835 + }, + { + "epoch": 2.65, + "grad_norm": 0.640625, + "learning_rate": 6.198193399988937e-05, + "loss": 0.3397, + "step": 19836 + }, + { + "epoch": 2.65, + "grad_norm": 0.421875, + "learning_rate": 6.197116363895356e-05, + "loss": 0.1971, + "step": 19837 + }, + { + "epoch": 2.65, + "grad_norm": 0.515625, + "learning_rate": 6.196039379371322e-05, + "loss": 0.2433, + "step": 19838 + }, + { + "epoch": 2.65, + "grad_norm": 0.46484375, + "learning_rate": 6.194962446431434e-05, + "loss": 0.2862, + "step": 19839 + }, + { + "epoch": 2.65, + "grad_norm": 0.65625, + "learning_rate": 6.193885565090298e-05, + "loss": 0.4938, + "step": 19840 + }, + { + "epoch": 2.65, + "grad_norm": 0.7421875, + "learning_rate": 6.192808735362515e-05, + "loss": 0.2908, + "step": 19841 + }, + { + "epoch": 2.65, + "grad_norm": 0.5703125, + "learning_rate": 6.19173195726269e-05, + "loss": 0.2988, + "step": 19842 + }, + { + "epoch": 2.65, + "grad_norm": 0.451171875, + "learning_rate": 6.190655230805424e-05, + "loss": 0.2984, + "step": 19843 + }, + { + "epoch": 2.65, + "grad_norm": 0.51953125, + "learning_rate": 6.189578556005318e-05, + "loss": 0.3947, + "step": 19844 + }, + { + "epoch": 2.65, + "grad_norm": 0.69140625, + "learning_rate": 6.188501932876975e-05, + "loss": 0.2736, + "step": 19845 + }, + { + "epoch": 2.65, + "grad_norm": 0.55078125, + "learning_rate": 6.187425361434988e-05, + "loss": 0.2863, + "step": 19846 + }, + { + "epoch": 2.65, + "grad_norm": 0.55859375, + "learning_rate": 6.186348841693962e-05, + "loss": 0.4748, + "step": 19847 + }, + { + "epoch": 2.65, + "grad_norm": 0.54296875, + "learning_rate": 6.185272373668491e-05, + "loss": 0.2498, + "step": 19848 + }, + { + "epoch": 2.65, + "grad_norm": 0.53125, + "learning_rate": 6.184195957373176e-05, + "loss": 0.2267, + "step": 19849 + }, + { + "epoch": 2.65, + "grad_norm": 0.453125, + "learning_rate": 6.183119592822613e-05, + "loss": 0.2405, + "step": 19850 + }, + { + "epoch": 2.65, + "grad_norm": 0.76171875, + "learning_rate": 6.182043280031398e-05, + "loss": 0.6783, + "step": 19851 + }, + { + "epoch": 2.65, + "grad_norm": 0.55078125, + "learning_rate": 6.180967019014126e-05, + "loss": 0.2479, + "step": 19852 + }, + { + "epoch": 2.65, + "grad_norm": 0.71484375, + "learning_rate": 6.179890809785392e-05, + "loss": 0.4663, + "step": 19853 + }, + { + "epoch": 2.65, + "grad_norm": 0.66015625, + "learning_rate": 6.178814652359793e-05, + "loss": 0.4136, + "step": 19854 + }, + { + "epoch": 2.65, + "grad_norm": 0.7734375, + "learning_rate": 6.177738546751915e-05, + "loss": 0.615, + "step": 19855 + }, + { + "epoch": 2.65, + "grad_norm": 0.6796875, + "learning_rate": 6.17666249297636e-05, + "loss": 0.2708, + "step": 19856 + }, + { + "epoch": 2.65, + "grad_norm": 0.63671875, + "learning_rate": 6.175586491047716e-05, + "loss": 0.3095, + "step": 19857 + }, + { + "epoch": 2.65, + "grad_norm": 0.61328125, + "learning_rate": 6.174510540980571e-05, + "loss": 0.3856, + "step": 19858 + }, + { + "epoch": 2.65, + "grad_norm": 0.72265625, + "learning_rate": 6.173434642789519e-05, + "loss": 0.6759, + "step": 19859 + }, + { + "epoch": 2.65, + "grad_norm": 0.52734375, + "learning_rate": 6.172358796489147e-05, + "loss": 0.5551, + "step": 19860 + }, + { + "epoch": 2.65, + "grad_norm": 0.59375, + "learning_rate": 6.171283002094048e-05, + "loss": 0.3681, + "step": 19861 + }, + { + "epoch": 2.65, + "grad_norm": 0.671875, + "learning_rate": 6.170207259618808e-05, + "loss": 0.2692, + "step": 19862 + }, + { + "epoch": 2.65, + "grad_norm": 0.515625, + "learning_rate": 6.169131569078017e-05, + "loss": 0.2497, + "step": 19863 + }, + { + "epoch": 2.65, + "grad_norm": 0.65625, + "learning_rate": 6.168055930486259e-05, + "loss": 0.3959, + "step": 19864 + }, + { + "epoch": 2.65, + "grad_norm": 0.482421875, + "learning_rate": 6.166980343858124e-05, + "loss": 0.2702, + "step": 19865 + }, + { + "epoch": 2.65, + "grad_norm": 0.80859375, + "learning_rate": 6.165904809208191e-05, + "loss": 0.5251, + "step": 19866 + }, + { + "epoch": 2.65, + "grad_norm": 0.53125, + "learning_rate": 6.164829326551055e-05, + "loss": 0.3284, + "step": 19867 + }, + { + "epoch": 2.65, + "grad_norm": 0.462890625, + "learning_rate": 6.163753895901292e-05, + "loss": 0.2248, + "step": 19868 + }, + { + "epoch": 2.65, + "grad_norm": 0.625, + "learning_rate": 6.16267851727349e-05, + "loss": 0.319, + "step": 19869 + }, + { + "epoch": 2.65, + "grad_norm": 0.474609375, + "learning_rate": 6.16160319068223e-05, + "loss": 0.3488, + "step": 19870 + }, + { + "epoch": 2.65, + "grad_norm": 0.6640625, + "learning_rate": 6.160527916142093e-05, + "loss": 0.314, + "step": 19871 + }, + { + "epoch": 2.65, + "grad_norm": 0.5390625, + "learning_rate": 6.159452693667662e-05, + "loss": 0.2348, + "step": 19872 + }, + { + "epoch": 2.65, + "grad_norm": 0.56640625, + "learning_rate": 6.158377523273515e-05, + "loss": 0.2619, + "step": 19873 + }, + { + "epoch": 2.65, + "grad_norm": 0.470703125, + "learning_rate": 6.157302404974236e-05, + "loss": 0.1948, + "step": 19874 + }, + { + "epoch": 2.65, + "grad_norm": 0.6171875, + "learning_rate": 6.156227338784404e-05, + "loss": 0.4181, + "step": 19875 + }, + { + "epoch": 2.65, + "grad_norm": 0.5390625, + "learning_rate": 6.155152324718594e-05, + "loss": 0.1696, + "step": 19876 + }, + { + "epoch": 2.65, + "grad_norm": 0.6484375, + "learning_rate": 6.154077362791387e-05, + "loss": 0.2079, + "step": 19877 + }, + { + "epoch": 2.65, + "grad_norm": 0.6796875, + "learning_rate": 6.153002453017358e-05, + "loss": 0.2455, + "step": 19878 + }, + { + "epoch": 2.65, + "grad_norm": 0.48828125, + "learning_rate": 6.151927595411087e-05, + "loss": 0.3425, + "step": 19879 + }, + { + "epoch": 2.65, + "grad_norm": 0.69140625, + "learning_rate": 6.150852789987145e-05, + "loss": 0.5207, + "step": 19880 + }, + { + "epoch": 2.65, + "grad_norm": 0.86328125, + "learning_rate": 6.149778036760114e-05, + "loss": 0.4447, + "step": 19881 + }, + { + "epoch": 2.65, + "grad_norm": 0.515625, + "learning_rate": 6.148703335744563e-05, + "loss": 0.1954, + "step": 19882 + }, + { + "epoch": 2.65, + "grad_norm": 0.54296875, + "learning_rate": 6.147628686955062e-05, + "loss": 0.2706, + "step": 19883 + }, + { + "epoch": 2.65, + "grad_norm": 0.53125, + "learning_rate": 6.146554090406193e-05, + "loss": 0.2648, + "step": 19884 + }, + { + "epoch": 2.65, + "grad_norm": 0.59375, + "learning_rate": 6.14547954611252e-05, + "loss": 0.1868, + "step": 19885 + }, + { + "epoch": 2.65, + "grad_norm": 0.546875, + "learning_rate": 6.144405054088622e-05, + "loss": 0.4897, + "step": 19886 + }, + { + "epoch": 2.65, + "grad_norm": 0.51171875, + "learning_rate": 6.143330614349061e-05, + "loss": 0.401, + "step": 19887 + }, + { + "epoch": 2.65, + "grad_norm": 0.71875, + "learning_rate": 6.142256226908417e-05, + "loss": 0.5482, + "step": 19888 + }, + { + "epoch": 2.65, + "grad_norm": 0.64453125, + "learning_rate": 6.141181891781251e-05, + "loss": 0.2671, + "step": 19889 + }, + { + "epoch": 2.65, + "grad_norm": 0.52734375, + "learning_rate": 6.140107608982136e-05, + "loss": 0.2914, + "step": 19890 + }, + { + "epoch": 2.65, + "grad_norm": 0.453125, + "learning_rate": 6.139033378525639e-05, + "loss": 0.3759, + "step": 19891 + }, + { + "epoch": 2.65, + "grad_norm": 0.4921875, + "learning_rate": 6.137959200426329e-05, + "loss": 0.1456, + "step": 19892 + }, + { + "epoch": 2.65, + "grad_norm": 0.66015625, + "learning_rate": 6.136885074698772e-05, + "loss": 0.6204, + "step": 19893 + }, + { + "epoch": 2.65, + "grad_norm": 0.66015625, + "learning_rate": 6.13581100135753e-05, + "loss": 0.6371, + "step": 19894 + }, + { + "epoch": 2.65, + "grad_norm": 0.64453125, + "learning_rate": 6.134736980417171e-05, + "loss": 0.3929, + "step": 19895 + }, + { + "epoch": 2.65, + "grad_norm": 0.65234375, + "learning_rate": 6.133663011892259e-05, + "loss": 0.2941, + "step": 19896 + }, + { + "epoch": 2.66, + "grad_norm": 0.44140625, + "learning_rate": 6.132589095797356e-05, + "loss": 0.1839, + "step": 19897 + }, + { + "epoch": 2.66, + "grad_norm": 0.5859375, + "learning_rate": 6.131515232147032e-05, + "loss": 0.235, + "step": 19898 + }, + { + "epoch": 2.66, + "grad_norm": 0.47265625, + "learning_rate": 6.13044142095584e-05, + "loss": 0.3056, + "step": 19899 + }, + { + "epoch": 2.66, + "grad_norm": 0.640625, + "learning_rate": 6.129367662238347e-05, + "loss": 0.5416, + "step": 19900 + }, + { + "epoch": 2.66, + "grad_norm": 0.70703125, + "learning_rate": 6.128293956009112e-05, + "loss": 0.3932, + "step": 19901 + }, + { + "epoch": 2.66, + "grad_norm": 0.5625, + "learning_rate": 6.127220302282699e-05, + "loss": 0.354, + "step": 19902 + }, + { + "epoch": 2.66, + "grad_norm": 0.50390625, + "learning_rate": 6.126146701073661e-05, + "loss": 0.2272, + "step": 19903 + }, + { + "epoch": 2.66, + "grad_norm": 0.65234375, + "learning_rate": 6.125073152396561e-05, + "loss": 0.3916, + "step": 19904 + }, + { + "epoch": 2.66, + "grad_norm": 0.77734375, + "learning_rate": 6.12399965626596e-05, + "loss": 0.3406, + "step": 19905 + }, + { + "epoch": 2.66, + "grad_norm": 0.423828125, + "learning_rate": 6.122926212696405e-05, + "loss": 0.2044, + "step": 19906 + }, + { + "epoch": 2.66, + "grad_norm": 0.59765625, + "learning_rate": 6.121852821702463e-05, + "loss": 0.2458, + "step": 19907 + }, + { + "epoch": 2.66, + "grad_norm": 0.58203125, + "learning_rate": 6.120779483298682e-05, + "loss": 0.2621, + "step": 19908 + }, + { + "epoch": 2.66, + "grad_norm": 0.56640625, + "learning_rate": 6.119706197499624e-05, + "loss": 0.2849, + "step": 19909 + }, + { + "epoch": 2.66, + "grad_norm": 0.609375, + "learning_rate": 6.118632964319836e-05, + "loss": 0.2494, + "step": 19910 + }, + { + "epoch": 2.66, + "grad_norm": 0.578125, + "learning_rate": 6.117559783773882e-05, + "loss": 0.5042, + "step": 19911 + }, + { + "epoch": 2.66, + "grad_norm": 0.470703125, + "learning_rate": 6.116486655876305e-05, + "loss": 0.2806, + "step": 19912 + }, + { + "epoch": 2.66, + "grad_norm": 0.6328125, + "learning_rate": 6.115413580641663e-05, + "loss": 0.5997, + "step": 19913 + }, + { + "epoch": 2.66, + "grad_norm": 0.578125, + "learning_rate": 6.114340558084504e-05, + "loss": 0.3806, + "step": 19914 + }, + { + "epoch": 2.66, + "grad_norm": 0.62109375, + "learning_rate": 6.113267588219384e-05, + "loss": 0.3299, + "step": 19915 + }, + { + "epoch": 2.66, + "grad_norm": 0.50390625, + "learning_rate": 6.112194671060847e-05, + "loss": 0.1851, + "step": 19916 + }, + { + "epoch": 2.66, + "grad_norm": 0.578125, + "learning_rate": 6.11112180662345e-05, + "loss": 0.2267, + "step": 19917 + }, + { + "epoch": 2.66, + "grad_norm": 0.60546875, + "learning_rate": 6.110048994921734e-05, + "loss": 0.2717, + "step": 19918 + }, + { + "epoch": 2.66, + "grad_norm": 0.498046875, + "learning_rate": 6.108976235970249e-05, + "loss": 0.3663, + "step": 19919 + }, + { + "epoch": 2.66, + "grad_norm": 0.54296875, + "learning_rate": 6.107903529783546e-05, + "loss": 0.2824, + "step": 19920 + }, + { + "epoch": 2.66, + "grad_norm": 0.625, + "learning_rate": 6.106830876376166e-05, + "loss": 0.5629, + "step": 19921 + }, + { + "epoch": 2.66, + "grad_norm": 0.59375, + "learning_rate": 6.105758275762659e-05, + "loss": 0.4068, + "step": 19922 + }, + { + "epoch": 2.66, + "grad_norm": 0.4921875, + "learning_rate": 6.10468572795757e-05, + "loss": 0.355, + "step": 19923 + }, + { + "epoch": 2.66, + "grad_norm": 0.423828125, + "learning_rate": 6.10361323297544e-05, + "loss": 0.1609, + "step": 19924 + }, + { + "epoch": 2.66, + "grad_norm": 0.51953125, + "learning_rate": 6.10254079083082e-05, + "loss": 0.4187, + "step": 19925 + }, + { + "epoch": 2.66, + "grad_norm": 0.458984375, + "learning_rate": 6.101468401538244e-05, + "loss": 0.2498, + "step": 19926 + }, + { + "epoch": 2.66, + "grad_norm": 0.69140625, + "learning_rate": 6.1003960651122613e-05, + "loss": 0.1785, + "step": 19927 + }, + { + "epoch": 2.66, + "grad_norm": 0.443359375, + "learning_rate": 6.099323781567408e-05, + "loss": 0.3547, + "step": 19928 + }, + { + "epoch": 2.66, + "grad_norm": 0.60546875, + "learning_rate": 6.0982515509182305e-05, + "loss": 0.2995, + "step": 19929 + }, + { + "epoch": 2.66, + "grad_norm": 0.69140625, + "learning_rate": 6.097179373179267e-05, + "loss": 0.4044, + "step": 19930 + }, + { + "epoch": 2.66, + "grad_norm": 0.78125, + "learning_rate": 6.0961072483650526e-05, + "loss": 0.3489, + "step": 19931 + }, + { + "epoch": 2.66, + "grad_norm": 0.52734375, + "learning_rate": 6.0950351764901324e-05, + "loss": 0.2403, + "step": 19932 + }, + { + "epoch": 2.66, + "grad_norm": 0.5703125, + "learning_rate": 6.093963157569038e-05, + "loss": 0.2617, + "step": 19933 + }, + { + "epoch": 2.66, + "grad_norm": 0.5546875, + "learning_rate": 6.092891191616314e-05, + "loss": 0.1694, + "step": 19934 + }, + { + "epoch": 2.66, + "grad_norm": 0.67578125, + "learning_rate": 6.091819278646489e-05, + "loss": 0.352, + "step": 19935 + }, + { + "epoch": 2.66, + "grad_norm": 0.6328125, + "learning_rate": 6.090747418674105e-05, + "loss": 0.4941, + "step": 19936 + }, + { + "epoch": 2.66, + "grad_norm": 0.49609375, + "learning_rate": 6.089675611713694e-05, + "loss": 0.3249, + "step": 19937 + }, + { + "epoch": 2.66, + "grad_norm": 0.66796875, + "learning_rate": 6.0886038577797924e-05, + "loss": 0.4395, + "step": 19938 + }, + { + "epoch": 2.66, + "grad_norm": 0.68359375, + "learning_rate": 6.0875321568869307e-05, + "loss": 0.2386, + "step": 19939 + }, + { + "epoch": 2.66, + "grad_norm": 0.62890625, + "learning_rate": 6.086460509049647e-05, + "loss": 0.3996, + "step": 19940 + }, + { + "epoch": 2.66, + "grad_norm": 0.396484375, + "learning_rate": 6.085388914282468e-05, + "loss": 0.1947, + "step": 19941 + }, + { + "epoch": 2.66, + "grad_norm": 0.72265625, + "learning_rate": 6.084317372599931e-05, + "loss": 0.5116, + "step": 19942 + }, + { + "epoch": 2.66, + "grad_norm": 0.58984375, + "learning_rate": 6.083245884016562e-05, + "loss": 0.3848, + "step": 19943 + }, + { + "epoch": 2.66, + "grad_norm": 0.5859375, + "learning_rate": 6.082174448546891e-05, + "loss": 0.5302, + "step": 19944 + }, + { + "epoch": 2.66, + "grad_norm": 0.478515625, + "learning_rate": 6.081103066205451e-05, + "loss": 0.3539, + "step": 19945 + }, + { + "epoch": 2.66, + "grad_norm": 0.71875, + "learning_rate": 6.080031737006766e-05, + "loss": 0.3372, + "step": 19946 + }, + { + "epoch": 2.66, + "grad_norm": 0.53125, + "learning_rate": 6.078960460965366e-05, + "loss": 0.1584, + "step": 19947 + }, + { + "epoch": 2.66, + "grad_norm": 0.53515625, + "learning_rate": 6.077889238095782e-05, + "loss": 0.4056, + "step": 19948 + }, + { + "epoch": 2.66, + "grad_norm": 0.5703125, + "learning_rate": 6.076818068412534e-05, + "loss": 0.3788, + "step": 19949 + }, + { + "epoch": 2.66, + "grad_norm": 0.55859375, + "learning_rate": 6.075746951930154e-05, + "loss": 0.4515, + "step": 19950 + }, + { + "epoch": 2.66, + "grad_norm": 0.4921875, + "learning_rate": 6.0746758886631606e-05, + "loss": 0.2045, + "step": 19951 + }, + { + "epoch": 2.66, + "grad_norm": 0.87890625, + "learning_rate": 6.073604878626085e-05, + "loss": 0.3513, + "step": 19952 + }, + { + "epoch": 2.66, + "grad_norm": 0.59375, + "learning_rate": 6.072533921833443e-05, + "loss": 0.3193, + "step": 19953 + }, + { + "epoch": 2.66, + "grad_norm": 0.69140625, + "learning_rate": 6.0714630182997665e-05, + "loss": 0.4152, + "step": 19954 + }, + { + "epoch": 2.66, + "grad_norm": 0.73046875, + "learning_rate": 6.070392168039571e-05, + "loss": 0.3951, + "step": 19955 + }, + { + "epoch": 2.66, + "grad_norm": 0.55078125, + "learning_rate": 6.069321371067378e-05, + "loss": 0.2184, + "step": 19956 + }, + { + "epoch": 2.66, + "grad_norm": 0.91015625, + "learning_rate": 6.068250627397711e-05, + "loss": 0.3458, + "step": 19957 + }, + { + "epoch": 2.66, + "grad_norm": 0.4765625, + "learning_rate": 6.067179937045089e-05, + "loss": 0.3301, + "step": 19958 + }, + { + "epoch": 2.66, + "grad_norm": 0.68359375, + "learning_rate": 6.066109300024031e-05, + "loss": 0.209, + "step": 19959 + }, + { + "epoch": 2.66, + "grad_norm": 0.62890625, + "learning_rate": 6.065038716349054e-05, + "loss": 0.5967, + "step": 19960 + }, + { + "epoch": 2.66, + "grad_norm": 0.53125, + "learning_rate": 6.063968186034681e-05, + "loss": 0.1697, + "step": 19961 + }, + { + "epoch": 2.66, + "grad_norm": 0.78515625, + "learning_rate": 6.062897709095421e-05, + "loss": 0.1831, + "step": 19962 + }, + { + "epoch": 2.66, + "grad_norm": 0.53125, + "learning_rate": 6.061827285545798e-05, + "loss": 0.7398, + "step": 19963 + }, + { + "epoch": 2.66, + "grad_norm": 0.49609375, + "learning_rate": 6.060756915400323e-05, + "loss": 0.2852, + "step": 19964 + }, + { + "epoch": 2.66, + "grad_norm": 0.5, + "learning_rate": 6.059686598673515e-05, + "loss": 0.2663, + "step": 19965 + }, + { + "epoch": 2.66, + "grad_norm": 0.62890625, + "learning_rate": 6.0586163353798864e-05, + "loss": 0.4939, + "step": 19966 + }, + { + "epoch": 2.66, + "grad_norm": 0.58203125, + "learning_rate": 6.057546125533946e-05, + "loss": 0.3789, + "step": 19967 + }, + { + "epoch": 2.66, + "grad_norm": 0.53515625, + "learning_rate": 6.056475969150213e-05, + "loss": 0.1777, + "step": 19968 + }, + { + "epoch": 2.66, + "grad_norm": 0.484375, + "learning_rate": 6.055405866243194e-05, + "loss": 0.3931, + "step": 19969 + }, + { + "epoch": 2.66, + "grad_norm": 0.30859375, + "learning_rate": 6.054335816827403e-05, + "loss": 0.1464, + "step": 19970 + }, + { + "epoch": 2.66, + "grad_norm": 0.546875, + "learning_rate": 6.053265820917353e-05, + "loss": 0.2049, + "step": 19971 + }, + { + "epoch": 2.67, + "grad_norm": 0.5, + "learning_rate": 6.05219587852755e-05, + "loss": 0.2446, + "step": 19972 + }, + { + "epoch": 2.67, + "grad_norm": 0.6796875, + "learning_rate": 6.0511259896725056e-05, + "loss": 0.4689, + "step": 19973 + }, + { + "epoch": 2.67, + "grad_norm": 0.5625, + "learning_rate": 6.050056154366726e-05, + "loss": 0.4797, + "step": 19974 + }, + { + "epoch": 2.67, + "grad_norm": 0.373046875, + "learning_rate": 6.048986372624722e-05, + "loss": 0.3314, + "step": 19975 + }, + { + "epoch": 2.67, + "grad_norm": 0.5390625, + "learning_rate": 6.047916644460996e-05, + "loss": 0.2888, + "step": 19976 + }, + { + "epoch": 2.67, + "grad_norm": 0.58984375, + "learning_rate": 6.046846969890059e-05, + "loss": 0.4047, + "step": 19977 + }, + { + "epoch": 2.67, + "grad_norm": 0.66796875, + "learning_rate": 6.0457773489264155e-05, + "loss": 0.4189, + "step": 19978 + }, + { + "epoch": 2.67, + "grad_norm": 0.5625, + "learning_rate": 6.0447077815845665e-05, + "loss": 0.3832, + "step": 19979 + }, + { + "epoch": 2.67, + "grad_norm": 0.85546875, + "learning_rate": 6.0436382678790195e-05, + "loss": 0.4524, + "step": 19980 + }, + { + "epoch": 2.67, + "grad_norm": 0.5625, + "learning_rate": 6.042568807824274e-05, + "loss": 0.4316, + "step": 19981 + }, + { + "epoch": 2.67, + "grad_norm": 0.5859375, + "learning_rate": 6.041499401434838e-05, + "loss": 0.2237, + "step": 19982 + }, + { + "epoch": 2.67, + "grad_norm": 0.5859375, + "learning_rate": 6.04043004872521e-05, + "loss": 0.3413, + "step": 19983 + }, + { + "epoch": 2.67, + "grad_norm": 0.59765625, + "learning_rate": 6.039360749709893e-05, + "loss": 0.7466, + "step": 19984 + }, + { + "epoch": 2.67, + "grad_norm": 0.55078125, + "learning_rate": 6.038291504403383e-05, + "loss": 0.4165, + "step": 19985 + }, + { + "epoch": 2.67, + "grad_norm": 0.48828125, + "learning_rate": 6.037222312820185e-05, + "loss": 0.3094, + "step": 19986 + }, + { + "epoch": 2.67, + "grad_norm": 0.494140625, + "learning_rate": 6.0361531749747947e-05, + "loss": 0.2513, + "step": 19987 + }, + { + "epoch": 2.67, + "grad_norm": 0.58203125, + "learning_rate": 6.035084090881713e-05, + "loss": 0.2743, + "step": 19988 + }, + { + "epoch": 2.67, + "grad_norm": 0.359375, + "learning_rate": 6.0340150605554335e-05, + "loss": 0.1768, + "step": 19989 + }, + { + "epoch": 2.67, + "grad_norm": 0.423828125, + "learning_rate": 6.032946084010459e-05, + "loss": 0.1289, + "step": 19990 + }, + { + "epoch": 2.67, + "grad_norm": 0.470703125, + "learning_rate": 6.03187716126128e-05, + "loss": 0.2396, + "step": 19991 + }, + { + "epoch": 2.67, + "grad_norm": 0.48828125, + "learning_rate": 6.030808292322392e-05, + "loss": 0.1861, + "step": 19992 + }, + { + "epoch": 2.67, + "grad_norm": 0.546875, + "learning_rate": 6.029739477208292e-05, + "loss": 0.3762, + "step": 19993 + }, + { + "epoch": 2.67, + "grad_norm": 0.486328125, + "learning_rate": 6.028670715933471e-05, + "loss": 0.3695, + "step": 19994 + }, + { + "epoch": 2.67, + "grad_norm": 0.41796875, + "learning_rate": 6.027602008512423e-05, + "loss": 0.2552, + "step": 19995 + }, + { + "epoch": 2.67, + "grad_norm": 0.4765625, + "learning_rate": 6.026533354959645e-05, + "loss": 0.2876, + "step": 19996 + }, + { + "epoch": 2.67, + "grad_norm": 0.67578125, + "learning_rate": 6.0254647552896216e-05, + "loss": 0.3033, + "step": 19997 + }, + { + "epoch": 2.67, + "grad_norm": 0.66015625, + "learning_rate": 6.024396209516849e-05, + "loss": 0.2375, + "step": 19998 + }, + { + "epoch": 2.67, + "grad_norm": 0.38671875, + "learning_rate": 6.023327717655813e-05, + "loss": 0.1427, + "step": 19999 + }, + { + "epoch": 2.67, + "grad_norm": 0.6171875, + "learning_rate": 6.0222592797210096e-05, + "loss": 0.4148, + "step": 20000 + }, + { + "epoch": 2.67, + "grad_norm": 0.546875, + "learning_rate": 6.021190895726919e-05, + "loss": 0.2161, + "step": 20001 + }, + { + "epoch": 2.67, + "grad_norm": 0.486328125, + "learning_rate": 6.020122565688038e-05, + "loss": 0.1688, + "step": 20002 + }, + { + "epoch": 2.67, + "grad_norm": 0.5859375, + "learning_rate": 6.0190542896188486e-05, + "loss": 0.5436, + "step": 20003 + }, + { + "epoch": 2.67, + "grad_norm": 0.408203125, + "learning_rate": 6.017986067533834e-05, + "loss": 0.2162, + "step": 20004 + }, + { + "epoch": 2.67, + "grad_norm": 0.46484375, + "learning_rate": 6.0169178994474874e-05, + "loss": 0.3142, + "step": 20005 + }, + { + "epoch": 2.67, + "grad_norm": 0.54296875, + "learning_rate": 6.0158497853742866e-05, + "loss": 0.3056, + "step": 20006 + }, + { + "epoch": 2.67, + "grad_norm": 0.62890625, + "learning_rate": 6.014781725328723e-05, + "loss": 0.4581, + "step": 20007 + }, + { + "epoch": 2.67, + "grad_norm": 0.57421875, + "learning_rate": 6.0137137193252754e-05, + "loss": 0.1782, + "step": 20008 + }, + { + "epoch": 2.67, + "grad_norm": 0.60546875, + "learning_rate": 6.012645767378429e-05, + "loss": 0.4024, + "step": 20009 + }, + { + "epoch": 2.67, + "grad_norm": 0.443359375, + "learning_rate": 6.0115778695026635e-05, + "loss": 0.2508, + "step": 20010 + }, + { + "epoch": 2.67, + "grad_norm": 0.494140625, + "learning_rate": 6.010510025712464e-05, + "loss": 0.2212, + "step": 20011 + }, + { + "epoch": 2.67, + "grad_norm": 0.55859375, + "learning_rate": 6.009442236022307e-05, + "loss": 0.5034, + "step": 20012 + }, + { + "epoch": 2.67, + "grad_norm": 0.439453125, + "learning_rate": 6.008374500446676e-05, + "loss": 0.2354, + "step": 20013 + }, + { + "epoch": 2.67, + "grad_norm": 0.5703125, + "learning_rate": 6.007306819000047e-05, + "loss": 0.2546, + "step": 20014 + }, + { + "epoch": 2.67, + "grad_norm": 0.53515625, + "learning_rate": 6.0062391916969054e-05, + "loss": 0.2379, + "step": 20015 + }, + { + "epoch": 2.67, + "grad_norm": 0.53125, + "learning_rate": 6.0051716185517194e-05, + "loss": 0.1867, + "step": 20016 + }, + { + "epoch": 2.67, + "grad_norm": 0.515625, + "learning_rate": 6.0041040995789686e-05, + "loss": 0.2337, + "step": 20017 + }, + { + "epoch": 2.67, + "grad_norm": 0.578125, + "learning_rate": 6.0030366347931333e-05, + "loss": 0.4019, + "step": 20018 + }, + { + "epoch": 2.67, + "grad_norm": 0.546875, + "learning_rate": 6.0019692242086836e-05, + "loss": 0.3409, + "step": 20019 + }, + { + "epoch": 2.67, + "grad_norm": 0.671875, + "learning_rate": 6.0009018678400976e-05, + "loss": 0.42, + "step": 20020 + }, + { + "epoch": 2.67, + "grad_norm": 0.51953125, + "learning_rate": 5.9998345657018515e-05, + "loss": 0.2146, + "step": 20021 + }, + { + "epoch": 2.67, + "grad_norm": 0.50390625, + "learning_rate": 5.998767317808414e-05, + "loss": 0.158, + "step": 20022 + }, + { + "epoch": 2.67, + "grad_norm": 0.61328125, + "learning_rate": 5.9977001241742595e-05, + "loss": 0.6613, + "step": 20023 + }, + { + "epoch": 2.67, + "grad_norm": 0.70703125, + "learning_rate": 5.9966329848138605e-05, + "loss": 0.2812, + "step": 20024 + }, + { + "epoch": 2.67, + "grad_norm": 0.69921875, + "learning_rate": 5.995565899741689e-05, + "loss": 0.7743, + "step": 20025 + }, + { + "epoch": 2.67, + "grad_norm": 0.5703125, + "learning_rate": 5.9944988689722114e-05, + "loss": 0.3179, + "step": 20026 + }, + { + "epoch": 2.67, + "grad_norm": 0.59765625, + "learning_rate": 5.993431892519903e-05, + "loss": 0.5658, + "step": 20027 + }, + { + "epoch": 2.67, + "grad_norm": 0.42578125, + "learning_rate": 5.992364970399229e-05, + "loss": 0.2546, + "step": 20028 + }, + { + "epoch": 2.67, + "grad_norm": 0.419921875, + "learning_rate": 5.9912981026246565e-05, + "loss": 0.2852, + "step": 20029 + }, + { + "epoch": 2.67, + "grad_norm": 0.6171875, + "learning_rate": 5.990231289210655e-05, + "loss": 0.3025, + "step": 20030 + }, + { + "epoch": 2.67, + "grad_norm": 0.57421875, + "learning_rate": 5.9891645301716894e-05, + "loss": 0.417, + "step": 20031 + }, + { + "epoch": 2.67, + "grad_norm": 0.703125, + "learning_rate": 5.9880978255222274e-05, + "loss": 0.6055, + "step": 20032 + }, + { + "epoch": 2.67, + "grad_norm": 0.376953125, + "learning_rate": 5.987031175276734e-05, + "loss": 0.18, + "step": 20033 + }, + { + "epoch": 2.67, + "grad_norm": 0.6484375, + "learning_rate": 5.985964579449675e-05, + "loss": 0.4698, + "step": 20034 + }, + { + "epoch": 2.67, + "grad_norm": 0.6015625, + "learning_rate": 5.9848980380555086e-05, + "loss": 0.478, + "step": 20035 + }, + { + "epoch": 2.67, + "grad_norm": 0.486328125, + "learning_rate": 5.983831551108705e-05, + "loss": 0.2754, + "step": 20036 + }, + { + "epoch": 2.67, + "grad_norm": 0.66796875, + "learning_rate": 5.982765118623721e-05, + "loss": 0.5179, + "step": 20037 + }, + { + "epoch": 2.67, + "grad_norm": 0.6171875, + "learning_rate": 5.981698740615023e-05, + "loss": 0.2718, + "step": 20038 + }, + { + "epoch": 2.67, + "grad_norm": 0.5703125, + "learning_rate": 5.980632417097071e-05, + "loss": 0.3267, + "step": 20039 + }, + { + "epoch": 2.67, + "grad_norm": 0.5078125, + "learning_rate": 5.9795661480843176e-05, + "loss": 0.3224, + "step": 20040 + }, + { + "epoch": 2.67, + "grad_norm": 0.5859375, + "learning_rate": 5.978499933591231e-05, + "loss": 0.4117, + "step": 20041 + }, + { + "epoch": 2.67, + "grad_norm": 0.8125, + "learning_rate": 5.977433773632264e-05, + "loss": 0.4246, + "step": 20042 + }, + { + "epoch": 2.67, + "grad_norm": 0.61328125, + "learning_rate": 5.976367668221876e-05, + "loss": 0.4502, + "step": 20043 + }, + { + "epoch": 2.67, + "grad_norm": 0.431640625, + "learning_rate": 5.975301617374527e-05, + "loss": 0.1881, + "step": 20044 + }, + { + "epoch": 2.67, + "grad_norm": 0.55078125, + "learning_rate": 5.974235621104669e-05, + "loss": 0.2919, + "step": 20045 + }, + { + "epoch": 2.67, + "grad_norm": 0.578125, + "learning_rate": 5.9731696794267625e-05, + "loss": 0.3533, + "step": 20046 + }, + { + "epoch": 2.68, + "grad_norm": 0.65625, + "learning_rate": 5.972103792355257e-05, + "loss": 0.4227, + "step": 20047 + }, + { + "epoch": 2.68, + "grad_norm": 0.462890625, + "learning_rate": 5.971037959904611e-05, + "loss": 0.375, + "step": 20048 + }, + { + "epoch": 2.68, + "grad_norm": 0.64453125, + "learning_rate": 5.969972182089274e-05, + "loss": 0.162, + "step": 20049 + }, + { + "epoch": 2.68, + "grad_norm": 0.44921875, + "learning_rate": 5.968906458923702e-05, + "loss": 0.2825, + "step": 20050 + }, + { + "epoch": 2.68, + "grad_norm": 0.53125, + "learning_rate": 5.967840790422349e-05, + "loss": 0.443, + "step": 20051 + }, + { + "epoch": 2.68, + "grad_norm": 0.69140625, + "learning_rate": 5.966775176599657e-05, + "loss": 0.254, + "step": 20052 + }, + { + "epoch": 2.68, + "grad_norm": 0.75, + "learning_rate": 5.9657096174700855e-05, + "loss": 0.3746, + "step": 20053 + }, + { + "epoch": 2.68, + "grad_norm": 0.56640625, + "learning_rate": 5.964644113048079e-05, + "loss": 0.3265, + "step": 20054 + }, + { + "epoch": 2.68, + "grad_norm": 0.72265625, + "learning_rate": 5.963578663348091e-05, + "loss": 0.6556, + "step": 20055 + }, + { + "epoch": 2.68, + "grad_norm": 0.5859375, + "learning_rate": 5.962513268384562e-05, + "loss": 0.3756, + "step": 20056 + }, + { + "epoch": 2.68, + "grad_norm": 0.67578125, + "learning_rate": 5.9614479281719485e-05, + "loss": 0.5298, + "step": 20057 + }, + { + "epoch": 2.68, + "grad_norm": 0.72265625, + "learning_rate": 5.96038264272469e-05, + "loss": 0.4661, + "step": 20058 + }, + { + "epoch": 2.68, + "grad_norm": 0.609375, + "learning_rate": 5.959317412057238e-05, + "loss": 0.3012, + "step": 20059 + }, + { + "epoch": 2.68, + "grad_norm": 0.484375, + "learning_rate": 5.958252236184033e-05, + "loss": 0.1356, + "step": 20060 + }, + { + "epoch": 2.68, + "grad_norm": 0.51171875, + "learning_rate": 5.957187115119524e-05, + "loss": 0.4148, + "step": 20061 + }, + { + "epoch": 2.68, + "grad_norm": 0.54296875, + "learning_rate": 5.95612204887815e-05, + "loss": 0.3849, + "step": 20062 + }, + { + "epoch": 2.68, + "grad_norm": 0.6328125, + "learning_rate": 5.955057037474361e-05, + "loss": 0.556, + "step": 20063 + }, + { + "epoch": 2.68, + "grad_norm": 0.58203125, + "learning_rate": 5.953992080922591e-05, + "loss": 0.2027, + "step": 20064 + }, + { + "epoch": 2.68, + "grad_norm": 0.5234375, + "learning_rate": 5.952927179237284e-05, + "loss": 0.3037, + "step": 20065 + }, + { + "epoch": 2.68, + "grad_norm": 0.64453125, + "learning_rate": 5.951862332432885e-05, + "loss": 0.4053, + "step": 20066 + }, + { + "epoch": 2.68, + "grad_norm": 0.51171875, + "learning_rate": 5.950797540523827e-05, + "loss": 0.2196, + "step": 20067 + }, + { + "epoch": 2.68, + "grad_norm": 0.54296875, + "learning_rate": 5.949732803524555e-05, + "loss": 0.2177, + "step": 20068 + }, + { + "epoch": 2.68, + "grad_norm": 0.703125, + "learning_rate": 5.948668121449506e-05, + "loss": 0.5539, + "step": 20069 + }, + { + "epoch": 2.68, + "grad_norm": 0.44921875, + "learning_rate": 5.947603494313116e-05, + "loss": 0.2471, + "step": 20070 + }, + { + "epoch": 2.68, + "grad_norm": 0.53515625, + "learning_rate": 5.946538922129825e-05, + "loss": 0.2723, + "step": 20071 + }, + { + "epoch": 2.68, + "grad_norm": 0.5234375, + "learning_rate": 5.945474404914066e-05, + "loss": 0.3143, + "step": 20072 + }, + { + "epoch": 2.68, + "grad_norm": 0.51171875, + "learning_rate": 5.944409942680278e-05, + "loss": 0.2745, + "step": 20073 + }, + { + "epoch": 2.68, + "grad_norm": 0.6015625, + "learning_rate": 5.9433455354428925e-05, + "loss": 0.3512, + "step": 20074 + }, + { + "epoch": 2.68, + "grad_norm": 1.046875, + "learning_rate": 5.9422811832163494e-05, + "loss": 0.3338, + "step": 20075 + }, + { + "epoch": 2.68, + "grad_norm": 0.72265625, + "learning_rate": 5.941216886015075e-05, + "loss": 0.4298, + "step": 20076 + }, + { + "epoch": 2.68, + "grad_norm": 0.5703125, + "learning_rate": 5.9401526438535027e-05, + "loss": 0.4563, + "step": 20077 + }, + { + "epoch": 2.68, + "grad_norm": 0.546875, + "learning_rate": 5.9390884567460694e-05, + "loss": 0.2684, + "step": 20078 + }, + { + "epoch": 2.68, + "grad_norm": 0.63671875, + "learning_rate": 5.9380243247071984e-05, + "loss": 0.3291, + "step": 20079 + }, + { + "epoch": 2.68, + "grad_norm": 0.796875, + "learning_rate": 5.9369602477513286e-05, + "loss": 0.3039, + "step": 20080 + }, + { + "epoch": 2.68, + "grad_norm": 0.5, + "learning_rate": 5.935896225892883e-05, + "loss": 0.2456, + "step": 20081 + }, + { + "epoch": 2.68, + "grad_norm": 0.50390625, + "learning_rate": 5.9348322591462943e-05, + "loss": 0.2828, + "step": 20082 + }, + { + "epoch": 2.68, + "grad_norm": 0.54296875, + "learning_rate": 5.933768347525987e-05, + "loss": 0.2887, + "step": 20083 + }, + { + "epoch": 2.68, + "grad_norm": 0.50390625, + "learning_rate": 5.932704491046393e-05, + "loss": 0.1753, + "step": 20084 + }, + { + "epoch": 2.68, + "grad_norm": 0.58203125, + "learning_rate": 5.931640689721935e-05, + "loss": 0.2119, + "step": 20085 + }, + { + "epoch": 2.68, + "grad_norm": 0.58203125, + "learning_rate": 5.930576943567041e-05, + "loss": 0.3078, + "step": 20086 + }, + { + "epoch": 2.68, + "grad_norm": 0.6015625, + "learning_rate": 5.92951325259614e-05, + "loss": 0.4445, + "step": 20087 + }, + { + "epoch": 2.68, + "grad_norm": 0.6171875, + "learning_rate": 5.9284496168236444e-05, + "loss": 0.299, + "step": 20088 + }, + { + "epoch": 2.68, + "grad_norm": 0.435546875, + "learning_rate": 5.927386036263989e-05, + "loss": 0.2808, + "step": 20089 + }, + { + "epoch": 2.68, + "grad_norm": 0.6484375, + "learning_rate": 5.92632251093159e-05, + "loss": 0.4256, + "step": 20090 + }, + { + "epoch": 2.68, + "grad_norm": 0.53125, + "learning_rate": 5.925259040840876e-05, + "loss": 0.3972, + "step": 20091 + }, + { + "epoch": 2.68, + "grad_norm": 0.73828125, + "learning_rate": 5.92419562600626e-05, + "loss": 0.4998, + "step": 20092 + }, + { + "epoch": 2.68, + "grad_norm": 0.47265625, + "learning_rate": 5.923132266442169e-05, + "loss": 0.171, + "step": 20093 + }, + { + "epoch": 2.68, + "grad_norm": 0.58203125, + "learning_rate": 5.922068962163022e-05, + "loss": 0.4131, + "step": 20094 + }, + { + "epoch": 2.68, + "grad_norm": 0.5078125, + "learning_rate": 5.921005713183235e-05, + "loss": 0.3461, + "step": 20095 + }, + { + "epoch": 2.68, + "grad_norm": 0.44140625, + "learning_rate": 5.919942519517231e-05, + "loss": 0.2071, + "step": 20096 + }, + { + "epoch": 2.68, + "grad_norm": 0.41796875, + "learning_rate": 5.918879381179423e-05, + "loss": 0.1567, + "step": 20097 + }, + { + "epoch": 2.68, + "grad_norm": 0.65625, + "learning_rate": 5.917816298184231e-05, + "loss": 0.7004, + "step": 20098 + }, + { + "epoch": 2.68, + "grad_norm": 0.55859375, + "learning_rate": 5.9167532705460694e-05, + "loss": 0.4555, + "step": 20099 + }, + { + "epoch": 2.68, + "grad_norm": 0.66015625, + "learning_rate": 5.9156902982793574e-05, + "loss": 0.4374, + "step": 20100 + }, + { + "epoch": 2.68, + "grad_norm": 0.3984375, + "learning_rate": 5.9146273813985035e-05, + "loss": 0.1734, + "step": 20101 + }, + { + "epoch": 2.68, + "grad_norm": 0.765625, + "learning_rate": 5.913564519917922e-05, + "loss": 0.5684, + "step": 20102 + }, + { + "epoch": 2.68, + "grad_norm": 0.59765625, + "learning_rate": 5.912501713852032e-05, + "loss": 0.2612, + "step": 20103 + }, + { + "epoch": 2.68, + "grad_norm": 0.6875, + "learning_rate": 5.911438963215239e-05, + "loss": 0.6148, + "step": 20104 + }, + { + "epoch": 2.68, + "grad_norm": 0.6171875, + "learning_rate": 5.91037626802196e-05, + "loss": 0.4574, + "step": 20105 + }, + { + "epoch": 2.68, + "grad_norm": 0.57421875, + "learning_rate": 5.909313628286601e-05, + "loss": 0.5034, + "step": 20106 + }, + { + "epoch": 2.68, + "grad_norm": 0.5859375, + "learning_rate": 5.908251044023578e-05, + "loss": 0.4637, + "step": 20107 + }, + { + "epoch": 2.68, + "grad_norm": 0.62890625, + "learning_rate": 5.907188515247293e-05, + "loss": 0.4627, + "step": 20108 + }, + { + "epoch": 2.68, + "grad_norm": 0.734375, + "learning_rate": 5.906126041972162e-05, + "loss": 0.306, + "step": 20109 + }, + { + "epoch": 2.68, + "grad_norm": 0.5703125, + "learning_rate": 5.905063624212586e-05, + "loss": 0.2719, + "step": 20110 + }, + { + "epoch": 2.68, + "grad_norm": 0.765625, + "learning_rate": 5.9040012619829786e-05, + "loss": 0.1885, + "step": 20111 + }, + { + "epoch": 2.68, + "grad_norm": 0.5546875, + "learning_rate": 5.902938955297744e-05, + "loss": 0.2209, + "step": 20112 + }, + { + "epoch": 2.68, + "grad_norm": 0.609375, + "learning_rate": 5.901876704171283e-05, + "loss": 0.4357, + "step": 20113 + }, + { + "epoch": 2.68, + "grad_norm": 0.64453125, + "learning_rate": 5.9008145086180064e-05, + "loss": 0.5, + "step": 20114 + }, + { + "epoch": 2.68, + "grad_norm": 0.75390625, + "learning_rate": 5.899752368652314e-05, + "loss": 0.411, + "step": 20115 + }, + { + "epoch": 2.68, + "grad_norm": 0.6171875, + "learning_rate": 5.8986902842886106e-05, + "loss": 0.4555, + "step": 20116 + }, + { + "epoch": 2.68, + "grad_norm": 0.5390625, + "learning_rate": 5.897628255541301e-05, + "loss": 0.388, + "step": 20117 + }, + { + "epoch": 2.68, + "grad_norm": 0.80859375, + "learning_rate": 5.896566282424782e-05, + "loss": 0.4903, + "step": 20118 + }, + { + "epoch": 2.68, + "grad_norm": 0.45703125, + "learning_rate": 5.895504364953462e-05, + "loss": 0.2695, + "step": 20119 + }, + { + "epoch": 2.68, + "grad_norm": 0.546875, + "learning_rate": 5.8944425031417326e-05, + "loss": 0.1484, + "step": 20120 + }, + { + "epoch": 2.68, + "grad_norm": 0.60546875, + "learning_rate": 5.893380697004003e-05, + "loss": 0.4237, + "step": 20121 + }, + { + "epoch": 2.69, + "grad_norm": 0.51953125, + "learning_rate": 5.892318946554662e-05, + "loss": 0.2723, + "step": 20122 + }, + { + "epoch": 2.69, + "grad_norm": 0.65234375, + "learning_rate": 5.8912572518081155e-05, + "loss": 0.3882, + "step": 20123 + }, + { + "epoch": 2.69, + "grad_norm": 0.458984375, + "learning_rate": 5.890195612778759e-05, + "loss": 0.1674, + "step": 20124 + }, + { + "epoch": 2.69, + "grad_norm": 0.470703125, + "learning_rate": 5.889134029480985e-05, + "loss": 0.208, + "step": 20125 + }, + { + "epoch": 2.69, + "grad_norm": 0.314453125, + "learning_rate": 5.888072501929193e-05, + "loss": 0.182, + "step": 20126 + }, + { + "epoch": 2.69, + "grad_norm": 0.66796875, + "learning_rate": 5.887011030137774e-05, + "loss": 0.3909, + "step": 20127 + }, + { + "epoch": 2.69, + "grad_norm": 0.38671875, + "learning_rate": 5.885949614121129e-05, + "loss": 0.0957, + "step": 20128 + }, + { + "epoch": 2.69, + "grad_norm": 0.62109375, + "learning_rate": 5.8848882538936445e-05, + "loss": 0.2116, + "step": 20129 + }, + { + "epoch": 2.69, + "grad_norm": 0.423828125, + "learning_rate": 5.883826949469718e-05, + "loss": 0.1571, + "step": 20130 + }, + { + "epoch": 2.69, + "grad_norm": 0.4453125, + "learning_rate": 5.882765700863739e-05, + "loss": 0.2844, + "step": 20131 + }, + { + "epoch": 2.69, + "grad_norm": 0.546875, + "learning_rate": 5.8817045080901004e-05, + "loss": 0.2723, + "step": 20132 + }, + { + "epoch": 2.69, + "grad_norm": 0.71484375, + "learning_rate": 5.88064337116319e-05, + "loss": 0.3804, + "step": 20133 + }, + { + "epoch": 2.69, + "grad_norm": 0.5078125, + "learning_rate": 5.8795822900974015e-05, + "loss": 0.2344, + "step": 20134 + }, + { + "epoch": 2.69, + "grad_norm": 0.67578125, + "learning_rate": 5.87852126490712e-05, + "loss": 0.4552, + "step": 20135 + }, + { + "epoch": 2.69, + "grad_norm": 0.6875, + "learning_rate": 5.877460295606738e-05, + "loss": 0.3121, + "step": 20136 + }, + { + "epoch": 2.69, + "grad_norm": 0.458984375, + "learning_rate": 5.8763993822106403e-05, + "loss": 0.1506, + "step": 20137 + }, + { + "epoch": 2.69, + "grad_norm": 0.59375, + "learning_rate": 5.8753385247332094e-05, + "loss": 0.3145, + "step": 20138 + }, + { + "epoch": 2.69, + "grad_norm": 0.5234375, + "learning_rate": 5.874277723188839e-05, + "loss": 0.461, + "step": 20139 + }, + { + "epoch": 2.69, + "grad_norm": 0.54296875, + "learning_rate": 5.873216977591908e-05, + "loss": 0.293, + "step": 20140 + }, + { + "epoch": 2.69, + "grad_norm": 0.6015625, + "learning_rate": 5.8721562879568025e-05, + "loss": 0.305, + "step": 20141 + }, + { + "epoch": 2.69, + "grad_norm": 0.478515625, + "learning_rate": 5.871095654297909e-05, + "loss": 0.1697, + "step": 20142 + }, + { + "epoch": 2.69, + "grad_norm": 0.5546875, + "learning_rate": 5.870035076629607e-05, + "loss": 0.5828, + "step": 20143 + }, + { + "epoch": 2.69, + "grad_norm": 0.56640625, + "learning_rate": 5.8689745549662825e-05, + "loss": 0.3551, + "step": 20144 + }, + { + "epoch": 2.69, + "grad_norm": 0.83984375, + "learning_rate": 5.867914089322312e-05, + "loss": 0.5598, + "step": 20145 + }, + { + "epoch": 2.69, + "grad_norm": 0.404296875, + "learning_rate": 5.8668536797120786e-05, + "loss": 0.1597, + "step": 20146 + }, + { + "epoch": 2.69, + "grad_norm": 0.65234375, + "learning_rate": 5.865793326149962e-05, + "loss": 0.4612, + "step": 20147 + }, + { + "epoch": 2.69, + "grad_norm": 0.6171875, + "learning_rate": 5.8647330286503445e-05, + "loss": 0.2996, + "step": 20148 + }, + { + "epoch": 2.69, + "grad_norm": 0.60546875, + "learning_rate": 5.8636727872276e-05, + "loss": 0.3215, + "step": 20149 + }, + { + "epoch": 2.69, + "grad_norm": 0.53125, + "learning_rate": 5.8626126018961045e-05, + "loss": 0.237, + "step": 20150 + }, + { + "epoch": 2.69, + "grad_norm": 0.8046875, + "learning_rate": 5.86155247267024e-05, + "loss": 0.3194, + "step": 20151 + }, + { + "epoch": 2.69, + "grad_norm": 0.5625, + "learning_rate": 5.860492399564377e-05, + "loss": 0.3923, + "step": 20152 + }, + { + "epoch": 2.69, + "grad_norm": 0.74609375, + "learning_rate": 5.859432382592896e-05, + "loss": 0.6386, + "step": 20153 + }, + { + "epoch": 2.69, + "grad_norm": 0.6171875, + "learning_rate": 5.858372421770166e-05, + "loss": 0.2815, + "step": 20154 + }, + { + "epoch": 2.69, + "grad_norm": 0.478515625, + "learning_rate": 5.857312517110569e-05, + "loss": 0.3596, + "step": 20155 + }, + { + "epoch": 2.69, + "grad_norm": 0.62109375, + "learning_rate": 5.856252668628469e-05, + "loss": 0.3224, + "step": 20156 + }, + { + "epoch": 2.69, + "grad_norm": 0.53515625, + "learning_rate": 5.855192876338245e-05, + "loss": 0.4455, + "step": 20157 + }, + { + "epoch": 2.69, + "grad_norm": 0.69921875, + "learning_rate": 5.854133140254263e-05, + "loss": 0.4598, + "step": 20158 + }, + { + "epoch": 2.69, + "grad_norm": 0.431640625, + "learning_rate": 5.853073460390899e-05, + "loss": 0.2598, + "step": 20159 + }, + { + "epoch": 2.69, + "grad_norm": 0.42578125, + "learning_rate": 5.852013836762522e-05, + "loss": 0.2035, + "step": 20160 + }, + { + "epoch": 2.69, + "grad_norm": 0.58203125, + "learning_rate": 5.8509542693834964e-05, + "loss": 0.5366, + "step": 20161 + }, + { + "epoch": 2.69, + "grad_norm": 0.69921875, + "learning_rate": 5.849894758268195e-05, + "loss": 0.3406, + "step": 20162 + }, + { + "epoch": 2.69, + "grad_norm": 0.419921875, + "learning_rate": 5.848835303430982e-05, + "loss": 0.1919, + "step": 20163 + }, + { + "epoch": 2.69, + "grad_norm": 0.67578125, + "learning_rate": 5.8477759048862294e-05, + "loss": 0.4294, + "step": 20164 + }, + { + "epoch": 2.69, + "grad_norm": 0.7109375, + "learning_rate": 5.8467165626482975e-05, + "loss": 0.606, + "step": 20165 + }, + { + "epoch": 2.69, + "grad_norm": 0.828125, + "learning_rate": 5.8456572767315555e-05, + "loss": 0.3998, + "step": 20166 + }, + { + "epoch": 2.69, + "grad_norm": 0.8046875, + "learning_rate": 5.844598047150368e-05, + "loss": 0.4465, + "step": 20167 + }, + { + "epoch": 2.69, + "grad_norm": 0.7578125, + "learning_rate": 5.843538873919097e-05, + "loss": 0.3967, + "step": 20168 + }, + { + "epoch": 2.69, + "grad_norm": 0.546875, + "learning_rate": 5.842479757052107e-05, + "loss": 0.3947, + "step": 20169 + }, + { + "epoch": 2.69, + "grad_norm": 0.6171875, + "learning_rate": 5.8414206965637594e-05, + "loss": 0.2997, + "step": 20170 + }, + { + "epoch": 2.69, + "grad_norm": 0.6875, + "learning_rate": 5.8403616924684166e-05, + "loss": 0.2969, + "step": 20171 + }, + { + "epoch": 2.69, + "grad_norm": 0.6171875, + "learning_rate": 5.8393027447804414e-05, + "loss": 0.3422, + "step": 20172 + }, + { + "epoch": 2.69, + "grad_norm": 0.48828125, + "learning_rate": 5.838243853514191e-05, + "loss": 0.3561, + "step": 20173 + }, + { + "epoch": 2.69, + "grad_norm": 0.6484375, + "learning_rate": 5.837185018684021e-05, + "loss": 0.5733, + "step": 20174 + }, + { + "epoch": 2.69, + "grad_norm": 0.51171875, + "learning_rate": 5.836126240304294e-05, + "loss": 0.3403, + "step": 20175 + }, + { + "epoch": 2.69, + "grad_norm": 0.5703125, + "learning_rate": 5.835067518389368e-05, + "loss": 0.2198, + "step": 20176 + }, + { + "epoch": 2.69, + "grad_norm": 0.6171875, + "learning_rate": 5.834008852953603e-05, + "loss": 0.3565, + "step": 20177 + }, + { + "epoch": 2.69, + "grad_norm": 0.55078125, + "learning_rate": 5.8329502440113484e-05, + "loss": 0.5254, + "step": 20178 + }, + { + "epoch": 2.69, + "grad_norm": 0.8359375, + "learning_rate": 5.831891691576963e-05, + "loss": 0.3844, + "step": 20179 + }, + { + "epoch": 2.69, + "grad_norm": 0.69140625, + "learning_rate": 5.8308331956648e-05, + "loss": 0.4778, + "step": 20180 + }, + { + "epoch": 2.69, + "grad_norm": 0.62890625, + "learning_rate": 5.829774756289219e-05, + "loss": 0.3623, + "step": 20181 + }, + { + "epoch": 2.69, + "grad_norm": 0.52734375, + "learning_rate": 5.828716373464565e-05, + "loss": 0.153, + "step": 20182 + }, + { + "epoch": 2.69, + "grad_norm": 0.515625, + "learning_rate": 5.8276580472051945e-05, + "loss": 0.1392, + "step": 20183 + }, + { + "epoch": 2.69, + "grad_norm": 0.67578125, + "learning_rate": 5.826599777525459e-05, + "loss": 0.4468, + "step": 20184 + }, + { + "epoch": 2.69, + "grad_norm": 0.6875, + "learning_rate": 5.825541564439712e-05, + "loss": 0.211, + "step": 20185 + }, + { + "epoch": 2.69, + "grad_norm": 0.443359375, + "learning_rate": 5.824483407962301e-05, + "loss": 0.2892, + "step": 20186 + }, + { + "epoch": 2.69, + "grad_norm": 0.65234375, + "learning_rate": 5.823425308107571e-05, + "loss": 0.1695, + "step": 20187 + }, + { + "epoch": 2.69, + "grad_norm": 0.5078125, + "learning_rate": 5.822367264889874e-05, + "loss": 0.3268, + "step": 20188 + }, + { + "epoch": 2.69, + "grad_norm": 0.71484375, + "learning_rate": 5.8213092783235576e-05, + "loss": 0.2538, + "step": 20189 + }, + { + "epoch": 2.69, + "grad_norm": 0.5546875, + "learning_rate": 5.82025134842297e-05, + "loss": 0.3605, + "step": 20190 + }, + { + "epoch": 2.69, + "grad_norm": 0.65234375, + "learning_rate": 5.81919347520246e-05, + "loss": 0.4154, + "step": 20191 + }, + { + "epoch": 2.69, + "grad_norm": 0.6015625, + "learning_rate": 5.818135658676367e-05, + "loss": 0.2928, + "step": 20192 + }, + { + "epoch": 2.69, + "grad_norm": 0.423828125, + "learning_rate": 5.817077898859038e-05, + "loss": 0.1556, + "step": 20193 + }, + { + "epoch": 2.69, + "grad_norm": 0.466796875, + "learning_rate": 5.816020195764816e-05, + "loss": 0.3365, + "step": 20194 + }, + { + "epoch": 2.69, + "grad_norm": 0.5546875, + "learning_rate": 5.81496254940805e-05, + "loss": 0.3493, + "step": 20195 + }, + { + "epoch": 2.69, + "grad_norm": 0.447265625, + "learning_rate": 5.8139049598030734e-05, + "loss": 0.1663, + "step": 20196 + }, + { + "epoch": 2.7, + "grad_norm": 0.43359375, + "learning_rate": 5.812847426964239e-05, + "loss": 0.2641, + "step": 20197 + }, + { + "epoch": 2.7, + "grad_norm": 0.81640625, + "learning_rate": 5.811789950905873e-05, + "loss": 0.5171, + "step": 20198 + }, + { + "epoch": 2.7, + "grad_norm": 0.75390625, + "learning_rate": 5.8107325316423245e-05, + "loss": 0.4206, + "step": 20199 + }, + { + "epoch": 2.7, + "grad_norm": 0.5625, + "learning_rate": 5.8096751691879356e-05, + "loss": 0.348, + "step": 20200 + }, + { + "epoch": 2.7, + "grad_norm": 0.462890625, + "learning_rate": 5.808617863557038e-05, + "loss": 0.2783, + "step": 20201 + }, + { + "epoch": 2.7, + "grad_norm": 0.68359375, + "learning_rate": 5.807560614763972e-05, + "loss": 0.4073, + "step": 20202 + }, + { + "epoch": 2.7, + "grad_norm": 0.5625, + "learning_rate": 5.806503422823073e-05, + "loss": 0.4628, + "step": 20203 + }, + { + "epoch": 2.7, + "grad_norm": 0.73828125, + "learning_rate": 5.805446287748684e-05, + "loss": 0.3336, + "step": 20204 + }, + { + "epoch": 2.7, + "grad_norm": 0.62109375, + "learning_rate": 5.80438920955513e-05, + "loss": 0.4056, + "step": 20205 + }, + { + "epoch": 2.7, + "grad_norm": 0.5625, + "learning_rate": 5.8033321882567534e-05, + "loss": 0.2899, + "step": 20206 + }, + { + "epoch": 2.7, + "grad_norm": 0.4453125, + "learning_rate": 5.802275223867883e-05, + "loss": 0.1715, + "step": 20207 + }, + { + "epoch": 2.7, + "grad_norm": 0.353515625, + "learning_rate": 5.801218316402861e-05, + "loss": 0.1309, + "step": 20208 + }, + { + "epoch": 2.7, + "grad_norm": 0.5703125, + "learning_rate": 5.800161465876013e-05, + "loss": 0.2672, + "step": 20209 + }, + { + "epoch": 2.7, + "grad_norm": 0.51953125, + "learning_rate": 5.799104672301667e-05, + "loss": 0.3879, + "step": 20210 + }, + { + "epoch": 2.7, + "grad_norm": 0.455078125, + "learning_rate": 5.7980479356941575e-05, + "loss": 0.2579, + "step": 20211 + }, + { + "epoch": 2.7, + "grad_norm": 0.498046875, + "learning_rate": 5.796991256067814e-05, + "loss": 0.3781, + "step": 20212 + }, + { + "epoch": 2.7, + "grad_norm": 0.84375, + "learning_rate": 5.7959346334369676e-05, + "loss": 0.5142, + "step": 20213 + }, + { + "epoch": 2.7, + "grad_norm": 0.69140625, + "learning_rate": 5.7948780678159496e-05, + "loss": 0.3451, + "step": 20214 + }, + { + "epoch": 2.7, + "grad_norm": 0.6640625, + "learning_rate": 5.7938215592190794e-05, + "loss": 0.2935, + "step": 20215 + }, + { + "epoch": 2.7, + "grad_norm": 0.6328125, + "learning_rate": 5.792765107660688e-05, + "loss": 0.4397, + "step": 20216 + }, + { + "epoch": 2.7, + "grad_norm": 0.61328125, + "learning_rate": 5.791708713155102e-05, + "loss": 0.3721, + "step": 20217 + }, + { + "epoch": 2.7, + "grad_norm": 0.62109375, + "learning_rate": 5.790652375716652e-05, + "loss": 0.3793, + "step": 20218 + }, + { + "epoch": 2.7, + "grad_norm": 0.57421875, + "learning_rate": 5.789596095359653e-05, + "loss": 0.1971, + "step": 20219 + }, + { + "epoch": 2.7, + "grad_norm": 0.69921875, + "learning_rate": 5.7885398720984315e-05, + "loss": 0.5821, + "step": 20220 + }, + { + "epoch": 2.7, + "grad_norm": 0.58984375, + "learning_rate": 5.7874837059473176e-05, + "loss": 0.2182, + "step": 20221 + }, + { + "epoch": 2.7, + "grad_norm": 0.458984375, + "learning_rate": 5.786427596920624e-05, + "loss": 0.3255, + "step": 20222 + }, + { + "epoch": 2.7, + "grad_norm": 0.41015625, + "learning_rate": 5.785371545032681e-05, + "loss": 0.2298, + "step": 20223 + }, + { + "epoch": 2.7, + "grad_norm": 0.46484375, + "learning_rate": 5.7843155502977995e-05, + "loss": 0.2267, + "step": 20224 + }, + { + "epoch": 2.7, + "grad_norm": 0.53515625, + "learning_rate": 5.783259612730305e-05, + "loss": 0.2449, + "step": 20225 + }, + { + "epoch": 2.7, + "grad_norm": 0.5234375, + "learning_rate": 5.782203732344518e-05, + "loss": 0.1755, + "step": 20226 + }, + { + "epoch": 2.7, + "grad_norm": 0.796875, + "learning_rate": 5.781147909154758e-05, + "loss": 0.4585, + "step": 20227 + }, + { + "epoch": 2.7, + "grad_norm": 0.5, + "learning_rate": 5.780092143175335e-05, + "loss": 0.234, + "step": 20228 + }, + { + "epoch": 2.7, + "grad_norm": 0.62109375, + "learning_rate": 5.7790364344205726e-05, + "loss": 0.2338, + "step": 20229 + }, + { + "epoch": 2.7, + "grad_norm": 0.68359375, + "learning_rate": 5.777980782904785e-05, + "loss": 0.7144, + "step": 20230 + }, + { + "epoch": 2.7, + "grad_norm": 0.5078125, + "learning_rate": 5.77692518864229e-05, + "loss": 0.2246, + "step": 20231 + }, + { + "epoch": 2.7, + "grad_norm": 0.470703125, + "learning_rate": 5.7758696516473985e-05, + "loss": 0.2044, + "step": 20232 + }, + { + "epoch": 2.7, + "grad_norm": 0.458984375, + "learning_rate": 5.7748141719344285e-05, + "loss": 0.1679, + "step": 20233 + }, + { + "epoch": 2.7, + "grad_norm": 0.609375, + "learning_rate": 5.773758749517685e-05, + "loss": 0.3715, + "step": 20234 + }, + { + "epoch": 2.7, + "grad_norm": 0.640625, + "learning_rate": 5.7727033844114866e-05, + "loss": 0.425, + "step": 20235 + }, + { + "epoch": 2.7, + "grad_norm": 0.515625, + "learning_rate": 5.771648076630146e-05, + "loss": 0.2932, + "step": 20236 + }, + { + "epoch": 2.7, + "grad_norm": 0.59765625, + "learning_rate": 5.770592826187968e-05, + "loss": 0.2885, + "step": 20237 + }, + { + "epoch": 2.7, + "grad_norm": 0.55078125, + "learning_rate": 5.769537633099266e-05, + "loss": 0.145, + "step": 20238 + }, + { + "epoch": 2.7, + "grad_norm": 0.609375, + "learning_rate": 5.768482497378348e-05, + "loss": 0.403, + "step": 20239 + }, + { + "epoch": 2.7, + "grad_norm": 0.53125, + "learning_rate": 5.767427419039524e-05, + "loss": 0.2521, + "step": 20240 + }, + { + "epoch": 2.7, + "grad_norm": 0.341796875, + "learning_rate": 5.766372398097104e-05, + "loss": 0.1025, + "step": 20241 + }, + { + "epoch": 2.7, + "grad_norm": 0.6015625, + "learning_rate": 5.765317434565387e-05, + "loss": 0.3646, + "step": 20242 + }, + { + "epoch": 2.7, + "grad_norm": 0.72265625, + "learning_rate": 5.764262528458684e-05, + "loss": 0.4099, + "step": 20243 + }, + { + "epoch": 2.7, + "grad_norm": 0.578125, + "learning_rate": 5.763207679791299e-05, + "loss": 0.3897, + "step": 20244 + }, + { + "epoch": 2.7, + "grad_norm": 0.58203125, + "learning_rate": 5.762152888577541e-05, + "loss": 0.2681, + "step": 20245 + }, + { + "epoch": 2.7, + "grad_norm": 0.66015625, + "learning_rate": 5.761098154831711e-05, + "loss": 0.5942, + "step": 20246 + }, + { + "epoch": 2.7, + "grad_norm": 0.70703125, + "learning_rate": 5.7600434785681055e-05, + "loss": 0.2352, + "step": 20247 + }, + { + "epoch": 2.7, + "grad_norm": 0.80859375, + "learning_rate": 5.758988859801032e-05, + "loss": 0.2531, + "step": 20248 + }, + { + "epoch": 2.7, + "grad_norm": 0.5703125, + "learning_rate": 5.757934298544792e-05, + "loss": 0.3027, + "step": 20249 + }, + { + "epoch": 2.7, + "grad_norm": 0.67578125, + "learning_rate": 5.7568797948136875e-05, + "loss": 0.5073, + "step": 20250 + }, + { + "epoch": 2.7, + "grad_norm": 0.5078125, + "learning_rate": 5.755825348622014e-05, + "loss": 0.2704, + "step": 20251 + }, + { + "epoch": 2.7, + "grad_norm": 0.6640625, + "learning_rate": 5.754770959984073e-05, + "loss": 0.4707, + "step": 20252 + }, + { + "epoch": 2.7, + "grad_norm": 0.5546875, + "learning_rate": 5.753716628914161e-05, + "loss": 0.4729, + "step": 20253 + }, + { + "epoch": 2.7, + "grad_norm": 0.5625, + "learning_rate": 5.752662355426581e-05, + "loss": 0.2564, + "step": 20254 + }, + { + "epoch": 2.7, + "grad_norm": 0.546875, + "learning_rate": 5.751608139535621e-05, + "loss": 0.3575, + "step": 20255 + }, + { + "epoch": 2.7, + "grad_norm": 0.609375, + "learning_rate": 5.750553981255582e-05, + "loss": 0.3041, + "step": 20256 + }, + { + "epoch": 2.7, + "grad_norm": 0.6171875, + "learning_rate": 5.749499880600762e-05, + "loss": 0.4819, + "step": 20257 + }, + { + "epoch": 2.7, + "grad_norm": 0.431640625, + "learning_rate": 5.748445837585448e-05, + "loss": 0.345, + "step": 20258 + }, + { + "epoch": 2.7, + "grad_norm": 0.49609375, + "learning_rate": 5.74739185222394e-05, + "loss": 0.2162, + "step": 20259 + }, + { + "epoch": 2.7, + "grad_norm": 0.451171875, + "learning_rate": 5.7463379245305246e-05, + "loss": 0.2478, + "step": 20260 + }, + { + "epoch": 2.7, + "grad_norm": 0.5703125, + "learning_rate": 5.7452840545194975e-05, + "loss": 0.3865, + "step": 20261 + }, + { + "epoch": 2.7, + "grad_norm": 0.4765625, + "learning_rate": 5.744230242205149e-05, + "loss": 0.2051, + "step": 20262 + }, + { + "epoch": 2.7, + "grad_norm": 0.60546875, + "learning_rate": 5.743176487601769e-05, + "loss": 0.4321, + "step": 20263 + }, + { + "epoch": 2.7, + "grad_norm": 0.5703125, + "learning_rate": 5.742122790723652e-05, + "loss": 0.2764, + "step": 20264 + }, + { + "epoch": 2.7, + "grad_norm": 0.55078125, + "learning_rate": 5.741069151585079e-05, + "loss": 0.4309, + "step": 20265 + }, + { + "epoch": 2.7, + "grad_norm": 0.53125, + "learning_rate": 5.7400155702003414e-05, + "loss": 0.1885, + "step": 20266 + }, + { + "epoch": 2.7, + "grad_norm": 0.66796875, + "learning_rate": 5.738962046583727e-05, + "loss": 0.3673, + "step": 20267 + }, + { + "epoch": 2.7, + "grad_norm": 0.57421875, + "learning_rate": 5.7379085807495245e-05, + "loss": 0.6413, + "step": 20268 + }, + { + "epoch": 2.7, + "grad_norm": 0.53125, + "learning_rate": 5.7368551727120144e-05, + "loss": 0.3276, + "step": 20269 + }, + { + "epoch": 2.7, + "grad_norm": 0.58203125, + "learning_rate": 5.7358018224854896e-05, + "loss": 0.5132, + "step": 20270 + }, + { + "epoch": 2.7, + "grad_norm": 0.39453125, + "learning_rate": 5.734748530084223e-05, + "loss": 0.1467, + "step": 20271 + }, + { + "epoch": 2.71, + "grad_norm": 0.5859375, + "learning_rate": 5.7336952955225034e-05, + "loss": 0.5242, + "step": 20272 + }, + { + "epoch": 2.71, + "grad_norm": 0.490234375, + "learning_rate": 5.7326421188146195e-05, + "loss": 0.2887, + "step": 20273 + }, + { + "epoch": 2.71, + "grad_norm": 0.443359375, + "learning_rate": 5.7315889999748415e-05, + "loss": 0.3088, + "step": 20274 + }, + { + "epoch": 2.71, + "grad_norm": 0.71875, + "learning_rate": 5.730535939017456e-05, + "loss": 0.266, + "step": 20275 + }, + { + "epoch": 2.71, + "grad_norm": 0.5390625, + "learning_rate": 5.729482935956745e-05, + "loss": 0.2873, + "step": 20276 + }, + { + "epoch": 2.71, + "grad_norm": 0.36328125, + "learning_rate": 5.728429990806989e-05, + "loss": 0.1709, + "step": 20277 + }, + { + "epoch": 2.71, + "grad_norm": 0.4296875, + "learning_rate": 5.727377103582461e-05, + "loss": 0.1353, + "step": 20278 + }, + { + "epoch": 2.71, + "grad_norm": 0.67578125, + "learning_rate": 5.72632427429744e-05, + "loss": 0.3258, + "step": 20279 + }, + { + "epoch": 2.71, + "grad_norm": 0.66015625, + "learning_rate": 5.725271502966205e-05, + "loss": 0.3236, + "step": 20280 + }, + { + "epoch": 2.71, + "grad_norm": 0.57421875, + "learning_rate": 5.7242187896030364e-05, + "loss": 0.3662, + "step": 20281 + }, + { + "epoch": 2.71, + "grad_norm": 0.4921875, + "learning_rate": 5.7231661342222056e-05, + "loss": 0.3826, + "step": 20282 + }, + { + "epoch": 2.71, + "grad_norm": 0.55078125, + "learning_rate": 5.722113536837982e-05, + "loss": 0.5524, + "step": 20283 + }, + { + "epoch": 2.71, + "grad_norm": 0.76171875, + "learning_rate": 5.7210609974646445e-05, + "loss": 0.2363, + "step": 20284 + }, + { + "epoch": 2.71, + "grad_norm": 0.546875, + "learning_rate": 5.720008516116466e-05, + "loss": 0.4158, + "step": 20285 + }, + { + "epoch": 2.71, + "grad_norm": 0.70703125, + "learning_rate": 5.718956092807719e-05, + "loss": 0.5802, + "step": 20286 + }, + { + "epoch": 2.71, + "grad_norm": 0.7109375, + "learning_rate": 5.7179037275526804e-05, + "loss": 0.4075, + "step": 20287 + }, + { + "epoch": 2.71, + "grad_norm": 0.5078125, + "learning_rate": 5.71685142036561e-05, + "loss": 0.2064, + "step": 20288 + }, + { + "epoch": 2.71, + "grad_norm": 0.47265625, + "learning_rate": 5.715799171260783e-05, + "loss": 0.1902, + "step": 20289 + }, + { + "epoch": 2.71, + "grad_norm": 0.6953125, + "learning_rate": 5.714746980252469e-05, + "loss": 0.4474, + "step": 20290 + }, + { + "epoch": 2.71, + "grad_norm": 0.62109375, + "learning_rate": 5.71369484735494e-05, + "loss": 0.5389, + "step": 20291 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 5.712642772582457e-05, + "loss": 0.5539, + "step": 20292 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 5.711590755949289e-05, + "loss": 0.4525, + "step": 20293 + }, + { + "epoch": 2.71, + "grad_norm": 0.4765625, + "learning_rate": 5.7105387974697063e-05, + "loss": 0.1298, + "step": 20294 + }, + { + "epoch": 2.71, + "grad_norm": 0.6015625, + "learning_rate": 5.7094868971579676e-05, + "loss": 0.2613, + "step": 20295 + }, + { + "epoch": 2.71, + "grad_norm": 0.51953125, + "learning_rate": 5.7084350550283426e-05, + "loss": 0.3863, + "step": 20296 + }, + { + "epoch": 2.71, + "grad_norm": 0.466796875, + "learning_rate": 5.7073832710950894e-05, + "loss": 0.2765, + "step": 20297 + }, + { + "epoch": 2.71, + "grad_norm": 0.6640625, + "learning_rate": 5.706331545372474e-05, + "loss": 0.4708, + "step": 20298 + }, + { + "epoch": 2.71, + "grad_norm": 0.65625, + "learning_rate": 5.705279877874758e-05, + "loss": 0.3456, + "step": 20299 + }, + { + "epoch": 2.71, + "grad_norm": 0.52734375, + "learning_rate": 5.704228268616208e-05, + "loss": 0.3082, + "step": 20300 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 5.703176717611074e-05, + "loss": 0.3424, + "step": 20301 + }, + { + "epoch": 2.71, + "grad_norm": 0.5625, + "learning_rate": 5.702125224873624e-05, + "loss": 0.2596, + "step": 20302 + }, + { + "epoch": 2.71, + "grad_norm": 0.41015625, + "learning_rate": 5.701073790418112e-05, + "loss": 0.1231, + "step": 20303 + }, + { + "epoch": 2.71, + "grad_norm": 0.57421875, + "learning_rate": 5.700022414258803e-05, + "loss": 0.3956, + "step": 20304 + }, + { + "epoch": 2.71, + "grad_norm": 0.69140625, + "learning_rate": 5.698971096409946e-05, + "loss": 0.2701, + "step": 20305 + }, + { + "epoch": 2.71, + "grad_norm": 0.396484375, + "learning_rate": 5.6979198368858054e-05, + "loss": 0.2228, + "step": 20306 + }, + { + "epoch": 2.71, + "grad_norm": 0.65234375, + "learning_rate": 5.69686863570063e-05, + "loss": 0.4052, + "step": 20307 + }, + { + "epoch": 2.71, + "grad_norm": 0.55078125, + "learning_rate": 5.6958174928686756e-05, + "loss": 0.4348, + "step": 20308 + }, + { + "epoch": 2.71, + "grad_norm": 1.125, + "learning_rate": 5.694766408404204e-05, + "loss": 0.2644, + "step": 20309 + }, + { + "epoch": 2.71, + "grad_norm": 0.58203125, + "learning_rate": 5.693715382321458e-05, + "loss": 0.4274, + "step": 20310 + }, + { + "epoch": 2.71, + "grad_norm": 0.72265625, + "learning_rate": 5.692664414634696e-05, + "loss": 0.3952, + "step": 20311 + }, + { + "epoch": 2.71, + "grad_norm": 0.62109375, + "learning_rate": 5.691613505358169e-05, + "loss": 0.5035, + "step": 20312 + }, + { + "epoch": 2.71, + "grad_norm": 0.7265625, + "learning_rate": 5.690562654506126e-05, + "loss": 0.3945, + "step": 20313 + }, + { + "epoch": 2.71, + "grad_norm": 0.6484375, + "learning_rate": 5.6895118620928255e-05, + "loss": 0.3779, + "step": 20314 + }, + { + "epoch": 2.71, + "grad_norm": 0.69140625, + "learning_rate": 5.688461128132505e-05, + "loss": 0.35, + "step": 20315 + }, + { + "epoch": 2.71, + "grad_norm": 0.65625, + "learning_rate": 5.687410452639421e-05, + "loss": 0.4609, + "step": 20316 + }, + { + "epoch": 2.71, + "grad_norm": 0.53515625, + "learning_rate": 5.686359835627817e-05, + "loss": 0.2649, + "step": 20317 + }, + { + "epoch": 2.71, + "grad_norm": 0.6484375, + "learning_rate": 5.685309277111947e-05, + "loss": 0.5194, + "step": 20318 + }, + { + "epoch": 2.71, + "grad_norm": 0.70703125, + "learning_rate": 5.684258777106052e-05, + "loss": 0.5323, + "step": 20319 + }, + { + "epoch": 2.71, + "grad_norm": 0.546875, + "learning_rate": 5.683208335624373e-05, + "loss": 0.3203, + "step": 20320 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 5.682157952681161e-05, + "loss": 0.2703, + "step": 20321 + }, + { + "epoch": 2.71, + "grad_norm": 0.6953125, + "learning_rate": 5.6811076282906575e-05, + "loss": 0.4488, + "step": 20322 + }, + { + "epoch": 2.71, + "grad_norm": 0.578125, + "learning_rate": 5.6800573624671104e-05, + "loss": 0.4235, + "step": 20323 + }, + { + "epoch": 2.71, + "grad_norm": 0.62890625, + "learning_rate": 5.6790071552247536e-05, + "loss": 0.3424, + "step": 20324 + }, + { + "epoch": 2.71, + "grad_norm": 0.515625, + "learning_rate": 5.677957006577834e-05, + "loss": 0.4801, + "step": 20325 + }, + { + "epoch": 2.71, + "grad_norm": 0.55859375, + "learning_rate": 5.67690691654059e-05, + "loss": 0.2951, + "step": 20326 + }, + { + "epoch": 2.71, + "grad_norm": 0.48828125, + "learning_rate": 5.675856885127269e-05, + "loss": 0.2795, + "step": 20327 + }, + { + "epoch": 2.71, + "grad_norm": 0.703125, + "learning_rate": 5.674806912352097e-05, + "loss": 0.9069, + "step": 20328 + }, + { + "epoch": 2.71, + "grad_norm": 0.56640625, + "learning_rate": 5.6737569982293205e-05, + "loss": 0.1581, + "step": 20329 + }, + { + "epoch": 2.71, + "grad_norm": 0.54296875, + "learning_rate": 5.672707142773179e-05, + "loss": 0.3046, + "step": 20330 + }, + { + "epoch": 2.71, + "grad_norm": 0.984375, + "learning_rate": 5.671657345997903e-05, + "loss": 0.2218, + "step": 20331 + }, + { + "epoch": 2.71, + "grad_norm": 0.65625, + "learning_rate": 5.670607607917735e-05, + "loss": 0.3832, + "step": 20332 + }, + { + "epoch": 2.71, + "grad_norm": 0.515625, + "learning_rate": 5.6695579285469025e-05, + "loss": 0.337, + "step": 20333 + }, + { + "epoch": 2.71, + "grad_norm": 0.578125, + "learning_rate": 5.668508307899644e-05, + "loss": 0.1719, + "step": 20334 + }, + { + "epoch": 2.71, + "grad_norm": 0.81640625, + "learning_rate": 5.667458745990192e-05, + "loss": 0.3321, + "step": 20335 + }, + { + "epoch": 2.71, + "grad_norm": 0.55078125, + "learning_rate": 5.66640924283278e-05, + "loss": 0.4523, + "step": 20336 + }, + { + "epoch": 2.71, + "grad_norm": 0.439453125, + "learning_rate": 5.6653597984416454e-05, + "loss": 0.1554, + "step": 20337 + }, + { + "epoch": 2.71, + "grad_norm": 0.65625, + "learning_rate": 5.664310412831007e-05, + "loss": 0.2705, + "step": 20338 + }, + { + "epoch": 2.71, + "grad_norm": 0.50390625, + "learning_rate": 5.6632610860151046e-05, + "loss": 0.2412, + "step": 20339 + }, + { + "epoch": 2.71, + "grad_norm": 0.498046875, + "learning_rate": 5.6622118180081655e-05, + "loss": 0.2602, + "step": 20340 + }, + { + "epoch": 2.71, + "grad_norm": 0.61328125, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.4088, + "step": 20341 + }, + { + "epoch": 2.71, + "grad_norm": 0.546875, + "learning_rate": 5.660113458478096e-05, + "loss": 0.387, + "step": 20342 + }, + { + "epoch": 2.71, + "grad_norm": 0.70703125, + "learning_rate": 5.659064366983413e-05, + "loss": 0.2906, + "step": 20343 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 5.6580153343546035e-05, + "loss": 0.3382, + "step": 20344 + }, + { + "epoch": 2.71, + "grad_norm": 0.83984375, + "learning_rate": 5.656966360605893e-05, + "loss": 0.55, + "step": 20345 + }, + { + "epoch": 2.71, + "grad_norm": 0.53125, + "learning_rate": 5.655917445751511e-05, + "loss": 0.2619, + "step": 20346 + }, + { + "epoch": 2.72, + "grad_norm": 0.51171875, + "learning_rate": 5.6548685898056716e-05, + "loss": 0.2979, + "step": 20347 + }, + { + "epoch": 2.72, + "grad_norm": 0.53125, + "learning_rate": 5.6538197927826045e-05, + "loss": 0.1759, + "step": 20348 + }, + { + "epoch": 2.72, + "grad_norm": 0.55078125, + "learning_rate": 5.652771054696528e-05, + "loss": 0.2444, + "step": 20349 + }, + { + "epoch": 2.72, + "grad_norm": 0.640625, + "learning_rate": 5.65172237556167e-05, + "loss": 0.2085, + "step": 20350 + }, + { + "epoch": 2.72, + "grad_norm": 0.6171875, + "learning_rate": 5.650673755392245e-05, + "loss": 0.4601, + "step": 20351 + }, + { + "epoch": 2.72, + "grad_norm": 0.49609375, + "learning_rate": 5.649625194202475e-05, + "loss": 0.1779, + "step": 20352 + }, + { + "epoch": 2.72, + "grad_norm": 0.75, + "learning_rate": 5.6485766920065775e-05, + "loss": 0.5266, + "step": 20353 + }, + { + "epoch": 2.72, + "grad_norm": 0.71875, + "learning_rate": 5.647528248818779e-05, + "loss": 0.556, + "step": 20354 + }, + { + "epoch": 2.72, + "grad_norm": 0.50390625, + "learning_rate": 5.64647986465329e-05, + "loss": 0.2511, + "step": 20355 + }, + { + "epoch": 2.72, + "grad_norm": 0.84375, + "learning_rate": 5.645431539524324e-05, + "loss": 0.3029, + "step": 20356 + }, + { + "epoch": 2.72, + "grad_norm": 0.44921875, + "learning_rate": 5.644383273446101e-05, + "loss": 0.3502, + "step": 20357 + }, + { + "epoch": 2.72, + "grad_norm": 0.443359375, + "learning_rate": 5.6433350664328365e-05, + "loss": 0.3151, + "step": 20358 + }, + { + "epoch": 2.72, + "grad_norm": 0.703125, + "learning_rate": 5.642286918498745e-05, + "loss": 0.6782, + "step": 20359 + }, + { + "epoch": 2.72, + "grad_norm": 0.73046875, + "learning_rate": 5.6412388296580424e-05, + "loss": 0.2352, + "step": 20360 + }, + { + "epoch": 2.72, + "grad_norm": 0.52734375, + "learning_rate": 5.640190799924936e-05, + "loss": 0.2833, + "step": 20361 + }, + { + "epoch": 2.72, + "grad_norm": 0.421875, + "learning_rate": 5.639142829313639e-05, + "loss": 0.1749, + "step": 20362 + }, + { + "epoch": 2.72, + "grad_norm": 0.609375, + "learning_rate": 5.6380949178383656e-05, + "loss": 0.4812, + "step": 20363 + }, + { + "epoch": 2.72, + "grad_norm": 0.69921875, + "learning_rate": 5.637047065513328e-05, + "loss": 0.319, + "step": 20364 + }, + { + "epoch": 2.72, + "grad_norm": 0.6171875, + "learning_rate": 5.635999272352728e-05, + "loss": 0.3534, + "step": 20365 + }, + { + "epoch": 2.72, + "grad_norm": 0.42578125, + "learning_rate": 5.634951538370778e-05, + "loss": 0.3134, + "step": 20366 + }, + { + "epoch": 2.72, + "grad_norm": 0.5390625, + "learning_rate": 5.6339038635816935e-05, + "loss": 0.3717, + "step": 20367 + }, + { + "epoch": 2.72, + "grad_norm": 0.703125, + "learning_rate": 5.632856247999667e-05, + "loss": 0.3698, + "step": 20368 + }, + { + "epoch": 2.72, + "grad_norm": 0.59765625, + "learning_rate": 5.631808691638919e-05, + "loss": 0.1819, + "step": 20369 + }, + { + "epoch": 2.72, + "grad_norm": 0.62109375, + "learning_rate": 5.630761194513644e-05, + "loss": 0.448, + "step": 20370 + }, + { + "epoch": 2.72, + "grad_norm": 0.63671875, + "learning_rate": 5.629713756638051e-05, + "loss": 0.277, + "step": 20371 + }, + { + "epoch": 2.72, + "grad_norm": 0.59765625, + "learning_rate": 5.628666378026345e-05, + "loss": 0.5236, + "step": 20372 + }, + { + "epoch": 2.72, + "grad_norm": 0.54296875, + "learning_rate": 5.6276190586927324e-05, + "loss": 0.2494, + "step": 20373 + }, + { + "epoch": 2.72, + "grad_norm": 0.546875, + "learning_rate": 5.6265717986514065e-05, + "loss": 0.3879, + "step": 20374 + }, + { + "epoch": 2.72, + "grad_norm": 0.7734375, + "learning_rate": 5.6255245979165736e-05, + "loss": 0.2671, + "step": 20375 + }, + { + "epoch": 2.72, + "grad_norm": 0.7265625, + "learning_rate": 5.624477456502437e-05, + "loss": 0.2188, + "step": 20376 + }, + { + "epoch": 2.72, + "grad_norm": 0.4609375, + "learning_rate": 5.6234303744231966e-05, + "loss": 0.2625, + "step": 20377 + }, + { + "epoch": 2.72, + "grad_norm": 0.52734375, + "learning_rate": 5.622383351693046e-05, + "loss": 0.3512, + "step": 20378 + }, + { + "epoch": 2.72, + "grad_norm": 0.56640625, + "learning_rate": 5.621336388326189e-05, + "loss": 0.2393, + "step": 20379 + }, + { + "epoch": 2.72, + "grad_norm": 0.44140625, + "learning_rate": 5.620289484336818e-05, + "loss": 0.1523, + "step": 20380 + }, + { + "epoch": 2.72, + "grad_norm": 0.57421875, + "learning_rate": 5.619242639739133e-05, + "loss": 0.414, + "step": 20381 + }, + { + "epoch": 2.72, + "grad_norm": 0.7421875, + "learning_rate": 5.6181958545473325e-05, + "loss": 0.2779, + "step": 20382 + }, + { + "epoch": 2.72, + "grad_norm": 0.56640625, + "learning_rate": 5.617149128775605e-05, + "loss": 0.292, + "step": 20383 + }, + { + "epoch": 2.72, + "grad_norm": 0.6640625, + "learning_rate": 5.616102462438147e-05, + "loss": 0.6673, + "step": 20384 + }, + { + "epoch": 2.72, + "grad_norm": 0.76171875, + "learning_rate": 5.615055855549154e-05, + "loss": 0.4524, + "step": 20385 + }, + { + "epoch": 2.72, + "grad_norm": 0.59765625, + "learning_rate": 5.6140093081228185e-05, + "loss": 0.3506, + "step": 20386 + }, + { + "epoch": 2.72, + "grad_norm": 0.578125, + "learning_rate": 5.612962820173335e-05, + "loss": 0.2432, + "step": 20387 + }, + { + "epoch": 2.72, + "grad_norm": 0.62109375, + "learning_rate": 5.6119163917148866e-05, + "loss": 0.2443, + "step": 20388 + }, + { + "epoch": 2.72, + "grad_norm": 0.58984375, + "learning_rate": 5.610870022761669e-05, + "loss": 0.2931, + "step": 20389 + }, + { + "epoch": 2.72, + "grad_norm": 0.546875, + "learning_rate": 5.609823713327871e-05, + "loss": 0.2008, + "step": 20390 + }, + { + "epoch": 2.72, + "grad_norm": 0.671875, + "learning_rate": 5.6087774634276856e-05, + "loss": 0.3037, + "step": 20391 + }, + { + "epoch": 2.72, + "grad_norm": 0.53125, + "learning_rate": 5.607731273075294e-05, + "loss": 0.3366, + "step": 20392 + }, + { + "epoch": 2.72, + "grad_norm": 0.52734375, + "learning_rate": 5.606685142284883e-05, + "loss": 0.271, + "step": 20393 + }, + { + "epoch": 2.72, + "grad_norm": 0.63671875, + "learning_rate": 5.605639071070641e-05, + "loss": 0.2677, + "step": 20394 + }, + { + "epoch": 2.72, + "grad_norm": 0.6328125, + "learning_rate": 5.604593059446752e-05, + "loss": 0.4314, + "step": 20395 + }, + { + "epoch": 2.72, + "grad_norm": 0.62890625, + "learning_rate": 5.603547107427408e-05, + "loss": 0.3252, + "step": 20396 + }, + { + "epoch": 2.72, + "grad_norm": 0.5546875, + "learning_rate": 5.602501215026782e-05, + "loss": 0.3078, + "step": 20397 + }, + { + "epoch": 2.72, + "grad_norm": 0.625, + "learning_rate": 5.601455382259062e-05, + "loss": 0.4217, + "step": 20398 + }, + { + "epoch": 2.72, + "grad_norm": 0.703125, + "learning_rate": 5.6004096091384295e-05, + "loss": 0.3186, + "step": 20399 + }, + { + "epoch": 2.72, + "grad_norm": 0.35546875, + "learning_rate": 5.599363895679071e-05, + "loss": 0.1289, + "step": 20400 + }, + { + "epoch": 2.72, + "grad_norm": 0.4765625, + "learning_rate": 5.5983182418951576e-05, + "loss": 0.2993, + "step": 20401 + }, + { + "epoch": 2.72, + "grad_norm": 0.6171875, + "learning_rate": 5.5972726478008744e-05, + "loss": 0.3169, + "step": 20402 + }, + { + "epoch": 2.72, + "grad_norm": 0.5625, + "learning_rate": 5.596227113410404e-05, + "loss": 0.2462, + "step": 20403 + }, + { + "epoch": 2.72, + "grad_norm": 0.67578125, + "learning_rate": 5.5951816387379154e-05, + "loss": 0.2928, + "step": 20404 + }, + { + "epoch": 2.72, + "grad_norm": 0.57421875, + "learning_rate": 5.594136223797595e-05, + "loss": 0.486, + "step": 20405 + }, + { + "epoch": 2.72, + "grad_norm": 0.80859375, + "learning_rate": 5.593090868603612e-05, + "loss": 0.4843, + "step": 20406 + }, + { + "epoch": 2.72, + "grad_norm": 0.67578125, + "learning_rate": 5.592045573170144e-05, + "loss": 0.2008, + "step": 20407 + }, + { + "epoch": 2.72, + "grad_norm": 0.5625, + "learning_rate": 5.591000337511367e-05, + "loss": 0.3891, + "step": 20408 + }, + { + "epoch": 2.72, + "grad_norm": 0.6484375, + "learning_rate": 5.589955161641456e-05, + "loss": 0.2187, + "step": 20409 + }, + { + "epoch": 2.72, + "grad_norm": 0.5625, + "learning_rate": 5.588910045574587e-05, + "loss": 0.3283, + "step": 20410 + }, + { + "epoch": 2.72, + "grad_norm": 0.6171875, + "learning_rate": 5.5878649893249245e-05, + "loss": 0.4293, + "step": 20411 + }, + { + "epoch": 2.72, + "grad_norm": 0.64453125, + "learning_rate": 5.586819992906645e-05, + "loss": 0.3447, + "step": 20412 + }, + { + "epoch": 2.72, + "grad_norm": 0.64453125, + "learning_rate": 5.5857750563339186e-05, + "loss": 0.3254, + "step": 20413 + }, + { + "epoch": 2.72, + "grad_norm": 0.50390625, + "learning_rate": 5.584730179620918e-05, + "loss": 0.2888, + "step": 20414 + }, + { + "epoch": 2.72, + "grad_norm": 0.4921875, + "learning_rate": 5.583685362781812e-05, + "loss": 0.2057, + "step": 20415 + }, + { + "epoch": 2.72, + "grad_norm": 0.6015625, + "learning_rate": 5.582640605830762e-05, + "loss": 0.2745, + "step": 20416 + }, + { + "epoch": 2.72, + "grad_norm": 0.5390625, + "learning_rate": 5.5815959087819404e-05, + "loss": 0.1821, + "step": 20417 + }, + { + "epoch": 2.72, + "grad_norm": 0.466796875, + "learning_rate": 5.580551271649515e-05, + "loss": 0.4436, + "step": 20418 + }, + { + "epoch": 2.72, + "grad_norm": 0.57421875, + "learning_rate": 5.579506694447652e-05, + "loss": 0.1935, + "step": 20419 + }, + { + "epoch": 2.72, + "grad_norm": 0.58203125, + "learning_rate": 5.5784621771905135e-05, + "loss": 0.479, + "step": 20420 + }, + { + "epoch": 2.72, + "grad_norm": 0.76171875, + "learning_rate": 5.577417719892266e-05, + "loss": 0.6346, + "step": 20421 + }, + { + "epoch": 2.73, + "grad_norm": 0.78125, + "learning_rate": 5.576373322567071e-05, + "loss": 0.2877, + "step": 20422 + }, + { + "epoch": 2.73, + "grad_norm": 0.64453125, + "learning_rate": 5.575328985229098e-05, + "loss": 0.2533, + "step": 20423 + }, + { + "epoch": 2.73, + "grad_norm": 0.625, + "learning_rate": 5.5742847078924985e-05, + "loss": 0.4152, + "step": 20424 + }, + { + "epoch": 2.73, + "grad_norm": 0.73828125, + "learning_rate": 5.573240490571441e-05, + "loss": 0.3375, + "step": 20425 + }, + { + "epoch": 2.73, + "grad_norm": 0.435546875, + "learning_rate": 5.572196333280081e-05, + "loss": 0.2276, + "step": 20426 + }, + { + "epoch": 2.73, + "grad_norm": 0.439453125, + "learning_rate": 5.571152236032586e-05, + "loss": 0.1498, + "step": 20427 + }, + { + "epoch": 2.73, + "grad_norm": 0.5703125, + "learning_rate": 5.570108198843108e-05, + "loss": 0.4766, + "step": 20428 + }, + { + "epoch": 2.73, + "grad_norm": 0.45703125, + "learning_rate": 5.569064221725803e-05, + "loss": 0.214, + "step": 20429 + }, + { + "epoch": 2.73, + "grad_norm": 0.8671875, + "learning_rate": 5.5680203046948296e-05, + "loss": 0.3909, + "step": 20430 + }, + { + "epoch": 2.73, + "grad_norm": 0.74609375, + "learning_rate": 5.566976447764347e-05, + "loss": 0.3473, + "step": 20431 + }, + { + "epoch": 2.73, + "grad_norm": 0.55859375, + "learning_rate": 5.5659326509485075e-05, + "loss": 0.2607, + "step": 20432 + }, + { + "epoch": 2.73, + "grad_norm": 0.6796875, + "learning_rate": 5.564888914261471e-05, + "loss": 0.333, + "step": 20433 + }, + { + "epoch": 2.73, + "grad_norm": 0.71875, + "learning_rate": 5.5638452377173846e-05, + "loss": 0.4088, + "step": 20434 + }, + { + "epoch": 2.73, + "grad_norm": 0.78515625, + "learning_rate": 5.5628016213304025e-05, + "loss": 0.3109, + "step": 20435 + }, + { + "epoch": 2.73, + "grad_norm": 0.455078125, + "learning_rate": 5.561758065114679e-05, + "loss": 0.2172, + "step": 20436 + }, + { + "epoch": 2.73, + "grad_norm": 0.65234375, + "learning_rate": 5.5607145690843685e-05, + "loss": 0.3618, + "step": 20437 + }, + { + "epoch": 2.73, + "grad_norm": 0.58203125, + "learning_rate": 5.5596711332536135e-05, + "loss": 0.3812, + "step": 20438 + }, + { + "epoch": 2.73, + "grad_norm": 0.45703125, + "learning_rate": 5.558627757636572e-05, + "loss": 0.2336, + "step": 20439 + }, + { + "epoch": 2.73, + "grad_norm": 0.6015625, + "learning_rate": 5.5575844422473835e-05, + "loss": 0.1698, + "step": 20440 + }, + { + "epoch": 2.73, + "grad_norm": 0.59375, + "learning_rate": 5.556541187100201e-05, + "loss": 0.2791, + "step": 20441 + }, + { + "epoch": 2.73, + "grad_norm": 0.66015625, + "learning_rate": 5.555497992209178e-05, + "loss": 0.3598, + "step": 20442 + }, + { + "epoch": 2.73, + "grad_norm": 0.6640625, + "learning_rate": 5.554454857588448e-05, + "loss": 0.2743, + "step": 20443 + }, + { + "epoch": 2.73, + "grad_norm": 0.58984375, + "learning_rate": 5.553411783252165e-05, + "loss": 0.2, + "step": 20444 + }, + { + "epoch": 2.73, + "grad_norm": 0.55859375, + "learning_rate": 5.552368769214471e-05, + "loss": 0.2257, + "step": 20445 + }, + { + "epoch": 2.73, + "grad_norm": 0.41015625, + "learning_rate": 5.551325815489517e-05, + "loss": 0.2289, + "step": 20446 + }, + { + "epoch": 2.73, + "grad_norm": 0.51953125, + "learning_rate": 5.550282922091433e-05, + "loss": 0.2074, + "step": 20447 + }, + { + "epoch": 2.73, + "grad_norm": 0.490234375, + "learning_rate": 5.549240089034371e-05, + "loss": 0.1202, + "step": 20448 + }, + { + "epoch": 2.73, + "grad_norm": 0.703125, + "learning_rate": 5.5481973163324684e-05, + "loss": 0.4203, + "step": 20449 + }, + { + "epoch": 2.73, + "grad_norm": 0.5625, + "learning_rate": 5.547154603999871e-05, + "loss": 0.1938, + "step": 20450 + }, + { + "epoch": 2.73, + "grad_norm": 0.58984375, + "learning_rate": 5.546111952050712e-05, + "loss": 0.2215, + "step": 20451 + }, + { + "epoch": 2.73, + "grad_norm": 0.60546875, + "learning_rate": 5.545069360499138e-05, + "loss": 0.3813, + "step": 20452 + }, + { + "epoch": 2.73, + "grad_norm": 0.515625, + "learning_rate": 5.544026829359277e-05, + "loss": 0.2238, + "step": 20453 + }, + { + "epoch": 2.73, + "grad_norm": 0.69140625, + "learning_rate": 5.5429843586452734e-05, + "loss": 0.4184, + "step": 20454 + }, + { + "epoch": 2.73, + "grad_norm": 0.64453125, + "learning_rate": 5.541941948371265e-05, + "loss": 0.4825, + "step": 20455 + }, + { + "epoch": 2.73, + "grad_norm": 0.64453125, + "learning_rate": 5.540899598551381e-05, + "loss": 0.4504, + "step": 20456 + }, + { + "epoch": 2.73, + "grad_norm": 0.64453125, + "learning_rate": 5.539857309199762e-05, + "loss": 0.4805, + "step": 20457 + }, + { + "epoch": 2.73, + "grad_norm": 0.470703125, + "learning_rate": 5.5388150803305396e-05, + "loss": 0.2616, + "step": 20458 + }, + { + "epoch": 2.73, + "grad_norm": 0.6953125, + "learning_rate": 5.537772911957847e-05, + "loss": 0.3016, + "step": 20459 + }, + { + "epoch": 2.73, + "grad_norm": 0.60546875, + "learning_rate": 5.536730804095822e-05, + "loss": 0.5144, + "step": 20460 + }, + { + "epoch": 2.73, + "grad_norm": 0.482421875, + "learning_rate": 5.535688756758588e-05, + "loss": 0.3576, + "step": 20461 + }, + { + "epoch": 2.73, + "grad_norm": 0.412109375, + "learning_rate": 5.53464676996028e-05, + "loss": 0.2131, + "step": 20462 + }, + { + "epoch": 2.73, + "grad_norm": 0.5390625, + "learning_rate": 5.533604843715028e-05, + "loss": 0.2613, + "step": 20463 + }, + { + "epoch": 2.73, + "grad_norm": 0.66015625, + "learning_rate": 5.5325629780369635e-05, + "loss": 0.2668, + "step": 20464 + }, + { + "epoch": 2.73, + "grad_norm": 0.70703125, + "learning_rate": 5.5315211729402126e-05, + "loss": 0.2966, + "step": 20465 + }, + { + "epoch": 2.73, + "grad_norm": 0.4609375, + "learning_rate": 5.5304794284389e-05, + "loss": 0.203, + "step": 20466 + }, + { + "epoch": 2.73, + "grad_norm": 0.48046875, + "learning_rate": 5.529437744547152e-05, + "loss": 0.2241, + "step": 20467 + }, + { + "epoch": 2.73, + "grad_norm": 0.68359375, + "learning_rate": 5.528396121279099e-05, + "loss": 0.246, + "step": 20468 + }, + { + "epoch": 2.73, + "grad_norm": 0.43359375, + "learning_rate": 5.5273545586488684e-05, + "loss": 0.152, + "step": 20469 + }, + { + "epoch": 2.73, + "grad_norm": 0.51171875, + "learning_rate": 5.5263130566705765e-05, + "loss": 0.3231, + "step": 20470 + }, + { + "epoch": 2.73, + "grad_norm": 0.69921875, + "learning_rate": 5.52527161535835e-05, + "loss": 0.5616, + "step": 20471 + }, + { + "epoch": 2.73, + "grad_norm": 0.40234375, + "learning_rate": 5.524230234726313e-05, + "loss": 0.1328, + "step": 20472 + }, + { + "epoch": 2.73, + "grad_norm": 0.6015625, + "learning_rate": 5.523188914788591e-05, + "loss": 0.4236, + "step": 20473 + }, + { + "epoch": 2.73, + "grad_norm": 0.56640625, + "learning_rate": 5.522147655559297e-05, + "loss": 0.3597, + "step": 20474 + }, + { + "epoch": 2.73, + "grad_norm": 0.53515625, + "learning_rate": 5.5211064570525525e-05, + "loss": 0.5904, + "step": 20475 + }, + { + "epoch": 2.73, + "grad_norm": 0.765625, + "learning_rate": 5.520065319282485e-05, + "loss": 0.2383, + "step": 20476 + }, + { + "epoch": 2.73, + "grad_norm": 0.96484375, + "learning_rate": 5.519024242263202e-05, + "loss": 0.4683, + "step": 20477 + }, + { + "epoch": 2.73, + "grad_norm": 0.58203125, + "learning_rate": 5.5179832260088294e-05, + "loss": 0.4317, + "step": 20478 + }, + { + "epoch": 2.73, + "grad_norm": 0.54296875, + "learning_rate": 5.5169422705334785e-05, + "loss": 0.2503, + "step": 20479 + }, + { + "epoch": 2.73, + "grad_norm": 0.6796875, + "learning_rate": 5.5159013758512665e-05, + "loss": 0.3708, + "step": 20480 + }, + { + "epoch": 2.73, + "grad_norm": 0.64453125, + "learning_rate": 5.51486054197631e-05, + "loss": 0.3313, + "step": 20481 + }, + { + "epoch": 2.73, + "grad_norm": 0.8203125, + "learning_rate": 5.513819768922723e-05, + "loss": 0.5539, + "step": 20482 + }, + { + "epoch": 2.73, + "grad_norm": 0.447265625, + "learning_rate": 5.512779056704623e-05, + "loss": 0.1333, + "step": 20483 + }, + { + "epoch": 2.73, + "grad_norm": 0.60546875, + "learning_rate": 5.5117384053361156e-05, + "loss": 0.211, + "step": 20484 + }, + { + "epoch": 2.73, + "grad_norm": 0.7578125, + "learning_rate": 5.510697814831316e-05, + "loss": 0.5312, + "step": 20485 + }, + { + "epoch": 2.73, + "grad_norm": 0.52734375, + "learning_rate": 5.509657285204335e-05, + "loss": 0.4361, + "step": 20486 + }, + { + "epoch": 2.73, + "grad_norm": 0.453125, + "learning_rate": 5.508616816469285e-05, + "loss": 0.2652, + "step": 20487 + }, + { + "epoch": 2.73, + "grad_norm": 0.494140625, + "learning_rate": 5.507576408640276e-05, + "loss": 0.3385, + "step": 20488 + }, + { + "epoch": 2.73, + "grad_norm": 0.7890625, + "learning_rate": 5.50653606173141e-05, + "loss": 0.3421, + "step": 20489 + }, + { + "epoch": 2.73, + "grad_norm": 0.5703125, + "learning_rate": 5.505495775756798e-05, + "loss": 0.3715, + "step": 20490 + }, + { + "epoch": 2.73, + "grad_norm": 0.57421875, + "learning_rate": 5.504455550730548e-05, + "loss": 0.3985, + "step": 20491 + }, + { + "epoch": 2.73, + "grad_norm": 0.62109375, + "learning_rate": 5.503415386666771e-05, + "loss": 0.4616, + "step": 20492 + }, + { + "epoch": 2.73, + "grad_norm": 0.56640625, + "learning_rate": 5.5023752835795614e-05, + "loss": 0.3963, + "step": 20493 + }, + { + "epoch": 2.73, + "grad_norm": 0.7265625, + "learning_rate": 5.501335241483031e-05, + "loss": 0.355, + "step": 20494 + }, + { + "epoch": 2.73, + "grad_norm": 0.400390625, + "learning_rate": 5.500295260391282e-05, + "loss": 0.1657, + "step": 20495 + }, + { + "epoch": 2.73, + "grad_norm": 0.9453125, + "learning_rate": 5.49925534031842e-05, + "loss": 0.287, + "step": 20496 + }, + { + "epoch": 2.74, + "grad_norm": 0.482421875, + "learning_rate": 5.498215481278541e-05, + "loss": 0.1427, + "step": 20497 + }, + { + "epoch": 2.74, + "grad_norm": 0.6484375, + "learning_rate": 5.497175683285749e-05, + "loss": 0.3862, + "step": 20498 + }, + { + "epoch": 2.74, + "grad_norm": 0.765625, + "learning_rate": 5.4961359463541437e-05, + "loss": 0.4289, + "step": 20499 + }, + { + "epoch": 2.74, + "grad_norm": 0.458984375, + "learning_rate": 5.495096270497832e-05, + "loss": 0.309, + "step": 20500 + }, + { + "epoch": 2.74, + "grad_norm": 0.6796875, + "learning_rate": 5.4940566557309035e-05, + "loss": 0.2234, + "step": 20501 + }, + { + "epoch": 2.74, + "grad_norm": 0.69921875, + "learning_rate": 5.493017102067456e-05, + "loss": 0.2761, + "step": 20502 + }, + { + "epoch": 2.74, + "grad_norm": 0.6484375, + "learning_rate": 5.491977609521589e-05, + "loss": 0.5449, + "step": 20503 + }, + { + "epoch": 2.74, + "grad_norm": 0.73046875, + "learning_rate": 5.4909381781073985e-05, + "loss": 0.4869, + "step": 20504 + }, + { + "epoch": 2.74, + "grad_norm": 0.6953125, + "learning_rate": 5.48989880783898e-05, + "loss": 0.331, + "step": 20505 + }, + { + "epoch": 2.74, + "grad_norm": 0.5078125, + "learning_rate": 5.488859498730434e-05, + "loss": 0.3347, + "step": 20506 + }, + { + "epoch": 2.74, + "grad_norm": 0.5703125, + "learning_rate": 5.487820250795843e-05, + "loss": 0.3755, + "step": 20507 + }, + { + "epoch": 2.74, + "grad_norm": 0.640625, + "learning_rate": 5.486781064049305e-05, + "loss": 0.4148, + "step": 20508 + }, + { + "epoch": 2.74, + "grad_norm": 0.478515625, + "learning_rate": 5.485741938504912e-05, + "loss": 0.2706, + "step": 20509 + }, + { + "epoch": 2.74, + "grad_norm": 0.60546875, + "learning_rate": 5.484702874176761e-05, + "loss": 0.4812, + "step": 20510 + }, + { + "epoch": 2.74, + "grad_norm": 0.5, + "learning_rate": 5.4836638710789324e-05, + "loss": 0.2652, + "step": 20511 + }, + { + "epoch": 2.74, + "grad_norm": 0.5625, + "learning_rate": 5.482624929225524e-05, + "loss": 0.2494, + "step": 20512 + }, + { + "epoch": 2.74, + "grad_norm": 0.5078125, + "learning_rate": 5.481586048630617e-05, + "loss": 0.3719, + "step": 20513 + }, + { + "epoch": 2.74, + "grad_norm": 0.5390625, + "learning_rate": 5.480547229308303e-05, + "loss": 0.2704, + "step": 20514 + }, + { + "epoch": 2.74, + "grad_norm": 0.625, + "learning_rate": 5.479508471272673e-05, + "loss": 0.1868, + "step": 20515 + }, + { + "epoch": 2.74, + "grad_norm": 0.6328125, + "learning_rate": 5.4784697745378046e-05, + "loss": 0.5922, + "step": 20516 + }, + { + "epoch": 2.74, + "grad_norm": 0.6015625, + "learning_rate": 5.47743113911779e-05, + "loss": 0.4668, + "step": 20517 + }, + { + "epoch": 2.74, + "grad_norm": 0.5078125, + "learning_rate": 5.47639256502671e-05, + "loss": 0.3234, + "step": 20518 + }, + { + "epoch": 2.74, + "grad_norm": 0.5234375, + "learning_rate": 5.475354052278654e-05, + "loss": 0.2826, + "step": 20519 + }, + { + "epoch": 2.74, + "grad_norm": 0.39453125, + "learning_rate": 5.474315600887698e-05, + "loss": 0.2975, + "step": 20520 + }, + { + "epoch": 2.74, + "grad_norm": 0.5546875, + "learning_rate": 5.473277210867926e-05, + "loss": 0.3985, + "step": 20521 + }, + { + "epoch": 2.74, + "grad_norm": 0.625, + "learning_rate": 5.4722388822334216e-05, + "loss": 0.2408, + "step": 20522 + }, + { + "epoch": 2.74, + "grad_norm": 0.515625, + "learning_rate": 5.4712006149982664e-05, + "loss": 0.2433, + "step": 20523 + }, + { + "epoch": 2.74, + "grad_norm": 0.5234375, + "learning_rate": 5.470162409176538e-05, + "loss": 0.4605, + "step": 20524 + }, + { + "epoch": 2.74, + "grad_norm": 0.58984375, + "learning_rate": 5.46912426478231e-05, + "loss": 0.4581, + "step": 20525 + }, + { + "epoch": 2.74, + "grad_norm": 0.70703125, + "learning_rate": 5.468086181829666e-05, + "loss": 0.4282, + "step": 20526 + }, + { + "epoch": 2.74, + "grad_norm": 0.54296875, + "learning_rate": 5.4670481603326816e-05, + "loss": 0.2741, + "step": 20527 + }, + { + "epoch": 2.74, + "grad_norm": 0.408203125, + "learning_rate": 5.466010200305438e-05, + "loss": 0.1652, + "step": 20528 + }, + { + "epoch": 2.74, + "grad_norm": 0.61328125, + "learning_rate": 5.4649723017620016e-05, + "loss": 0.3796, + "step": 20529 + }, + { + "epoch": 2.74, + "grad_norm": 0.671875, + "learning_rate": 5.46393446471645e-05, + "loss": 0.2712, + "step": 20530 + }, + { + "epoch": 2.74, + "grad_norm": 0.61328125, + "learning_rate": 5.462896689182859e-05, + "loss": 0.3086, + "step": 20531 + }, + { + "epoch": 2.74, + "grad_norm": 0.60546875, + "learning_rate": 5.461858975175301e-05, + "loss": 0.4377, + "step": 20532 + }, + { + "epoch": 2.74, + "grad_norm": 0.734375, + "learning_rate": 5.460821322707852e-05, + "loss": 0.3383, + "step": 20533 + }, + { + "epoch": 2.74, + "grad_norm": 0.546875, + "learning_rate": 5.459783731794575e-05, + "loss": 0.4228, + "step": 20534 + }, + { + "epoch": 2.74, + "grad_norm": 0.55859375, + "learning_rate": 5.458746202449545e-05, + "loss": 0.4727, + "step": 20535 + }, + { + "epoch": 2.74, + "grad_norm": 0.515625, + "learning_rate": 5.457708734686831e-05, + "loss": 0.2092, + "step": 20536 + }, + { + "epoch": 2.74, + "grad_norm": 0.546875, + "learning_rate": 5.456671328520504e-05, + "loss": 0.2964, + "step": 20537 + }, + { + "epoch": 2.74, + "grad_norm": 0.6953125, + "learning_rate": 5.4556339839646317e-05, + "loss": 0.2773, + "step": 20538 + }, + { + "epoch": 2.74, + "grad_norm": 0.470703125, + "learning_rate": 5.4545967010332744e-05, + "loss": 0.2132, + "step": 20539 + }, + { + "epoch": 2.74, + "grad_norm": 0.4609375, + "learning_rate": 5.453559479740502e-05, + "loss": 0.2864, + "step": 20540 + }, + { + "epoch": 2.74, + "grad_norm": 0.5859375, + "learning_rate": 5.4525223201003826e-05, + "loss": 0.1966, + "step": 20541 + }, + { + "epoch": 2.74, + "grad_norm": 0.7421875, + "learning_rate": 5.451485222126982e-05, + "loss": 0.2634, + "step": 20542 + }, + { + "epoch": 2.74, + "grad_norm": 0.5234375, + "learning_rate": 5.450448185834358e-05, + "loss": 0.2139, + "step": 20543 + }, + { + "epoch": 2.74, + "grad_norm": 0.455078125, + "learning_rate": 5.449411211236576e-05, + "loss": 0.2729, + "step": 20544 + }, + { + "epoch": 2.74, + "grad_norm": 0.55859375, + "learning_rate": 5.448374298347698e-05, + "loss": 0.4912, + "step": 20545 + }, + { + "epoch": 2.74, + "grad_norm": 0.6015625, + "learning_rate": 5.4473374471817906e-05, + "loss": 0.2684, + "step": 20546 + }, + { + "epoch": 2.74, + "grad_norm": 0.72265625, + "learning_rate": 5.446300657752905e-05, + "loss": 0.3067, + "step": 20547 + }, + { + "epoch": 2.74, + "grad_norm": 0.60546875, + "learning_rate": 5.445263930075105e-05, + "loss": 0.408, + "step": 20548 + }, + { + "epoch": 2.74, + "grad_norm": 0.44921875, + "learning_rate": 5.444227264162454e-05, + "loss": 0.2994, + "step": 20549 + }, + { + "epoch": 2.74, + "grad_norm": 0.6640625, + "learning_rate": 5.443190660028999e-05, + "loss": 0.4334, + "step": 20550 + }, + { + "epoch": 2.74, + "grad_norm": 0.7109375, + "learning_rate": 5.442154117688809e-05, + "loss": 0.4336, + "step": 20551 + }, + { + "epoch": 2.74, + "grad_norm": 0.578125, + "learning_rate": 5.44111763715593e-05, + "loss": 0.4029, + "step": 20552 + }, + { + "epoch": 2.74, + "grad_norm": 0.47265625, + "learning_rate": 5.440081218444422e-05, + "loss": 0.2622, + "step": 20553 + }, + { + "epoch": 2.74, + "grad_norm": 0.51953125, + "learning_rate": 5.4390448615683385e-05, + "loss": 0.2241, + "step": 20554 + }, + { + "epoch": 2.74, + "grad_norm": 0.734375, + "learning_rate": 5.4380085665417344e-05, + "loss": 0.3605, + "step": 20555 + }, + { + "epoch": 2.74, + "grad_norm": 1.03125, + "learning_rate": 5.436972333378666e-05, + "loss": 0.2149, + "step": 20556 + }, + { + "epoch": 2.74, + "grad_norm": 0.7265625, + "learning_rate": 5.435936162093177e-05, + "loss": 0.1691, + "step": 20557 + }, + { + "epoch": 2.74, + "grad_norm": 0.8203125, + "learning_rate": 5.4349000526993235e-05, + "loss": 0.3334, + "step": 20558 + }, + { + "epoch": 2.74, + "grad_norm": 0.60546875, + "learning_rate": 5.4338640052111545e-05, + "loss": 0.4104, + "step": 20559 + }, + { + "epoch": 2.74, + "grad_norm": 0.68359375, + "learning_rate": 5.432828019642724e-05, + "loss": 0.3183, + "step": 20560 + }, + { + "epoch": 2.74, + "grad_norm": 0.99609375, + "learning_rate": 5.431792096008077e-05, + "loss": 0.4977, + "step": 20561 + }, + { + "epoch": 2.74, + "grad_norm": 0.78125, + "learning_rate": 5.430756234321257e-05, + "loss": 0.5578, + "step": 20562 + }, + { + "epoch": 2.74, + "grad_norm": 0.7109375, + "learning_rate": 5.429720434596315e-05, + "loss": 0.4381, + "step": 20563 + }, + { + "epoch": 2.74, + "grad_norm": 0.6328125, + "learning_rate": 5.428684696847297e-05, + "loss": 0.3054, + "step": 20564 + }, + { + "epoch": 2.74, + "grad_norm": 0.61328125, + "learning_rate": 5.427649021088253e-05, + "loss": 0.3655, + "step": 20565 + }, + { + "epoch": 2.74, + "grad_norm": 0.6875, + "learning_rate": 5.426613407333219e-05, + "loss": 0.6005, + "step": 20566 + }, + { + "epoch": 2.74, + "grad_norm": 0.6953125, + "learning_rate": 5.42557785559624e-05, + "loss": 0.3914, + "step": 20567 + }, + { + "epoch": 2.74, + "grad_norm": 0.55859375, + "learning_rate": 5.424542365891363e-05, + "loss": 0.278, + "step": 20568 + }, + { + "epoch": 2.74, + "grad_norm": 0.546875, + "learning_rate": 5.423506938232632e-05, + "loss": 0.2233, + "step": 20569 + }, + { + "epoch": 2.74, + "grad_norm": 0.466796875, + "learning_rate": 5.4224715726340805e-05, + "loss": 0.1769, + "step": 20570 + }, + { + "epoch": 2.74, + "grad_norm": 0.515625, + "learning_rate": 5.421436269109752e-05, + "loss": 0.3023, + "step": 20571 + }, + { + "epoch": 2.75, + "grad_norm": 0.6640625, + "learning_rate": 5.420401027673685e-05, + "loss": 0.1802, + "step": 20572 + }, + { + "epoch": 2.75, + "grad_norm": 0.6875, + "learning_rate": 5.4193658483399244e-05, + "loss": 0.1731, + "step": 20573 + }, + { + "epoch": 2.75, + "grad_norm": 0.52734375, + "learning_rate": 5.418330731122503e-05, + "loss": 0.3937, + "step": 20574 + }, + { + "epoch": 2.75, + "grad_norm": 0.578125, + "learning_rate": 5.417295676035452e-05, + "loss": 0.2996, + "step": 20575 + }, + { + "epoch": 2.75, + "grad_norm": 0.6328125, + "learning_rate": 5.416260683092814e-05, + "loss": 0.1861, + "step": 20576 + }, + { + "epoch": 2.75, + "grad_norm": 0.60546875, + "learning_rate": 5.415225752308622e-05, + "loss": 0.3262, + "step": 20577 + }, + { + "epoch": 2.75, + "grad_norm": 0.65234375, + "learning_rate": 5.4141908836969125e-05, + "loss": 0.4564, + "step": 20578 + }, + { + "epoch": 2.75, + "grad_norm": 0.6171875, + "learning_rate": 5.4131560772717217e-05, + "loss": 0.3382, + "step": 20579 + }, + { + "epoch": 2.75, + "grad_norm": 0.5703125, + "learning_rate": 5.412121333047073e-05, + "loss": 0.2249, + "step": 20580 + }, + { + "epoch": 2.75, + "grad_norm": 0.609375, + "learning_rate": 5.411086651037005e-05, + "loss": 0.3244, + "step": 20581 + }, + { + "epoch": 2.75, + "grad_norm": 0.6015625, + "learning_rate": 5.410052031255546e-05, + "loss": 0.421, + "step": 20582 + }, + { + "epoch": 2.75, + "grad_norm": 0.7421875, + "learning_rate": 5.409017473716731e-05, + "loss": 0.3058, + "step": 20583 + }, + { + "epoch": 2.75, + "grad_norm": 0.58984375, + "learning_rate": 5.4079829784345824e-05, + "loss": 0.2209, + "step": 20584 + }, + { + "epoch": 2.75, + "grad_norm": 0.546875, + "learning_rate": 5.406948545423136e-05, + "loss": 0.2607, + "step": 20585 + }, + { + "epoch": 2.75, + "grad_norm": 0.53515625, + "learning_rate": 5.405914174696411e-05, + "loss": 0.4472, + "step": 20586 + }, + { + "epoch": 2.75, + "grad_norm": 0.76171875, + "learning_rate": 5.4048798662684376e-05, + "loss": 0.4552, + "step": 20587 + }, + { + "epoch": 2.75, + "grad_norm": 0.6328125, + "learning_rate": 5.403845620153247e-05, + "loss": 0.216, + "step": 20588 + }, + { + "epoch": 2.75, + "grad_norm": 0.48828125, + "learning_rate": 5.4028114363648554e-05, + "loss": 0.4022, + "step": 20589 + }, + { + "epoch": 2.75, + "grad_norm": 0.6875, + "learning_rate": 5.401777314917291e-05, + "loss": 0.361, + "step": 20590 + }, + { + "epoch": 2.75, + "grad_norm": 0.60546875, + "learning_rate": 5.400743255824577e-05, + "loss": 0.3125, + "step": 20591 + }, + { + "epoch": 2.75, + "grad_norm": 0.6015625, + "learning_rate": 5.39970925910074e-05, + "loss": 0.3598, + "step": 20592 + }, + { + "epoch": 2.75, + "grad_norm": 0.4921875, + "learning_rate": 5.398675324759796e-05, + "loss": 0.2524, + "step": 20593 + }, + { + "epoch": 2.75, + "grad_norm": 0.7578125, + "learning_rate": 5.397641452815765e-05, + "loss": 0.3181, + "step": 20594 + }, + { + "epoch": 2.75, + "grad_norm": 0.6484375, + "learning_rate": 5.396607643282672e-05, + "loss": 0.3038, + "step": 20595 + }, + { + "epoch": 2.75, + "grad_norm": 0.5859375, + "learning_rate": 5.395573896174536e-05, + "loss": 0.3284, + "step": 20596 + }, + { + "epoch": 2.75, + "grad_norm": 0.6484375, + "learning_rate": 5.3945402115053745e-05, + "loss": 0.2676, + "step": 20597 + }, + { + "epoch": 2.75, + "grad_norm": 0.7578125, + "learning_rate": 5.393506589289198e-05, + "loss": 0.404, + "step": 20598 + }, + { + "epoch": 2.75, + "grad_norm": 0.640625, + "learning_rate": 5.392473029540029e-05, + "loss": 0.3417, + "step": 20599 + }, + { + "epoch": 2.75, + "grad_norm": 0.5390625, + "learning_rate": 5.3914395322718834e-05, + "loss": 0.3515, + "step": 20600 + }, + { + "epoch": 2.75, + "grad_norm": 0.61328125, + "learning_rate": 5.390406097498779e-05, + "loss": 0.2828, + "step": 20601 + }, + { + "epoch": 2.75, + "grad_norm": 0.5625, + "learning_rate": 5.3893727252347225e-05, + "loss": 0.2974, + "step": 20602 + }, + { + "epoch": 2.75, + "grad_norm": 0.68359375, + "learning_rate": 5.3883394154937305e-05, + "loss": 0.5259, + "step": 20603 + }, + { + "epoch": 2.75, + "grad_norm": 0.609375, + "learning_rate": 5.387306168289817e-05, + "loss": 0.4996, + "step": 20604 + }, + { + "epoch": 2.75, + "grad_norm": 0.62890625, + "learning_rate": 5.386272983636992e-05, + "loss": 0.2859, + "step": 20605 + }, + { + "epoch": 2.75, + "grad_norm": 0.65625, + "learning_rate": 5.38523986154927e-05, + "loss": 0.4018, + "step": 20606 + }, + { + "epoch": 2.75, + "grad_norm": 0.54296875, + "learning_rate": 5.3842068020406524e-05, + "loss": 0.3107, + "step": 20607 + }, + { + "epoch": 2.75, + "grad_norm": 0.51953125, + "learning_rate": 5.3831738051251543e-05, + "loss": 0.3077, + "step": 20608 + }, + { + "epoch": 2.75, + "grad_norm": 0.56640625, + "learning_rate": 5.382140870816785e-05, + "loss": 0.3227, + "step": 20609 + }, + { + "epoch": 2.75, + "grad_norm": 0.7578125, + "learning_rate": 5.3811079991295465e-05, + "loss": 0.2942, + "step": 20610 + }, + { + "epoch": 2.75, + "grad_norm": 0.53125, + "learning_rate": 5.380075190077451e-05, + "loss": 0.2871, + "step": 20611 + }, + { + "epoch": 2.75, + "grad_norm": 0.59765625, + "learning_rate": 5.379042443674498e-05, + "loss": 0.4268, + "step": 20612 + }, + { + "epoch": 2.75, + "grad_norm": 0.56640625, + "learning_rate": 5.378009759934694e-05, + "loss": 0.153, + "step": 20613 + }, + { + "epoch": 2.75, + "grad_norm": 0.640625, + "learning_rate": 5.376977138872046e-05, + "loss": 0.2764, + "step": 20614 + }, + { + "epoch": 2.75, + "grad_norm": 0.56640625, + "learning_rate": 5.375944580500557e-05, + "loss": 0.2115, + "step": 20615 + }, + { + "epoch": 2.75, + "grad_norm": 0.625, + "learning_rate": 5.374912084834225e-05, + "loss": 0.3117, + "step": 20616 + }, + { + "epoch": 2.75, + "grad_norm": 0.466796875, + "learning_rate": 5.373879651887051e-05, + "loss": 0.2622, + "step": 20617 + }, + { + "epoch": 2.75, + "grad_norm": 0.73046875, + "learning_rate": 5.3728472816730393e-05, + "loss": 0.2963, + "step": 20618 + }, + { + "epoch": 2.75, + "grad_norm": 0.66015625, + "learning_rate": 5.371814974206193e-05, + "loss": 0.3506, + "step": 20619 + }, + { + "epoch": 2.75, + "grad_norm": 0.62890625, + "learning_rate": 5.370782729500501e-05, + "loss": 0.2941, + "step": 20620 + }, + { + "epoch": 2.75, + "grad_norm": 0.6015625, + "learning_rate": 5.369750547569966e-05, + "loss": 0.3442, + "step": 20621 + }, + { + "epoch": 2.75, + "grad_norm": 0.83203125, + "learning_rate": 5.368718428428589e-05, + "loss": 0.4745, + "step": 20622 + }, + { + "epoch": 2.75, + "grad_norm": 0.57421875, + "learning_rate": 5.3676863720903595e-05, + "loss": 0.2976, + "step": 20623 + }, + { + "epoch": 2.75, + "grad_norm": 0.4296875, + "learning_rate": 5.36665437856928e-05, + "loss": 0.3407, + "step": 20624 + }, + { + "epoch": 2.75, + "grad_norm": 0.484375, + "learning_rate": 5.365622447879336e-05, + "loss": 0.3118, + "step": 20625 + }, + { + "epoch": 2.75, + "grad_norm": 0.341796875, + "learning_rate": 5.364590580034525e-05, + "loss": 0.1943, + "step": 20626 + }, + { + "epoch": 2.75, + "grad_norm": 0.5390625, + "learning_rate": 5.3635587750488426e-05, + "loss": 0.4548, + "step": 20627 + }, + { + "epoch": 2.75, + "grad_norm": 0.671875, + "learning_rate": 5.362527032936277e-05, + "loss": 0.2558, + "step": 20628 + }, + { + "epoch": 2.75, + "grad_norm": 0.5546875, + "learning_rate": 5.361495353710828e-05, + "loss": 0.39, + "step": 20629 + }, + { + "epoch": 2.75, + "grad_norm": 0.69921875, + "learning_rate": 5.3604637373864716e-05, + "loss": 0.3491, + "step": 20630 + }, + { + "epoch": 2.75, + "grad_norm": 0.63671875, + "learning_rate": 5.3594321839772066e-05, + "loss": 0.4027, + "step": 20631 + }, + { + "epoch": 2.75, + "grad_norm": 0.6171875, + "learning_rate": 5.35840069349702e-05, + "loss": 0.2845, + "step": 20632 + }, + { + "epoch": 2.75, + "grad_norm": 0.6171875, + "learning_rate": 5.357369265959902e-05, + "loss": 0.32, + "step": 20633 + }, + { + "epoch": 2.75, + "grad_norm": 0.462890625, + "learning_rate": 5.3563379013798355e-05, + "loss": 0.1736, + "step": 20634 + }, + { + "epoch": 2.75, + "grad_norm": 0.490234375, + "learning_rate": 5.355306599770804e-05, + "loss": 0.1836, + "step": 20635 + }, + { + "epoch": 2.75, + "grad_norm": 0.5234375, + "learning_rate": 5.3542753611467965e-05, + "loss": 0.4088, + "step": 20636 + }, + { + "epoch": 2.75, + "grad_norm": 0.6328125, + "learning_rate": 5.353244185521798e-05, + "loss": 0.3695, + "step": 20637 + }, + { + "epoch": 2.75, + "grad_norm": 0.6640625, + "learning_rate": 5.352213072909791e-05, + "loss": 0.414, + "step": 20638 + }, + { + "epoch": 2.75, + "grad_norm": 0.52734375, + "learning_rate": 5.3511820233247564e-05, + "loss": 0.3169, + "step": 20639 + }, + { + "epoch": 2.75, + "grad_norm": 0.8359375, + "learning_rate": 5.350151036780676e-05, + "loss": 0.393, + "step": 20640 + }, + { + "epoch": 2.75, + "grad_norm": 0.625, + "learning_rate": 5.349120113291532e-05, + "loss": 0.5198, + "step": 20641 + }, + { + "epoch": 2.75, + "grad_norm": 0.6796875, + "learning_rate": 5.348089252871309e-05, + "loss": 0.2727, + "step": 20642 + }, + { + "epoch": 2.75, + "grad_norm": 0.59765625, + "learning_rate": 5.347058455533976e-05, + "loss": 0.4643, + "step": 20643 + }, + { + "epoch": 2.75, + "grad_norm": 0.48046875, + "learning_rate": 5.346027721293518e-05, + "loss": 0.171, + "step": 20644 + }, + { + "epoch": 2.75, + "grad_norm": 0.5390625, + "learning_rate": 5.344997050163909e-05, + "loss": 0.4136, + "step": 20645 + }, + { + "epoch": 2.76, + "grad_norm": 0.62890625, + "learning_rate": 5.343966442159134e-05, + "loss": 0.2079, + "step": 20646 + }, + { + "epoch": 2.76, + "grad_norm": 0.58203125, + "learning_rate": 5.342935897293161e-05, + "loss": 0.3847, + "step": 20647 + }, + { + "epoch": 2.76, + "grad_norm": 0.70703125, + "learning_rate": 5.341905415579962e-05, + "loss": 0.3988, + "step": 20648 + }, + { + "epoch": 2.76, + "grad_norm": 0.515625, + "learning_rate": 5.340874997033516e-05, + "loss": 0.2743, + "step": 20649 + }, + { + "epoch": 2.76, + "grad_norm": 0.5703125, + "learning_rate": 5.3398446416677936e-05, + "loss": 0.4588, + "step": 20650 + }, + { + "epoch": 2.76, + "grad_norm": 0.5, + "learning_rate": 5.33881434949677e-05, + "loss": 0.1705, + "step": 20651 + }, + { + "epoch": 2.76, + "grad_norm": 0.65234375, + "learning_rate": 5.337784120534419e-05, + "loss": 0.3943, + "step": 20652 + }, + { + "epoch": 2.76, + "grad_norm": 0.625, + "learning_rate": 5.336753954794703e-05, + "loss": 0.3702, + "step": 20653 + }, + { + "epoch": 2.76, + "grad_norm": 0.6875, + "learning_rate": 5.335723852291596e-05, + "loss": 0.4214, + "step": 20654 + }, + { + "epoch": 2.76, + "grad_norm": 0.515625, + "learning_rate": 5.3346938130390686e-05, + "loss": 0.4129, + "step": 20655 + }, + { + "epoch": 2.76, + "grad_norm": 0.46875, + "learning_rate": 5.3336638370510904e-05, + "loss": 0.3554, + "step": 20656 + }, + { + "epoch": 2.76, + "grad_norm": 0.62109375, + "learning_rate": 5.332633924341621e-05, + "loss": 0.5684, + "step": 20657 + }, + { + "epoch": 2.76, + "grad_norm": 0.58203125, + "learning_rate": 5.3316040749246367e-05, + "loss": 0.3615, + "step": 20658 + }, + { + "epoch": 2.76, + "grad_norm": 0.5625, + "learning_rate": 5.330574288814092e-05, + "loss": 0.1838, + "step": 20659 + }, + { + "epoch": 2.76, + "grad_norm": 0.5, + "learning_rate": 5.3295445660239564e-05, + "loss": 0.2588, + "step": 20660 + }, + { + "epoch": 2.76, + "grad_norm": 0.59765625, + "learning_rate": 5.328514906568198e-05, + "loss": 0.3674, + "step": 20661 + }, + { + "epoch": 2.76, + "grad_norm": 0.58203125, + "learning_rate": 5.3274853104607734e-05, + "loss": 0.3285, + "step": 20662 + }, + { + "epoch": 2.76, + "grad_norm": 0.56640625, + "learning_rate": 5.326455777715644e-05, + "loss": 0.3523, + "step": 20663 + }, + { + "epoch": 2.76, + "grad_norm": 0.62890625, + "learning_rate": 5.325426308346777e-05, + "loss": 0.4495, + "step": 20664 + }, + { + "epoch": 2.76, + "grad_norm": 0.5234375, + "learning_rate": 5.3243969023681316e-05, + "loss": 0.2324, + "step": 20665 + }, + { + "epoch": 2.76, + "grad_norm": 0.53515625, + "learning_rate": 5.323367559793661e-05, + "loss": 0.2362, + "step": 20666 + }, + { + "epoch": 2.76, + "grad_norm": 0.74609375, + "learning_rate": 5.3223382806373265e-05, + "loss": 0.5345, + "step": 20667 + }, + { + "epoch": 2.76, + "grad_norm": 0.6171875, + "learning_rate": 5.3213090649130895e-05, + "loss": 0.3599, + "step": 20668 + }, + { + "epoch": 2.76, + "grad_norm": 0.470703125, + "learning_rate": 5.320279912634907e-05, + "loss": 0.3297, + "step": 20669 + }, + { + "epoch": 2.76, + "grad_norm": 0.67578125, + "learning_rate": 5.319250823816731e-05, + "loss": 0.4339, + "step": 20670 + }, + { + "epoch": 2.76, + "grad_norm": 0.54296875, + "learning_rate": 5.318221798472516e-05, + "loss": 0.3092, + "step": 20671 + }, + { + "epoch": 2.76, + "grad_norm": 0.73046875, + "learning_rate": 5.3171928366162173e-05, + "loss": 0.7319, + "step": 20672 + }, + { + "epoch": 2.76, + "grad_norm": 0.75390625, + "learning_rate": 5.316163938261789e-05, + "loss": 0.4988, + "step": 20673 + }, + { + "epoch": 2.76, + "grad_norm": 0.43359375, + "learning_rate": 5.3151351034231874e-05, + "loss": 0.1706, + "step": 20674 + }, + { + "epoch": 2.76, + "grad_norm": 0.76171875, + "learning_rate": 5.314106332114358e-05, + "loss": 0.2073, + "step": 20675 + }, + { + "epoch": 2.76, + "grad_norm": 0.75, + "learning_rate": 5.313077624349254e-05, + "loss": 0.3675, + "step": 20676 + }, + { + "epoch": 2.76, + "grad_norm": 0.52734375, + "learning_rate": 5.312048980141824e-05, + "loss": 0.4667, + "step": 20677 + }, + { + "epoch": 2.76, + "grad_norm": 0.6796875, + "learning_rate": 5.3110203995060195e-05, + "loss": 0.3325, + "step": 20678 + }, + { + "epoch": 2.76, + "grad_norm": 0.7109375, + "learning_rate": 5.3099918824557916e-05, + "loss": 0.3866, + "step": 20679 + }, + { + "epoch": 2.76, + "grad_norm": 0.490234375, + "learning_rate": 5.308963429005081e-05, + "loss": 0.2296, + "step": 20680 + }, + { + "epoch": 2.76, + "grad_norm": 0.58203125, + "learning_rate": 5.307935039167835e-05, + "loss": 0.5018, + "step": 20681 + }, + { + "epoch": 2.76, + "grad_norm": 0.71875, + "learning_rate": 5.3069067129580055e-05, + "loss": 0.3423, + "step": 20682 + }, + { + "epoch": 2.76, + "grad_norm": 0.66796875, + "learning_rate": 5.3058784503895296e-05, + "loss": 0.1828, + "step": 20683 + }, + { + "epoch": 2.76, + "grad_norm": 0.4375, + "learning_rate": 5.304850251476359e-05, + "loss": 0.269, + "step": 20684 + }, + { + "epoch": 2.76, + "grad_norm": 0.7890625, + "learning_rate": 5.303822116232426e-05, + "loss": 0.5367, + "step": 20685 + }, + { + "epoch": 2.76, + "grad_norm": 0.7265625, + "learning_rate": 5.302794044671682e-05, + "loss": 0.5212, + "step": 20686 + }, + { + "epoch": 2.76, + "grad_norm": 0.5078125, + "learning_rate": 5.3017660368080643e-05, + "loss": 0.371, + "step": 20687 + }, + { + "epoch": 2.76, + "grad_norm": 0.80078125, + "learning_rate": 5.3007380926555174e-05, + "loss": 0.4104, + "step": 20688 + }, + { + "epoch": 2.76, + "grad_norm": 0.62109375, + "learning_rate": 5.299710212227975e-05, + "loss": 0.6481, + "step": 20689 + }, + { + "epoch": 2.76, + "grad_norm": 0.59765625, + "learning_rate": 5.298682395539379e-05, + "loss": 0.3669, + "step": 20690 + }, + { + "epoch": 2.76, + "grad_norm": 0.73046875, + "learning_rate": 5.2976546426036664e-05, + "loss": 0.5438, + "step": 20691 + }, + { + "epoch": 2.76, + "grad_norm": 0.58984375, + "learning_rate": 5.2966269534347804e-05, + "loss": 0.4896, + "step": 20692 + }, + { + "epoch": 2.76, + "grad_norm": 0.703125, + "learning_rate": 5.295599328046645e-05, + "loss": 0.3242, + "step": 20693 + }, + { + "epoch": 2.76, + "grad_norm": 0.5234375, + "learning_rate": 5.2945717664532065e-05, + "loss": 0.3002, + "step": 20694 + }, + { + "epoch": 2.76, + "grad_norm": 0.55859375, + "learning_rate": 5.293544268668392e-05, + "loss": 0.3821, + "step": 20695 + }, + { + "epoch": 2.76, + "grad_norm": 0.6484375, + "learning_rate": 5.2925168347061374e-05, + "loss": 0.5737, + "step": 20696 + }, + { + "epoch": 2.76, + "grad_norm": 0.59375, + "learning_rate": 5.291489464580379e-05, + "loss": 0.2631, + "step": 20697 + }, + { + "epoch": 2.76, + "grad_norm": 0.546875, + "learning_rate": 5.2904621583050426e-05, + "loss": 0.3858, + "step": 20698 + }, + { + "epoch": 2.76, + "grad_norm": 0.546875, + "learning_rate": 5.28943491589406e-05, + "loss": 0.4404, + "step": 20699 + }, + { + "epoch": 2.76, + "grad_norm": 0.361328125, + "learning_rate": 5.288407737361365e-05, + "loss": 0.1183, + "step": 20700 + }, + { + "epoch": 2.76, + "grad_norm": 0.59375, + "learning_rate": 5.2873806227208855e-05, + "loss": 0.2933, + "step": 20701 + }, + { + "epoch": 2.76, + "grad_norm": 0.5546875, + "learning_rate": 5.286353571986551e-05, + "loss": 0.22, + "step": 20702 + }, + { + "epoch": 2.76, + "grad_norm": 0.5625, + "learning_rate": 5.2853265851722855e-05, + "loss": 0.2394, + "step": 20703 + }, + { + "epoch": 2.76, + "grad_norm": 0.6015625, + "learning_rate": 5.284299662292016e-05, + "loss": 0.25, + "step": 20704 + }, + { + "epoch": 2.76, + "grad_norm": 0.7109375, + "learning_rate": 5.283272803359669e-05, + "loss": 0.4516, + "step": 20705 + }, + { + "epoch": 2.76, + "grad_norm": 0.66796875, + "learning_rate": 5.282246008389176e-05, + "loss": 0.5016, + "step": 20706 + }, + { + "epoch": 2.76, + "grad_norm": 0.5625, + "learning_rate": 5.2812192773944535e-05, + "loss": 0.4243, + "step": 20707 + }, + { + "epoch": 2.76, + "grad_norm": 0.384765625, + "learning_rate": 5.280192610389422e-05, + "loss": 0.147, + "step": 20708 + }, + { + "epoch": 2.76, + "grad_norm": 0.54296875, + "learning_rate": 5.279166007388008e-05, + "loss": 0.2845, + "step": 20709 + }, + { + "epoch": 2.76, + "grad_norm": 0.6484375, + "learning_rate": 5.278139468404133e-05, + "loss": 0.3795, + "step": 20710 + }, + { + "epoch": 2.76, + "grad_norm": 0.498046875, + "learning_rate": 5.277112993451721e-05, + "loss": 0.3433, + "step": 20711 + }, + { + "epoch": 2.76, + "grad_norm": 0.46875, + "learning_rate": 5.276086582544684e-05, + "loss": 0.4612, + "step": 20712 + }, + { + "epoch": 2.76, + "grad_norm": 0.53515625, + "learning_rate": 5.275060235696945e-05, + "loss": 0.3578, + "step": 20713 + }, + { + "epoch": 2.76, + "grad_norm": 0.55078125, + "learning_rate": 5.2740339529224214e-05, + "loss": 0.2614, + "step": 20714 + }, + { + "epoch": 2.76, + "grad_norm": 0.62890625, + "learning_rate": 5.273007734235036e-05, + "loss": 0.591, + "step": 20715 + }, + { + "epoch": 2.76, + "grad_norm": 0.3828125, + "learning_rate": 5.2719815796486924e-05, + "loss": 0.2635, + "step": 20716 + }, + { + "epoch": 2.76, + "grad_norm": 0.400390625, + "learning_rate": 5.270955489177316e-05, + "loss": 0.1188, + "step": 20717 + }, + { + "epoch": 2.76, + "grad_norm": 0.7109375, + "learning_rate": 5.269929462834817e-05, + "loss": 0.3038, + "step": 20718 + }, + { + "epoch": 2.76, + "grad_norm": 0.5546875, + "learning_rate": 5.268903500635114e-05, + "loss": 0.1649, + "step": 20719 + }, + { + "epoch": 2.76, + "grad_norm": 0.484375, + "learning_rate": 5.267877602592116e-05, + "loss": 0.3252, + "step": 20720 + }, + { + "epoch": 2.77, + "grad_norm": 0.357421875, + "learning_rate": 5.2668517687197314e-05, + "loss": 0.1869, + "step": 20721 + }, + { + "epoch": 2.77, + "grad_norm": 0.63671875, + "learning_rate": 5.265825999031874e-05, + "loss": 0.2275, + "step": 20722 + }, + { + "epoch": 2.77, + "grad_norm": 0.4921875, + "learning_rate": 5.2648002935424535e-05, + "loss": 0.2722, + "step": 20723 + }, + { + "epoch": 2.77, + "grad_norm": 0.625, + "learning_rate": 5.263774652265382e-05, + "loss": 0.3754, + "step": 20724 + }, + { + "epoch": 2.77, + "grad_norm": 0.6328125, + "learning_rate": 5.2627490752145684e-05, + "loss": 0.3147, + "step": 20725 + }, + { + "epoch": 2.77, + "grad_norm": 0.52734375, + "learning_rate": 5.2617235624039155e-05, + "loss": 0.3566, + "step": 20726 + }, + { + "epoch": 2.77, + "grad_norm": 0.77734375, + "learning_rate": 5.26069811384733e-05, + "loss": 0.4121, + "step": 20727 + }, + { + "epoch": 2.77, + "grad_norm": 0.75390625, + "learning_rate": 5.25967272955872e-05, + "loss": 0.3171, + "step": 20728 + }, + { + "epoch": 2.77, + "grad_norm": 0.76171875, + "learning_rate": 5.258647409551996e-05, + "loss": 0.6323, + "step": 20729 + }, + { + "epoch": 2.77, + "grad_norm": 0.671875, + "learning_rate": 5.25762215384105e-05, + "loss": 0.4997, + "step": 20730 + }, + { + "epoch": 2.77, + "grad_norm": 0.6875, + "learning_rate": 5.256596962439796e-05, + "loss": 0.4492, + "step": 20731 + }, + { + "epoch": 2.77, + "grad_norm": 0.671875, + "learning_rate": 5.255571835362128e-05, + "loss": 0.4253, + "step": 20732 + }, + { + "epoch": 2.77, + "grad_norm": 0.66015625, + "learning_rate": 5.254546772621949e-05, + "loss": 0.3457, + "step": 20733 + }, + { + "epoch": 2.77, + "grad_norm": 0.62109375, + "learning_rate": 5.253521774233168e-05, + "loss": 0.1961, + "step": 20734 + }, + { + "epoch": 2.77, + "grad_norm": 0.62109375, + "learning_rate": 5.252496840209672e-05, + "loss": 0.3529, + "step": 20735 + }, + { + "epoch": 2.77, + "grad_norm": 0.60546875, + "learning_rate": 5.251471970565367e-05, + "loss": 0.1773, + "step": 20736 + }, + { + "epoch": 2.77, + "grad_norm": 0.67578125, + "learning_rate": 5.250447165314149e-05, + "loss": 0.3992, + "step": 20737 + }, + { + "epoch": 2.77, + "grad_norm": 0.52734375, + "learning_rate": 5.2494224244699194e-05, + "loss": 0.3203, + "step": 20738 + }, + { + "epoch": 2.77, + "grad_norm": 0.625, + "learning_rate": 5.2483977480465675e-05, + "loss": 0.3084, + "step": 20739 + }, + { + "epoch": 2.77, + "grad_norm": 0.5078125, + "learning_rate": 5.247373136057992e-05, + "loss": 0.3924, + "step": 20740 + }, + { + "epoch": 2.77, + "grad_norm": 0.494140625, + "learning_rate": 5.2463485885180854e-05, + "loss": 0.2445, + "step": 20741 + }, + { + "epoch": 2.77, + "grad_norm": 0.61328125, + "learning_rate": 5.2453241054407474e-05, + "loss": 0.4652, + "step": 20742 + }, + { + "epoch": 2.77, + "grad_norm": 0.671875, + "learning_rate": 5.2442996868398664e-05, + "loss": 0.3645, + "step": 20743 + }, + { + "epoch": 2.77, + "grad_norm": 0.6015625, + "learning_rate": 5.2432753327293296e-05, + "loss": 0.4712, + "step": 20744 + }, + { + "epoch": 2.77, + "grad_norm": 0.4921875, + "learning_rate": 5.2422510431230324e-05, + "loss": 0.2068, + "step": 20745 + }, + { + "epoch": 2.77, + "grad_norm": 0.69921875, + "learning_rate": 5.241226818034865e-05, + "loss": 0.3412, + "step": 20746 + }, + { + "epoch": 2.77, + "grad_norm": 0.625, + "learning_rate": 5.240202657478719e-05, + "loss": 0.7411, + "step": 20747 + }, + { + "epoch": 2.77, + "grad_norm": 0.796875, + "learning_rate": 5.2391785614684764e-05, + "loss": 0.393, + "step": 20748 + }, + { + "epoch": 2.77, + "grad_norm": 0.7109375, + "learning_rate": 5.238154530018028e-05, + "loss": 0.661, + "step": 20749 + }, + { + "epoch": 2.77, + "grad_norm": 0.58984375, + "learning_rate": 5.2371305631412604e-05, + "loss": 0.272, + "step": 20750 + }, + { + "epoch": 2.77, + "grad_norm": 0.51953125, + "learning_rate": 5.236106660852058e-05, + "loss": 0.1903, + "step": 20751 + }, + { + "epoch": 2.77, + "grad_norm": 0.63671875, + "learning_rate": 5.2350828231643125e-05, + "loss": 0.4575, + "step": 20752 + }, + { + "epoch": 2.77, + "grad_norm": 0.5703125, + "learning_rate": 5.234059050091897e-05, + "loss": 0.2805, + "step": 20753 + }, + { + "epoch": 2.77, + "grad_norm": 0.74609375, + "learning_rate": 5.2330353416486996e-05, + "loss": 0.4064, + "step": 20754 + }, + { + "epoch": 2.77, + "grad_norm": 0.76171875, + "learning_rate": 5.232011697848607e-05, + "loss": 0.2376, + "step": 20755 + }, + { + "epoch": 2.77, + "grad_norm": 0.58203125, + "learning_rate": 5.230988118705491e-05, + "loss": 0.4312, + "step": 20756 + }, + { + "epoch": 2.77, + "grad_norm": 0.5234375, + "learning_rate": 5.229964604233242e-05, + "loss": 0.2097, + "step": 20757 + }, + { + "epoch": 2.77, + "grad_norm": 0.5546875, + "learning_rate": 5.22894115444573e-05, + "loss": 0.2785, + "step": 20758 + }, + { + "epoch": 2.77, + "grad_norm": 0.69921875, + "learning_rate": 5.227917769356837e-05, + "loss": 0.4605, + "step": 20759 + }, + { + "epoch": 2.77, + "grad_norm": 0.7578125, + "learning_rate": 5.2268944489804416e-05, + "loss": 0.4964, + "step": 20760 + }, + { + "epoch": 2.77, + "grad_norm": 0.69921875, + "learning_rate": 5.225871193330426e-05, + "loss": 0.3246, + "step": 20761 + }, + { + "epoch": 2.77, + "grad_norm": 0.515625, + "learning_rate": 5.224848002420657e-05, + "loss": 0.3979, + "step": 20762 + }, + { + "epoch": 2.77, + "grad_norm": 0.65234375, + "learning_rate": 5.223824876265013e-05, + "loss": 0.1872, + "step": 20763 + }, + { + "epoch": 2.77, + "grad_norm": 0.72265625, + "learning_rate": 5.222801814877369e-05, + "loss": 0.4322, + "step": 20764 + }, + { + "epoch": 2.77, + "grad_norm": 0.61328125, + "learning_rate": 5.221778818271603e-05, + "loss": 0.363, + "step": 20765 + }, + { + "epoch": 2.77, + "grad_norm": 0.55859375, + "learning_rate": 5.220755886461579e-05, + "loss": 0.4328, + "step": 20766 + }, + { + "epoch": 2.77, + "grad_norm": 0.6953125, + "learning_rate": 5.219733019461176e-05, + "loss": 0.3918, + "step": 20767 + }, + { + "epoch": 2.77, + "grad_norm": 0.466796875, + "learning_rate": 5.2187102172842574e-05, + "loss": 0.2207, + "step": 20768 + }, + { + "epoch": 2.77, + "grad_norm": 0.65625, + "learning_rate": 5.2176874799446976e-05, + "loss": 0.4641, + "step": 20769 + }, + { + "epoch": 2.77, + "grad_norm": 0.55078125, + "learning_rate": 5.2166648074563674e-05, + "loss": 0.3407, + "step": 20770 + }, + { + "epoch": 2.77, + "grad_norm": 0.8125, + "learning_rate": 5.2156421998331285e-05, + "loss": 0.3837, + "step": 20771 + }, + { + "epoch": 2.77, + "grad_norm": 0.5859375, + "learning_rate": 5.2146196570888525e-05, + "loss": 0.3385, + "step": 20772 + }, + { + "epoch": 2.77, + "grad_norm": 0.5078125, + "learning_rate": 5.213597179237405e-05, + "loss": 0.4013, + "step": 20773 + }, + { + "epoch": 2.77, + "grad_norm": 0.703125, + "learning_rate": 5.2125747662926524e-05, + "loss": 0.5649, + "step": 20774 + }, + { + "epoch": 2.77, + "grad_norm": 0.62890625, + "learning_rate": 5.2115524182684615e-05, + "loss": 0.5135, + "step": 20775 + }, + { + "epoch": 2.77, + "grad_norm": 0.486328125, + "learning_rate": 5.2105301351786884e-05, + "loss": 0.2116, + "step": 20776 + }, + { + "epoch": 2.77, + "grad_norm": 0.51953125, + "learning_rate": 5.209507917037202e-05, + "loss": 0.1813, + "step": 20777 + }, + { + "epoch": 2.77, + "grad_norm": 0.7109375, + "learning_rate": 5.208485763857862e-05, + "loss": 0.5601, + "step": 20778 + }, + { + "epoch": 2.77, + "grad_norm": 0.625, + "learning_rate": 5.207463675654535e-05, + "loss": 0.1888, + "step": 20779 + }, + { + "epoch": 2.77, + "grad_norm": 0.61328125, + "learning_rate": 5.206441652441076e-05, + "loss": 0.2869, + "step": 20780 + }, + { + "epoch": 2.77, + "grad_norm": 0.53515625, + "learning_rate": 5.20541969423134e-05, + "loss": 0.2006, + "step": 20781 + }, + { + "epoch": 2.77, + "grad_norm": 0.703125, + "learning_rate": 5.2043978010391913e-05, + "loss": 0.3963, + "step": 20782 + }, + { + "epoch": 2.77, + "grad_norm": 0.59765625, + "learning_rate": 5.203375972878486e-05, + "loss": 0.3647, + "step": 20783 + }, + { + "epoch": 2.77, + "grad_norm": 0.6796875, + "learning_rate": 5.202354209763083e-05, + "loss": 0.4983, + "step": 20784 + }, + { + "epoch": 2.77, + "grad_norm": 0.59375, + "learning_rate": 5.2013325117068336e-05, + "loss": 0.3769, + "step": 20785 + }, + { + "epoch": 2.77, + "grad_norm": 0.60546875, + "learning_rate": 5.200310878723596e-05, + "loss": 0.227, + "step": 20786 + }, + { + "epoch": 2.77, + "grad_norm": 0.392578125, + "learning_rate": 5.199289310827222e-05, + "loss": 0.287, + "step": 20787 + }, + { + "epoch": 2.77, + "grad_norm": 0.68359375, + "learning_rate": 5.19826780803157e-05, + "loss": 0.5355, + "step": 20788 + }, + { + "epoch": 2.77, + "grad_norm": 0.76171875, + "learning_rate": 5.197246370350485e-05, + "loss": 0.4397, + "step": 20789 + }, + { + "epoch": 2.77, + "grad_norm": 0.65234375, + "learning_rate": 5.1962249977978206e-05, + "loss": 0.3057, + "step": 20790 + }, + { + "epoch": 2.77, + "grad_norm": 0.703125, + "learning_rate": 5.195203690387429e-05, + "loss": 0.3351, + "step": 20791 + }, + { + "epoch": 2.77, + "grad_norm": 0.58984375, + "learning_rate": 5.1941824481331626e-05, + "loss": 0.4334, + "step": 20792 + }, + { + "epoch": 2.77, + "grad_norm": 0.69140625, + "learning_rate": 5.193161271048868e-05, + "loss": 0.345, + "step": 20793 + }, + { + "epoch": 2.77, + "grad_norm": 0.65234375, + "learning_rate": 5.192140159148386e-05, + "loss": 0.4556, + "step": 20794 + }, + { + "epoch": 2.77, + "grad_norm": 0.67578125, + "learning_rate": 5.19111911244557e-05, + "loss": 0.3023, + "step": 20795 + }, + { + "epoch": 2.78, + "grad_norm": 0.54296875, + "learning_rate": 5.190098130954264e-05, + "loss": 0.1697, + "step": 20796 + }, + { + "epoch": 2.78, + "grad_norm": 0.484375, + "learning_rate": 5.189077214688316e-05, + "loss": 0.3476, + "step": 20797 + }, + { + "epoch": 2.78, + "grad_norm": 0.45703125, + "learning_rate": 5.188056363661571e-05, + "loss": 0.2047, + "step": 20798 + }, + { + "epoch": 2.78, + "grad_norm": 0.57421875, + "learning_rate": 5.1870355778878665e-05, + "loss": 0.5792, + "step": 20799 + }, + { + "epoch": 2.78, + "grad_norm": 0.90234375, + "learning_rate": 5.186014857381048e-05, + "loss": 0.3067, + "step": 20800 + }, + { + "epoch": 2.78, + "grad_norm": 0.53515625, + "learning_rate": 5.184994202154959e-05, + "loss": 0.2984, + "step": 20801 + }, + { + "epoch": 2.78, + "grad_norm": 0.50390625, + "learning_rate": 5.1839736122234426e-05, + "loss": 0.2108, + "step": 20802 + }, + { + "epoch": 2.78, + "grad_norm": 0.6015625, + "learning_rate": 5.182953087600331e-05, + "loss": 0.3877, + "step": 20803 + }, + { + "epoch": 2.78, + "grad_norm": 0.53515625, + "learning_rate": 5.1819326282994695e-05, + "loss": 0.2707, + "step": 20804 + }, + { + "epoch": 2.78, + "grad_norm": 0.62890625, + "learning_rate": 5.1809122343346904e-05, + "loss": 0.1963, + "step": 20805 + }, + { + "epoch": 2.78, + "grad_norm": 1.125, + "learning_rate": 5.179891905719836e-05, + "loss": 0.6133, + "step": 20806 + }, + { + "epoch": 2.78, + "grad_norm": 0.55078125, + "learning_rate": 5.178871642468743e-05, + "loss": 0.3434, + "step": 20807 + }, + { + "epoch": 2.78, + "grad_norm": 0.453125, + "learning_rate": 5.177851444595241e-05, + "loss": 0.1881, + "step": 20808 + }, + { + "epoch": 2.78, + "grad_norm": 0.640625, + "learning_rate": 5.176831312113169e-05, + "loss": 0.1987, + "step": 20809 + }, + { + "epoch": 2.78, + "grad_norm": 0.70703125, + "learning_rate": 5.175811245036359e-05, + "loss": 0.317, + "step": 20810 + }, + { + "epoch": 2.78, + "grad_norm": 0.423828125, + "learning_rate": 5.1747912433786497e-05, + "loss": 0.162, + "step": 20811 + }, + { + "epoch": 2.78, + "grad_norm": 0.73828125, + "learning_rate": 5.173771307153863e-05, + "loss": 0.2959, + "step": 20812 + }, + { + "epoch": 2.78, + "grad_norm": 0.4453125, + "learning_rate": 5.172751436375835e-05, + "loss": 0.1717, + "step": 20813 + }, + { + "epoch": 2.78, + "grad_norm": 0.6171875, + "learning_rate": 5.171731631058396e-05, + "loss": 0.2679, + "step": 20814 + }, + { + "epoch": 2.78, + "grad_norm": 0.5390625, + "learning_rate": 5.170711891215378e-05, + "loss": 0.3934, + "step": 20815 + }, + { + "epoch": 2.78, + "grad_norm": 0.65625, + "learning_rate": 5.1696922168606065e-05, + "loss": 0.2731, + "step": 20816 + }, + { + "epoch": 2.78, + "grad_norm": 0.7265625, + "learning_rate": 5.168672608007905e-05, + "loss": 0.508, + "step": 20817 + }, + { + "epoch": 2.78, + "grad_norm": 0.5390625, + "learning_rate": 5.167653064671103e-05, + "loss": 0.1978, + "step": 20818 + }, + { + "epoch": 2.78, + "grad_norm": 0.734375, + "learning_rate": 5.166633586864028e-05, + "loss": 0.4658, + "step": 20819 + }, + { + "epoch": 2.78, + "grad_norm": 0.55078125, + "learning_rate": 5.165614174600507e-05, + "loss": 0.5396, + "step": 20820 + }, + { + "epoch": 2.78, + "grad_norm": 0.84765625, + "learning_rate": 5.1645948278943566e-05, + "loss": 0.4463, + "step": 20821 + }, + { + "epoch": 2.78, + "grad_norm": 0.6015625, + "learning_rate": 5.163575546759404e-05, + "loss": 0.2871, + "step": 20822 + }, + { + "epoch": 2.78, + "grad_norm": 0.5234375, + "learning_rate": 5.162556331209469e-05, + "loss": 0.3538, + "step": 20823 + }, + { + "epoch": 2.78, + "grad_norm": 0.5546875, + "learning_rate": 5.161537181258378e-05, + "loss": 0.6208, + "step": 20824 + }, + { + "epoch": 2.78, + "grad_norm": 0.451171875, + "learning_rate": 5.160518096919951e-05, + "loss": 0.3218, + "step": 20825 + }, + { + "epoch": 2.78, + "grad_norm": 0.625, + "learning_rate": 5.159499078207999e-05, + "loss": 0.4484, + "step": 20826 + }, + { + "epoch": 2.78, + "grad_norm": 0.73046875, + "learning_rate": 5.158480125136347e-05, + "loss": 0.2796, + "step": 20827 + }, + { + "epoch": 2.78, + "grad_norm": 0.5703125, + "learning_rate": 5.157461237718816e-05, + "loss": 0.1532, + "step": 20828 + }, + { + "epoch": 2.78, + "grad_norm": 0.6484375, + "learning_rate": 5.1564424159692134e-05, + "loss": 0.325, + "step": 20829 + }, + { + "epoch": 2.78, + "grad_norm": 0.52734375, + "learning_rate": 5.1554236599013664e-05, + "loss": 0.3237, + "step": 20830 + }, + { + "epoch": 2.78, + "grad_norm": 0.578125, + "learning_rate": 5.154404969529078e-05, + "loss": 0.3441, + "step": 20831 + }, + { + "epoch": 2.78, + "grad_norm": 0.53515625, + "learning_rate": 5.153386344866169e-05, + "loss": 0.196, + "step": 20832 + }, + { + "epoch": 2.78, + "grad_norm": 0.6328125, + "learning_rate": 5.1523677859264516e-05, + "loss": 0.5145, + "step": 20833 + }, + { + "epoch": 2.78, + "grad_norm": 0.7265625, + "learning_rate": 5.151349292723742e-05, + "loss": 0.3321, + "step": 20834 + }, + { + "epoch": 2.78, + "grad_norm": 0.69921875, + "learning_rate": 5.150330865271844e-05, + "loss": 0.3842, + "step": 20835 + }, + { + "epoch": 2.78, + "grad_norm": 0.6328125, + "learning_rate": 5.1493125035845714e-05, + "loss": 0.356, + "step": 20836 + }, + { + "epoch": 2.78, + "grad_norm": 0.5703125, + "learning_rate": 5.148294207675734e-05, + "loss": 0.3045, + "step": 20837 + }, + { + "epoch": 2.78, + "grad_norm": 0.66015625, + "learning_rate": 5.147275977559147e-05, + "loss": 0.3709, + "step": 20838 + }, + { + "epoch": 2.78, + "grad_norm": 0.61328125, + "learning_rate": 5.1462578132486074e-05, + "loss": 0.2779, + "step": 20839 + }, + { + "epoch": 2.78, + "grad_norm": 0.5078125, + "learning_rate": 5.14523971475793e-05, + "loss": 0.2355, + "step": 20840 + }, + { + "epoch": 2.78, + "grad_norm": 0.6171875, + "learning_rate": 5.144221682100914e-05, + "loss": 0.237, + "step": 20841 + }, + { + "epoch": 2.78, + "grad_norm": 0.4921875, + "learning_rate": 5.143203715291369e-05, + "loss": 0.3422, + "step": 20842 + }, + { + "epoch": 2.78, + "grad_norm": 0.44140625, + "learning_rate": 5.142185814343103e-05, + "loss": 0.2756, + "step": 20843 + }, + { + "epoch": 2.78, + "grad_norm": 0.451171875, + "learning_rate": 5.1411679792699094e-05, + "loss": 0.1368, + "step": 20844 + }, + { + "epoch": 2.78, + "grad_norm": 0.66015625, + "learning_rate": 5.1401502100855994e-05, + "loss": 0.282, + "step": 20845 + }, + { + "epoch": 2.78, + "grad_norm": 0.82421875, + "learning_rate": 5.1391325068039696e-05, + "loss": 0.3356, + "step": 20846 + }, + { + "epoch": 2.78, + "grad_norm": 0.447265625, + "learning_rate": 5.138114869438822e-05, + "loss": 0.3387, + "step": 20847 + }, + { + "epoch": 2.78, + "grad_norm": 0.625, + "learning_rate": 5.137097298003962e-05, + "loss": 0.2174, + "step": 20848 + }, + { + "epoch": 2.78, + "grad_norm": 0.65625, + "learning_rate": 5.13607979251318e-05, + "loss": 0.4769, + "step": 20849 + }, + { + "epoch": 2.78, + "grad_norm": 0.609375, + "learning_rate": 5.135062352980276e-05, + "loss": 0.5454, + "step": 20850 + }, + { + "epoch": 2.78, + "grad_norm": 0.62890625, + "learning_rate": 5.134044979419051e-05, + "loss": 0.2634, + "step": 20851 + }, + { + "epoch": 2.78, + "grad_norm": 0.78515625, + "learning_rate": 5.1330276718433024e-05, + "loss": 0.341, + "step": 20852 + }, + { + "epoch": 2.78, + "grad_norm": 0.56640625, + "learning_rate": 5.1320104302668204e-05, + "loss": 0.3636, + "step": 20853 + }, + { + "epoch": 2.78, + "grad_norm": 0.62890625, + "learning_rate": 5.1309932547034e-05, + "loss": 0.215, + "step": 20854 + }, + { + "epoch": 2.78, + "grad_norm": 0.6796875, + "learning_rate": 5.129976145166835e-05, + "loss": 0.374, + "step": 20855 + }, + { + "epoch": 2.78, + "grad_norm": 0.515625, + "learning_rate": 5.128959101670918e-05, + "loss": 0.2788, + "step": 20856 + }, + { + "epoch": 2.78, + "grad_norm": 0.49609375, + "learning_rate": 5.127942124229446e-05, + "loss": 0.3118, + "step": 20857 + }, + { + "epoch": 2.78, + "grad_norm": 0.6015625, + "learning_rate": 5.126925212856202e-05, + "loss": 0.2006, + "step": 20858 + }, + { + "epoch": 2.78, + "grad_norm": 0.63671875, + "learning_rate": 5.12590836756498e-05, + "loss": 0.4017, + "step": 20859 + }, + { + "epoch": 2.78, + "grad_norm": 0.52734375, + "learning_rate": 5.124891588369569e-05, + "loss": 0.2961, + "step": 20860 + }, + { + "epoch": 2.78, + "grad_norm": 0.6640625, + "learning_rate": 5.12387487528376e-05, + "loss": 0.5234, + "step": 20861 + }, + { + "epoch": 2.78, + "grad_norm": 0.58984375, + "learning_rate": 5.122858228321333e-05, + "loss": 0.3569, + "step": 20862 + }, + { + "epoch": 2.78, + "grad_norm": 0.703125, + "learning_rate": 5.12184164749608e-05, + "loss": 0.3412, + "step": 20863 + }, + { + "epoch": 2.78, + "grad_norm": 0.54296875, + "learning_rate": 5.120825132821787e-05, + "loss": 0.2656, + "step": 20864 + }, + { + "epoch": 2.78, + "grad_norm": 0.62109375, + "learning_rate": 5.119808684312234e-05, + "loss": 0.5412, + "step": 20865 + }, + { + "epoch": 2.78, + "grad_norm": 0.59765625, + "learning_rate": 5.11879230198121e-05, + "loss": 0.203, + "step": 20866 + }, + { + "epoch": 2.78, + "grad_norm": 0.671875, + "learning_rate": 5.117775985842492e-05, + "loss": 0.412, + "step": 20867 + }, + { + "epoch": 2.78, + "grad_norm": 0.56640625, + "learning_rate": 5.116759735909866e-05, + "loss": 0.2685, + "step": 20868 + }, + { + "epoch": 2.78, + "grad_norm": 0.55078125, + "learning_rate": 5.115743552197112e-05, + "loss": 0.2335, + "step": 20869 + }, + { + "epoch": 2.78, + "grad_norm": 0.6328125, + "learning_rate": 5.1147274347180095e-05, + "loss": 0.2883, + "step": 20870 + }, + { + "epoch": 2.79, + "grad_norm": 0.56640625, + "learning_rate": 5.113711383486341e-05, + "loss": 0.571, + "step": 20871 + }, + { + "epoch": 2.79, + "grad_norm": 0.5546875, + "learning_rate": 5.11269539851588e-05, + "loss": 0.1877, + "step": 20872 + }, + { + "epoch": 2.79, + "grad_norm": 0.5390625, + "learning_rate": 5.111679479820406e-05, + "loss": 0.3431, + "step": 20873 + }, + { + "epoch": 2.79, + "grad_norm": 0.75390625, + "learning_rate": 5.110663627413694e-05, + "loss": 0.3216, + "step": 20874 + }, + { + "epoch": 2.79, + "grad_norm": 0.55078125, + "learning_rate": 5.109647841309526e-05, + "loss": 0.1852, + "step": 20875 + }, + { + "epoch": 2.79, + "grad_norm": 0.83984375, + "learning_rate": 5.108632121521668e-05, + "loss": 0.344, + "step": 20876 + }, + { + "epoch": 2.79, + "grad_norm": 0.71875, + "learning_rate": 5.1076164680639016e-05, + "loss": 0.4594, + "step": 20877 + }, + { + "epoch": 2.79, + "grad_norm": 0.53125, + "learning_rate": 5.106600880949992e-05, + "loss": 0.3185, + "step": 20878 + }, + { + "epoch": 2.79, + "grad_norm": 0.43359375, + "learning_rate": 5.1055853601937145e-05, + "loss": 0.182, + "step": 20879 + }, + { + "epoch": 2.79, + "grad_norm": 0.6015625, + "learning_rate": 5.1045699058088444e-05, + "loss": 0.3436, + "step": 20880 + }, + { + "epoch": 2.79, + "grad_norm": 0.68359375, + "learning_rate": 5.103554517809146e-05, + "loss": 0.3543, + "step": 20881 + }, + { + "epoch": 2.79, + "grad_norm": 0.65625, + "learning_rate": 5.102539196208389e-05, + "loss": 0.3367, + "step": 20882 + }, + { + "epoch": 2.79, + "grad_norm": 0.42578125, + "learning_rate": 5.101523941020345e-05, + "loss": 0.321, + "step": 20883 + }, + { + "epoch": 2.79, + "grad_norm": 0.65234375, + "learning_rate": 5.1005087522587834e-05, + "loss": 0.3705, + "step": 20884 + }, + { + "epoch": 2.79, + "grad_norm": 0.5859375, + "learning_rate": 5.099493629937464e-05, + "loss": 0.3349, + "step": 20885 + }, + { + "epoch": 2.79, + "grad_norm": 0.7421875, + "learning_rate": 5.098478574070156e-05, + "loss": 0.6644, + "step": 20886 + }, + { + "epoch": 2.79, + "grad_norm": 0.5859375, + "learning_rate": 5.0974635846706244e-05, + "loss": 0.2959, + "step": 20887 + }, + { + "epoch": 2.79, + "grad_norm": 0.6171875, + "learning_rate": 5.096448661752637e-05, + "loss": 0.4141, + "step": 20888 + }, + { + "epoch": 2.79, + "grad_norm": 0.71484375, + "learning_rate": 5.095433805329952e-05, + "loss": 0.3038, + "step": 20889 + }, + { + "epoch": 2.79, + "grad_norm": 0.5234375, + "learning_rate": 5.0944190154163295e-05, + "loss": 0.206, + "step": 20890 + }, + { + "epoch": 2.79, + "grad_norm": 0.59765625, + "learning_rate": 5.093404292025533e-05, + "loss": 0.3824, + "step": 20891 + }, + { + "epoch": 2.79, + "grad_norm": 0.4765625, + "learning_rate": 5.092389635171324e-05, + "loss": 0.3207, + "step": 20892 + }, + { + "epoch": 2.79, + "grad_norm": 0.5703125, + "learning_rate": 5.0913750448674636e-05, + "loss": 0.2965, + "step": 20893 + }, + { + "epoch": 2.79, + "grad_norm": 0.5859375, + "learning_rate": 5.090360521127705e-05, + "loss": 0.2806, + "step": 20894 + }, + { + "epoch": 2.79, + "grad_norm": 0.65625, + "learning_rate": 5.08934606396581e-05, + "loss": 0.4533, + "step": 20895 + }, + { + "epoch": 2.79, + "grad_norm": 0.71875, + "learning_rate": 5.088331673395532e-05, + "loss": 0.4151, + "step": 20896 + }, + { + "epoch": 2.79, + "grad_norm": 0.5, + "learning_rate": 5.087317349430629e-05, + "loss": 0.1778, + "step": 20897 + }, + { + "epoch": 2.79, + "grad_norm": 0.484375, + "learning_rate": 5.086303092084861e-05, + "loss": 0.2548, + "step": 20898 + }, + { + "epoch": 2.79, + "grad_norm": 0.51953125, + "learning_rate": 5.085288901371972e-05, + "loss": 0.3449, + "step": 20899 + }, + { + "epoch": 2.79, + "grad_norm": 0.51953125, + "learning_rate": 5.0842747773057185e-05, + "loss": 0.3565, + "step": 20900 + }, + { + "epoch": 2.79, + "grad_norm": 0.63671875, + "learning_rate": 5.0832607198998584e-05, + "loss": 0.2595, + "step": 20901 + }, + { + "epoch": 2.79, + "grad_norm": 0.5703125, + "learning_rate": 5.0822467291681365e-05, + "loss": 0.5381, + "step": 20902 + }, + { + "epoch": 2.79, + "grad_norm": 0.6328125, + "learning_rate": 5.081232805124307e-05, + "loss": 0.4393, + "step": 20903 + }, + { + "epoch": 2.79, + "grad_norm": 0.64453125, + "learning_rate": 5.080218947782114e-05, + "loss": 0.4646, + "step": 20904 + }, + { + "epoch": 2.79, + "grad_norm": 0.671875, + "learning_rate": 5.079205157155309e-05, + "loss": 0.3006, + "step": 20905 + }, + { + "epoch": 2.79, + "grad_norm": 0.4296875, + "learning_rate": 5.078191433257642e-05, + "loss": 0.181, + "step": 20906 + }, + { + "epoch": 2.79, + "grad_norm": 0.7109375, + "learning_rate": 5.077177776102861e-05, + "loss": 0.7028, + "step": 20907 + }, + { + "epoch": 2.79, + "grad_norm": 0.65625, + "learning_rate": 5.0761641857047036e-05, + "loss": 0.4628, + "step": 20908 + }, + { + "epoch": 2.79, + "grad_norm": 0.73828125, + "learning_rate": 5.075150662076922e-05, + "loss": 0.3217, + "step": 20909 + }, + { + "epoch": 2.79, + "grad_norm": 0.66015625, + "learning_rate": 5.074137205233257e-05, + "loss": 0.4553, + "step": 20910 + }, + { + "epoch": 2.79, + "grad_norm": 0.6328125, + "learning_rate": 5.073123815187457e-05, + "loss": 0.2324, + "step": 20911 + }, + { + "epoch": 2.79, + "grad_norm": 0.451171875, + "learning_rate": 5.0721104919532566e-05, + "loss": 0.2444, + "step": 20912 + }, + { + "epoch": 2.79, + "grad_norm": 0.7890625, + "learning_rate": 5.071097235544404e-05, + "loss": 0.5296, + "step": 20913 + }, + { + "epoch": 2.79, + "grad_norm": 0.53125, + "learning_rate": 5.0700840459746326e-05, + "loss": 0.2986, + "step": 20914 + }, + { + "epoch": 2.79, + "grad_norm": 0.625, + "learning_rate": 5.069070923257685e-05, + "loss": 0.5117, + "step": 20915 + }, + { + "epoch": 2.79, + "grad_norm": 0.71875, + "learning_rate": 5.068057867407305e-05, + "loss": 0.4328, + "step": 20916 + }, + { + "epoch": 2.79, + "grad_norm": 0.65625, + "learning_rate": 5.0670448784372214e-05, + "loss": 0.2843, + "step": 20917 + }, + { + "epoch": 2.79, + "grad_norm": 0.546875, + "learning_rate": 5.0660319563611744e-05, + "loss": 0.4028, + "step": 20918 + }, + { + "epoch": 2.79, + "grad_norm": 0.474609375, + "learning_rate": 5.065019101192903e-05, + "loss": 0.1491, + "step": 20919 + }, + { + "epoch": 2.79, + "grad_norm": 0.49609375, + "learning_rate": 5.0640063129461365e-05, + "loss": 0.2736, + "step": 20920 + }, + { + "epoch": 2.79, + "grad_norm": 0.52734375, + "learning_rate": 5.062993591634619e-05, + "loss": 0.4355, + "step": 20921 + }, + { + "epoch": 2.79, + "grad_norm": 0.546875, + "learning_rate": 5.0619809372720725e-05, + "loss": 0.2593, + "step": 20922 + }, + { + "epoch": 2.79, + "grad_norm": 0.57421875, + "learning_rate": 5.0609683498722324e-05, + "loss": 0.3663, + "step": 20923 + }, + { + "epoch": 2.79, + "grad_norm": 0.5859375, + "learning_rate": 5.0599558294488325e-05, + "loss": 0.2575, + "step": 20924 + }, + { + "epoch": 2.79, + "grad_norm": 0.63671875, + "learning_rate": 5.0589433760156055e-05, + "loss": 0.3587, + "step": 20925 + }, + { + "epoch": 2.79, + "grad_norm": 0.443359375, + "learning_rate": 5.057930989586277e-05, + "loss": 0.3219, + "step": 20926 + }, + { + "epoch": 2.79, + "grad_norm": 0.5546875, + "learning_rate": 5.056918670174573e-05, + "loss": 0.378, + "step": 20927 + }, + { + "epoch": 2.79, + "grad_norm": 0.66796875, + "learning_rate": 5.055906417794225e-05, + "loss": 0.4968, + "step": 20928 + }, + { + "epoch": 2.79, + "grad_norm": 0.443359375, + "learning_rate": 5.054894232458958e-05, + "loss": 0.3296, + "step": 20929 + }, + { + "epoch": 2.79, + "grad_norm": 0.6015625, + "learning_rate": 5.0538821141825024e-05, + "loss": 0.3632, + "step": 20930 + }, + { + "epoch": 2.79, + "grad_norm": 0.6328125, + "learning_rate": 5.0528700629785764e-05, + "loss": 0.2486, + "step": 20931 + }, + { + "epoch": 2.79, + "grad_norm": 0.51171875, + "learning_rate": 5.051858078860907e-05, + "loss": 0.2975, + "step": 20932 + }, + { + "epoch": 2.79, + "grad_norm": 0.67578125, + "learning_rate": 5.050846161843219e-05, + "loss": 0.2802, + "step": 20933 + }, + { + "epoch": 2.79, + "grad_norm": 0.88671875, + "learning_rate": 5.0498343119392365e-05, + "loss": 0.2937, + "step": 20934 + }, + { + "epoch": 2.79, + "grad_norm": 0.63671875, + "learning_rate": 5.048822529162673e-05, + "loss": 0.3813, + "step": 20935 + }, + { + "epoch": 2.79, + "grad_norm": 0.91796875, + "learning_rate": 5.047810813527255e-05, + "loss": 0.5914, + "step": 20936 + }, + { + "epoch": 2.79, + "grad_norm": 0.64453125, + "learning_rate": 5.046799165046704e-05, + "loss": 0.6994, + "step": 20937 + }, + { + "epoch": 2.79, + "grad_norm": 0.76171875, + "learning_rate": 5.045787583734731e-05, + "loss": 0.3467, + "step": 20938 + }, + { + "epoch": 2.79, + "grad_norm": 0.77734375, + "learning_rate": 5.044776069605062e-05, + "loss": 0.4561, + "step": 20939 + }, + { + "epoch": 2.79, + "grad_norm": 0.5078125, + "learning_rate": 5.0437646226714066e-05, + "loss": 0.21, + "step": 20940 + }, + { + "epoch": 2.79, + "grad_norm": 0.5859375, + "learning_rate": 5.0427532429474834e-05, + "loss": 0.378, + "step": 20941 + }, + { + "epoch": 2.79, + "grad_norm": 0.7734375, + "learning_rate": 5.0417419304470074e-05, + "loss": 0.3029, + "step": 20942 + }, + { + "epoch": 2.79, + "grad_norm": 0.5703125, + "learning_rate": 5.0407306851836936e-05, + "loss": 0.4914, + "step": 20943 + }, + { + "epoch": 2.79, + "grad_norm": 0.490234375, + "learning_rate": 5.039719507171258e-05, + "loss": 0.2349, + "step": 20944 + }, + { + "epoch": 2.79, + "grad_norm": 0.64453125, + "learning_rate": 5.038708396423405e-05, + "loss": 0.3919, + "step": 20945 + }, + { + "epoch": 2.8, + "grad_norm": 0.671875, + "learning_rate": 5.0376973529538505e-05, + "loss": 0.4827, + "step": 20946 + }, + { + "epoch": 2.8, + "grad_norm": 0.67578125, + "learning_rate": 5.036686376776305e-05, + "loss": 0.3206, + "step": 20947 + }, + { + "epoch": 2.8, + "grad_norm": 0.50390625, + "learning_rate": 5.035675467904481e-05, + "loss": 0.2288, + "step": 20948 + }, + { + "epoch": 2.8, + "grad_norm": 0.54296875, + "learning_rate": 5.0346646263520857e-05, + "loss": 0.19, + "step": 20949 + }, + { + "epoch": 2.8, + "grad_norm": 0.52734375, + "learning_rate": 5.033653852132819e-05, + "loss": 0.4403, + "step": 20950 + }, + { + "epoch": 2.8, + "grad_norm": 0.435546875, + "learning_rate": 5.032643145260394e-05, + "loss": 0.1602, + "step": 20951 + }, + { + "epoch": 2.8, + "grad_norm": 0.66015625, + "learning_rate": 5.031632505748516e-05, + "loss": 0.4436, + "step": 20952 + }, + { + "epoch": 2.8, + "grad_norm": 0.53125, + "learning_rate": 5.030621933610893e-05, + "loss": 0.3455, + "step": 20953 + }, + { + "epoch": 2.8, + "grad_norm": 0.462890625, + "learning_rate": 5.0296114288612226e-05, + "loss": 0.2017, + "step": 20954 + }, + { + "epoch": 2.8, + "grad_norm": 0.57421875, + "learning_rate": 5.0286009915132123e-05, + "loss": 0.3099, + "step": 20955 + }, + { + "epoch": 2.8, + "grad_norm": 0.65234375, + "learning_rate": 5.0275906215805625e-05, + "loss": 0.3665, + "step": 20956 + }, + { + "epoch": 2.8, + "grad_norm": 0.57421875, + "learning_rate": 5.026580319076978e-05, + "loss": 0.2084, + "step": 20957 + }, + { + "epoch": 2.8, + "grad_norm": 0.546875, + "learning_rate": 5.025570084016153e-05, + "loss": 0.2065, + "step": 20958 + }, + { + "epoch": 2.8, + "grad_norm": 0.5234375, + "learning_rate": 5.024559916411791e-05, + "loss": 0.3461, + "step": 20959 + }, + { + "epoch": 2.8, + "grad_norm": 0.6328125, + "learning_rate": 5.0235498162775885e-05, + "loss": 0.5993, + "step": 20960 + }, + { + "epoch": 2.8, + "grad_norm": 0.56640625, + "learning_rate": 5.022539783627249e-05, + "loss": 0.3483, + "step": 20961 + }, + { + "epoch": 2.8, + "grad_norm": 0.5234375, + "learning_rate": 5.021529818474464e-05, + "loss": 0.2266, + "step": 20962 + }, + { + "epoch": 2.8, + "grad_norm": 0.609375, + "learning_rate": 5.020519920832927e-05, + "loss": 0.4517, + "step": 20963 + }, + { + "epoch": 2.8, + "grad_norm": 0.71484375, + "learning_rate": 5.019510090716335e-05, + "loss": 0.2743, + "step": 20964 + }, + { + "epoch": 2.8, + "grad_norm": 0.5234375, + "learning_rate": 5.0185003281383826e-05, + "loss": 0.1803, + "step": 20965 + }, + { + "epoch": 2.8, + "grad_norm": 0.8359375, + "learning_rate": 5.017490633112767e-05, + "loss": 0.3543, + "step": 20966 + }, + { + "epoch": 2.8, + "grad_norm": 0.59375, + "learning_rate": 5.016481005653171e-05, + "loss": 0.2099, + "step": 20967 + }, + { + "epoch": 2.8, + "grad_norm": 0.38671875, + "learning_rate": 5.015471445773292e-05, + "loss": 0.1383, + "step": 20968 + }, + { + "epoch": 2.8, + "grad_norm": 0.609375, + "learning_rate": 5.01446195348682e-05, + "loss": 0.1995, + "step": 20969 + }, + { + "epoch": 2.8, + "grad_norm": 0.6171875, + "learning_rate": 5.013452528807443e-05, + "loss": 0.5727, + "step": 20970 + }, + { + "epoch": 2.8, + "grad_norm": 0.376953125, + "learning_rate": 5.012443171748853e-05, + "loss": 0.1445, + "step": 20971 + }, + { + "epoch": 2.8, + "grad_norm": 0.74609375, + "learning_rate": 5.011433882324732e-05, + "loss": 0.539, + "step": 20972 + }, + { + "epoch": 2.8, + "grad_norm": 0.640625, + "learning_rate": 5.010424660548769e-05, + "loss": 0.4877, + "step": 20973 + }, + { + "epoch": 2.8, + "grad_norm": 0.6171875, + "learning_rate": 5.0094155064346534e-05, + "loss": 0.3436, + "step": 20974 + }, + { + "epoch": 2.8, + "grad_norm": 0.65625, + "learning_rate": 5.0084064199960625e-05, + "loss": 0.3733, + "step": 20975 + }, + { + "epoch": 2.8, + "grad_norm": 0.703125, + "learning_rate": 5.0073974012466886e-05, + "loss": 0.3043, + "step": 20976 + }, + { + "epoch": 2.8, + "grad_norm": 0.7109375, + "learning_rate": 5.0063884502002055e-05, + "loss": 0.4018, + "step": 20977 + }, + { + "epoch": 2.8, + "grad_norm": 0.64453125, + "learning_rate": 5.0053795668703006e-05, + "loss": 0.303, + "step": 20978 + }, + { + "epoch": 2.8, + "grad_norm": 0.640625, + "learning_rate": 5.0043707512706525e-05, + "loss": 0.589, + "step": 20979 + }, + { + "epoch": 2.8, + "grad_norm": 0.59375, + "learning_rate": 5.00336200341495e-05, + "loss": 0.2349, + "step": 20980 + }, + { + "epoch": 2.8, + "grad_norm": 0.6875, + "learning_rate": 5.00235332331686e-05, + "loss": 0.2282, + "step": 20981 + }, + { + "epoch": 2.8, + "grad_norm": 0.6015625, + "learning_rate": 5.0013447109900654e-05, + "loss": 0.2233, + "step": 20982 + }, + { + "epoch": 2.8, + "grad_norm": 0.5390625, + "learning_rate": 5.0003361664482476e-05, + "loss": 0.3657, + "step": 20983 + }, + { + "epoch": 2.8, + "grad_norm": 0.427734375, + "learning_rate": 4.9993276897050814e-05, + "loss": 0.2286, + "step": 20984 + }, + { + "epoch": 2.8, + "grad_norm": 0.453125, + "learning_rate": 4.9983192807742386e-05, + "loss": 0.2133, + "step": 20985 + }, + { + "epoch": 2.8, + "grad_norm": 0.6015625, + "learning_rate": 4.9973109396694e-05, + "loss": 0.3619, + "step": 20986 + }, + { + "epoch": 2.8, + "grad_norm": 0.51953125, + "learning_rate": 4.9963026664042315e-05, + "loss": 0.3268, + "step": 20987 + }, + { + "epoch": 2.8, + "grad_norm": 0.369140625, + "learning_rate": 4.99529446099241e-05, + "loss": 0.2324, + "step": 20988 + }, + { + "epoch": 2.8, + "grad_norm": 0.78515625, + "learning_rate": 4.9942863234476126e-05, + "loss": 0.4159, + "step": 20989 + }, + { + "epoch": 2.8, + "grad_norm": 0.515625, + "learning_rate": 4.9932782537834996e-05, + "loss": 0.3479, + "step": 20990 + }, + { + "epoch": 2.8, + "grad_norm": 0.5859375, + "learning_rate": 4.992270252013749e-05, + "loss": 0.2845, + "step": 20991 + }, + { + "epoch": 2.8, + "grad_norm": 0.56640625, + "learning_rate": 4.9912623181520255e-05, + "loss": 0.2837, + "step": 20992 + }, + { + "epoch": 2.8, + "grad_norm": 0.453125, + "learning_rate": 4.990254452212001e-05, + "loss": 0.224, + "step": 20993 + }, + { + "epoch": 2.8, + "grad_norm": 0.58984375, + "learning_rate": 4.989246654207344e-05, + "loss": 0.2755, + "step": 20994 + }, + { + "epoch": 2.8, + "grad_norm": 0.478515625, + "learning_rate": 4.988238924151716e-05, + "loss": 0.4257, + "step": 20995 + }, + { + "epoch": 2.8, + "grad_norm": 0.6875, + "learning_rate": 4.9872312620587836e-05, + "loss": 0.3058, + "step": 20996 + }, + { + "epoch": 2.8, + "grad_norm": 0.5859375, + "learning_rate": 4.986223667942214e-05, + "loss": 0.48, + "step": 20997 + }, + { + "epoch": 2.8, + "grad_norm": 0.36328125, + "learning_rate": 4.985216141815672e-05, + "loss": 0.1222, + "step": 20998 + }, + { + "epoch": 2.8, + "grad_norm": 0.6171875, + "learning_rate": 4.984208683692817e-05, + "loss": 0.3134, + "step": 20999 + }, + { + "epoch": 2.8, + "grad_norm": 0.70703125, + "learning_rate": 4.9832012935873076e-05, + "loss": 0.4608, + "step": 21000 + }, + { + "epoch": 2.8, + "grad_norm": 0.52734375, + "learning_rate": 4.9821939715128096e-05, + "loss": 0.3783, + "step": 21001 + }, + { + "epoch": 2.8, + "grad_norm": 0.58203125, + "learning_rate": 4.981186717482981e-05, + "loss": 0.3882, + "step": 21002 + }, + { + "epoch": 2.8, + "grad_norm": 0.640625, + "learning_rate": 4.980179531511485e-05, + "loss": 0.3527, + "step": 21003 + }, + { + "epoch": 2.8, + "grad_norm": 0.4375, + "learning_rate": 4.979172413611973e-05, + "loss": 0.1295, + "step": 21004 + }, + { + "epoch": 2.8, + "grad_norm": 0.486328125, + "learning_rate": 4.978165363798105e-05, + "loss": 0.1598, + "step": 21005 + }, + { + "epoch": 2.8, + "grad_norm": 0.43359375, + "learning_rate": 4.977158382083538e-05, + "loss": 0.2771, + "step": 21006 + }, + { + "epoch": 2.8, + "grad_norm": 0.578125, + "learning_rate": 4.9761514684819314e-05, + "loss": 0.3187, + "step": 21007 + }, + { + "epoch": 2.8, + "grad_norm": 0.55078125, + "learning_rate": 4.97514462300693e-05, + "loss": 0.3416, + "step": 21008 + }, + { + "epoch": 2.8, + "grad_norm": 0.65234375, + "learning_rate": 4.974137845672193e-05, + "loss": 0.3054, + "step": 21009 + }, + { + "epoch": 2.8, + "grad_norm": 0.63671875, + "learning_rate": 4.973131136491377e-05, + "loss": 0.5039, + "step": 21010 + }, + { + "epoch": 2.8, + "grad_norm": 0.8125, + "learning_rate": 4.9721244954781245e-05, + "loss": 0.4081, + "step": 21011 + }, + { + "epoch": 2.8, + "grad_norm": 0.65625, + "learning_rate": 4.971117922646095e-05, + "loss": 0.3808, + "step": 21012 + }, + { + "epoch": 2.8, + "grad_norm": 0.5, + "learning_rate": 4.970111418008929e-05, + "loss": 0.2412, + "step": 21013 + }, + { + "epoch": 2.8, + "grad_norm": 0.52734375, + "learning_rate": 4.9691049815802805e-05, + "loss": 0.3479, + "step": 21014 + }, + { + "epoch": 2.8, + "grad_norm": 0.5546875, + "learning_rate": 4.968098613373798e-05, + "loss": 0.2897, + "step": 21015 + }, + { + "epoch": 2.8, + "grad_norm": 0.640625, + "learning_rate": 4.967092313403128e-05, + "loss": 0.2119, + "step": 21016 + }, + { + "epoch": 2.8, + "grad_norm": 0.6796875, + "learning_rate": 4.96608608168192e-05, + "loss": 0.3532, + "step": 21017 + }, + { + "epoch": 2.8, + "grad_norm": 0.62109375, + "learning_rate": 4.965079918223811e-05, + "loss": 0.2899, + "step": 21018 + }, + { + "epoch": 2.8, + "grad_norm": 0.87109375, + "learning_rate": 4.9640738230424496e-05, + "loss": 0.4432, + "step": 21019 + }, + { + "epoch": 2.8, + "grad_norm": 0.66796875, + "learning_rate": 4.9630677961514806e-05, + "loss": 0.2071, + "step": 21020 + }, + { + "epoch": 2.81, + "grad_norm": 0.71484375, + "learning_rate": 4.9620618375645465e-05, + "loss": 0.2529, + "step": 21021 + }, + { + "epoch": 2.81, + "grad_norm": 0.55078125, + "learning_rate": 4.961055947295289e-05, + "loss": 0.536, + "step": 21022 + }, + { + "epoch": 2.81, + "grad_norm": 0.58984375, + "learning_rate": 4.960050125357343e-05, + "loss": 0.4367, + "step": 21023 + }, + { + "epoch": 2.81, + "grad_norm": 0.330078125, + "learning_rate": 4.959044371764351e-05, + "loss": 0.1051, + "step": 21024 + }, + { + "epoch": 2.81, + "grad_norm": 0.5546875, + "learning_rate": 4.958038686529953e-05, + "loss": 0.4595, + "step": 21025 + }, + { + "epoch": 2.81, + "grad_norm": 0.56640625, + "learning_rate": 4.95703306966779e-05, + "loss": 0.2951, + "step": 21026 + }, + { + "epoch": 2.81, + "grad_norm": 0.7578125, + "learning_rate": 4.9560275211914916e-05, + "loss": 0.2494, + "step": 21027 + }, + { + "epoch": 2.81, + "grad_norm": 0.640625, + "learning_rate": 4.9550220411146974e-05, + "loss": 0.4146, + "step": 21028 + }, + { + "epoch": 2.81, + "grad_norm": 0.51953125, + "learning_rate": 4.9540166294510426e-05, + "loss": 0.5039, + "step": 21029 + }, + { + "epoch": 2.81, + "grad_norm": 0.55859375, + "learning_rate": 4.953011286214163e-05, + "loss": 0.2738, + "step": 21030 + }, + { + "epoch": 2.81, + "grad_norm": 0.65234375, + "learning_rate": 4.952006011417687e-05, + "loss": 0.2558, + "step": 21031 + }, + { + "epoch": 2.81, + "grad_norm": 0.7421875, + "learning_rate": 4.951000805075249e-05, + "loss": 0.3444, + "step": 21032 + }, + { + "epoch": 2.81, + "grad_norm": 0.515625, + "learning_rate": 4.94999566720048e-05, + "loss": 0.3222, + "step": 21033 + }, + { + "epoch": 2.81, + "grad_norm": 0.5234375, + "learning_rate": 4.948990597807015e-05, + "loss": 0.4604, + "step": 21034 + }, + { + "epoch": 2.81, + "grad_norm": 0.6640625, + "learning_rate": 4.94798559690848e-05, + "loss": 0.4859, + "step": 21035 + }, + { + "epoch": 2.81, + "grad_norm": 0.48828125, + "learning_rate": 4.946980664518498e-05, + "loss": 0.1836, + "step": 21036 + }, + { + "epoch": 2.81, + "grad_norm": 0.4375, + "learning_rate": 4.9459758006507004e-05, + "loss": 0.2245, + "step": 21037 + }, + { + "epoch": 2.81, + "grad_norm": 0.466796875, + "learning_rate": 4.944971005318716e-05, + "loss": 0.4008, + "step": 21038 + }, + { + "epoch": 2.81, + "grad_norm": 0.61328125, + "learning_rate": 4.9439662785361715e-05, + "loss": 0.4921, + "step": 21039 + }, + { + "epoch": 2.81, + "grad_norm": 0.5, + "learning_rate": 4.942961620316687e-05, + "loss": 0.1907, + "step": 21040 + }, + { + "epoch": 2.81, + "grad_norm": 0.5234375, + "learning_rate": 4.941957030673887e-05, + "loss": 0.2582, + "step": 21041 + }, + { + "epoch": 2.81, + "grad_norm": 0.7421875, + "learning_rate": 4.940952509621397e-05, + "loss": 0.3338, + "step": 21042 + }, + { + "epoch": 2.81, + "grad_norm": 0.890625, + "learning_rate": 4.939948057172837e-05, + "loss": 0.3771, + "step": 21043 + }, + { + "epoch": 2.81, + "grad_norm": 0.6796875, + "learning_rate": 4.9389436733418335e-05, + "loss": 0.4833, + "step": 21044 + }, + { + "epoch": 2.81, + "grad_norm": 0.66015625, + "learning_rate": 4.9379393581419974e-05, + "loss": 0.3836, + "step": 21045 + }, + { + "epoch": 2.81, + "grad_norm": 0.455078125, + "learning_rate": 4.9369351115869535e-05, + "loss": 0.1983, + "step": 21046 + }, + { + "epoch": 2.81, + "grad_norm": 0.64453125, + "learning_rate": 4.935930933690321e-05, + "loss": 0.4235, + "step": 21047 + }, + { + "epoch": 2.81, + "grad_norm": 0.66796875, + "learning_rate": 4.934926824465713e-05, + "loss": 0.5064, + "step": 21048 + }, + { + "epoch": 2.81, + "grad_norm": 0.64453125, + "learning_rate": 4.933922783926751e-05, + "loss": 0.3943, + "step": 21049 + }, + { + "epoch": 2.81, + "grad_norm": 0.51953125, + "learning_rate": 4.932918812087044e-05, + "loss": 0.3633, + "step": 21050 + }, + { + "epoch": 2.81, + "grad_norm": 0.6171875, + "learning_rate": 4.93191490896021e-05, + "loss": 0.1983, + "step": 21051 + }, + { + "epoch": 2.81, + "grad_norm": 0.6015625, + "learning_rate": 4.9309110745598617e-05, + "loss": 0.392, + "step": 21052 + }, + { + "epoch": 2.81, + "grad_norm": 0.482421875, + "learning_rate": 4.9299073088996176e-05, + "loss": 0.2757, + "step": 21053 + }, + { + "epoch": 2.81, + "grad_norm": 0.490234375, + "learning_rate": 4.92890361199308e-05, + "loss": 0.3577, + "step": 21054 + }, + { + "epoch": 2.81, + "grad_norm": 0.59765625, + "learning_rate": 4.927899983853864e-05, + "loss": 0.2547, + "step": 21055 + }, + { + "epoch": 2.81, + "grad_norm": 0.54296875, + "learning_rate": 4.9268964244955796e-05, + "loss": 0.5302, + "step": 21056 + }, + { + "epoch": 2.81, + "grad_norm": 0.62890625, + "learning_rate": 4.925892933931838e-05, + "loss": 0.2936, + "step": 21057 + }, + { + "epoch": 2.81, + "grad_norm": 0.73828125, + "learning_rate": 4.924889512176243e-05, + "loss": 0.3176, + "step": 21058 + }, + { + "epoch": 2.81, + "grad_norm": 0.5625, + "learning_rate": 4.923886159242406e-05, + "loss": 0.2644, + "step": 21059 + }, + { + "epoch": 2.81, + "grad_norm": 0.455078125, + "learning_rate": 4.922882875143927e-05, + "loss": 0.2825, + "step": 21060 + }, + { + "epoch": 2.81, + "grad_norm": 0.57421875, + "learning_rate": 4.921879659894415e-05, + "loss": 0.2322, + "step": 21061 + }, + { + "epoch": 2.81, + "grad_norm": 0.4453125, + "learning_rate": 4.920876513507477e-05, + "loss": 0.3038, + "step": 21062 + }, + { + "epoch": 2.81, + "grad_norm": 0.71875, + "learning_rate": 4.91987343599671e-05, + "loss": 0.1595, + "step": 21063 + }, + { + "epoch": 2.81, + "grad_norm": 0.7421875, + "learning_rate": 4.918870427375719e-05, + "loss": 0.4, + "step": 21064 + }, + { + "epoch": 2.81, + "grad_norm": 0.52734375, + "learning_rate": 4.917867487658107e-05, + "loss": 0.3485, + "step": 21065 + }, + { + "epoch": 2.81, + "grad_norm": 0.75390625, + "learning_rate": 4.916864616857473e-05, + "loss": 0.3172, + "step": 21066 + }, + { + "epoch": 2.81, + "grad_norm": 0.494140625, + "learning_rate": 4.9158618149874216e-05, + "loss": 0.2237, + "step": 21067 + }, + { + "epoch": 2.81, + "grad_norm": 0.5, + "learning_rate": 4.9148590820615424e-05, + "loss": 0.3468, + "step": 21068 + }, + { + "epoch": 2.81, + "grad_norm": 0.498046875, + "learning_rate": 4.9138564180934375e-05, + "loss": 0.1529, + "step": 21069 + }, + { + "epoch": 2.81, + "grad_norm": 0.53515625, + "learning_rate": 4.912853823096703e-05, + "loss": 0.2492, + "step": 21070 + }, + { + "epoch": 2.81, + "grad_norm": 0.6328125, + "learning_rate": 4.911851297084942e-05, + "loss": 0.4986, + "step": 21071 + }, + { + "epoch": 2.81, + "grad_norm": 0.6015625, + "learning_rate": 4.910848840071741e-05, + "loss": 0.4751, + "step": 21072 + }, + { + "epoch": 2.81, + "grad_norm": 0.7578125, + "learning_rate": 4.909846452070693e-05, + "loss": 0.4288, + "step": 21073 + }, + { + "epoch": 2.81, + "grad_norm": 0.4765625, + "learning_rate": 4.908844133095394e-05, + "loss": 0.1644, + "step": 21074 + }, + { + "epoch": 2.81, + "grad_norm": 0.44921875, + "learning_rate": 4.907841883159435e-05, + "loss": 0.2139, + "step": 21075 + }, + { + "epoch": 2.81, + "grad_norm": 0.56640625, + "learning_rate": 4.9068397022764123e-05, + "loss": 0.2251, + "step": 21076 + }, + { + "epoch": 2.81, + "grad_norm": 0.431640625, + "learning_rate": 4.9058375904599084e-05, + "loss": 0.1027, + "step": 21077 + }, + { + "epoch": 2.81, + "grad_norm": 0.60546875, + "learning_rate": 4.904835547723516e-05, + "loss": 0.3536, + "step": 21078 + }, + { + "epoch": 2.81, + "grad_norm": 0.56640625, + "learning_rate": 4.903833574080825e-05, + "loss": 0.2636, + "step": 21079 + }, + { + "epoch": 2.81, + "grad_norm": 0.46875, + "learning_rate": 4.9028316695454236e-05, + "loss": 0.1464, + "step": 21080 + }, + { + "epoch": 2.81, + "grad_norm": 0.5625, + "learning_rate": 4.901829834130893e-05, + "loss": 0.3678, + "step": 21081 + }, + { + "epoch": 2.81, + "grad_norm": 0.9375, + "learning_rate": 4.900828067850822e-05, + "loss": 0.3363, + "step": 21082 + }, + { + "epoch": 2.81, + "grad_norm": 0.41796875, + "learning_rate": 4.8998263707187984e-05, + "loss": 0.2529, + "step": 21083 + }, + { + "epoch": 2.81, + "grad_norm": 0.50390625, + "learning_rate": 4.898824742748399e-05, + "loss": 0.1461, + "step": 21084 + }, + { + "epoch": 2.81, + "grad_norm": 0.7421875, + "learning_rate": 4.897823183953213e-05, + "loss": 0.4696, + "step": 21085 + }, + { + "epoch": 2.81, + "grad_norm": 0.60546875, + "learning_rate": 4.896821694346816e-05, + "loss": 0.3112, + "step": 21086 + }, + { + "epoch": 2.81, + "grad_norm": 0.44921875, + "learning_rate": 4.895820273942793e-05, + "loss": 0.1572, + "step": 21087 + }, + { + "epoch": 2.81, + "grad_norm": 0.6484375, + "learning_rate": 4.894818922754721e-05, + "loss": 0.5659, + "step": 21088 + }, + { + "epoch": 2.81, + "grad_norm": 0.578125, + "learning_rate": 4.8938176407961824e-05, + "loss": 0.3385, + "step": 21089 + }, + { + "epoch": 2.81, + "grad_norm": 0.54296875, + "learning_rate": 4.892816428080756e-05, + "loss": 0.4089, + "step": 21090 + }, + { + "epoch": 2.81, + "grad_norm": 0.55859375, + "learning_rate": 4.8918152846220145e-05, + "loss": 0.2908, + "step": 21091 + }, + { + "epoch": 2.81, + "grad_norm": 0.5625, + "learning_rate": 4.890814210433535e-05, + "loss": 0.3603, + "step": 21092 + }, + { + "epoch": 2.81, + "grad_norm": 0.47265625, + "learning_rate": 4.889813205528895e-05, + "loss": 0.3512, + "step": 21093 + }, + { + "epoch": 2.81, + "grad_norm": 0.50390625, + "learning_rate": 4.8888122699216696e-05, + "loss": 0.441, + "step": 21094 + }, + { + "epoch": 2.81, + "grad_norm": 0.66015625, + "learning_rate": 4.887811403625432e-05, + "loss": 0.4171, + "step": 21095 + }, + { + "epoch": 2.82, + "grad_norm": 0.515625, + "learning_rate": 4.886810606653748e-05, + "loss": 0.4843, + "step": 21096 + }, + { + "epoch": 2.82, + "grad_norm": 0.56640625, + "learning_rate": 4.885809879020194e-05, + "loss": 0.2769, + "step": 21097 + }, + { + "epoch": 2.82, + "grad_norm": 0.53515625, + "learning_rate": 4.88480922073834e-05, + "loss": 0.4219, + "step": 21098 + }, + { + "epoch": 2.82, + "grad_norm": 0.55078125, + "learning_rate": 4.8838086318217604e-05, + "loss": 0.224, + "step": 21099 + }, + { + "epoch": 2.82, + "grad_norm": 0.73828125, + "learning_rate": 4.882808112284015e-05, + "loss": 0.3298, + "step": 21100 + }, + { + "epoch": 2.82, + "grad_norm": 0.443359375, + "learning_rate": 4.881807662138675e-05, + "loss": 0.3109, + "step": 21101 + }, + { + "epoch": 2.82, + "grad_norm": 0.5625, + "learning_rate": 4.88080728139931e-05, + "loss": 0.2873, + "step": 21102 + }, + { + "epoch": 2.82, + "grad_norm": 0.7421875, + "learning_rate": 4.8798069700794855e-05, + "loss": 0.5575, + "step": 21103 + }, + { + "epoch": 2.82, + "grad_norm": 0.765625, + "learning_rate": 4.8788067281927616e-05, + "loss": 0.5749, + "step": 21104 + }, + { + "epoch": 2.82, + "grad_norm": 0.5078125, + "learning_rate": 4.877806555752705e-05, + "loss": 0.3783, + "step": 21105 + }, + { + "epoch": 2.82, + "grad_norm": 0.6953125, + "learning_rate": 4.8768064527728794e-05, + "loss": 0.3783, + "step": 21106 + }, + { + "epoch": 2.82, + "grad_norm": 0.66796875, + "learning_rate": 4.8758064192668504e-05, + "loss": 0.196, + "step": 21107 + }, + { + "epoch": 2.82, + "grad_norm": 0.5703125, + "learning_rate": 4.874806455248174e-05, + "loss": 0.4804, + "step": 21108 + }, + { + "epoch": 2.82, + "grad_norm": 0.6796875, + "learning_rate": 4.873806560730408e-05, + "loss": 0.2253, + "step": 21109 + }, + { + "epoch": 2.82, + "grad_norm": 0.58203125, + "learning_rate": 4.872806735727115e-05, + "loss": 0.4203, + "step": 21110 + }, + { + "epoch": 2.82, + "grad_norm": 0.5234375, + "learning_rate": 4.871806980251854e-05, + "loss": 0.3175, + "step": 21111 + }, + { + "epoch": 2.82, + "grad_norm": 0.72265625, + "learning_rate": 4.870807294318185e-05, + "loss": 0.2186, + "step": 21112 + }, + { + "epoch": 2.82, + "grad_norm": 0.7109375, + "learning_rate": 4.8698076779396575e-05, + "loss": 0.3737, + "step": 21113 + }, + { + "epoch": 2.82, + "grad_norm": 0.84375, + "learning_rate": 4.8688081311298296e-05, + "loss": 0.3135, + "step": 21114 + }, + { + "epoch": 2.82, + "grad_norm": 0.38671875, + "learning_rate": 4.8678086539022575e-05, + "loss": 0.1716, + "step": 21115 + }, + { + "epoch": 2.82, + "grad_norm": 0.3671875, + "learning_rate": 4.866809246270493e-05, + "loss": 0.1538, + "step": 21116 + }, + { + "epoch": 2.82, + "grad_norm": 0.390625, + "learning_rate": 4.8658099082480936e-05, + "loss": 0.1979, + "step": 21117 + }, + { + "epoch": 2.82, + "grad_norm": 0.466796875, + "learning_rate": 4.864810639848603e-05, + "loss": 0.2646, + "step": 21118 + }, + { + "epoch": 2.82, + "grad_norm": 0.55078125, + "learning_rate": 4.8638114410855794e-05, + "loss": 0.5638, + "step": 21119 + }, + { + "epoch": 2.82, + "grad_norm": 0.6328125, + "learning_rate": 4.862812311972567e-05, + "loss": 0.4273, + "step": 21120 + }, + { + "epoch": 2.82, + "grad_norm": 0.59375, + "learning_rate": 4.861813252523114e-05, + "loss": 0.5882, + "step": 21121 + }, + { + "epoch": 2.82, + "grad_norm": 0.67578125, + "learning_rate": 4.860814262750777e-05, + "loss": 0.2117, + "step": 21122 + }, + { + "epoch": 2.82, + "grad_norm": 0.3828125, + "learning_rate": 4.859815342669092e-05, + "loss": 0.1737, + "step": 21123 + }, + { + "epoch": 2.82, + "grad_norm": 0.373046875, + "learning_rate": 4.85881649229161e-05, + "loss": 0.2252, + "step": 21124 + }, + { + "epoch": 2.82, + "grad_norm": 0.5078125, + "learning_rate": 4.857817711631877e-05, + "loss": 0.2077, + "step": 21125 + }, + { + "epoch": 2.82, + "grad_norm": 0.59765625, + "learning_rate": 4.8568190007034385e-05, + "loss": 0.3313, + "step": 21126 + }, + { + "epoch": 2.82, + "grad_norm": 0.46484375, + "learning_rate": 4.8558203595198316e-05, + "loss": 0.1612, + "step": 21127 + }, + { + "epoch": 2.82, + "grad_norm": 0.5625, + "learning_rate": 4.854821788094602e-05, + "loss": 0.2926, + "step": 21128 + }, + { + "epoch": 2.82, + "grad_norm": 0.80078125, + "learning_rate": 4.853823286441293e-05, + "loss": 0.3873, + "step": 21129 + }, + { + "epoch": 2.82, + "grad_norm": 0.59765625, + "learning_rate": 4.852824854573444e-05, + "loss": 0.1946, + "step": 21130 + }, + { + "epoch": 2.82, + "grad_norm": 0.53515625, + "learning_rate": 4.851826492504592e-05, + "loss": 0.3013, + "step": 21131 + }, + { + "epoch": 2.82, + "grad_norm": 0.37890625, + "learning_rate": 4.85082820024828e-05, + "loss": 0.1203, + "step": 21132 + }, + { + "epoch": 2.82, + "grad_norm": 0.59765625, + "learning_rate": 4.849829977818038e-05, + "loss": 0.4189, + "step": 21133 + }, + { + "epoch": 2.82, + "grad_norm": 0.73046875, + "learning_rate": 4.848831825227407e-05, + "loss": 0.335, + "step": 21134 + }, + { + "epoch": 2.82, + "grad_norm": 0.53125, + "learning_rate": 4.847833742489926e-05, + "loss": 0.4163, + "step": 21135 + }, + { + "epoch": 2.82, + "grad_norm": 0.6171875, + "learning_rate": 4.846835729619123e-05, + "loss": 0.4272, + "step": 21136 + }, + { + "epoch": 2.82, + "grad_norm": 0.67578125, + "learning_rate": 4.845837786628533e-05, + "loss": 0.2788, + "step": 21137 + }, + { + "epoch": 2.82, + "grad_norm": 0.498046875, + "learning_rate": 4.844839913531692e-05, + "loss": 0.1349, + "step": 21138 + }, + { + "epoch": 2.82, + "grad_norm": 0.6640625, + "learning_rate": 4.8438421103421304e-05, + "loss": 0.471, + "step": 21139 + }, + { + "epoch": 2.82, + "grad_norm": 0.462890625, + "learning_rate": 4.8428443770733835e-05, + "loss": 0.1609, + "step": 21140 + }, + { + "epoch": 2.82, + "grad_norm": 0.546875, + "learning_rate": 4.8418467137389713e-05, + "loss": 0.3108, + "step": 21141 + }, + { + "epoch": 2.82, + "grad_norm": 0.51171875, + "learning_rate": 4.840849120352429e-05, + "loss": 0.2692, + "step": 21142 + }, + { + "epoch": 2.82, + "grad_norm": 0.71484375, + "learning_rate": 4.839851596927284e-05, + "loss": 0.2698, + "step": 21143 + }, + { + "epoch": 2.82, + "grad_norm": 0.6640625, + "learning_rate": 4.838854143477066e-05, + "loss": 0.322, + "step": 21144 + }, + { + "epoch": 2.82, + "grad_norm": 0.3515625, + "learning_rate": 4.837856760015299e-05, + "loss": 0.0985, + "step": 21145 + }, + { + "epoch": 2.82, + "grad_norm": 0.67578125, + "learning_rate": 4.836859446555503e-05, + "loss": 0.4357, + "step": 21146 + }, + { + "epoch": 2.82, + "grad_norm": 0.51953125, + "learning_rate": 4.8358622031112064e-05, + "loss": 0.2946, + "step": 21147 + }, + { + "epoch": 2.82, + "grad_norm": 0.70703125, + "learning_rate": 4.834865029695933e-05, + "loss": 0.3118, + "step": 21148 + }, + { + "epoch": 2.82, + "grad_norm": 0.75390625, + "learning_rate": 4.833867926323209e-05, + "loss": 0.6594, + "step": 21149 + }, + { + "epoch": 2.82, + "grad_norm": 0.6484375, + "learning_rate": 4.832870893006547e-05, + "loss": 0.3092, + "step": 21150 + }, + { + "epoch": 2.82, + "grad_norm": 0.6796875, + "learning_rate": 4.831873929759473e-05, + "loss": 0.3733, + "step": 21151 + }, + { + "epoch": 2.82, + "grad_norm": 0.6328125, + "learning_rate": 4.830877036595505e-05, + "loss": 0.4494, + "step": 21152 + }, + { + "epoch": 2.82, + "grad_norm": 0.72265625, + "learning_rate": 4.829880213528165e-05, + "loss": 0.2824, + "step": 21153 + }, + { + "epoch": 2.82, + "grad_norm": 0.5078125, + "learning_rate": 4.8288834605709646e-05, + "loss": 0.1993, + "step": 21154 + }, + { + "epoch": 2.82, + "grad_norm": 0.6484375, + "learning_rate": 4.8278867777374224e-05, + "loss": 0.5204, + "step": 21155 + }, + { + "epoch": 2.82, + "grad_norm": 0.52734375, + "learning_rate": 4.826890165041059e-05, + "loss": 0.1762, + "step": 21156 + }, + { + "epoch": 2.82, + "grad_norm": 0.546875, + "learning_rate": 4.825893622495381e-05, + "loss": 0.2824, + "step": 21157 + }, + { + "epoch": 2.82, + "grad_norm": 0.60546875, + "learning_rate": 4.824897150113911e-05, + "loss": 0.3239, + "step": 21158 + }, + { + "epoch": 2.82, + "grad_norm": 0.5234375, + "learning_rate": 4.8239007479101516e-05, + "loss": 0.1867, + "step": 21159 + }, + { + "epoch": 2.82, + "grad_norm": 0.7734375, + "learning_rate": 4.822904415897621e-05, + "loss": 0.2696, + "step": 21160 + }, + { + "epoch": 2.82, + "grad_norm": 0.61328125, + "learning_rate": 4.8219081540898295e-05, + "loss": 0.4433, + "step": 21161 + }, + { + "epoch": 2.82, + "grad_norm": 0.55078125, + "learning_rate": 4.820911962500286e-05, + "loss": 0.4068, + "step": 21162 + }, + { + "epoch": 2.82, + "grad_norm": 1.296875, + "learning_rate": 4.819915841142505e-05, + "loss": 0.3727, + "step": 21163 + }, + { + "epoch": 2.82, + "grad_norm": 0.4140625, + "learning_rate": 4.8189197900299855e-05, + "loss": 0.1191, + "step": 21164 + }, + { + "epoch": 2.82, + "grad_norm": 0.5703125, + "learning_rate": 4.817923809176238e-05, + "loss": 0.4974, + "step": 21165 + }, + { + "epoch": 2.82, + "grad_norm": 0.59375, + "learning_rate": 4.8169278985947705e-05, + "loss": 0.3598, + "step": 21166 + }, + { + "epoch": 2.82, + "grad_norm": 0.4765625, + "learning_rate": 4.815932058299091e-05, + "loss": 0.3104, + "step": 21167 + }, + { + "epoch": 2.82, + "grad_norm": 0.5, + "learning_rate": 4.8149362883026995e-05, + "loss": 0.27, + "step": 21168 + }, + { + "epoch": 2.82, + "grad_norm": 0.451171875, + "learning_rate": 4.8139405886190944e-05, + "loss": 0.1731, + "step": 21169 + }, + { + "epoch": 2.82, + "grad_norm": 0.59375, + "learning_rate": 4.812944959261786e-05, + "loss": 0.2513, + "step": 21170 + }, + { + "epoch": 2.83, + "grad_norm": 0.5078125, + "learning_rate": 4.811949400244271e-05, + "loss": 0.1322, + "step": 21171 + }, + { + "epoch": 2.83, + "grad_norm": 0.55859375, + "learning_rate": 4.8109539115800575e-05, + "loss": 0.2254, + "step": 21172 + }, + { + "epoch": 2.83, + "grad_norm": 0.56640625, + "learning_rate": 4.809958493282634e-05, + "loss": 0.3304, + "step": 21173 + }, + { + "epoch": 2.83, + "grad_norm": 0.53515625, + "learning_rate": 4.808963145365504e-05, + "loss": 0.5037, + "step": 21174 + }, + { + "epoch": 2.83, + "grad_norm": 0.6796875, + "learning_rate": 4.807967867842166e-05, + "loss": 0.2952, + "step": 21175 + }, + { + "epoch": 2.83, + "grad_norm": 0.6171875, + "learning_rate": 4.80697266072612e-05, + "loss": 0.6162, + "step": 21176 + }, + { + "epoch": 2.83, + "grad_norm": 0.6015625, + "learning_rate": 4.8059775240308534e-05, + "loss": 0.1878, + "step": 21177 + }, + { + "epoch": 2.83, + "grad_norm": 0.62109375, + "learning_rate": 4.8049824577698656e-05, + "loss": 0.337, + "step": 21178 + }, + { + "epoch": 2.83, + "grad_norm": 0.4453125, + "learning_rate": 4.803987461956648e-05, + "loss": 0.3381, + "step": 21179 + }, + { + "epoch": 2.83, + "grad_norm": 0.58984375, + "learning_rate": 4.802992536604701e-05, + "loss": 0.1971, + "step": 21180 + }, + { + "epoch": 2.83, + "grad_norm": 0.51171875, + "learning_rate": 4.80199768172751e-05, + "loss": 0.4407, + "step": 21181 + }, + { + "epoch": 2.83, + "grad_norm": 0.734375, + "learning_rate": 4.8010028973385624e-05, + "loss": 0.3694, + "step": 21182 + }, + { + "epoch": 2.83, + "grad_norm": 0.498046875, + "learning_rate": 4.800008183451353e-05, + "loss": 0.4013, + "step": 21183 + }, + { + "epoch": 2.83, + "grad_norm": 0.6796875, + "learning_rate": 4.799013540079369e-05, + "loss": 0.5258, + "step": 21184 + }, + { + "epoch": 2.83, + "grad_norm": 0.5703125, + "learning_rate": 4.798018967236104e-05, + "loss": 0.31, + "step": 21185 + }, + { + "epoch": 2.83, + "grad_norm": 0.921875, + "learning_rate": 4.797024464935036e-05, + "loss": 0.4145, + "step": 21186 + }, + { + "epoch": 2.83, + "grad_norm": 0.62890625, + "learning_rate": 4.7960300331896554e-05, + "loss": 0.4451, + "step": 21187 + }, + { + "epoch": 2.83, + "grad_norm": 0.72265625, + "learning_rate": 4.795035672013448e-05, + "loss": 0.2907, + "step": 21188 + }, + { + "epoch": 2.83, + "grad_norm": 0.482421875, + "learning_rate": 4.794041381419897e-05, + "loss": 0.3618, + "step": 21189 + }, + { + "epoch": 2.83, + "grad_norm": 0.6796875, + "learning_rate": 4.7930471614224904e-05, + "loss": 0.5301, + "step": 21190 + }, + { + "epoch": 2.83, + "grad_norm": 0.625, + "learning_rate": 4.7920530120347016e-05, + "loss": 0.4327, + "step": 21191 + }, + { + "epoch": 2.83, + "grad_norm": 0.54296875, + "learning_rate": 4.791058933270021e-05, + "loss": 0.3707, + "step": 21192 + }, + { + "epoch": 2.83, + "grad_norm": 0.439453125, + "learning_rate": 4.790064925141919e-05, + "loss": 0.1809, + "step": 21193 + }, + { + "epoch": 2.83, + "grad_norm": 0.70703125, + "learning_rate": 4.789070987663882e-05, + "loss": 0.3229, + "step": 21194 + }, + { + "epoch": 2.83, + "grad_norm": 0.64453125, + "learning_rate": 4.78807712084939e-05, + "loss": 0.2915, + "step": 21195 + }, + { + "epoch": 2.83, + "grad_norm": 0.427734375, + "learning_rate": 4.787083324711913e-05, + "loss": 0.142, + "step": 21196 + }, + { + "epoch": 2.83, + "grad_norm": 0.50390625, + "learning_rate": 4.786089599264932e-05, + "loss": 0.2321, + "step": 21197 + }, + { + "epoch": 2.83, + "grad_norm": 0.5234375, + "learning_rate": 4.785095944521922e-05, + "loss": 0.269, + "step": 21198 + }, + { + "epoch": 2.83, + "grad_norm": 0.67578125, + "learning_rate": 4.784102360496363e-05, + "loss": 0.3107, + "step": 21199 + }, + { + "epoch": 2.83, + "grad_norm": 0.69140625, + "learning_rate": 4.7831088472017184e-05, + "loss": 0.25, + "step": 21200 + }, + { + "epoch": 2.83, + "grad_norm": 0.56640625, + "learning_rate": 4.782115404651466e-05, + "loss": 0.3132, + "step": 21201 + }, + { + "epoch": 2.83, + "grad_norm": 0.578125, + "learning_rate": 4.781122032859079e-05, + "loss": 0.6738, + "step": 21202 + }, + { + "epoch": 2.83, + "grad_norm": 0.48828125, + "learning_rate": 4.780128731838028e-05, + "loss": 0.1796, + "step": 21203 + }, + { + "epoch": 2.83, + "grad_norm": 0.5703125, + "learning_rate": 4.7791355016017835e-05, + "loss": 0.3324, + "step": 21204 + }, + { + "epoch": 2.83, + "grad_norm": 0.478515625, + "learning_rate": 4.7781423421638086e-05, + "loss": 0.3316, + "step": 21205 + }, + { + "epoch": 2.83, + "grad_norm": 0.79296875, + "learning_rate": 4.777149253537574e-05, + "loss": 0.3512, + "step": 21206 + }, + { + "epoch": 2.83, + "grad_norm": 0.53515625, + "learning_rate": 4.7761562357365474e-05, + "loss": 0.1319, + "step": 21207 + }, + { + "epoch": 2.83, + "grad_norm": 0.66015625, + "learning_rate": 4.7751632887741994e-05, + "loss": 0.4009, + "step": 21208 + }, + { + "epoch": 2.83, + "grad_norm": 0.5703125, + "learning_rate": 4.7741704126639864e-05, + "loss": 0.1417, + "step": 21209 + }, + { + "epoch": 2.83, + "grad_norm": 0.6796875, + "learning_rate": 4.7731776074193754e-05, + "loss": 0.3886, + "step": 21210 + }, + { + "epoch": 2.83, + "grad_norm": 0.5859375, + "learning_rate": 4.772184873053832e-05, + "loss": 0.2643, + "step": 21211 + }, + { + "epoch": 2.83, + "grad_norm": 0.56640625, + "learning_rate": 4.771192209580817e-05, + "loss": 0.5297, + "step": 21212 + }, + { + "epoch": 2.83, + "grad_norm": 0.5859375, + "learning_rate": 4.770199617013795e-05, + "loss": 0.1652, + "step": 21213 + }, + { + "epoch": 2.83, + "grad_norm": 0.69921875, + "learning_rate": 4.769207095366218e-05, + "loss": 0.3759, + "step": 21214 + }, + { + "epoch": 2.83, + "grad_norm": 0.5, + "learning_rate": 4.7682146446515506e-05, + "loss": 0.2278, + "step": 21215 + }, + { + "epoch": 2.83, + "grad_norm": 0.52734375, + "learning_rate": 4.767222264883251e-05, + "loss": 0.1899, + "step": 21216 + }, + { + "epoch": 2.83, + "grad_norm": 0.4921875, + "learning_rate": 4.7662299560747795e-05, + "loss": 0.3814, + "step": 21217 + }, + { + "epoch": 2.83, + "grad_norm": 0.58984375, + "learning_rate": 4.765237718239588e-05, + "loss": 0.2992, + "step": 21218 + }, + { + "epoch": 2.83, + "grad_norm": 0.515625, + "learning_rate": 4.764245551391129e-05, + "loss": 0.3106, + "step": 21219 + }, + { + "epoch": 2.83, + "grad_norm": 0.5390625, + "learning_rate": 4.763253455542861e-05, + "loss": 0.3341, + "step": 21220 + }, + { + "epoch": 2.83, + "grad_norm": 0.921875, + "learning_rate": 4.762261430708237e-05, + "loss": 0.4136, + "step": 21221 + }, + { + "epoch": 2.83, + "grad_norm": 0.5546875, + "learning_rate": 4.761269476900713e-05, + "loss": 0.398, + "step": 21222 + }, + { + "epoch": 2.83, + "grad_norm": 0.4609375, + "learning_rate": 4.760277594133733e-05, + "loss": 0.4491, + "step": 21223 + }, + { + "epoch": 2.83, + "grad_norm": 0.6328125, + "learning_rate": 4.759285782420753e-05, + "loss": 0.2542, + "step": 21224 + }, + { + "epoch": 2.83, + "grad_norm": 0.482421875, + "learning_rate": 4.7582940417752195e-05, + "loss": 0.3411, + "step": 21225 + }, + { + "epoch": 2.83, + "grad_norm": 0.5390625, + "learning_rate": 4.7573023722105884e-05, + "loss": 0.1868, + "step": 21226 + }, + { + "epoch": 2.83, + "grad_norm": 0.546875, + "learning_rate": 4.756310773740297e-05, + "loss": 0.3192, + "step": 21227 + }, + { + "epoch": 2.83, + "grad_norm": 0.462890625, + "learning_rate": 4.755319246377797e-05, + "loss": 0.3427, + "step": 21228 + }, + { + "epoch": 2.83, + "grad_norm": 0.7109375, + "learning_rate": 4.754327790136538e-05, + "loss": 0.4314, + "step": 21229 + }, + { + "epoch": 2.83, + "grad_norm": 0.5078125, + "learning_rate": 4.7533364050299564e-05, + "loss": 0.1352, + "step": 21230 + }, + { + "epoch": 2.83, + "grad_norm": 0.6171875, + "learning_rate": 4.752345091071505e-05, + "loss": 0.5144, + "step": 21231 + }, + { + "epoch": 2.83, + "grad_norm": 0.5703125, + "learning_rate": 4.751353848274619e-05, + "loss": 0.2721, + "step": 21232 + }, + { + "epoch": 2.83, + "grad_norm": 0.4375, + "learning_rate": 4.750362676652742e-05, + "loss": 0.2972, + "step": 21233 + }, + { + "epoch": 2.83, + "grad_norm": 0.77734375, + "learning_rate": 4.749371576219317e-05, + "loss": 0.2981, + "step": 21234 + }, + { + "epoch": 2.83, + "grad_norm": 0.59375, + "learning_rate": 4.748380546987783e-05, + "loss": 0.4529, + "step": 21235 + }, + { + "epoch": 2.83, + "grad_norm": 0.51953125, + "learning_rate": 4.7473895889715824e-05, + "loss": 0.2557, + "step": 21236 + }, + { + "epoch": 2.83, + "grad_norm": 0.58203125, + "learning_rate": 4.746398702184147e-05, + "loss": 0.3067, + "step": 21237 + }, + { + "epoch": 2.83, + "grad_norm": 0.53125, + "learning_rate": 4.745407886638917e-05, + "loss": 0.4399, + "step": 21238 + }, + { + "epoch": 2.83, + "grad_norm": 0.5859375, + "learning_rate": 4.744417142349329e-05, + "loss": 0.4551, + "step": 21239 + }, + { + "epoch": 2.83, + "grad_norm": 0.609375, + "learning_rate": 4.743426469328821e-05, + "loss": 0.3499, + "step": 21240 + }, + { + "epoch": 2.83, + "grad_norm": 0.58203125, + "learning_rate": 4.7424358675908234e-05, + "loss": 0.7761, + "step": 21241 + }, + { + "epoch": 2.83, + "grad_norm": 0.61328125, + "learning_rate": 4.7414453371487654e-05, + "loss": 0.3345, + "step": 21242 + }, + { + "epoch": 2.83, + "grad_norm": 0.458984375, + "learning_rate": 4.740454878016084e-05, + "loss": 0.3637, + "step": 21243 + }, + { + "epoch": 2.83, + "grad_norm": 0.59375, + "learning_rate": 4.7394644902062105e-05, + "loss": 0.5358, + "step": 21244 + }, + { + "epoch": 2.83, + "grad_norm": 0.71875, + "learning_rate": 4.7384741737325775e-05, + "loss": 0.4201, + "step": 21245 + }, + { + "epoch": 2.84, + "grad_norm": 0.66015625, + "learning_rate": 4.7374839286086095e-05, + "loss": 0.2091, + "step": 21246 + }, + { + "epoch": 2.84, + "grad_norm": 0.54296875, + "learning_rate": 4.736493754847735e-05, + "loss": 0.2099, + "step": 21247 + }, + { + "epoch": 2.84, + "grad_norm": 0.73828125, + "learning_rate": 4.7355036524633845e-05, + "loss": 0.343, + "step": 21248 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 4.7345136214689866e-05, + "loss": 0.1934, + "step": 21249 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 4.733523661877959e-05, + "loss": 0.5146, + "step": 21250 + }, + { + "epoch": 2.84, + "grad_norm": 0.546875, + "learning_rate": 4.732533773703731e-05, + "loss": 0.3245, + "step": 21251 + }, + { + "epoch": 2.84, + "grad_norm": 0.59765625, + "learning_rate": 4.731543956959725e-05, + "loss": 0.5086, + "step": 21252 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 4.730554211659369e-05, + "loss": 0.5596, + "step": 21253 + }, + { + "epoch": 2.84, + "grad_norm": 0.462890625, + "learning_rate": 4.7295645378160805e-05, + "loss": 0.2147, + "step": 21254 + }, + { + "epoch": 2.84, + "grad_norm": 0.5390625, + "learning_rate": 4.728574935443274e-05, + "loss": 0.233, + "step": 21255 + }, + { + "epoch": 2.84, + "grad_norm": 0.455078125, + "learning_rate": 4.7275854045543755e-05, + "loss": 0.1901, + "step": 21256 + }, + { + "epoch": 2.84, + "grad_norm": 0.6484375, + "learning_rate": 4.726595945162802e-05, + "loss": 0.5412, + "step": 21257 + }, + { + "epoch": 2.84, + "grad_norm": 0.59375, + "learning_rate": 4.725606557281976e-05, + "loss": 0.2598, + "step": 21258 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 4.724617240925307e-05, + "loss": 0.4054, + "step": 21259 + }, + { + "epoch": 2.84, + "grad_norm": 0.796875, + "learning_rate": 4.723627996106214e-05, + "loss": 0.4086, + "step": 21260 + }, + { + "epoch": 2.84, + "grad_norm": 0.64453125, + "learning_rate": 4.72263882283811e-05, + "loss": 0.3242, + "step": 21261 + }, + { + "epoch": 2.84, + "grad_norm": 0.4921875, + "learning_rate": 4.721649721134412e-05, + "loss": 0.1797, + "step": 21262 + }, + { + "epoch": 2.84, + "grad_norm": 0.458984375, + "learning_rate": 4.720660691008535e-05, + "loss": 0.151, + "step": 21263 + }, + { + "epoch": 2.84, + "grad_norm": 0.474609375, + "learning_rate": 4.719671732473884e-05, + "loss": 0.4149, + "step": 21264 + }, + { + "epoch": 2.84, + "grad_norm": 0.5390625, + "learning_rate": 4.718682845543876e-05, + "loss": 0.3746, + "step": 21265 + }, + { + "epoch": 2.84, + "grad_norm": 0.474609375, + "learning_rate": 4.717694030231914e-05, + "loss": 0.1888, + "step": 21266 + }, + { + "epoch": 2.84, + "grad_norm": 0.640625, + "learning_rate": 4.716705286551413e-05, + "loss": 0.4673, + "step": 21267 + }, + { + "epoch": 2.84, + "grad_norm": 0.69140625, + "learning_rate": 4.715716614515781e-05, + "loss": 0.3525, + "step": 21268 + }, + { + "epoch": 2.84, + "grad_norm": 0.59375, + "learning_rate": 4.71472801413842e-05, + "loss": 0.3587, + "step": 21269 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 4.713739485432739e-05, + "loss": 0.3676, + "step": 21270 + }, + { + "epoch": 2.84, + "grad_norm": 0.6328125, + "learning_rate": 4.7127510284121433e-05, + "loss": 0.3034, + "step": 21271 + }, + { + "epoch": 2.84, + "grad_norm": 0.703125, + "learning_rate": 4.711762643090042e-05, + "loss": 0.4488, + "step": 21272 + }, + { + "epoch": 2.84, + "grad_norm": 0.37890625, + "learning_rate": 4.710774329479828e-05, + "loss": 0.1516, + "step": 21273 + }, + { + "epoch": 2.84, + "grad_norm": 0.58984375, + "learning_rate": 4.709786087594909e-05, + "loss": 0.4224, + "step": 21274 + }, + { + "epoch": 2.84, + "grad_norm": 0.37890625, + "learning_rate": 4.7087979174486866e-05, + "loss": 0.1659, + "step": 21275 + }, + { + "epoch": 2.84, + "grad_norm": 0.5546875, + "learning_rate": 4.7078098190545625e-05, + "loss": 0.2222, + "step": 21276 + }, + { + "epoch": 2.84, + "grad_norm": 0.470703125, + "learning_rate": 4.7068217924259364e-05, + "loss": 0.2623, + "step": 21277 + }, + { + "epoch": 2.84, + "grad_norm": 0.6484375, + "learning_rate": 4.7058338375761976e-05, + "loss": 0.4575, + "step": 21278 + }, + { + "epoch": 2.84, + "grad_norm": 0.671875, + "learning_rate": 4.704845954518751e-05, + "loss": 0.4416, + "step": 21279 + }, + { + "epoch": 2.84, + "grad_norm": 0.6953125, + "learning_rate": 4.7038581432669913e-05, + "loss": 0.3548, + "step": 21280 + }, + { + "epoch": 2.84, + "grad_norm": 0.59765625, + "learning_rate": 4.702870403834317e-05, + "loss": 0.3956, + "step": 21281 + }, + { + "epoch": 2.84, + "grad_norm": 0.609375, + "learning_rate": 4.701882736234118e-05, + "loss": 0.1449, + "step": 21282 + }, + { + "epoch": 2.84, + "grad_norm": 0.470703125, + "learning_rate": 4.700895140479789e-05, + "loss": 0.1777, + "step": 21283 + }, + { + "epoch": 2.84, + "grad_norm": 0.421875, + "learning_rate": 4.699907616584721e-05, + "loss": 0.2315, + "step": 21284 + }, + { + "epoch": 2.84, + "grad_norm": 0.3671875, + "learning_rate": 4.698920164562309e-05, + "loss": 0.1828, + "step": 21285 + }, + { + "epoch": 2.84, + "grad_norm": 0.5234375, + "learning_rate": 4.697932784425945e-05, + "loss": 0.2205, + "step": 21286 + }, + { + "epoch": 2.84, + "grad_norm": 0.490234375, + "learning_rate": 4.6969454761890116e-05, + "loss": 0.2638, + "step": 21287 + }, + { + "epoch": 2.84, + "grad_norm": 0.51171875, + "learning_rate": 4.695958239864902e-05, + "loss": 0.1642, + "step": 21288 + }, + { + "epoch": 2.84, + "grad_norm": 0.8203125, + "learning_rate": 4.694971075467005e-05, + "loss": 0.5716, + "step": 21289 + }, + { + "epoch": 2.84, + "grad_norm": 0.6171875, + "learning_rate": 4.693983983008702e-05, + "loss": 0.4485, + "step": 21290 + }, + { + "epoch": 2.84, + "grad_norm": 0.51953125, + "learning_rate": 4.692996962503384e-05, + "loss": 0.4079, + "step": 21291 + }, + { + "epoch": 2.84, + "grad_norm": 0.6953125, + "learning_rate": 4.692010013964431e-05, + "loss": 0.2815, + "step": 21292 + }, + { + "epoch": 2.84, + "grad_norm": 0.53125, + "learning_rate": 4.6910231374052286e-05, + "loss": 0.2152, + "step": 21293 + }, + { + "epoch": 2.84, + "grad_norm": 0.58203125, + "learning_rate": 4.690036332839159e-05, + "loss": 0.2318, + "step": 21294 + }, + { + "epoch": 2.84, + "grad_norm": 0.5859375, + "learning_rate": 4.689049600279609e-05, + "loss": 0.6552, + "step": 21295 + }, + { + "epoch": 2.84, + "grad_norm": 0.703125, + "learning_rate": 4.688062939739951e-05, + "loss": 0.4334, + "step": 21296 + }, + { + "epoch": 2.84, + "grad_norm": 0.7734375, + "learning_rate": 4.687076351233569e-05, + "loss": 0.33, + "step": 21297 + }, + { + "epoch": 2.84, + "grad_norm": 0.486328125, + "learning_rate": 4.686089834773841e-05, + "loss": 0.2221, + "step": 21298 + }, + { + "epoch": 2.84, + "grad_norm": 0.6484375, + "learning_rate": 4.6851033903741504e-05, + "loss": 0.5462, + "step": 21299 + }, + { + "epoch": 2.84, + "grad_norm": 0.625, + "learning_rate": 4.684117018047863e-05, + "loss": 0.3599, + "step": 21300 + }, + { + "epoch": 2.84, + "grad_norm": 0.78125, + "learning_rate": 4.6831307178083614e-05, + "loss": 0.453, + "step": 21301 + }, + { + "epoch": 2.84, + "grad_norm": 0.6796875, + "learning_rate": 4.6821444896690246e-05, + "loss": 0.3478, + "step": 21302 + }, + { + "epoch": 2.84, + "grad_norm": 0.62109375, + "learning_rate": 4.681158333643216e-05, + "loss": 0.3266, + "step": 21303 + }, + { + "epoch": 2.84, + "grad_norm": 0.58984375, + "learning_rate": 4.680172249744319e-05, + "loss": 0.3755, + "step": 21304 + }, + { + "epoch": 2.84, + "grad_norm": 0.71484375, + "learning_rate": 4.6791862379856964e-05, + "loss": 0.2835, + "step": 21305 + }, + { + "epoch": 2.84, + "grad_norm": 0.58984375, + "learning_rate": 4.6782002983807226e-05, + "loss": 0.3679, + "step": 21306 + }, + { + "epoch": 2.84, + "grad_norm": 0.640625, + "learning_rate": 4.677214430942769e-05, + "loss": 0.5622, + "step": 21307 + }, + { + "epoch": 2.84, + "grad_norm": 0.54296875, + "learning_rate": 4.676228635685204e-05, + "loss": 0.1449, + "step": 21308 + }, + { + "epoch": 2.84, + "grad_norm": 1.0546875, + "learning_rate": 4.675242912621399e-05, + "loss": 0.5488, + "step": 21309 + }, + { + "epoch": 2.84, + "grad_norm": 0.5078125, + "learning_rate": 4.674257261764714e-05, + "loss": 0.3994, + "step": 21310 + }, + { + "epoch": 2.84, + "grad_norm": 0.5859375, + "learning_rate": 4.6732716831285186e-05, + "loss": 0.4926, + "step": 21311 + }, + { + "epoch": 2.84, + "grad_norm": 0.51953125, + "learning_rate": 4.672286176726177e-05, + "loss": 0.4446, + "step": 21312 + }, + { + "epoch": 2.84, + "grad_norm": 0.5234375, + "learning_rate": 4.6713007425710584e-05, + "loss": 0.4207, + "step": 21313 + }, + { + "epoch": 2.84, + "grad_norm": 0.48046875, + "learning_rate": 4.670315380676522e-05, + "loss": 0.1453, + "step": 21314 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 4.669330091055925e-05, + "loss": 0.3263, + "step": 21315 + }, + { + "epoch": 2.84, + "grad_norm": 0.63671875, + "learning_rate": 4.6683448737226346e-05, + "loss": 0.4701, + "step": 21316 + }, + { + "epoch": 2.84, + "grad_norm": 0.51171875, + "learning_rate": 4.667359728690007e-05, + "loss": 0.2665, + "step": 21317 + }, + { + "epoch": 2.84, + "grad_norm": 0.45703125, + "learning_rate": 4.6663746559714104e-05, + "loss": 0.27, + "step": 21318 + }, + { + "epoch": 2.84, + "grad_norm": 0.470703125, + "learning_rate": 4.665389655580191e-05, + "loss": 0.3027, + "step": 21319 + }, + { + "epoch": 2.84, + "grad_norm": 0.578125, + "learning_rate": 4.664404727529713e-05, + "loss": 0.3898, + "step": 21320 + }, + { + "epoch": 2.85, + "grad_norm": 0.625, + "learning_rate": 4.66341987183333e-05, + "loss": 0.333, + "step": 21321 + }, + { + "epoch": 2.85, + "grad_norm": 0.7734375, + "learning_rate": 4.6624350885044036e-05, + "loss": 0.4097, + "step": 21322 + }, + { + "epoch": 2.85, + "grad_norm": 0.474609375, + "learning_rate": 4.661450377556278e-05, + "loss": 0.1789, + "step": 21323 + }, + { + "epoch": 2.85, + "grad_norm": 0.470703125, + "learning_rate": 4.6604657390023124e-05, + "loss": 0.3301, + "step": 21324 + }, + { + "epoch": 2.85, + "grad_norm": 0.478515625, + "learning_rate": 4.6594811728558584e-05, + "loss": 0.3477, + "step": 21325 + }, + { + "epoch": 2.85, + "grad_norm": 0.431640625, + "learning_rate": 4.6584966791302695e-05, + "loss": 0.3934, + "step": 21326 + }, + { + "epoch": 2.85, + "grad_norm": 0.53125, + "learning_rate": 4.6575122578388954e-05, + "loss": 0.2639, + "step": 21327 + }, + { + "epoch": 2.85, + "grad_norm": 0.5390625, + "learning_rate": 4.6565279089950795e-05, + "loss": 0.2723, + "step": 21328 + }, + { + "epoch": 2.85, + "grad_norm": 0.7421875, + "learning_rate": 4.655543632612175e-05, + "loss": 0.4326, + "step": 21329 + }, + { + "epoch": 2.85, + "grad_norm": 0.52734375, + "learning_rate": 4.6545594287035275e-05, + "loss": 0.2859, + "step": 21330 + }, + { + "epoch": 2.85, + "grad_norm": 0.62890625, + "learning_rate": 4.65357529728249e-05, + "loss": 0.4483, + "step": 21331 + }, + { + "epoch": 2.85, + "grad_norm": 0.79296875, + "learning_rate": 4.6525912383623994e-05, + "loss": 0.3832, + "step": 21332 + }, + { + "epoch": 2.85, + "grad_norm": 0.81640625, + "learning_rate": 4.651607251956602e-05, + "loss": 0.4495, + "step": 21333 + }, + { + "epoch": 2.85, + "grad_norm": 0.71875, + "learning_rate": 4.6506233380784436e-05, + "loss": 0.4171, + "step": 21334 + }, + { + "epoch": 2.85, + "grad_norm": 0.5625, + "learning_rate": 4.649639496741266e-05, + "loss": 0.3167, + "step": 21335 + }, + { + "epoch": 2.85, + "grad_norm": 0.6171875, + "learning_rate": 4.648655727958414e-05, + "loss": 0.2799, + "step": 21336 + }, + { + "epoch": 2.85, + "grad_norm": 0.388671875, + "learning_rate": 4.647672031743222e-05, + "loss": 0.3452, + "step": 21337 + }, + { + "epoch": 2.85, + "grad_norm": 0.671875, + "learning_rate": 4.646688408109036e-05, + "loss": 0.4172, + "step": 21338 + }, + { + "epoch": 2.85, + "grad_norm": 0.80859375, + "learning_rate": 4.645704857069186e-05, + "loss": 0.5213, + "step": 21339 + }, + { + "epoch": 2.85, + "grad_norm": 0.65625, + "learning_rate": 4.644721378637015e-05, + "loss": 0.4662, + "step": 21340 + }, + { + "epoch": 2.85, + "grad_norm": 0.5625, + "learning_rate": 4.643737972825863e-05, + "loss": 0.3096, + "step": 21341 + }, + { + "epoch": 2.85, + "grad_norm": 0.625, + "learning_rate": 4.642754639649058e-05, + "loss": 0.375, + "step": 21342 + }, + { + "epoch": 2.85, + "grad_norm": 0.53515625, + "learning_rate": 4.641771379119939e-05, + "loss": 0.2922, + "step": 21343 + }, + { + "epoch": 2.85, + "grad_norm": 0.7890625, + "learning_rate": 4.640788191251838e-05, + "loss": 0.2922, + "step": 21344 + }, + { + "epoch": 2.85, + "grad_norm": 0.484375, + "learning_rate": 4.6398050760580925e-05, + "loss": 0.1804, + "step": 21345 + }, + { + "epoch": 2.85, + "grad_norm": 0.671875, + "learning_rate": 4.638822033552027e-05, + "loss": 0.5498, + "step": 21346 + }, + { + "epoch": 2.85, + "grad_norm": 0.69921875, + "learning_rate": 4.637839063746975e-05, + "loss": 0.2165, + "step": 21347 + }, + { + "epoch": 2.85, + "grad_norm": 0.455078125, + "learning_rate": 4.636856166656267e-05, + "loss": 0.1639, + "step": 21348 + }, + { + "epoch": 2.85, + "grad_norm": 0.671875, + "learning_rate": 4.635873342293236e-05, + "loss": 0.469, + "step": 21349 + }, + { + "epoch": 2.85, + "grad_norm": 0.69140625, + "learning_rate": 4.6348905906712036e-05, + "loss": 0.2931, + "step": 21350 + }, + { + "epoch": 2.85, + "grad_norm": 0.88671875, + "learning_rate": 4.633907911803496e-05, + "loss": 0.4561, + "step": 21351 + }, + { + "epoch": 2.85, + "grad_norm": 0.765625, + "learning_rate": 4.6329253057034396e-05, + "loss": 0.4461, + "step": 21352 + }, + { + "epoch": 2.85, + "grad_norm": 0.58984375, + "learning_rate": 4.631942772384361e-05, + "loss": 0.355, + "step": 21353 + }, + { + "epoch": 2.85, + "grad_norm": 0.48828125, + "learning_rate": 4.630960311859587e-05, + "loss": 0.3033, + "step": 21354 + }, + { + "epoch": 2.85, + "grad_norm": 0.5625, + "learning_rate": 4.629977924142435e-05, + "loss": 0.3427, + "step": 21355 + }, + { + "epoch": 2.85, + "grad_norm": 0.462890625, + "learning_rate": 4.6289956092462264e-05, + "loss": 0.3411, + "step": 21356 + }, + { + "epoch": 2.85, + "grad_norm": 0.64453125, + "learning_rate": 4.6280133671842866e-05, + "loss": 0.2318, + "step": 21357 + }, + { + "epoch": 2.85, + "grad_norm": 0.55078125, + "learning_rate": 4.627031197969931e-05, + "loss": 0.5403, + "step": 21358 + }, + { + "epoch": 2.85, + "grad_norm": 0.75390625, + "learning_rate": 4.626049101616486e-05, + "loss": 0.4665, + "step": 21359 + }, + { + "epoch": 2.85, + "grad_norm": 0.47265625, + "learning_rate": 4.6250670781372605e-05, + "loss": 0.355, + "step": 21360 + }, + { + "epoch": 2.85, + "grad_norm": 0.66796875, + "learning_rate": 4.6240851275455744e-05, + "loss": 0.2364, + "step": 21361 + }, + { + "epoch": 2.85, + "grad_norm": 0.46875, + "learning_rate": 4.623103249854748e-05, + "loss": 0.1964, + "step": 21362 + }, + { + "epoch": 2.85, + "grad_norm": 0.7578125, + "learning_rate": 4.6221214450780894e-05, + "loss": 0.5235, + "step": 21363 + }, + { + "epoch": 2.85, + "grad_norm": 0.60546875, + "learning_rate": 4.621139713228918e-05, + "loss": 0.2586, + "step": 21364 + }, + { + "epoch": 2.85, + "grad_norm": 0.478515625, + "learning_rate": 4.6201580543205414e-05, + "loss": 0.1723, + "step": 21365 + }, + { + "epoch": 2.85, + "grad_norm": 0.55859375, + "learning_rate": 4.6191764683662744e-05, + "loss": 0.1787, + "step": 21366 + }, + { + "epoch": 2.85, + "grad_norm": 0.51953125, + "learning_rate": 4.618194955379427e-05, + "loss": 0.1873, + "step": 21367 + }, + { + "epoch": 2.85, + "grad_norm": 0.7734375, + "learning_rate": 4.617213515373314e-05, + "loss": 0.3789, + "step": 21368 + }, + { + "epoch": 2.85, + "grad_norm": 0.58203125, + "learning_rate": 4.616232148361237e-05, + "loss": 0.3427, + "step": 21369 + }, + { + "epoch": 2.85, + "grad_norm": 0.64453125, + "learning_rate": 4.6152508543565075e-05, + "loss": 0.306, + "step": 21370 + }, + { + "epoch": 2.85, + "grad_norm": 0.48046875, + "learning_rate": 4.6142696333724324e-05, + "loss": 0.3513, + "step": 21371 + }, + { + "epoch": 2.85, + "grad_norm": 0.65625, + "learning_rate": 4.613288485422321e-05, + "loss": 0.6293, + "step": 21372 + }, + { + "epoch": 2.85, + "grad_norm": 0.78515625, + "learning_rate": 4.612307410519472e-05, + "loss": 0.8245, + "step": 21373 + }, + { + "epoch": 2.85, + "grad_norm": 0.5859375, + "learning_rate": 4.6113264086771945e-05, + "loss": 0.3703, + "step": 21374 + }, + { + "epoch": 2.85, + "grad_norm": 0.326171875, + "learning_rate": 4.6103454799087864e-05, + "loss": 0.0999, + "step": 21375 + }, + { + "epoch": 2.85, + "grad_norm": 0.5390625, + "learning_rate": 4.6093646242275534e-05, + "loss": 0.374, + "step": 21376 + }, + { + "epoch": 2.85, + "grad_norm": 0.48828125, + "learning_rate": 4.6083838416467985e-05, + "loss": 0.21, + "step": 21377 + }, + { + "epoch": 2.85, + "grad_norm": 0.65625, + "learning_rate": 4.607403132179815e-05, + "loss": 0.4284, + "step": 21378 + }, + { + "epoch": 2.85, + "grad_norm": 0.65234375, + "learning_rate": 4.6064224958399075e-05, + "loss": 0.3036, + "step": 21379 + }, + { + "epoch": 2.85, + "grad_norm": 0.58984375, + "learning_rate": 4.605441932640371e-05, + "loss": 0.4205, + "step": 21380 + }, + { + "epoch": 2.85, + "grad_norm": 0.462890625, + "learning_rate": 4.604461442594504e-05, + "loss": 0.1293, + "step": 21381 + }, + { + "epoch": 2.85, + "grad_norm": 0.578125, + "learning_rate": 4.603481025715608e-05, + "loss": 0.2611, + "step": 21382 + }, + { + "epoch": 2.85, + "grad_norm": 0.42578125, + "learning_rate": 4.602500682016968e-05, + "loss": 0.2669, + "step": 21383 + }, + { + "epoch": 2.85, + "grad_norm": 0.58984375, + "learning_rate": 4.601520411511883e-05, + "loss": 0.3219, + "step": 21384 + }, + { + "epoch": 2.85, + "grad_norm": 0.55859375, + "learning_rate": 4.6005402142136456e-05, + "loss": 0.3855, + "step": 21385 + }, + { + "epoch": 2.85, + "grad_norm": 0.546875, + "learning_rate": 4.599560090135552e-05, + "loss": 0.3699, + "step": 21386 + }, + { + "epoch": 2.85, + "grad_norm": 0.515625, + "learning_rate": 4.5985800392908893e-05, + "loss": 0.3791, + "step": 21387 + }, + { + "epoch": 2.85, + "grad_norm": 0.5546875, + "learning_rate": 4.597600061692945e-05, + "loss": 0.3475, + "step": 21388 + }, + { + "epoch": 2.85, + "grad_norm": 0.79296875, + "learning_rate": 4.5966201573550115e-05, + "loss": 0.3018, + "step": 21389 + }, + { + "epoch": 2.85, + "grad_norm": 0.451171875, + "learning_rate": 4.595640326290376e-05, + "loss": 0.2294, + "step": 21390 + }, + { + "epoch": 2.85, + "grad_norm": 0.73828125, + "learning_rate": 4.594660568512329e-05, + "loss": 0.2531, + "step": 21391 + }, + { + "epoch": 2.85, + "grad_norm": 0.73046875, + "learning_rate": 4.593680884034151e-05, + "loss": 0.3807, + "step": 21392 + }, + { + "epoch": 2.85, + "grad_norm": 0.70703125, + "learning_rate": 4.59270127286913e-05, + "loss": 0.2985, + "step": 21393 + }, + { + "epoch": 2.85, + "grad_norm": 0.56640625, + "learning_rate": 4.59172173503055e-05, + "loss": 0.4129, + "step": 21394 + }, + { + "epoch": 2.85, + "grad_norm": 0.48828125, + "learning_rate": 4.590742270531697e-05, + "loss": 0.1441, + "step": 21395 + }, + { + "epoch": 2.86, + "grad_norm": 0.50390625, + "learning_rate": 4.5897628793858485e-05, + "loss": 0.1531, + "step": 21396 + }, + { + "epoch": 2.86, + "grad_norm": 0.5859375, + "learning_rate": 4.588783561606286e-05, + "loss": 0.3602, + "step": 21397 + }, + { + "epoch": 2.86, + "grad_norm": 0.50390625, + "learning_rate": 4.5878043172062927e-05, + "loss": 0.1497, + "step": 21398 + }, + { + "epoch": 2.86, + "grad_norm": 0.66015625, + "learning_rate": 4.5868251461991484e-05, + "loss": 0.2171, + "step": 21399 + }, + { + "epoch": 2.86, + "grad_norm": 0.578125, + "learning_rate": 4.58584604859813e-05, + "loss": 0.1907, + "step": 21400 + }, + { + "epoch": 2.86, + "grad_norm": 0.65234375, + "learning_rate": 4.584867024416511e-05, + "loss": 0.4823, + "step": 21401 + }, + { + "epoch": 2.86, + "grad_norm": 0.59765625, + "learning_rate": 4.5838880736675706e-05, + "loss": 0.4753, + "step": 21402 + }, + { + "epoch": 2.86, + "grad_norm": 0.435546875, + "learning_rate": 4.582909196364583e-05, + "loss": 0.1311, + "step": 21403 + }, + { + "epoch": 2.86, + "grad_norm": 0.609375, + "learning_rate": 4.581930392520828e-05, + "loss": 0.493, + "step": 21404 + }, + { + "epoch": 2.86, + "grad_norm": 0.578125, + "learning_rate": 4.580951662149569e-05, + "loss": 0.1917, + "step": 21405 + }, + { + "epoch": 2.86, + "grad_norm": 0.671875, + "learning_rate": 4.579973005264085e-05, + "loss": 0.2944, + "step": 21406 + }, + { + "epoch": 2.86, + "grad_norm": 0.68359375, + "learning_rate": 4.578994421877645e-05, + "loss": 0.5065, + "step": 21407 + }, + { + "epoch": 2.86, + "grad_norm": 0.55859375, + "learning_rate": 4.5780159120035194e-05, + "loss": 0.2459, + "step": 21408 + }, + { + "epoch": 2.86, + "grad_norm": 0.68359375, + "learning_rate": 4.577037475654982e-05, + "loss": 0.218, + "step": 21409 + }, + { + "epoch": 2.86, + "grad_norm": 0.6171875, + "learning_rate": 4.576059112845293e-05, + "loss": 0.2382, + "step": 21410 + }, + { + "epoch": 2.86, + "grad_norm": 0.5390625, + "learning_rate": 4.5750808235877274e-05, + "loss": 0.2034, + "step": 21411 + }, + { + "epoch": 2.86, + "grad_norm": 0.69140625, + "learning_rate": 4.5741026078955443e-05, + "loss": 0.5433, + "step": 21412 + }, + { + "epoch": 2.86, + "grad_norm": 0.55078125, + "learning_rate": 4.573124465782012e-05, + "loss": 0.3518, + "step": 21413 + }, + { + "epoch": 2.86, + "grad_norm": 0.64453125, + "learning_rate": 4.5721463972603974e-05, + "loss": 0.6209, + "step": 21414 + }, + { + "epoch": 2.86, + "grad_norm": 0.54296875, + "learning_rate": 4.5711684023439585e-05, + "loss": 0.1792, + "step": 21415 + }, + { + "epoch": 2.86, + "grad_norm": 0.451171875, + "learning_rate": 4.57019048104596e-05, + "loss": 0.1964, + "step": 21416 + }, + { + "epoch": 2.86, + "grad_norm": 0.58203125, + "learning_rate": 4.569212633379662e-05, + "loss": 0.4919, + "step": 21417 + }, + { + "epoch": 2.86, + "grad_norm": 0.8203125, + "learning_rate": 4.568234859358332e-05, + "loss": 0.5394, + "step": 21418 + }, + { + "epoch": 2.86, + "grad_norm": 0.5703125, + "learning_rate": 4.5672571589952194e-05, + "loss": 0.2611, + "step": 21419 + }, + { + "epoch": 2.86, + "grad_norm": 0.73828125, + "learning_rate": 4.5662795323035854e-05, + "loss": 0.5539, + "step": 21420 + }, + { + "epoch": 2.86, + "grad_norm": 0.412109375, + "learning_rate": 4.565301979296689e-05, + "loss": 0.1442, + "step": 21421 + }, + { + "epoch": 2.86, + "grad_norm": 0.51953125, + "learning_rate": 4.56432449998779e-05, + "loss": 0.2893, + "step": 21422 + }, + { + "epoch": 2.86, + "grad_norm": 0.56640625, + "learning_rate": 4.563347094390138e-05, + "loss": 0.1882, + "step": 21423 + }, + { + "epoch": 2.86, + "grad_norm": 0.50390625, + "learning_rate": 4.562369762516986e-05, + "loss": 0.2107, + "step": 21424 + }, + { + "epoch": 2.86, + "grad_norm": 0.5078125, + "learning_rate": 4.56139250438159e-05, + "loss": 0.233, + "step": 21425 + }, + { + "epoch": 2.86, + "grad_norm": 0.52734375, + "learning_rate": 4.560415319997201e-05, + "loss": 0.4292, + "step": 21426 + }, + { + "epoch": 2.86, + "grad_norm": 0.56640625, + "learning_rate": 4.559438209377076e-05, + "loss": 0.2995, + "step": 21427 + }, + { + "epoch": 2.86, + "grad_norm": 0.74609375, + "learning_rate": 4.558461172534456e-05, + "loss": 0.4757, + "step": 21428 + }, + { + "epoch": 2.86, + "grad_norm": 0.6875, + "learning_rate": 4.557484209482595e-05, + "loss": 0.2218, + "step": 21429 + }, + { + "epoch": 2.86, + "grad_norm": 0.578125, + "learning_rate": 4.556507320234742e-05, + "loss": 0.3715, + "step": 21430 + }, + { + "epoch": 2.86, + "grad_norm": 0.66796875, + "learning_rate": 4.555530504804143e-05, + "loss": 0.26, + "step": 21431 + }, + { + "epoch": 2.86, + "grad_norm": 0.96875, + "learning_rate": 4.554553763204048e-05, + "loss": 0.7642, + "step": 21432 + }, + { + "epoch": 2.86, + "grad_norm": 0.4609375, + "learning_rate": 4.553577095447695e-05, + "loss": 0.1625, + "step": 21433 + }, + { + "epoch": 2.86, + "grad_norm": 0.79296875, + "learning_rate": 4.552600501548333e-05, + "loss": 0.4064, + "step": 21434 + }, + { + "epoch": 2.86, + "grad_norm": 0.83203125, + "learning_rate": 4.5516239815192074e-05, + "loss": 0.3402, + "step": 21435 + }, + { + "epoch": 2.86, + "grad_norm": 0.8359375, + "learning_rate": 4.5506475353735535e-05, + "loss": 0.4938, + "step": 21436 + }, + { + "epoch": 2.86, + "grad_norm": 0.6171875, + "learning_rate": 4.5496711631246206e-05, + "loss": 0.301, + "step": 21437 + }, + { + "epoch": 2.86, + "grad_norm": 0.6484375, + "learning_rate": 4.54869486478564e-05, + "loss": 0.5604, + "step": 21438 + }, + { + "epoch": 2.86, + "grad_norm": 0.80078125, + "learning_rate": 4.5477186403698566e-05, + "loss": 0.3257, + "step": 21439 + }, + { + "epoch": 2.86, + "grad_norm": 0.78125, + "learning_rate": 4.546742489890507e-05, + "loss": 0.2887, + "step": 21440 + }, + { + "epoch": 2.86, + "grad_norm": 0.51953125, + "learning_rate": 4.545766413360833e-05, + "loss": 0.5257, + "step": 21441 + }, + { + "epoch": 2.86, + "grad_norm": 0.72265625, + "learning_rate": 4.544790410794064e-05, + "loss": 0.4576, + "step": 21442 + }, + { + "epoch": 2.86, + "grad_norm": 0.6328125, + "learning_rate": 4.543814482203438e-05, + "loss": 0.2503, + "step": 21443 + }, + { + "epoch": 2.86, + "grad_norm": 0.578125, + "learning_rate": 4.542838627602188e-05, + "loss": 0.4417, + "step": 21444 + }, + { + "epoch": 2.86, + "grad_norm": 0.58203125, + "learning_rate": 4.541862847003554e-05, + "loss": 0.2702, + "step": 21445 + }, + { + "epoch": 2.86, + "grad_norm": 0.65625, + "learning_rate": 4.5408871404207584e-05, + "loss": 0.4399, + "step": 21446 + }, + { + "epoch": 2.86, + "grad_norm": 0.59375, + "learning_rate": 4.5399115078670406e-05, + "loss": 0.2699, + "step": 21447 + }, + { + "epoch": 2.86, + "grad_norm": 0.54296875, + "learning_rate": 4.538935949355623e-05, + "loss": 0.3229, + "step": 21448 + }, + { + "epoch": 2.86, + "grad_norm": 0.65625, + "learning_rate": 4.537960464899739e-05, + "loss": 0.2213, + "step": 21449 + }, + { + "epoch": 2.86, + "grad_norm": 0.734375, + "learning_rate": 4.5369850545126204e-05, + "loss": 0.4137, + "step": 21450 + }, + { + "epoch": 2.86, + "grad_norm": 0.8125, + "learning_rate": 4.5360097182074866e-05, + "loss": 0.5417, + "step": 21451 + }, + { + "epoch": 2.86, + "grad_norm": 0.5390625, + "learning_rate": 4.5350344559975676e-05, + "loss": 0.2987, + "step": 21452 + }, + { + "epoch": 2.86, + "grad_norm": 0.6015625, + "learning_rate": 4.5340592678960905e-05, + "loss": 0.4985, + "step": 21453 + }, + { + "epoch": 2.86, + "grad_norm": 0.64453125, + "learning_rate": 4.533084153916276e-05, + "loss": 0.4992, + "step": 21454 + }, + { + "epoch": 2.86, + "grad_norm": 0.69140625, + "learning_rate": 4.5321091140713536e-05, + "loss": 0.6757, + "step": 21455 + }, + { + "epoch": 2.86, + "grad_norm": 0.51171875, + "learning_rate": 4.531134148374536e-05, + "loss": 0.463, + "step": 21456 + }, + { + "epoch": 2.86, + "grad_norm": 0.515625, + "learning_rate": 4.530159256839051e-05, + "loss": 0.3133, + "step": 21457 + }, + { + "epoch": 2.86, + "grad_norm": 0.361328125, + "learning_rate": 4.5291844394781144e-05, + "loss": 0.2303, + "step": 21458 + }, + { + "epoch": 2.86, + "grad_norm": 0.52734375, + "learning_rate": 4.528209696304954e-05, + "loss": 0.3053, + "step": 21459 + }, + { + "epoch": 2.86, + "grad_norm": 0.6484375, + "learning_rate": 4.5272350273327815e-05, + "loss": 0.3284, + "step": 21460 + }, + { + "epoch": 2.86, + "grad_norm": 0.62109375, + "learning_rate": 4.5262604325748096e-05, + "loss": 0.4756, + "step": 21461 + }, + { + "epoch": 2.86, + "grad_norm": 0.53515625, + "learning_rate": 4.5252859120442606e-05, + "loss": 0.163, + "step": 21462 + }, + { + "epoch": 2.86, + "grad_norm": 0.53125, + "learning_rate": 4.524311465754347e-05, + "loss": 0.1959, + "step": 21463 + }, + { + "epoch": 2.86, + "grad_norm": 0.66796875, + "learning_rate": 4.5233370937182885e-05, + "loss": 0.2782, + "step": 21464 + }, + { + "epoch": 2.86, + "grad_norm": 0.4765625, + "learning_rate": 4.522362795949291e-05, + "loss": 0.2972, + "step": 21465 + }, + { + "epoch": 2.86, + "grad_norm": 0.63671875, + "learning_rate": 4.521388572460569e-05, + "loss": 0.325, + "step": 21466 + }, + { + "epoch": 2.86, + "grad_norm": 0.65234375, + "learning_rate": 4.520414423265334e-05, + "loss": 0.3652, + "step": 21467 + }, + { + "epoch": 2.86, + "grad_norm": 0.6171875, + "learning_rate": 4.519440348376799e-05, + "loss": 0.3959, + "step": 21468 + }, + { + "epoch": 2.86, + "grad_norm": 0.474609375, + "learning_rate": 4.518466347808168e-05, + "loss": 0.2482, + "step": 21469 + }, + { + "epoch": 2.86, + "grad_norm": 0.49609375, + "learning_rate": 4.51749242157265e-05, + "loss": 0.3679, + "step": 21470 + }, + { + "epoch": 2.87, + "grad_norm": 0.57421875, + "learning_rate": 4.516518569683454e-05, + "loss": 0.1602, + "step": 21471 + }, + { + "epoch": 2.87, + "grad_norm": 0.65234375, + "learning_rate": 4.51554479215379e-05, + "loss": 0.2668, + "step": 21472 + }, + { + "epoch": 2.87, + "grad_norm": 0.53125, + "learning_rate": 4.514571088996857e-05, + "loss": 0.22, + "step": 21473 + }, + { + "epoch": 2.87, + "grad_norm": 0.59375, + "learning_rate": 4.513597460225858e-05, + "loss": 0.4135, + "step": 21474 + }, + { + "epoch": 2.87, + "grad_norm": 0.6328125, + "learning_rate": 4.512623905853999e-05, + "loss": 0.4481, + "step": 21475 + }, + { + "epoch": 2.87, + "grad_norm": 0.5625, + "learning_rate": 4.51165042589448e-05, + "loss": 0.589, + "step": 21476 + }, + { + "epoch": 2.87, + "grad_norm": 0.37890625, + "learning_rate": 4.510677020360509e-05, + "loss": 0.2079, + "step": 21477 + }, + { + "epoch": 2.87, + "grad_norm": 0.49609375, + "learning_rate": 4.5097036892652757e-05, + "loss": 0.1412, + "step": 21478 + }, + { + "epoch": 2.87, + "grad_norm": 0.482421875, + "learning_rate": 4.5087304326219834e-05, + "loss": 0.2198, + "step": 21479 + }, + { + "epoch": 2.87, + "grad_norm": 0.490234375, + "learning_rate": 4.507757250443833e-05, + "loss": 0.3342, + "step": 21480 + }, + { + "epoch": 2.87, + "grad_norm": 0.64453125, + "learning_rate": 4.506784142744017e-05, + "loss": 0.2538, + "step": 21481 + }, + { + "epoch": 2.87, + "grad_norm": 0.6796875, + "learning_rate": 4.505811109535739e-05, + "loss": 0.2509, + "step": 21482 + }, + { + "epoch": 2.87, + "grad_norm": 0.451171875, + "learning_rate": 4.5048381508321834e-05, + "loss": 0.1917, + "step": 21483 + }, + { + "epoch": 2.87, + "grad_norm": 0.81640625, + "learning_rate": 4.503865266646554e-05, + "loss": 0.6097, + "step": 21484 + }, + { + "epoch": 2.87, + "grad_norm": 0.66015625, + "learning_rate": 4.502892456992035e-05, + "loss": 0.5983, + "step": 21485 + }, + { + "epoch": 2.87, + "grad_norm": 0.73828125, + "learning_rate": 4.501919721881822e-05, + "loss": 0.3805, + "step": 21486 + }, + { + "epoch": 2.87, + "grad_norm": 0.50390625, + "learning_rate": 4.5009470613291096e-05, + "loss": 0.3054, + "step": 21487 + }, + { + "epoch": 2.87, + "grad_norm": 0.53515625, + "learning_rate": 4.499974475347081e-05, + "loss": 0.4523, + "step": 21488 + }, + { + "epoch": 2.87, + "grad_norm": 0.5546875, + "learning_rate": 4.499001963948929e-05, + "loss": 0.4591, + "step": 21489 + }, + { + "epoch": 2.87, + "grad_norm": 0.58984375, + "learning_rate": 4.49802952714784e-05, + "loss": 0.4234, + "step": 21490 + }, + { + "epoch": 2.87, + "grad_norm": 0.6015625, + "learning_rate": 4.497057164957006e-05, + "loss": 0.1723, + "step": 21491 + }, + { + "epoch": 2.87, + "grad_norm": 0.62109375, + "learning_rate": 4.4960848773896046e-05, + "loss": 0.441, + "step": 21492 + }, + { + "epoch": 2.87, + "grad_norm": 0.53125, + "learning_rate": 4.4951126644588246e-05, + "loss": 0.3894, + "step": 21493 + }, + { + "epoch": 2.87, + "grad_norm": 0.55078125, + "learning_rate": 4.4941405261778504e-05, + "loss": 0.3333, + "step": 21494 + }, + { + "epoch": 2.87, + "grad_norm": 0.58203125, + "learning_rate": 4.493168462559867e-05, + "loss": 0.2228, + "step": 21495 + }, + { + "epoch": 2.87, + "grad_norm": 0.57421875, + "learning_rate": 4.492196473618054e-05, + "loss": 0.2742, + "step": 21496 + }, + { + "epoch": 2.87, + "grad_norm": 0.56640625, + "learning_rate": 4.4912245593655885e-05, + "loss": 0.3956, + "step": 21497 + }, + { + "epoch": 2.87, + "grad_norm": 0.609375, + "learning_rate": 4.490252719815653e-05, + "loss": 0.457, + "step": 21498 + }, + { + "epoch": 2.87, + "grad_norm": 0.734375, + "learning_rate": 4.489280954981427e-05, + "loss": 0.3126, + "step": 21499 + }, + { + "epoch": 2.87, + "grad_norm": 0.6328125, + "learning_rate": 4.4883092648760914e-05, + "loss": 0.3976, + "step": 21500 + }, + { + "epoch": 2.87, + "grad_norm": 0.59765625, + "learning_rate": 4.487337649512816e-05, + "loss": 0.2833, + "step": 21501 + }, + { + "epoch": 2.87, + "grad_norm": 0.58984375, + "learning_rate": 4.48636610890478e-05, + "loss": 0.2301, + "step": 21502 + }, + { + "epoch": 2.87, + "grad_norm": 0.65625, + "learning_rate": 4.485394643065158e-05, + "loss": 0.5045, + "step": 21503 + }, + { + "epoch": 2.87, + "grad_norm": 0.458984375, + "learning_rate": 4.484423252007125e-05, + "loss": 0.2405, + "step": 21504 + }, + { + "epoch": 2.87, + "grad_norm": 0.80078125, + "learning_rate": 4.483451935743855e-05, + "loss": 0.4524, + "step": 21505 + }, + { + "epoch": 2.87, + "grad_norm": 0.609375, + "learning_rate": 4.482480694288514e-05, + "loss": 0.3058, + "step": 21506 + }, + { + "epoch": 2.87, + "grad_norm": 0.55859375, + "learning_rate": 4.481509527654276e-05, + "loss": 0.294, + "step": 21507 + }, + { + "epoch": 2.87, + "grad_norm": 0.703125, + "learning_rate": 4.4805384358543144e-05, + "loss": 0.3379, + "step": 21508 + }, + { + "epoch": 2.87, + "grad_norm": 0.55078125, + "learning_rate": 4.47956741890179e-05, + "loss": 0.412, + "step": 21509 + }, + { + "epoch": 2.87, + "grad_norm": 0.69921875, + "learning_rate": 4.4785964768098776e-05, + "loss": 0.5628, + "step": 21510 + }, + { + "epoch": 2.87, + "grad_norm": 0.5234375, + "learning_rate": 4.477625609591738e-05, + "loss": 0.3445, + "step": 21511 + }, + { + "epoch": 2.87, + "grad_norm": 0.578125, + "learning_rate": 4.476654817260538e-05, + "loss": 0.249, + "step": 21512 + }, + { + "epoch": 2.87, + "grad_norm": 0.64453125, + "learning_rate": 4.4756840998294445e-05, + "loss": 0.206, + "step": 21513 + }, + { + "epoch": 2.87, + "grad_norm": 0.5234375, + "learning_rate": 4.4747134573116245e-05, + "loss": 0.372, + "step": 21514 + }, + { + "epoch": 2.87, + "grad_norm": 0.51953125, + "learning_rate": 4.47374288972023e-05, + "loss": 0.1877, + "step": 21515 + }, + { + "epoch": 2.87, + "grad_norm": 0.53515625, + "learning_rate": 4.472772397068431e-05, + "loss": 0.2172, + "step": 21516 + }, + { + "epoch": 2.87, + "grad_norm": 0.69140625, + "learning_rate": 4.471801979369385e-05, + "loss": 0.526, + "step": 21517 + }, + { + "epoch": 2.87, + "grad_norm": 0.53515625, + "learning_rate": 4.470831636636256e-05, + "loss": 0.2883, + "step": 21518 + }, + { + "epoch": 2.87, + "grad_norm": 0.58203125, + "learning_rate": 4.4698613688821946e-05, + "loss": 0.3812, + "step": 21519 + }, + { + "epoch": 2.87, + "grad_norm": 0.5859375, + "learning_rate": 4.4688911761203656e-05, + "loss": 0.3931, + "step": 21520 + }, + { + "epoch": 2.87, + "grad_norm": 0.625, + "learning_rate": 4.467921058363919e-05, + "loss": 0.446, + "step": 21521 + }, + { + "epoch": 2.87, + "grad_norm": 0.66796875, + "learning_rate": 4.466951015626013e-05, + "loss": 0.618, + "step": 21522 + }, + { + "epoch": 2.87, + "grad_norm": 0.298828125, + "learning_rate": 4.465981047919806e-05, + "loss": 0.1687, + "step": 21523 + }, + { + "epoch": 2.87, + "grad_norm": 0.59765625, + "learning_rate": 4.465011155258444e-05, + "loss": 0.4465, + "step": 21524 + }, + { + "epoch": 2.87, + "grad_norm": 0.453125, + "learning_rate": 4.464041337655083e-05, + "loss": 0.209, + "step": 21525 + }, + { + "epoch": 2.87, + "grad_norm": 0.42578125, + "learning_rate": 4.463071595122874e-05, + "loss": 0.2981, + "step": 21526 + }, + { + "epoch": 2.87, + "grad_norm": 0.5078125, + "learning_rate": 4.462101927674969e-05, + "loss": 0.2491, + "step": 21527 + }, + { + "epoch": 2.87, + "grad_norm": 0.5546875, + "learning_rate": 4.4611323353245184e-05, + "loss": 0.1566, + "step": 21528 + }, + { + "epoch": 2.87, + "grad_norm": 0.59375, + "learning_rate": 4.460162818084666e-05, + "loss": 0.3421, + "step": 21529 + }, + { + "epoch": 2.87, + "grad_norm": 0.6640625, + "learning_rate": 4.45919337596856e-05, + "loss": 0.3728, + "step": 21530 + }, + { + "epoch": 2.87, + "grad_norm": 0.58984375, + "learning_rate": 4.4582240089893476e-05, + "loss": 0.3785, + "step": 21531 + }, + { + "epoch": 2.87, + "grad_norm": 0.671875, + "learning_rate": 4.457254717160178e-05, + "loss": 0.3566, + "step": 21532 + }, + { + "epoch": 2.87, + "grad_norm": 0.57421875, + "learning_rate": 4.4562855004941926e-05, + "loss": 0.3256, + "step": 21533 + }, + { + "epoch": 2.87, + "grad_norm": 0.498046875, + "learning_rate": 4.45531635900453e-05, + "loss": 0.2381, + "step": 21534 + }, + { + "epoch": 2.87, + "grad_norm": 0.447265625, + "learning_rate": 4.454347292704336e-05, + "loss": 0.2113, + "step": 21535 + }, + { + "epoch": 2.87, + "grad_norm": 0.66796875, + "learning_rate": 4.453378301606752e-05, + "loss": 0.2706, + "step": 21536 + }, + { + "epoch": 2.87, + "grad_norm": 0.6015625, + "learning_rate": 4.452409385724922e-05, + "loss": 0.4086, + "step": 21537 + }, + { + "epoch": 2.87, + "grad_norm": 0.6484375, + "learning_rate": 4.4514405450719766e-05, + "loss": 0.5278, + "step": 21538 + }, + { + "epoch": 2.87, + "grad_norm": 0.49609375, + "learning_rate": 4.450471779661058e-05, + "loss": 0.295, + "step": 21539 + }, + { + "epoch": 2.87, + "grad_norm": 0.6640625, + "learning_rate": 4.4495030895053056e-05, + "loss": 0.2311, + "step": 21540 + }, + { + "epoch": 2.87, + "grad_norm": 0.50390625, + "learning_rate": 4.4485344746178557e-05, + "loss": 0.2784, + "step": 21541 + }, + { + "epoch": 2.87, + "grad_norm": 0.69140625, + "learning_rate": 4.4475659350118385e-05, + "loss": 0.4218, + "step": 21542 + }, + { + "epoch": 2.87, + "grad_norm": 0.53515625, + "learning_rate": 4.446597470700391e-05, + "loss": 0.3678, + "step": 21543 + }, + { + "epoch": 2.87, + "grad_norm": 0.60546875, + "learning_rate": 4.4456290816966485e-05, + "loss": 0.4237, + "step": 21544 + }, + { + "epoch": 2.87, + "grad_norm": 0.56640625, + "learning_rate": 4.444660768013738e-05, + "loss": 0.3707, + "step": 21545 + }, + { + "epoch": 2.88, + "grad_norm": 0.42578125, + "learning_rate": 4.443692529664795e-05, + "loss": 0.1537, + "step": 21546 + }, + { + "epoch": 2.88, + "grad_norm": 0.447265625, + "learning_rate": 4.442724366662945e-05, + "loss": 0.4072, + "step": 21547 + }, + { + "epoch": 2.88, + "grad_norm": 0.6015625, + "learning_rate": 4.441756279021319e-05, + "loss": 0.2716, + "step": 21548 + }, + { + "epoch": 2.88, + "grad_norm": 0.57421875, + "learning_rate": 4.440788266753044e-05, + "loss": 0.4249, + "step": 21549 + }, + { + "epoch": 2.88, + "grad_norm": 0.58984375, + "learning_rate": 4.4398203298712524e-05, + "loss": 0.3827, + "step": 21550 + }, + { + "epoch": 2.88, + "grad_norm": 0.453125, + "learning_rate": 4.438852468389062e-05, + "loss": 0.3663, + "step": 21551 + }, + { + "epoch": 2.88, + "grad_norm": 0.734375, + "learning_rate": 4.4378846823196005e-05, + "loss": 0.5658, + "step": 21552 + }, + { + "epoch": 2.88, + "grad_norm": 0.7890625, + "learning_rate": 4.436916971675994e-05, + "loss": 0.3173, + "step": 21553 + }, + { + "epoch": 2.88, + "grad_norm": 0.48828125, + "learning_rate": 4.4359493364713626e-05, + "loss": 0.2026, + "step": 21554 + }, + { + "epoch": 2.88, + "grad_norm": 0.58984375, + "learning_rate": 4.434981776718833e-05, + "loss": 0.2999, + "step": 21555 + }, + { + "epoch": 2.88, + "grad_norm": 0.5234375, + "learning_rate": 4.434014292431522e-05, + "loss": 0.2817, + "step": 21556 + }, + { + "epoch": 2.88, + "grad_norm": 0.69921875, + "learning_rate": 4.433046883622547e-05, + "loss": 0.506, + "step": 21557 + }, + { + "epoch": 2.88, + "grad_norm": 0.625, + "learning_rate": 4.432079550305027e-05, + "loss": 0.3767, + "step": 21558 + }, + { + "epoch": 2.88, + "grad_norm": 0.54296875, + "learning_rate": 4.4311122924920835e-05, + "loss": 0.3669, + "step": 21559 + }, + { + "epoch": 2.88, + "grad_norm": 0.53515625, + "learning_rate": 4.4301451101968335e-05, + "loss": 0.3644, + "step": 21560 + }, + { + "epoch": 2.88, + "grad_norm": 0.54296875, + "learning_rate": 4.429178003432388e-05, + "loss": 0.3045, + "step": 21561 + }, + { + "epoch": 2.88, + "grad_norm": 0.71484375, + "learning_rate": 4.4282109722118646e-05, + "loss": 0.4672, + "step": 21562 + }, + { + "epoch": 2.88, + "grad_norm": 0.69921875, + "learning_rate": 4.427244016548375e-05, + "loss": 0.2502, + "step": 21563 + }, + { + "epoch": 2.88, + "grad_norm": 0.41015625, + "learning_rate": 4.426277136455038e-05, + "loss": 0.2067, + "step": 21564 + }, + { + "epoch": 2.88, + "grad_norm": 0.75, + "learning_rate": 4.4253103319449565e-05, + "loss": 0.3579, + "step": 21565 + }, + { + "epoch": 2.88, + "grad_norm": 0.52734375, + "learning_rate": 4.424343603031243e-05, + "loss": 0.2371, + "step": 21566 + }, + { + "epoch": 2.88, + "grad_norm": 0.6015625, + "learning_rate": 4.4233769497270104e-05, + "loss": 0.3668, + "step": 21567 + }, + { + "epoch": 2.88, + "grad_norm": 0.5546875, + "learning_rate": 4.422410372045369e-05, + "loss": 0.232, + "step": 21568 + }, + { + "epoch": 2.88, + "grad_norm": 0.640625, + "learning_rate": 4.421443869999422e-05, + "loss": 0.2956, + "step": 21569 + }, + { + "epoch": 2.88, + "grad_norm": 0.5625, + "learning_rate": 4.4204774436022735e-05, + "loss": 0.2851, + "step": 21570 + }, + { + "epoch": 2.88, + "grad_norm": 0.61328125, + "learning_rate": 4.41951109286703e-05, + "loss": 0.2299, + "step": 21571 + }, + { + "epoch": 2.88, + "grad_norm": 0.69140625, + "learning_rate": 4.418544817806799e-05, + "loss": 0.536, + "step": 21572 + }, + { + "epoch": 2.88, + "grad_norm": 0.765625, + "learning_rate": 4.417578618434687e-05, + "loss": 0.3267, + "step": 21573 + }, + { + "epoch": 2.88, + "grad_norm": 0.470703125, + "learning_rate": 4.4166124947637865e-05, + "loss": 0.2919, + "step": 21574 + }, + { + "epoch": 2.88, + "grad_norm": 0.6015625, + "learning_rate": 4.4156464468072044e-05, + "loss": 0.3976, + "step": 21575 + }, + { + "epoch": 2.88, + "grad_norm": 0.49609375, + "learning_rate": 4.4146804745780404e-05, + "loss": 0.1948, + "step": 21576 + }, + { + "epoch": 2.88, + "grad_norm": 0.5859375, + "learning_rate": 4.413714578089394e-05, + "loss": 0.3412, + "step": 21577 + }, + { + "epoch": 2.88, + "grad_norm": 0.6328125, + "learning_rate": 4.4127487573543666e-05, + "loss": 0.402, + "step": 21578 + }, + { + "epoch": 2.88, + "grad_norm": 0.478515625, + "learning_rate": 4.4117830123860484e-05, + "loss": 0.1329, + "step": 21579 + }, + { + "epoch": 2.88, + "grad_norm": 0.54296875, + "learning_rate": 4.4108173431975384e-05, + "loss": 0.4153, + "step": 21580 + }, + { + "epoch": 2.88, + "grad_norm": 0.6875, + "learning_rate": 4.4098517498019375e-05, + "loss": 0.6832, + "step": 21581 + }, + { + "epoch": 2.88, + "grad_norm": 0.4375, + "learning_rate": 4.40888623221233e-05, + "loss": 0.238, + "step": 21582 + }, + { + "epoch": 2.88, + "grad_norm": 0.6796875, + "learning_rate": 4.407920790441817e-05, + "loss": 0.2539, + "step": 21583 + }, + { + "epoch": 2.88, + "grad_norm": 0.640625, + "learning_rate": 4.406955424503484e-05, + "loss": 0.5181, + "step": 21584 + }, + { + "epoch": 2.88, + "grad_norm": 0.59765625, + "learning_rate": 4.4059901344104246e-05, + "loss": 0.3016, + "step": 21585 + }, + { + "epoch": 2.88, + "grad_norm": 0.82421875, + "learning_rate": 4.40502492017573e-05, + "loss": 0.3486, + "step": 21586 + }, + { + "epoch": 2.88, + "grad_norm": 0.490234375, + "learning_rate": 4.4040597818124904e-05, + "loss": 0.344, + "step": 21587 + }, + { + "epoch": 2.88, + "grad_norm": 0.3828125, + "learning_rate": 4.4030947193337893e-05, + "loss": 0.2125, + "step": 21588 + }, + { + "epoch": 2.88, + "grad_norm": 0.53125, + "learning_rate": 4.402129732752715e-05, + "loss": 0.2668, + "step": 21589 + }, + { + "epoch": 2.88, + "grad_norm": 0.55859375, + "learning_rate": 4.401164822082355e-05, + "loss": 0.2546, + "step": 21590 + }, + { + "epoch": 2.88, + "grad_norm": 0.6953125, + "learning_rate": 4.4001999873357966e-05, + "loss": 0.5751, + "step": 21591 + }, + { + "epoch": 2.88, + "grad_norm": 0.451171875, + "learning_rate": 4.3992352285261165e-05, + "loss": 0.2155, + "step": 21592 + }, + { + "epoch": 2.88, + "grad_norm": 0.796875, + "learning_rate": 4.398270545666405e-05, + "loss": 0.5889, + "step": 21593 + }, + { + "epoch": 2.88, + "grad_norm": 0.73046875, + "learning_rate": 4.397305938769737e-05, + "loss": 0.2994, + "step": 21594 + }, + { + "epoch": 2.88, + "grad_norm": 0.703125, + "learning_rate": 4.3963414078491946e-05, + "loss": 0.4151, + "step": 21595 + }, + { + "epoch": 2.88, + "grad_norm": 0.62109375, + "learning_rate": 4.3953769529178644e-05, + "loss": 0.5097, + "step": 21596 + }, + { + "epoch": 2.88, + "grad_norm": 0.53125, + "learning_rate": 4.394412573988815e-05, + "loss": 0.3092, + "step": 21597 + }, + { + "epoch": 2.88, + "grad_norm": 0.7421875, + "learning_rate": 4.3934482710751304e-05, + "loss": 0.3445, + "step": 21598 + }, + { + "epoch": 2.88, + "grad_norm": 0.64453125, + "learning_rate": 4.392484044189884e-05, + "loss": 0.4357, + "step": 21599 + }, + { + "epoch": 2.88, + "grad_norm": 0.478515625, + "learning_rate": 4.391519893346153e-05, + "loss": 0.3161, + "step": 21600 + }, + { + "epoch": 2.88, + "grad_norm": 0.58984375, + "learning_rate": 4.390555818557016e-05, + "loss": 0.38, + "step": 21601 + }, + { + "epoch": 2.88, + "grad_norm": 0.546875, + "learning_rate": 4.389591819835538e-05, + "loss": 0.4617, + "step": 21602 + }, + { + "epoch": 2.88, + "grad_norm": 0.6328125, + "learning_rate": 4.388627897194796e-05, + "loss": 0.5645, + "step": 21603 + }, + { + "epoch": 2.88, + "grad_norm": 0.74609375, + "learning_rate": 4.3876640506478616e-05, + "loss": 0.4468, + "step": 21604 + }, + { + "epoch": 2.88, + "grad_norm": 0.796875, + "learning_rate": 4.3867002802078074e-05, + "loss": 0.299, + "step": 21605 + }, + { + "epoch": 2.88, + "grad_norm": 0.40234375, + "learning_rate": 4.3857365858877e-05, + "loss": 0.1548, + "step": 21606 + }, + { + "epoch": 2.88, + "grad_norm": 0.38671875, + "learning_rate": 4.384772967700604e-05, + "loss": 0.0962, + "step": 21607 + }, + { + "epoch": 2.88, + "grad_norm": 0.47265625, + "learning_rate": 4.3838094256595905e-05, + "loss": 0.1565, + "step": 21608 + }, + { + "epoch": 2.88, + "grad_norm": 0.490234375, + "learning_rate": 4.382845959777726e-05, + "loss": 0.3139, + "step": 21609 + }, + { + "epoch": 2.88, + "grad_norm": 0.625, + "learning_rate": 4.381882570068079e-05, + "loss": 0.5774, + "step": 21610 + }, + { + "epoch": 2.88, + "grad_norm": 0.5390625, + "learning_rate": 4.380919256543706e-05, + "loss": 0.1996, + "step": 21611 + }, + { + "epoch": 2.88, + "grad_norm": 0.6015625, + "learning_rate": 4.379956019217675e-05, + "loss": 0.2971, + "step": 21612 + }, + { + "epoch": 2.88, + "grad_norm": 0.478515625, + "learning_rate": 4.378992858103046e-05, + "loss": 0.2753, + "step": 21613 + }, + { + "epoch": 2.88, + "grad_norm": 0.59375, + "learning_rate": 4.378029773212886e-05, + "loss": 0.3659, + "step": 21614 + }, + { + "epoch": 2.88, + "grad_norm": 0.46875, + "learning_rate": 4.377066764560247e-05, + "loss": 0.3203, + "step": 21615 + }, + { + "epoch": 2.88, + "grad_norm": 0.62890625, + "learning_rate": 4.376103832158191e-05, + "loss": 0.4513, + "step": 21616 + }, + { + "epoch": 2.88, + "grad_norm": 0.45703125, + "learning_rate": 4.375140976019779e-05, + "loss": 0.2059, + "step": 21617 + }, + { + "epoch": 2.88, + "grad_norm": 0.66796875, + "learning_rate": 4.374178196158063e-05, + "loss": 0.647, + "step": 21618 + }, + { + "epoch": 2.88, + "grad_norm": 0.78125, + "learning_rate": 4.373215492586105e-05, + "loss": 0.5408, + "step": 21619 + }, + { + "epoch": 2.88, + "grad_norm": 0.55859375, + "learning_rate": 4.3722528653169525e-05, + "loss": 0.2736, + "step": 21620 + }, + { + "epoch": 2.89, + "grad_norm": 0.625, + "learning_rate": 4.371290314363663e-05, + "loss": 0.2834, + "step": 21621 + }, + { + "epoch": 2.89, + "grad_norm": 0.50390625, + "learning_rate": 4.3703278397392885e-05, + "loss": 0.3577, + "step": 21622 + }, + { + "epoch": 2.89, + "grad_norm": 0.6171875, + "learning_rate": 4.3693654414568864e-05, + "loss": 0.3219, + "step": 21623 + }, + { + "epoch": 2.89, + "grad_norm": 0.54296875, + "learning_rate": 4.368403119529498e-05, + "loss": 0.3784, + "step": 21624 + }, + { + "epoch": 2.89, + "grad_norm": 1.0078125, + "learning_rate": 4.3674408739701776e-05, + "loss": 0.3934, + "step": 21625 + }, + { + "epoch": 2.89, + "grad_norm": 0.765625, + "learning_rate": 4.3664787047919744e-05, + "loss": 0.3507, + "step": 21626 + }, + { + "epoch": 2.89, + "grad_norm": 0.56640625, + "learning_rate": 4.3655166120079364e-05, + "loss": 0.2481, + "step": 21627 + }, + { + "epoch": 2.89, + "grad_norm": 0.53515625, + "learning_rate": 4.3645545956311104e-05, + "loss": 0.3229, + "step": 21628 + }, + { + "epoch": 2.89, + "grad_norm": 0.5625, + "learning_rate": 4.363592655674543e-05, + "loss": 0.4543, + "step": 21629 + }, + { + "epoch": 2.89, + "grad_norm": 0.75390625, + "learning_rate": 4.362630792151272e-05, + "loss": 0.3206, + "step": 21630 + }, + { + "epoch": 2.89, + "grad_norm": 0.55078125, + "learning_rate": 4.361669005074347e-05, + "loss": 0.2936, + "step": 21631 + }, + { + "epoch": 2.89, + "grad_norm": 0.443359375, + "learning_rate": 4.3607072944568074e-05, + "loss": 0.2444, + "step": 21632 + }, + { + "epoch": 2.89, + "grad_norm": 0.671875, + "learning_rate": 4.3597456603117007e-05, + "loss": 0.3634, + "step": 21633 + }, + { + "epoch": 2.89, + "grad_norm": 0.58984375, + "learning_rate": 4.358784102652058e-05, + "loss": 0.4921, + "step": 21634 + }, + { + "epoch": 2.89, + "grad_norm": 0.48828125, + "learning_rate": 4.357822621490925e-05, + "loss": 0.3263, + "step": 21635 + }, + { + "epoch": 2.89, + "grad_norm": 0.423828125, + "learning_rate": 4.3568612168413366e-05, + "loss": 0.1485, + "step": 21636 + }, + { + "epoch": 2.89, + "grad_norm": 0.5546875, + "learning_rate": 4.355899888716337e-05, + "loss": 0.2781, + "step": 21637 + }, + { + "epoch": 2.89, + "grad_norm": 0.66796875, + "learning_rate": 4.354938637128952e-05, + "loss": 0.3401, + "step": 21638 + }, + { + "epoch": 2.89, + "grad_norm": 0.7109375, + "learning_rate": 4.3539774620922233e-05, + "loss": 0.3352, + "step": 21639 + }, + { + "epoch": 2.89, + "grad_norm": 0.67578125, + "learning_rate": 4.3530163636191835e-05, + "loss": 0.4423, + "step": 21640 + }, + { + "epoch": 2.89, + "grad_norm": 0.625, + "learning_rate": 4.352055341722871e-05, + "loss": 0.4365, + "step": 21641 + }, + { + "epoch": 2.89, + "grad_norm": 0.4453125, + "learning_rate": 4.351094396416311e-05, + "loss": 0.1489, + "step": 21642 + }, + { + "epoch": 2.89, + "grad_norm": 0.8359375, + "learning_rate": 4.3501335277125333e-05, + "loss": 0.445, + "step": 21643 + }, + { + "epoch": 2.89, + "grad_norm": 0.6796875, + "learning_rate": 4.34917273562457e-05, + "loss": 0.433, + "step": 21644 + }, + { + "epoch": 2.89, + "grad_norm": 0.71484375, + "learning_rate": 4.348212020165453e-05, + "loss": 0.3795, + "step": 21645 + }, + { + "epoch": 2.89, + "grad_norm": 0.65625, + "learning_rate": 4.3472513813482104e-05, + "loss": 0.3428, + "step": 21646 + }, + { + "epoch": 2.89, + "grad_norm": 0.59375, + "learning_rate": 4.346290819185864e-05, + "loss": 0.2757, + "step": 21647 + }, + { + "epoch": 2.89, + "grad_norm": 0.796875, + "learning_rate": 4.345330333691443e-05, + "loss": 0.3105, + "step": 21648 + }, + { + "epoch": 2.89, + "grad_norm": 0.59375, + "learning_rate": 4.3443699248779725e-05, + "loss": 0.4139, + "step": 21649 + }, + { + "epoch": 2.89, + "grad_norm": 0.73828125, + "learning_rate": 4.343409592758474e-05, + "loss": 0.4651, + "step": 21650 + }, + { + "epoch": 2.89, + "grad_norm": 0.5390625, + "learning_rate": 4.3424493373459766e-05, + "loss": 0.4321, + "step": 21651 + }, + { + "epoch": 2.89, + "grad_norm": 0.6640625, + "learning_rate": 4.341489158653492e-05, + "loss": 0.4633, + "step": 21652 + }, + { + "epoch": 2.89, + "grad_norm": 0.7578125, + "learning_rate": 4.340529056694047e-05, + "loss": 0.5071, + "step": 21653 + }, + { + "epoch": 2.89, + "grad_norm": 0.90234375, + "learning_rate": 4.339569031480663e-05, + "loss": 0.4956, + "step": 21654 + }, + { + "epoch": 2.89, + "grad_norm": 0.5078125, + "learning_rate": 4.338609083026354e-05, + "loss": 0.2106, + "step": 21655 + }, + { + "epoch": 2.89, + "grad_norm": 0.5703125, + "learning_rate": 4.3376492113441414e-05, + "loss": 0.3542, + "step": 21656 + }, + { + "epoch": 2.89, + "grad_norm": 0.59765625, + "learning_rate": 4.3366894164470376e-05, + "loss": 0.2196, + "step": 21657 + }, + { + "epoch": 2.89, + "grad_norm": 0.6484375, + "learning_rate": 4.335729698348058e-05, + "loss": 0.2326, + "step": 21658 + }, + { + "epoch": 2.89, + "grad_norm": 0.64453125, + "learning_rate": 4.33477005706022e-05, + "loss": 0.4595, + "step": 21659 + }, + { + "epoch": 2.89, + "grad_norm": 0.4296875, + "learning_rate": 4.3338104925965395e-05, + "loss": 0.1967, + "step": 21660 + }, + { + "epoch": 2.89, + "grad_norm": 0.9140625, + "learning_rate": 4.332851004970021e-05, + "loss": 0.3389, + "step": 21661 + }, + { + "epoch": 2.89, + "grad_norm": 0.61328125, + "learning_rate": 4.331891594193681e-05, + "loss": 0.3177, + "step": 21662 + }, + { + "epoch": 2.89, + "grad_norm": 0.6953125, + "learning_rate": 4.330932260280528e-05, + "loss": 0.3246, + "step": 21663 + }, + { + "epoch": 2.89, + "grad_norm": 0.51953125, + "learning_rate": 4.329973003243576e-05, + "loss": 0.3543, + "step": 21664 + }, + { + "epoch": 2.89, + "grad_norm": 0.66796875, + "learning_rate": 4.3290138230958256e-05, + "loss": 0.468, + "step": 21665 + }, + { + "epoch": 2.89, + "grad_norm": 0.609375, + "learning_rate": 4.32805471985029e-05, + "loss": 0.3856, + "step": 21666 + }, + { + "epoch": 2.89, + "grad_norm": 0.5546875, + "learning_rate": 4.3270956935199705e-05, + "loss": 0.2789, + "step": 21667 + }, + { + "epoch": 2.89, + "grad_norm": 0.51171875, + "learning_rate": 4.3261367441178725e-05, + "loss": 0.1348, + "step": 21668 + }, + { + "epoch": 2.89, + "grad_norm": 0.498046875, + "learning_rate": 4.325177871657007e-05, + "loss": 0.2366, + "step": 21669 + }, + { + "epoch": 2.89, + "grad_norm": 0.53515625, + "learning_rate": 4.324219076150366e-05, + "loss": 0.2589, + "step": 21670 + }, + { + "epoch": 2.89, + "grad_norm": 0.52734375, + "learning_rate": 4.3232603576109586e-05, + "loss": 0.2506, + "step": 21671 + }, + { + "epoch": 2.89, + "grad_norm": 0.427734375, + "learning_rate": 4.322301716051784e-05, + "loss": 0.1586, + "step": 21672 + }, + { + "epoch": 2.89, + "grad_norm": 0.69921875, + "learning_rate": 4.321343151485841e-05, + "loss": 0.4448, + "step": 21673 + }, + { + "epoch": 2.89, + "grad_norm": 0.66015625, + "learning_rate": 4.320384663926135e-05, + "loss": 0.2566, + "step": 21674 + }, + { + "epoch": 2.89, + "grad_norm": 0.71484375, + "learning_rate": 4.319426253385653e-05, + "loss": 0.4954, + "step": 21675 + }, + { + "epoch": 2.89, + "grad_norm": 0.74609375, + "learning_rate": 4.318467919877397e-05, + "loss": 0.5912, + "step": 21676 + }, + { + "epoch": 2.89, + "grad_norm": 0.53125, + "learning_rate": 4.317509663414363e-05, + "loss": 0.3495, + "step": 21677 + }, + { + "epoch": 2.89, + "grad_norm": 0.65625, + "learning_rate": 4.316551484009548e-05, + "loss": 0.4457, + "step": 21678 + }, + { + "epoch": 2.89, + "grad_norm": 0.60546875, + "learning_rate": 4.315593381675942e-05, + "loss": 0.3033, + "step": 21679 + }, + { + "epoch": 2.89, + "grad_norm": 0.5625, + "learning_rate": 4.314635356426535e-05, + "loss": 0.3905, + "step": 21680 + }, + { + "epoch": 2.89, + "grad_norm": 0.640625, + "learning_rate": 4.313677408274321e-05, + "loss": 0.3459, + "step": 21681 + }, + { + "epoch": 2.89, + "grad_norm": 0.68359375, + "learning_rate": 4.3127195372322916e-05, + "loss": 0.5667, + "step": 21682 + }, + { + "epoch": 2.89, + "grad_norm": 0.546875, + "learning_rate": 4.3117617433134386e-05, + "loss": 0.5068, + "step": 21683 + }, + { + "epoch": 2.89, + "grad_norm": 0.671875, + "learning_rate": 4.3108040265307433e-05, + "loss": 0.5091, + "step": 21684 + }, + { + "epoch": 2.89, + "grad_norm": 0.66015625, + "learning_rate": 4.309846386897196e-05, + "loss": 0.475, + "step": 21685 + }, + { + "epoch": 2.89, + "grad_norm": 0.4609375, + "learning_rate": 4.3088888244257855e-05, + "loss": 0.2893, + "step": 21686 + }, + { + "epoch": 2.89, + "grad_norm": 0.451171875, + "learning_rate": 4.3079313391294976e-05, + "loss": 0.1912, + "step": 21687 + }, + { + "epoch": 2.89, + "grad_norm": 0.87109375, + "learning_rate": 4.3069739310213097e-05, + "loss": 0.405, + "step": 21688 + }, + { + "epoch": 2.89, + "grad_norm": 0.466796875, + "learning_rate": 4.306016600114209e-05, + "loss": 0.3322, + "step": 21689 + }, + { + "epoch": 2.89, + "grad_norm": 0.51953125, + "learning_rate": 4.305059346421182e-05, + "loss": 0.1942, + "step": 21690 + }, + { + "epoch": 2.89, + "grad_norm": 0.56640625, + "learning_rate": 4.304102169955201e-05, + "loss": 0.411, + "step": 21691 + }, + { + "epoch": 2.89, + "grad_norm": 0.6328125, + "learning_rate": 4.303145070729254e-05, + "loss": 0.5134, + "step": 21692 + }, + { + "epoch": 2.89, + "grad_norm": 0.48828125, + "learning_rate": 4.302188048756313e-05, + "loss": 0.1809, + "step": 21693 + }, + { + "epoch": 2.89, + "grad_norm": 0.484375, + "learning_rate": 4.3012311040493594e-05, + "loss": 0.387, + "step": 21694 + }, + { + "epoch": 2.89, + "grad_norm": 0.65625, + "learning_rate": 4.300274236621368e-05, + "loss": 0.1813, + "step": 21695 + }, + { + "epoch": 2.9, + "grad_norm": 0.54296875, + "learning_rate": 4.29931744648532e-05, + "loss": 0.3836, + "step": 21696 + }, + { + "epoch": 2.9, + "grad_norm": 0.66796875, + "learning_rate": 4.298360733654183e-05, + "loss": 0.3335, + "step": 21697 + }, + { + "epoch": 2.9, + "grad_norm": 0.53515625, + "learning_rate": 4.2974040981409325e-05, + "loss": 0.2723, + "step": 21698 + }, + { + "epoch": 2.9, + "grad_norm": 0.6171875, + "learning_rate": 4.296447539958544e-05, + "loss": 0.5607, + "step": 21699 + }, + { + "epoch": 2.9, + "grad_norm": 0.7421875, + "learning_rate": 4.295491059119986e-05, + "loss": 0.3711, + "step": 21700 + }, + { + "epoch": 2.9, + "grad_norm": 0.5390625, + "learning_rate": 4.294534655638235e-05, + "loss": 0.4338, + "step": 21701 + }, + { + "epoch": 2.9, + "grad_norm": 0.61328125, + "learning_rate": 4.2935783295262556e-05, + "loss": 0.195, + "step": 21702 + }, + { + "epoch": 2.9, + "grad_norm": 0.52734375, + "learning_rate": 4.2926220807970116e-05, + "loss": 0.4422, + "step": 21703 + }, + { + "epoch": 2.9, + "grad_norm": 0.7734375, + "learning_rate": 4.291665909463477e-05, + "loss": 0.6078, + "step": 21704 + }, + { + "epoch": 2.9, + "grad_norm": 0.6796875, + "learning_rate": 4.290709815538615e-05, + "loss": 0.2055, + "step": 21705 + }, + { + "epoch": 2.9, + "grad_norm": 0.85546875, + "learning_rate": 4.2897537990353966e-05, + "loss": 0.4283, + "step": 21706 + }, + { + "epoch": 2.9, + "grad_norm": 0.58984375, + "learning_rate": 4.2887978599667776e-05, + "loss": 0.6544, + "step": 21707 + }, + { + "epoch": 2.9, + "grad_norm": 0.53125, + "learning_rate": 4.2878419983457244e-05, + "loss": 0.3859, + "step": 21708 + }, + { + "epoch": 2.9, + "grad_norm": 0.66015625, + "learning_rate": 4.2868862141852006e-05, + "loss": 0.4997, + "step": 21709 + }, + { + "epoch": 2.9, + "grad_norm": 0.65234375, + "learning_rate": 4.285930507498169e-05, + "loss": 0.2157, + "step": 21710 + }, + { + "epoch": 2.9, + "grad_norm": 0.6328125, + "learning_rate": 4.284974878297584e-05, + "loss": 0.6084, + "step": 21711 + }, + { + "epoch": 2.9, + "grad_norm": 0.5546875, + "learning_rate": 4.284019326596408e-05, + "loss": 0.3258, + "step": 21712 + }, + { + "epoch": 2.9, + "grad_norm": 0.59375, + "learning_rate": 4.2830638524075975e-05, + "loss": 0.4718, + "step": 21713 + }, + { + "epoch": 2.9, + "grad_norm": 0.72265625, + "learning_rate": 4.2821084557441136e-05, + "loss": 0.5927, + "step": 21714 + }, + { + "epoch": 2.9, + "grad_norm": 0.6953125, + "learning_rate": 4.281153136618911e-05, + "loss": 0.2662, + "step": 21715 + }, + { + "epoch": 2.9, + "grad_norm": 0.51171875, + "learning_rate": 4.280197895044936e-05, + "loss": 0.5911, + "step": 21716 + }, + { + "epoch": 2.9, + "grad_norm": 0.52734375, + "learning_rate": 4.2792427310351504e-05, + "loss": 0.2208, + "step": 21717 + }, + { + "epoch": 2.9, + "grad_norm": 0.8359375, + "learning_rate": 4.2782876446025033e-05, + "loss": 0.2745, + "step": 21718 + }, + { + "epoch": 2.9, + "grad_norm": 0.92578125, + "learning_rate": 4.277332635759952e-05, + "loss": 0.4173, + "step": 21719 + }, + { + "epoch": 2.9, + "grad_norm": 0.640625, + "learning_rate": 4.27637770452044e-05, + "loss": 0.2339, + "step": 21720 + }, + { + "epoch": 2.9, + "grad_norm": 0.70703125, + "learning_rate": 4.27542285089692e-05, + "loss": 0.2255, + "step": 21721 + }, + { + "epoch": 2.9, + "grad_norm": 0.50390625, + "learning_rate": 4.274468074902339e-05, + "loss": 0.3765, + "step": 21722 + }, + { + "epoch": 2.9, + "grad_norm": 0.5234375, + "learning_rate": 4.273513376549646e-05, + "loss": 0.3555, + "step": 21723 + }, + { + "epoch": 2.9, + "grad_norm": 0.77734375, + "learning_rate": 4.27255875585179e-05, + "loss": 0.5677, + "step": 21724 + }, + { + "epoch": 2.9, + "grad_norm": 0.478515625, + "learning_rate": 4.27160421282171e-05, + "loss": 0.1897, + "step": 21725 + }, + { + "epoch": 2.9, + "grad_norm": 0.4921875, + "learning_rate": 4.270649747472357e-05, + "loss": 0.3817, + "step": 21726 + }, + { + "epoch": 2.9, + "grad_norm": 0.49609375, + "learning_rate": 4.2696953598166664e-05, + "loss": 0.3386, + "step": 21727 + }, + { + "epoch": 2.9, + "grad_norm": 0.65625, + "learning_rate": 4.268741049867585e-05, + "loss": 0.2743, + "step": 21728 + }, + { + "epoch": 2.9, + "grad_norm": 0.828125, + "learning_rate": 4.267786817638056e-05, + "loss": 0.3848, + "step": 21729 + }, + { + "epoch": 2.9, + "grad_norm": 0.81640625, + "learning_rate": 4.266832663141015e-05, + "loss": 0.2937, + "step": 21730 + }, + { + "epoch": 2.9, + "grad_norm": 0.546875, + "learning_rate": 4.265878586389401e-05, + "loss": 0.3906, + "step": 21731 + }, + { + "epoch": 2.9, + "grad_norm": 0.65625, + "learning_rate": 4.264924587396154e-05, + "loss": 0.2202, + "step": 21732 + }, + { + "epoch": 2.9, + "grad_norm": 0.5234375, + "learning_rate": 4.2639706661742126e-05, + "loss": 0.2307, + "step": 21733 + }, + { + "epoch": 2.9, + "grad_norm": 0.69921875, + "learning_rate": 4.2630168227365086e-05, + "loss": 0.3299, + "step": 21734 + }, + { + "epoch": 2.9, + "grad_norm": 0.470703125, + "learning_rate": 4.2620630570959775e-05, + "loss": 0.2373, + "step": 21735 + }, + { + "epoch": 2.9, + "grad_norm": 0.51953125, + "learning_rate": 4.2611093692655545e-05, + "loss": 0.3208, + "step": 21736 + }, + { + "epoch": 2.9, + "grad_norm": 0.5625, + "learning_rate": 4.2601557592581754e-05, + "loss": 0.2068, + "step": 21737 + }, + { + "epoch": 2.9, + "grad_norm": 0.55078125, + "learning_rate": 4.259202227086764e-05, + "loss": 0.2624, + "step": 21738 + }, + { + "epoch": 2.9, + "grad_norm": 0.625, + "learning_rate": 4.258248772764258e-05, + "loss": 0.3637, + "step": 21739 + }, + { + "epoch": 2.9, + "grad_norm": 0.45703125, + "learning_rate": 4.25729539630358e-05, + "loss": 0.1743, + "step": 21740 + }, + { + "epoch": 2.9, + "grad_norm": 0.5859375, + "learning_rate": 4.256342097717663e-05, + "loss": 0.2314, + "step": 21741 + }, + { + "epoch": 2.9, + "grad_norm": 0.53515625, + "learning_rate": 4.255388877019436e-05, + "loss": 0.2851, + "step": 21742 + }, + { + "epoch": 2.9, + "grad_norm": 0.75390625, + "learning_rate": 4.25443573422182e-05, + "loss": 0.4632, + "step": 21743 + }, + { + "epoch": 2.9, + "grad_norm": 0.60546875, + "learning_rate": 4.2534826693377426e-05, + "loss": 0.2473, + "step": 21744 + }, + { + "epoch": 2.9, + "grad_norm": 0.65625, + "learning_rate": 4.2525296823801286e-05, + "loss": 0.2356, + "step": 21745 + }, + { + "epoch": 2.9, + "grad_norm": 0.4296875, + "learning_rate": 4.2515767733618997e-05, + "loss": 0.2093, + "step": 21746 + }, + { + "epoch": 2.9, + "grad_norm": 0.61328125, + "learning_rate": 4.2506239422959834e-05, + "loss": 0.3225, + "step": 21747 + }, + { + "epoch": 2.9, + "grad_norm": 0.486328125, + "learning_rate": 4.249671189195293e-05, + "loss": 0.1575, + "step": 21748 + }, + { + "epoch": 2.9, + "grad_norm": 0.4765625, + "learning_rate": 4.248718514072751e-05, + "loss": 0.1599, + "step": 21749 + }, + { + "epoch": 2.9, + "grad_norm": 0.51953125, + "learning_rate": 4.2477659169412786e-05, + "loss": 0.2094, + "step": 21750 + }, + { + "epoch": 2.9, + "grad_norm": 0.361328125, + "learning_rate": 4.2468133978137945e-05, + "loss": 0.1887, + "step": 21751 + }, + { + "epoch": 2.9, + "grad_norm": 0.83984375, + "learning_rate": 4.245860956703214e-05, + "loss": 0.324, + "step": 21752 + }, + { + "epoch": 2.9, + "grad_norm": 0.6875, + "learning_rate": 4.244908593622448e-05, + "loss": 0.3534, + "step": 21753 + }, + { + "epoch": 2.9, + "grad_norm": 0.66015625, + "learning_rate": 4.243956308584414e-05, + "loss": 0.3234, + "step": 21754 + }, + { + "epoch": 2.9, + "grad_norm": 0.734375, + "learning_rate": 4.243004101602027e-05, + "loss": 0.2564, + "step": 21755 + }, + { + "epoch": 2.9, + "grad_norm": 0.470703125, + "learning_rate": 4.242051972688204e-05, + "loss": 0.1716, + "step": 21756 + }, + { + "epoch": 2.9, + "grad_norm": 0.7109375, + "learning_rate": 4.241099921855847e-05, + "loss": 0.4463, + "step": 21757 + }, + { + "epoch": 2.9, + "grad_norm": 0.57421875, + "learning_rate": 4.2401479491178705e-05, + "loss": 0.4047, + "step": 21758 + }, + { + "epoch": 2.9, + "grad_norm": 0.609375, + "learning_rate": 4.2391960544871854e-05, + "loss": 0.4983, + "step": 21759 + }, + { + "epoch": 2.9, + "grad_norm": 0.65234375, + "learning_rate": 4.238244237976703e-05, + "loss": 0.4302, + "step": 21760 + }, + { + "epoch": 2.9, + "grad_norm": 0.5390625, + "learning_rate": 4.2372924995993226e-05, + "loss": 0.1385, + "step": 21761 + }, + { + "epoch": 2.9, + "grad_norm": 0.515625, + "learning_rate": 4.236340839367955e-05, + "loss": 0.2509, + "step": 21762 + }, + { + "epoch": 2.9, + "grad_norm": 0.61328125, + "learning_rate": 4.235389257295508e-05, + "loss": 0.3483, + "step": 21763 + }, + { + "epoch": 2.9, + "grad_norm": 0.54296875, + "learning_rate": 4.234437753394878e-05, + "loss": 0.3071, + "step": 21764 + }, + { + "epoch": 2.9, + "grad_norm": 0.291015625, + "learning_rate": 4.233486327678976e-05, + "loss": 0.098, + "step": 21765 + }, + { + "epoch": 2.9, + "grad_norm": 0.59375, + "learning_rate": 4.232534980160697e-05, + "loss": 0.2089, + "step": 21766 + }, + { + "epoch": 2.9, + "grad_norm": 0.625, + "learning_rate": 4.231583710852946e-05, + "loss": 0.339, + "step": 21767 + }, + { + "epoch": 2.9, + "grad_norm": 0.5546875, + "learning_rate": 4.2306325197686214e-05, + "loss": 0.4259, + "step": 21768 + }, + { + "epoch": 2.9, + "grad_norm": 0.447265625, + "learning_rate": 4.229681406920626e-05, + "loss": 0.3132, + "step": 21769 + }, + { + "epoch": 2.9, + "grad_norm": 0.56640625, + "learning_rate": 4.228730372321851e-05, + "loss": 0.354, + "step": 21770 + }, + { + "epoch": 2.91, + "grad_norm": 0.62109375, + "learning_rate": 4.227779415985196e-05, + "loss": 0.2306, + "step": 21771 + }, + { + "epoch": 2.91, + "grad_norm": 0.64453125, + "learning_rate": 4.226828537923556e-05, + "loss": 0.2766, + "step": 21772 + }, + { + "epoch": 2.91, + "grad_norm": 0.494140625, + "learning_rate": 4.225877738149827e-05, + "loss": 0.2186, + "step": 21773 + }, + { + "epoch": 2.91, + "grad_norm": 0.640625, + "learning_rate": 4.224927016676905e-05, + "loss": 0.6717, + "step": 21774 + }, + { + "epoch": 2.91, + "grad_norm": 0.482421875, + "learning_rate": 4.2239763735176785e-05, + "loss": 0.351, + "step": 21775 + }, + { + "epoch": 2.91, + "grad_norm": 0.478515625, + "learning_rate": 4.2230258086850374e-05, + "loss": 0.3151, + "step": 21776 + }, + { + "epoch": 2.91, + "grad_norm": 0.65625, + "learning_rate": 4.222075322191872e-05, + "loss": 0.281, + "step": 21777 + }, + { + "epoch": 2.91, + "grad_norm": 0.546875, + "learning_rate": 4.2211249140510746e-05, + "loss": 0.3503, + "step": 21778 + }, + { + "epoch": 2.91, + "grad_norm": 0.484375, + "learning_rate": 4.220174584275536e-05, + "loss": 0.2187, + "step": 21779 + }, + { + "epoch": 2.91, + "grad_norm": 0.65625, + "learning_rate": 4.219224332878136e-05, + "loss": 0.2034, + "step": 21780 + }, + { + "epoch": 2.91, + "grad_norm": 0.69140625, + "learning_rate": 4.218274159871763e-05, + "loss": 0.4858, + "step": 21781 + }, + { + "epoch": 2.91, + "grad_norm": 0.75, + "learning_rate": 4.2173240652693036e-05, + "loss": 0.3689, + "step": 21782 + }, + { + "epoch": 2.91, + "grad_norm": 0.5546875, + "learning_rate": 4.216374049083646e-05, + "loss": 0.4359, + "step": 21783 + }, + { + "epoch": 2.91, + "grad_norm": 0.70703125, + "learning_rate": 4.2154241113276626e-05, + "loss": 0.5005, + "step": 21784 + }, + { + "epoch": 2.91, + "grad_norm": 0.55078125, + "learning_rate": 4.214474252014242e-05, + "loss": 0.3797, + "step": 21785 + }, + { + "epoch": 2.91, + "grad_norm": 0.73046875, + "learning_rate": 4.213524471156263e-05, + "loss": 0.5251, + "step": 21786 + }, + { + "epoch": 2.91, + "grad_norm": 0.53515625, + "learning_rate": 4.212574768766611e-05, + "loss": 0.4037, + "step": 21787 + }, + { + "epoch": 2.91, + "grad_norm": 0.65234375, + "learning_rate": 4.2116251448581576e-05, + "loss": 0.4291, + "step": 21788 + }, + { + "epoch": 2.91, + "grad_norm": 0.54296875, + "learning_rate": 4.210675599443779e-05, + "loss": 0.3436, + "step": 21789 + }, + { + "epoch": 2.91, + "grad_norm": 0.51953125, + "learning_rate": 4.209726132536356e-05, + "loss": 0.2925, + "step": 21790 + }, + { + "epoch": 2.91, + "grad_norm": 0.45703125, + "learning_rate": 4.2087767441487616e-05, + "loss": 0.2667, + "step": 21791 + }, + { + "epoch": 2.91, + "grad_norm": 0.64453125, + "learning_rate": 4.207827434293875e-05, + "loss": 0.2617, + "step": 21792 + }, + { + "epoch": 2.91, + "grad_norm": 0.59375, + "learning_rate": 4.206878202984562e-05, + "loss": 0.3759, + "step": 21793 + }, + { + "epoch": 2.91, + "grad_norm": 0.46484375, + "learning_rate": 4.2059290502337e-05, + "loss": 0.1909, + "step": 21794 + }, + { + "epoch": 2.91, + "grad_norm": 0.55078125, + "learning_rate": 4.2049799760541575e-05, + "loss": 0.3163, + "step": 21795 + }, + { + "epoch": 2.91, + "grad_norm": 0.5234375, + "learning_rate": 4.204030980458806e-05, + "loss": 0.2402, + "step": 21796 + }, + { + "epoch": 2.91, + "grad_norm": 0.54296875, + "learning_rate": 4.203082063460518e-05, + "loss": 0.2065, + "step": 21797 + }, + { + "epoch": 2.91, + "grad_norm": 0.51953125, + "learning_rate": 4.202133225072153e-05, + "loss": 0.306, + "step": 21798 + }, + { + "epoch": 2.91, + "grad_norm": 0.44921875, + "learning_rate": 4.201184465306588e-05, + "loss": 0.1648, + "step": 21799 + }, + { + "epoch": 2.91, + "grad_norm": 0.48046875, + "learning_rate": 4.200235784176678e-05, + "loss": 0.2182, + "step": 21800 + }, + { + "epoch": 2.91, + "grad_norm": 0.59765625, + "learning_rate": 4.199287181695294e-05, + "loss": 0.3105, + "step": 21801 + }, + { + "epoch": 2.91, + "grad_norm": 0.734375, + "learning_rate": 4.1983386578753025e-05, + "loss": 0.2278, + "step": 21802 + }, + { + "epoch": 2.91, + "grad_norm": 0.625, + "learning_rate": 4.197390212729558e-05, + "loss": 0.4539, + "step": 21803 + }, + { + "epoch": 2.91, + "grad_norm": 0.72265625, + "learning_rate": 4.196441846270927e-05, + "loss": 0.2683, + "step": 21804 + }, + { + "epoch": 2.91, + "grad_norm": 0.5625, + "learning_rate": 4.195493558512268e-05, + "loss": 0.2566, + "step": 21805 + }, + { + "epoch": 2.91, + "grad_norm": 0.470703125, + "learning_rate": 4.194545349466447e-05, + "loss": 0.381, + "step": 21806 + }, + { + "epoch": 2.91, + "grad_norm": 0.4765625, + "learning_rate": 4.1935972191463114e-05, + "loss": 0.2537, + "step": 21807 + }, + { + "epoch": 2.91, + "grad_norm": 0.671875, + "learning_rate": 4.192649167564725e-05, + "loss": 0.5364, + "step": 21808 + }, + { + "epoch": 2.91, + "grad_norm": 0.58984375, + "learning_rate": 4.191701194734543e-05, + "loss": 0.3569, + "step": 21809 + }, + { + "epoch": 2.91, + "grad_norm": 0.5078125, + "learning_rate": 4.190753300668625e-05, + "loss": 0.2876, + "step": 21810 + }, + { + "epoch": 2.91, + "grad_norm": 0.421875, + "learning_rate": 4.18980548537982e-05, + "loss": 0.1676, + "step": 21811 + }, + { + "epoch": 2.91, + "grad_norm": 0.57421875, + "learning_rate": 4.188857748880978e-05, + "loss": 0.2506, + "step": 21812 + }, + { + "epoch": 2.91, + "grad_norm": 0.7734375, + "learning_rate": 4.187910091184953e-05, + "loss": 0.5448, + "step": 21813 + }, + { + "epoch": 2.91, + "grad_norm": 0.56640625, + "learning_rate": 4.186962512304599e-05, + "loss": 0.2175, + "step": 21814 + }, + { + "epoch": 2.91, + "grad_norm": 0.8203125, + "learning_rate": 4.186015012252767e-05, + "loss": 0.4015, + "step": 21815 + }, + { + "epoch": 2.91, + "grad_norm": 0.4765625, + "learning_rate": 4.1850675910422996e-05, + "loss": 0.3821, + "step": 21816 + }, + { + "epoch": 2.91, + "grad_norm": 0.59765625, + "learning_rate": 4.184120248686048e-05, + "loss": 0.4511, + "step": 21817 + }, + { + "epoch": 2.91, + "grad_norm": 0.55078125, + "learning_rate": 4.183172985196858e-05, + "loss": 0.3493, + "step": 21818 + }, + { + "epoch": 2.91, + "grad_norm": 0.66796875, + "learning_rate": 4.182225800587576e-05, + "loss": 0.4355, + "step": 21819 + }, + { + "epoch": 2.91, + "grad_norm": 0.69140625, + "learning_rate": 4.181278694871049e-05, + "loss": 0.4327, + "step": 21820 + }, + { + "epoch": 2.91, + "grad_norm": 0.57421875, + "learning_rate": 4.1803316680601135e-05, + "loss": 0.4063, + "step": 21821 + }, + { + "epoch": 2.91, + "grad_norm": 0.55078125, + "learning_rate": 4.179384720167616e-05, + "loss": 0.3804, + "step": 21822 + }, + { + "epoch": 2.91, + "grad_norm": 0.66796875, + "learning_rate": 4.178437851206397e-05, + "loss": 0.5308, + "step": 21823 + }, + { + "epoch": 2.91, + "grad_norm": 0.53125, + "learning_rate": 4.1774910611893016e-05, + "loss": 0.3339, + "step": 21824 + }, + { + "epoch": 2.91, + "grad_norm": 0.80078125, + "learning_rate": 4.176544350129164e-05, + "loss": 0.3259, + "step": 21825 + }, + { + "epoch": 2.91, + "grad_norm": 0.44921875, + "learning_rate": 4.1755977180388184e-05, + "loss": 0.297, + "step": 21826 + }, + { + "epoch": 2.91, + "grad_norm": 0.6171875, + "learning_rate": 4.174651164931105e-05, + "loss": 0.3899, + "step": 21827 + }, + { + "epoch": 2.91, + "grad_norm": 0.439453125, + "learning_rate": 4.1737046908188614e-05, + "loss": 0.243, + "step": 21828 + }, + { + "epoch": 2.91, + "grad_norm": 0.5859375, + "learning_rate": 4.172758295714926e-05, + "loss": 0.2956, + "step": 21829 + }, + { + "epoch": 2.91, + "grad_norm": 0.59765625, + "learning_rate": 4.171811979632123e-05, + "loss": 0.5714, + "step": 21830 + }, + { + "epoch": 2.91, + "grad_norm": 0.5390625, + "learning_rate": 4.170865742583291e-05, + "loss": 0.3085, + "step": 21831 + }, + { + "epoch": 2.91, + "grad_norm": 0.58203125, + "learning_rate": 4.16991958458126e-05, + "loss": 0.3726, + "step": 21832 + }, + { + "epoch": 2.91, + "grad_norm": 0.578125, + "learning_rate": 4.168973505638865e-05, + "loss": 0.3715, + "step": 21833 + }, + { + "epoch": 2.91, + "grad_norm": 0.66015625, + "learning_rate": 4.168027505768928e-05, + "loss": 0.2757, + "step": 21834 + }, + { + "epoch": 2.91, + "grad_norm": 0.5625, + "learning_rate": 4.16708158498428e-05, + "loss": 0.3303, + "step": 21835 + }, + { + "epoch": 2.91, + "grad_norm": 0.57421875, + "learning_rate": 4.166135743297754e-05, + "loss": 0.2188, + "step": 21836 + }, + { + "epoch": 2.91, + "grad_norm": 0.578125, + "learning_rate": 4.165189980722167e-05, + "loss": 0.3311, + "step": 21837 + }, + { + "epoch": 2.91, + "grad_norm": 0.51953125, + "learning_rate": 4.1642442972703534e-05, + "loss": 0.2471, + "step": 21838 + }, + { + "epoch": 2.91, + "grad_norm": 0.6484375, + "learning_rate": 4.1632986929551284e-05, + "loss": 0.4126, + "step": 21839 + }, + { + "epoch": 2.91, + "grad_norm": 0.5234375, + "learning_rate": 4.162353167789319e-05, + "loss": 0.15, + "step": 21840 + }, + { + "epoch": 2.91, + "grad_norm": 0.447265625, + "learning_rate": 4.161407721785747e-05, + "loss": 0.2175, + "step": 21841 + }, + { + "epoch": 2.91, + "grad_norm": 0.3359375, + "learning_rate": 4.160462354957237e-05, + "loss": 0.1455, + "step": 21842 + }, + { + "epoch": 2.91, + "grad_norm": 0.625, + "learning_rate": 4.159517067316603e-05, + "loss": 0.3908, + "step": 21843 + }, + { + "epoch": 2.91, + "grad_norm": 0.64453125, + "learning_rate": 4.1585718588766645e-05, + "loss": 0.4962, + "step": 21844 + }, + { + "epoch": 2.91, + "grad_norm": 0.8046875, + "learning_rate": 4.157626729650241e-05, + "loss": 0.4909, + "step": 21845 + }, + { + "epoch": 2.92, + "grad_norm": 0.46484375, + "learning_rate": 4.156681679650148e-05, + "loss": 0.4421, + "step": 21846 + }, + { + "epoch": 2.92, + "grad_norm": 0.546875, + "learning_rate": 4.155736708889206e-05, + "loss": 0.2559, + "step": 21847 + }, + { + "epoch": 2.92, + "grad_norm": 0.66015625, + "learning_rate": 4.1547918173802255e-05, + "loss": 0.3389, + "step": 21848 + }, + { + "epoch": 2.92, + "grad_norm": 0.5859375, + "learning_rate": 4.1538470051360155e-05, + "loss": 0.276, + "step": 21849 + }, + { + "epoch": 2.92, + "grad_norm": 0.4140625, + "learning_rate": 4.152902272169391e-05, + "loss": 0.1511, + "step": 21850 + }, + { + "epoch": 2.92, + "grad_norm": 0.62890625, + "learning_rate": 4.151957618493166e-05, + "loss": 0.5838, + "step": 21851 + }, + { + "epoch": 2.92, + "grad_norm": 0.3984375, + "learning_rate": 4.151013044120151e-05, + "loss": 0.1807, + "step": 21852 + }, + { + "epoch": 2.92, + "grad_norm": 0.62109375, + "learning_rate": 4.150068549063151e-05, + "loss": 0.3118, + "step": 21853 + }, + { + "epoch": 2.92, + "grad_norm": 0.64453125, + "learning_rate": 4.149124133334975e-05, + "loss": 0.279, + "step": 21854 + }, + { + "epoch": 2.92, + "grad_norm": 0.6953125, + "learning_rate": 4.1481797969484304e-05, + "loss": 0.2534, + "step": 21855 + }, + { + "epoch": 2.92, + "grad_norm": 0.671875, + "learning_rate": 4.147235539916328e-05, + "loss": 0.4733, + "step": 21856 + }, + { + "epoch": 2.92, + "grad_norm": 0.49609375, + "learning_rate": 4.1462913622514635e-05, + "loss": 0.2403, + "step": 21857 + }, + { + "epoch": 2.92, + "grad_norm": 0.5546875, + "learning_rate": 4.1453472639666457e-05, + "loss": 0.3113, + "step": 21858 + }, + { + "epoch": 2.92, + "grad_norm": 0.859375, + "learning_rate": 4.144403245074676e-05, + "loss": 0.3783, + "step": 21859 + }, + { + "epoch": 2.92, + "grad_norm": 0.71875, + "learning_rate": 4.143459305588361e-05, + "loss": 0.2856, + "step": 21860 + }, + { + "epoch": 2.92, + "grad_norm": 0.671875, + "learning_rate": 4.142515445520495e-05, + "loss": 0.2921, + "step": 21861 + }, + { + "epoch": 2.92, + "grad_norm": 0.59375, + "learning_rate": 4.1415716648838765e-05, + "loss": 0.5975, + "step": 21862 + }, + { + "epoch": 2.92, + "grad_norm": 0.6796875, + "learning_rate": 4.140627963691305e-05, + "loss": 0.3708, + "step": 21863 + }, + { + "epoch": 2.92, + "grad_norm": 0.72265625, + "learning_rate": 4.1396843419555796e-05, + "loss": 0.4277, + "step": 21864 + }, + { + "epoch": 2.92, + "grad_norm": 0.6796875, + "learning_rate": 4.138740799689499e-05, + "loss": 0.2752, + "step": 21865 + }, + { + "epoch": 2.92, + "grad_norm": 0.404296875, + "learning_rate": 4.137797336905851e-05, + "loss": 0.2504, + "step": 21866 + }, + { + "epoch": 2.92, + "grad_norm": 0.5859375, + "learning_rate": 4.1368539536174345e-05, + "loss": 0.327, + "step": 21867 + }, + { + "epoch": 2.92, + "grad_norm": 0.60546875, + "learning_rate": 4.1359106498370406e-05, + "loss": 0.6177, + "step": 21868 + }, + { + "epoch": 2.92, + "grad_norm": 0.546875, + "learning_rate": 4.134967425577462e-05, + "loss": 0.2892, + "step": 21869 + }, + { + "epoch": 2.92, + "grad_norm": 0.5234375, + "learning_rate": 4.1340242808514926e-05, + "loss": 0.4013, + "step": 21870 + }, + { + "epoch": 2.92, + "grad_norm": 0.67578125, + "learning_rate": 4.133081215671915e-05, + "loss": 0.5539, + "step": 21871 + }, + { + "epoch": 2.92, + "grad_norm": 0.7421875, + "learning_rate": 4.132138230051525e-05, + "loss": 0.3748, + "step": 21872 + }, + { + "epoch": 2.92, + "grad_norm": 0.68359375, + "learning_rate": 4.131195324003103e-05, + "loss": 0.4463, + "step": 21873 + }, + { + "epoch": 2.92, + "grad_norm": 0.70703125, + "learning_rate": 4.13025249753944e-05, + "loss": 0.5747, + "step": 21874 + }, + { + "epoch": 2.92, + "grad_norm": 0.734375, + "learning_rate": 4.129309750673324e-05, + "loss": 0.4587, + "step": 21875 + }, + { + "epoch": 2.92, + "grad_norm": 0.61328125, + "learning_rate": 4.12836708341753e-05, + "loss": 0.4211, + "step": 21876 + }, + { + "epoch": 2.92, + "grad_norm": 0.75, + "learning_rate": 4.1274244957848485e-05, + "loss": 0.2497, + "step": 21877 + }, + { + "epoch": 2.92, + "grad_norm": 0.65234375, + "learning_rate": 4.126481987788059e-05, + "loss": 0.4259, + "step": 21878 + }, + { + "epoch": 2.92, + "grad_norm": 0.7421875, + "learning_rate": 4.125539559439947e-05, + "loss": 0.6392, + "step": 21879 + }, + { + "epoch": 2.92, + "grad_norm": 0.62890625, + "learning_rate": 4.1245972107532846e-05, + "loss": 0.2856, + "step": 21880 + }, + { + "epoch": 2.92, + "grad_norm": 0.54296875, + "learning_rate": 4.1236549417408564e-05, + "loss": 0.1746, + "step": 21881 + }, + { + "epoch": 2.92, + "grad_norm": 0.5625, + "learning_rate": 4.122712752415437e-05, + "loss": 0.392, + "step": 21882 + }, + { + "epoch": 2.92, + "grad_norm": 0.66015625, + "learning_rate": 4.1217706427898094e-05, + "loss": 0.3096, + "step": 21883 + }, + { + "epoch": 2.92, + "grad_norm": 0.63671875, + "learning_rate": 4.120828612876745e-05, + "loss": 0.4789, + "step": 21884 + }, + { + "epoch": 2.92, + "grad_norm": 0.57421875, + "learning_rate": 4.119886662689013e-05, + "loss": 0.434, + "step": 21885 + }, + { + "epoch": 2.92, + "grad_norm": 0.8203125, + "learning_rate": 4.118944792239392e-05, + "loss": 0.4723, + "step": 21886 + }, + { + "epoch": 2.92, + "grad_norm": 0.5234375, + "learning_rate": 4.1180030015406544e-05, + "loss": 0.1503, + "step": 21887 + }, + { + "epoch": 2.92, + "grad_norm": 0.5546875, + "learning_rate": 4.1170612906055736e-05, + "loss": 0.3189, + "step": 21888 + }, + { + "epoch": 2.92, + "grad_norm": 0.59765625, + "learning_rate": 4.116119659446915e-05, + "loss": 0.386, + "step": 21889 + }, + { + "epoch": 2.92, + "grad_norm": 0.52734375, + "learning_rate": 4.11517810807745e-05, + "loss": 0.3467, + "step": 21890 + }, + { + "epoch": 2.92, + "grad_norm": 0.65625, + "learning_rate": 4.114236636509946e-05, + "loss": 0.4448, + "step": 21891 + }, + { + "epoch": 2.92, + "grad_norm": 0.5234375, + "learning_rate": 4.113295244757172e-05, + "loss": 0.408, + "step": 21892 + }, + { + "epoch": 2.92, + "grad_norm": 0.6875, + "learning_rate": 4.112353932831895e-05, + "loss": 0.4916, + "step": 21893 + }, + { + "epoch": 2.92, + "grad_norm": 0.66015625, + "learning_rate": 4.111412700746874e-05, + "loss": 0.3141, + "step": 21894 + }, + { + "epoch": 2.92, + "grad_norm": 0.61328125, + "learning_rate": 4.110471548514876e-05, + "loss": 0.4827, + "step": 21895 + }, + { + "epoch": 2.92, + "grad_norm": 0.40234375, + "learning_rate": 4.1095304761486676e-05, + "loss": 0.2414, + "step": 21896 + }, + { + "epoch": 2.92, + "grad_norm": 0.6875, + "learning_rate": 4.108589483661002e-05, + "loss": 0.4695, + "step": 21897 + }, + { + "epoch": 2.92, + "grad_norm": 0.7109375, + "learning_rate": 4.1076485710646485e-05, + "loss": 0.4017, + "step": 21898 + }, + { + "epoch": 2.92, + "grad_norm": 0.4140625, + "learning_rate": 4.106707738372357e-05, + "loss": 0.2046, + "step": 21899 + }, + { + "epoch": 2.92, + "grad_norm": 0.6953125, + "learning_rate": 4.105766985596893e-05, + "loss": 0.4698, + "step": 21900 + }, + { + "epoch": 2.92, + "grad_norm": 0.5859375, + "learning_rate": 4.10482631275101e-05, + "loss": 0.2636, + "step": 21901 + }, + { + "epoch": 2.92, + "grad_norm": 0.62109375, + "learning_rate": 4.10388571984747e-05, + "loss": 0.7397, + "step": 21902 + }, + { + "epoch": 2.92, + "grad_norm": 0.76953125, + "learning_rate": 4.10294520689902e-05, + "loss": 0.4108, + "step": 21903 + }, + { + "epoch": 2.92, + "grad_norm": 0.439453125, + "learning_rate": 4.102004773918418e-05, + "loss": 0.1342, + "step": 21904 + }, + { + "epoch": 2.92, + "grad_norm": 0.5546875, + "learning_rate": 4.101064420918417e-05, + "loss": 0.3307, + "step": 21905 + }, + { + "epoch": 2.92, + "grad_norm": 0.68359375, + "learning_rate": 4.1001241479117704e-05, + "loss": 0.3937, + "step": 21906 + }, + { + "epoch": 2.92, + "grad_norm": 0.55078125, + "learning_rate": 4.0991839549112245e-05, + "loss": 0.1793, + "step": 21907 + }, + { + "epoch": 2.92, + "grad_norm": 0.52734375, + "learning_rate": 4.098243841929531e-05, + "loss": 0.3432, + "step": 21908 + }, + { + "epoch": 2.92, + "grad_norm": 1.1796875, + "learning_rate": 4.0973038089794416e-05, + "loss": 0.3418, + "step": 21909 + }, + { + "epoch": 2.92, + "grad_norm": 0.5390625, + "learning_rate": 4.096363856073698e-05, + "loss": 0.3078, + "step": 21910 + }, + { + "epoch": 2.92, + "grad_norm": 0.71875, + "learning_rate": 4.095423983225053e-05, + "loss": 0.5675, + "step": 21911 + }, + { + "epoch": 2.92, + "grad_norm": 0.703125, + "learning_rate": 4.094484190446245e-05, + "loss": 0.5194, + "step": 21912 + }, + { + "epoch": 2.92, + "grad_norm": 0.53515625, + "learning_rate": 4.093544477750021e-05, + "loss": 0.2224, + "step": 21913 + }, + { + "epoch": 2.92, + "grad_norm": 0.6796875, + "learning_rate": 4.092604845149125e-05, + "loss": 0.4005, + "step": 21914 + }, + { + "epoch": 2.92, + "grad_norm": 0.640625, + "learning_rate": 4.0916652926563015e-05, + "loss": 0.2996, + "step": 21915 + }, + { + "epoch": 2.92, + "grad_norm": 0.60546875, + "learning_rate": 4.0907258202842855e-05, + "loss": 0.5415, + "step": 21916 + }, + { + "epoch": 2.92, + "grad_norm": 0.66796875, + "learning_rate": 4.08978642804582e-05, + "loss": 0.5912, + "step": 21917 + }, + { + "epoch": 2.92, + "grad_norm": 0.60546875, + "learning_rate": 4.0888471159536434e-05, + "loss": 0.3724, + "step": 21918 + }, + { + "epoch": 2.92, + "grad_norm": 0.63671875, + "learning_rate": 4.087907884020494e-05, + "loss": 0.2443, + "step": 21919 + }, + { + "epoch": 2.93, + "grad_norm": 0.58984375, + "learning_rate": 4.086968732259112e-05, + "loss": 0.4021, + "step": 21920 + }, + { + "epoch": 2.93, + "grad_norm": 0.73828125, + "learning_rate": 4.086029660682228e-05, + "loss": 0.3367, + "step": 21921 + }, + { + "epoch": 2.93, + "grad_norm": 0.5703125, + "learning_rate": 4.085090669302575e-05, + "loss": 0.4, + "step": 21922 + }, + { + "epoch": 2.93, + "grad_norm": 0.5234375, + "learning_rate": 4.084151758132887e-05, + "loss": 0.2838, + "step": 21923 + }, + { + "epoch": 2.93, + "grad_norm": 0.60546875, + "learning_rate": 4.083212927185899e-05, + "loss": 0.2169, + "step": 21924 + }, + { + "epoch": 2.93, + "grad_norm": 0.490234375, + "learning_rate": 4.082274176474344e-05, + "loss": 0.437, + "step": 21925 + }, + { + "epoch": 2.93, + "grad_norm": 0.640625, + "learning_rate": 4.081335506010947e-05, + "loss": 0.4264, + "step": 21926 + }, + { + "epoch": 2.93, + "grad_norm": 0.59765625, + "learning_rate": 4.0803969158084374e-05, + "loss": 0.2678, + "step": 21927 + }, + { + "epoch": 2.93, + "grad_norm": 0.6953125, + "learning_rate": 4.079458405879546e-05, + "loss": 0.3178, + "step": 21928 + }, + { + "epoch": 2.93, + "grad_norm": 0.62109375, + "learning_rate": 4.078519976237002e-05, + "loss": 0.3063, + "step": 21929 + }, + { + "epoch": 2.93, + "grad_norm": 0.5625, + "learning_rate": 4.077581626893523e-05, + "loss": 0.3797, + "step": 21930 + }, + { + "epoch": 2.93, + "grad_norm": 0.56640625, + "learning_rate": 4.076643357861839e-05, + "loss": 0.298, + "step": 21931 + }, + { + "epoch": 2.93, + "grad_norm": 0.61328125, + "learning_rate": 4.075705169154672e-05, + "loss": 0.26, + "step": 21932 + }, + { + "epoch": 2.93, + "grad_norm": 0.61328125, + "learning_rate": 4.0747670607847486e-05, + "loss": 0.2611, + "step": 21933 + }, + { + "epoch": 2.93, + "grad_norm": 0.61328125, + "learning_rate": 4.073829032764788e-05, + "loss": 0.3084, + "step": 21934 + }, + { + "epoch": 2.93, + "grad_norm": 0.5390625, + "learning_rate": 4.072891085107504e-05, + "loss": 0.2395, + "step": 21935 + }, + { + "epoch": 2.93, + "grad_norm": 0.48046875, + "learning_rate": 4.0719532178256216e-05, + "loss": 0.3075, + "step": 21936 + }, + { + "epoch": 2.93, + "grad_norm": 0.73828125, + "learning_rate": 4.071015430931857e-05, + "loss": 0.5453, + "step": 21937 + }, + { + "epoch": 2.93, + "grad_norm": 0.625, + "learning_rate": 4.070077724438933e-05, + "loss": 0.3447, + "step": 21938 + }, + { + "epoch": 2.93, + "grad_norm": 0.486328125, + "learning_rate": 4.069140098359557e-05, + "loss": 0.2626, + "step": 21939 + }, + { + "epoch": 2.93, + "grad_norm": 0.302734375, + "learning_rate": 4.0682025527064486e-05, + "loss": 0.1049, + "step": 21940 + }, + { + "epoch": 2.93, + "grad_norm": 0.77734375, + "learning_rate": 4.067265087492319e-05, + "loss": 0.5226, + "step": 21941 + }, + { + "epoch": 2.93, + "grad_norm": 0.66015625, + "learning_rate": 4.066327702729884e-05, + "loss": 0.4677, + "step": 21942 + }, + { + "epoch": 2.93, + "grad_norm": 0.9453125, + "learning_rate": 4.0653903984318565e-05, + "loss": 0.3455, + "step": 21943 + }, + { + "epoch": 2.93, + "grad_norm": 0.62890625, + "learning_rate": 4.0644531746109404e-05, + "loss": 0.348, + "step": 21944 + }, + { + "epoch": 2.93, + "grad_norm": 0.498046875, + "learning_rate": 4.063516031279852e-05, + "loss": 0.3631, + "step": 21945 + }, + { + "epoch": 2.93, + "grad_norm": 0.60546875, + "learning_rate": 4.062578968451293e-05, + "loss": 0.4269, + "step": 21946 + }, + { + "epoch": 2.93, + "grad_norm": 0.5078125, + "learning_rate": 4.061641986137973e-05, + "loss": 0.27, + "step": 21947 + }, + { + "epoch": 2.93, + "grad_norm": 0.55078125, + "learning_rate": 4.060705084352603e-05, + "loss": 0.2238, + "step": 21948 + }, + { + "epoch": 2.93, + "grad_norm": 0.6015625, + "learning_rate": 4.0597682631078804e-05, + "loss": 0.3065, + "step": 21949 + }, + { + "epoch": 2.93, + "grad_norm": 0.376953125, + "learning_rate": 4.058831522416512e-05, + "loss": 0.255, + "step": 21950 + }, + { + "epoch": 2.93, + "grad_norm": 0.671875, + "learning_rate": 4.057894862291202e-05, + "loss": 0.3375, + "step": 21951 + }, + { + "epoch": 2.93, + "grad_norm": 0.65625, + "learning_rate": 4.056958282744654e-05, + "loss": 0.5733, + "step": 21952 + }, + { + "epoch": 2.93, + "grad_norm": 0.5859375, + "learning_rate": 4.056021783789563e-05, + "loss": 0.4247, + "step": 21953 + }, + { + "epoch": 2.93, + "grad_norm": 0.58984375, + "learning_rate": 4.055085365438631e-05, + "loss": 0.322, + "step": 21954 + }, + { + "epoch": 2.93, + "grad_norm": 0.59375, + "learning_rate": 4.054149027704558e-05, + "loss": 0.3174, + "step": 21955 + }, + { + "epoch": 2.93, + "grad_norm": 0.8671875, + "learning_rate": 4.053212770600042e-05, + "loss": 0.3826, + "step": 21956 + }, + { + "epoch": 2.93, + "grad_norm": 0.5078125, + "learning_rate": 4.0522765941377796e-05, + "loss": 0.2912, + "step": 21957 + }, + { + "epoch": 2.93, + "grad_norm": 0.56640625, + "learning_rate": 4.051340498330458e-05, + "loss": 0.3796, + "step": 21958 + }, + { + "epoch": 2.93, + "grad_norm": 0.640625, + "learning_rate": 4.050404483190778e-05, + "loss": 0.3994, + "step": 21959 + }, + { + "epoch": 2.93, + "grad_norm": 0.7265625, + "learning_rate": 4.049468548731432e-05, + "loss": 0.6702, + "step": 21960 + }, + { + "epoch": 2.93, + "grad_norm": 0.6875, + "learning_rate": 4.0485326949651146e-05, + "loss": 0.5372, + "step": 21961 + }, + { + "epoch": 2.93, + "grad_norm": 0.484375, + "learning_rate": 4.0475969219045104e-05, + "loss": 0.1831, + "step": 21962 + }, + { + "epoch": 2.93, + "grad_norm": 0.57421875, + "learning_rate": 4.046661229562312e-05, + "loss": 0.2296, + "step": 21963 + }, + { + "epoch": 2.93, + "grad_norm": 0.640625, + "learning_rate": 4.045725617951208e-05, + "loss": 0.5257, + "step": 21964 + }, + { + "epoch": 2.93, + "grad_norm": 0.5234375, + "learning_rate": 4.044790087083886e-05, + "loss": 0.3447, + "step": 21965 + }, + { + "epoch": 2.93, + "grad_norm": 0.66796875, + "learning_rate": 4.043854636973036e-05, + "loss": 0.2782, + "step": 21966 + }, + { + "epoch": 2.93, + "grad_norm": 0.53515625, + "learning_rate": 4.042919267631336e-05, + "loss": 0.2582, + "step": 21967 + }, + { + "epoch": 2.93, + "grad_norm": 0.5, + "learning_rate": 4.0419839790714745e-05, + "loss": 0.2058, + "step": 21968 + }, + { + "epoch": 2.93, + "grad_norm": 0.8125, + "learning_rate": 4.041048771306137e-05, + "loss": 0.3068, + "step": 21969 + }, + { + "epoch": 2.93, + "grad_norm": 0.6875, + "learning_rate": 4.040113644347999e-05, + "loss": 0.5251, + "step": 21970 + }, + { + "epoch": 2.93, + "grad_norm": 0.53515625, + "learning_rate": 4.0391785982097495e-05, + "loss": 0.3851, + "step": 21971 + }, + { + "epoch": 2.93, + "grad_norm": 0.8359375, + "learning_rate": 4.0382436329040594e-05, + "loss": 0.6842, + "step": 21972 + }, + { + "epoch": 2.93, + "grad_norm": 0.734375, + "learning_rate": 4.0373087484436126e-05, + "loss": 0.4243, + "step": 21973 + }, + { + "epoch": 2.93, + "grad_norm": 0.384765625, + "learning_rate": 4.0363739448410854e-05, + "loss": 0.2228, + "step": 21974 + }, + { + "epoch": 2.93, + "grad_norm": 0.62109375, + "learning_rate": 4.0354392221091584e-05, + "loss": 0.3008, + "step": 21975 + }, + { + "epoch": 2.93, + "grad_norm": 0.546875, + "learning_rate": 4.034504580260501e-05, + "loss": 0.3068, + "step": 21976 + }, + { + "epoch": 2.93, + "grad_norm": 0.4140625, + "learning_rate": 4.0335700193077895e-05, + "loss": 0.2269, + "step": 21977 + }, + { + "epoch": 2.93, + "grad_norm": 0.58203125, + "learning_rate": 4.032635539263697e-05, + "loss": 0.539, + "step": 21978 + }, + { + "epoch": 2.93, + "grad_norm": 0.4453125, + "learning_rate": 4.031701140140901e-05, + "loss": 0.2077, + "step": 21979 + }, + { + "epoch": 2.93, + "grad_norm": 0.474609375, + "learning_rate": 4.030766821952064e-05, + "loss": 0.2985, + "step": 21980 + }, + { + "epoch": 2.93, + "grad_norm": 0.57421875, + "learning_rate": 4.029832584709864e-05, + "loss": 0.2413, + "step": 21981 + }, + { + "epoch": 2.93, + "grad_norm": 0.60546875, + "learning_rate": 4.028898428426961e-05, + "loss": 0.2595, + "step": 21982 + }, + { + "epoch": 2.93, + "grad_norm": 0.53125, + "learning_rate": 4.027964353116029e-05, + "loss": 0.2472, + "step": 21983 + }, + { + "epoch": 2.93, + "grad_norm": 0.47265625, + "learning_rate": 4.027030358789736e-05, + "loss": 0.3167, + "step": 21984 + }, + { + "epoch": 2.93, + "grad_norm": 0.470703125, + "learning_rate": 4.026096445460741e-05, + "loss": 0.4965, + "step": 21985 + }, + { + "epoch": 2.93, + "grad_norm": 0.4375, + "learning_rate": 4.025162613141713e-05, + "loss": 0.2909, + "step": 21986 + }, + { + "epoch": 2.93, + "grad_norm": 0.59765625, + "learning_rate": 4.024228861845314e-05, + "loss": 0.2449, + "step": 21987 + }, + { + "epoch": 2.93, + "grad_norm": 0.439453125, + "learning_rate": 4.02329519158421e-05, + "loss": 0.1233, + "step": 21988 + }, + { + "epoch": 2.93, + "grad_norm": 0.67578125, + "learning_rate": 4.022361602371056e-05, + "loss": 0.4648, + "step": 21989 + }, + { + "epoch": 2.93, + "grad_norm": 0.5390625, + "learning_rate": 4.0214280942185157e-05, + "loss": 0.4299, + "step": 21990 + }, + { + "epoch": 2.93, + "grad_norm": 0.50390625, + "learning_rate": 4.0204946671392464e-05, + "loss": 0.2701, + "step": 21991 + }, + { + "epoch": 2.93, + "grad_norm": 0.6640625, + "learning_rate": 4.019561321145907e-05, + "loss": 0.4094, + "step": 21992 + }, + { + "epoch": 2.93, + "grad_norm": 0.546875, + "learning_rate": 4.018628056251159e-05, + "loss": 0.3605, + "step": 21993 + }, + { + "epoch": 2.93, + "grad_norm": 0.58203125, + "learning_rate": 4.017694872467652e-05, + "loss": 0.2922, + "step": 21994 + }, + { + "epoch": 2.94, + "grad_norm": 0.51171875, + "learning_rate": 4.016761769808039e-05, + "loss": 0.2579, + "step": 21995 + }, + { + "epoch": 2.94, + "grad_norm": 0.76171875, + "learning_rate": 4.015828748284978e-05, + "loss": 0.5139, + "step": 21996 + }, + { + "epoch": 2.94, + "grad_norm": 0.447265625, + "learning_rate": 4.014895807911118e-05, + "loss": 0.1902, + "step": 21997 + }, + { + "epoch": 2.94, + "grad_norm": 0.51953125, + "learning_rate": 4.013962948699116e-05, + "loss": 0.1835, + "step": 21998 + }, + { + "epoch": 2.94, + "grad_norm": 0.50390625, + "learning_rate": 4.0130301706616135e-05, + "loss": 0.4299, + "step": 21999 + }, + { + "epoch": 2.94, + "grad_norm": 0.671875, + "learning_rate": 4.012097473811266e-05, + "loss": 0.4612, + "step": 22000 + }, + { + "epoch": 2.94, + "grad_norm": 0.80078125, + "learning_rate": 4.0111648581607186e-05, + "loss": 0.3118, + "step": 22001 + }, + { + "epoch": 2.94, + "grad_norm": 0.76953125, + "learning_rate": 4.0102323237226234e-05, + "loss": 0.5392, + "step": 22002 + }, + { + "epoch": 2.94, + "grad_norm": 0.55859375, + "learning_rate": 4.0092998705096184e-05, + "loss": 0.298, + "step": 22003 + }, + { + "epoch": 2.94, + "grad_norm": 0.69140625, + "learning_rate": 4.0083674985343534e-05, + "loss": 0.4694, + "step": 22004 + }, + { + "epoch": 2.94, + "grad_norm": 0.5703125, + "learning_rate": 4.007435207809469e-05, + "loss": 0.2606, + "step": 22005 + }, + { + "epoch": 2.94, + "grad_norm": 0.71484375, + "learning_rate": 4.006502998347612e-05, + "loss": 0.4511, + "step": 22006 + }, + { + "epoch": 2.94, + "grad_norm": 0.56640625, + "learning_rate": 4.005570870161423e-05, + "loss": 0.2096, + "step": 22007 + }, + { + "epoch": 2.94, + "grad_norm": 0.435546875, + "learning_rate": 4.004638823263536e-05, + "loss": 0.2461, + "step": 22008 + }, + { + "epoch": 2.94, + "grad_norm": 0.388671875, + "learning_rate": 4.003706857666593e-05, + "loss": 0.1831, + "step": 22009 + }, + { + "epoch": 2.94, + "grad_norm": 0.64453125, + "learning_rate": 4.0027749733832344e-05, + "loss": 0.3709, + "step": 22010 + }, + { + "epoch": 2.94, + "grad_norm": 0.451171875, + "learning_rate": 4.001843170426099e-05, + "loss": 0.2483, + "step": 22011 + }, + { + "epoch": 2.94, + "grad_norm": 0.8203125, + "learning_rate": 4.000911448807818e-05, + "loss": 0.3293, + "step": 22012 + }, + { + "epoch": 2.94, + "grad_norm": 0.65625, + "learning_rate": 3.9999798085410266e-05, + "loss": 0.613, + "step": 22013 + }, + { + "epoch": 2.94, + "grad_norm": 0.4765625, + "learning_rate": 3.9990482496383594e-05, + "loss": 0.2301, + "step": 22014 + }, + { + "epoch": 2.94, + "grad_norm": 0.400390625, + "learning_rate": 3.99811677211245e-05, + "loss": 0.3359, + "step": 22015 + }, + { + "epoch": 2.94, + "grad_norm": 0.640625, + "learning_rate": 3.9971853759759335e-05, + "loss": 0.3738, + "step": 22016 + }, + { + "epoch": 2.94, + "grad_norm": 0.57421875, + "learning_rate": 3.996254061241431e-05, + "loss": 0.3036, + "step": 22017 + }, + { + "epoch": 2.94, + "grad_norm": 0.474609375, + "learning_rate": 3.9953228279215804e-05, + "loss": 0.1727, + "step": 22018 + }, + { + "epoch": 2.94, + "grad_norm": 0.4765625, + "learning_rate": 3.994391676029004e-05, + "loss": 0.1536, + "step": 22019 + }, + { + "epoch": 2.94, + "grad_norm": 0.419921875, + "learning_rate": 3.993460605576329e-05, + "loss": 0.2024, + "step": 22020 + }, + { + "epoch": 2.94, + "grad_norm": 0.703125, + "learning_rate": 3.992529616576187e-05, + "loss": 0.6076, + "step": 22021 + }, + { + "epoch": 2.94, + "grad_norm": 0.65625, + "learning_rate": 3.991598709041196e-05, + "loss": 0.4005, + "step": 22022 + }, + { + "epoch": 2.94, + "grad_norm": 0.494140625, + "learning_rate": 3.990667882983981e-05, + "loss": 0.4422, + "step": 22023 + }, + { + "epoch": 2.94, + "grad_norm": 0.400390625, + "learning_rate": 3.989737138417169e-05, + "loss": 0.2918, + "step": 22024 + }, + { + "epoch": 2.94, + "grad_norm": 0.56640625, + "learning_rate": 3.98880647535338e-05, + "loss": 0.33, + "step": 22025 + }, + { + "epoch": 2.94, + "grad_norm": 0.578125, + "learning_rate": 3.987875893805231e-05, + "loss": 0.3996, + "step": 22026 + }, + { + "epoch": 2.94, + "grad_norm": 0.5859375, + "learning_rate": 3.986945393785343e-05, + "loss": 0.1988, + "step": 22027 + }, + { + "epoch": 2.94, + "grad_norm": 0.62890625, + "learning_rate": 3.9860149753063345e-05, + "loss": 0.4386, + "step": 22028 + }, + { + "epoch": 2.94, + "grad_norm": 0.6875, + "learning_rate": 3.985084638380826e-05, + "loss": 0.3362, + "step": 22029 + }, + { + "epoch": 2.94, + "grad_norm": 0.48828125, + "learning_rate": 3.9841543830214314e-05, + "loss": 0.3959, + "step": 22030 + }, + { + "epoch": 2.94, + "grad_norm": 0.71484375, + "learning_rate": 3.983224209240759e-05, + "loss": 0.4322, + "step": 22031 + }, + { + "epoch": 2.94, + "grad_norm": 0.62890625, + "learning_rate": 3.982294117051429e-05, + "loss": 0.3901, + "step": 22032 + }, + { + "epoch": 2.94, + "grad_norm": 0.5078125, + "learning_rate": 3.981364106466052e-05, + "loss": 0.208, + "step": 22033 + }, + { + "epoch": 2.94, + "grad_norm": 0.66015625, + "learning_rate": 3.980434177497245e-05, + "loss": 0.4269, + "step": 22034 + }, + { + "epoch": 2.94, + "grad_norm": 0.59765625, + "learning_rate": 3.9795043301576105e-05, + "loss": 0.4108, + "step": 22035 + }, + { + "epoch": 2.94, + "grad_norm": 0.7890625, + "learning_rate": 3.9785745644597606e-05, + "loss": 0.4332, + "step": 22036 + }, + { + "epoch": 2.94, + "grad_norm": 0.51953125, + "learning_rate": 3.977644880416305e-05, + "loss": 0.3669, + "step": 22037 + }, + { + "epoch": 2.94, + "grad_norm": 0.60546875, + "learning_rate": 3.976715278039849e-05, + "loss": 0.4462, + "step": 22038 + }, + { + "epoch": 2.94, + "grad_norm": 0.48046875, + "learning_rate": 3.9757857573430026e-05, + "loss": 0.307, + "step": 22039 + }, + { + "epoch": 2.94, + "grad_norm": 0.482421875, + "learning_rate": 3.974856318338365e-05, + "loss": 0.1757, + "step": 22040 + }, + { + "epoch": 2.94, + "grad_norm": 0.56640625, + "learning_rate": 3.973926961038542e-05, + "loss": 0.4604, + "step": 22041 + }, + { + "epoch": 2.94, + "grad_norm": 0.74609375, + "learning_rate": 3.972997685456141e-05, + "loss": 0.3488, + "step": 22042 + }, + { + "epoch": 2.94, + "grad_norm": 0.66796875, + "learning_rate": 3.9720684916037554e-05, + "loss": 0.4618, + "step": 22043 + }, + { + "epoch": 2.94, + "grad_norm": 0.6640625, + "learning_rate": 3.971139379493994e-05, + "loss": 0.2305, + "step": 22044 + }, + { + "epoch": 2.94, + "grad_norm": 0.4765625, + "learning_rate": 3.9702103491394484e-05, + "loss": 0.1805, + "step": 22045 + }, + { + "epoch": 2.94, + "grad_norm": 0.73828125, + "learning_rate": 3.969281400552719e-05, + "loss": 0.3927, + "step": 22046 + }, + { + "epoch": 2.94, + "grad_norm": 0.796875, + "learning_rate": 3.9683525337464054e-05, + "loss": 0.5309, + "step": 22047 + }, + { + "epoch": 2.94, + "grad_norm": 0.53125, + "learning_rate": 3.967423748733107e-05, + "loss": 0.192, + "step": 22048 + }, + { + "epoch": 2.94, + "grad_norm": 0.578125, + "learning_rate": 3.966495045525409e-05, + "loss": 0.4373, + "step": 22049 + }, + { + "epoch": 2.94, + "grad_norm": 0.75390625, + "learning_rate": 3.965566424135911e-05, + "loss": 0.7455, + "step": 22050 + }, + { + "epoch": 2.94, + "grad_norm": 0.65625, + "learning_rate": 3.964637884577206e-05, + "loss": 0.3181, + "step": 22051 + }, + { + "epoch": 2.94, + "grad_norm": 0.77734375, + "learning_rate": 3.963709426861887e-05, + "loss": 0.5602, + "step": 22052 + }, + { + "epoch": 2.94, + "grad_norm": 0.5, + "learning_rate": 3.96278105100254e-05, + "loss": 0.1635, + "step": 22053 + }, + { + "epoch": 2.94, + "grad_norm": 0.6796875, + "learning_rate": 3.961852757011759e-05, + "loss": 0.6012, + "step": 22054 + }, + { + "epoch": 2.94, + "grad_norm": 0.65625, + "learning_rate": 3.960924544902127e-05, + "loss": 0.3993, + "step": 22055 + }, + { + "epoch": 2.94, + "grad_norm": 0.5625, + "learning_rate": 3.9599964146862335e-05, + "loss": 0.2175, + "step": 22056 + }, + { + "epoch": 2.94, + "grad_norm": 0.56640625, + "learning_rate": 3.959068366376669e-05, + "loss": 0.4125, + "step": 22057 + }, + { + "epoch": 2.94, + "grad_norm": 0.63671875, + "learning_rate": 3.958140399986011e-05, + "loss": 0.2295, + "step": 22058 + }, + { + "epoch": 2.94, + "grad_norm": 0.65234375, + "learning_rate": 3.957212515526846e-05, + "loss": 0.3643, + "step": 22059 + }, + { + "epoch": 2.94, + "grad_norm": 0.69921875, + "learning_rate": 3.9562847130117585e-05, + "loss": 0.6241, + "step": 22060 + }, + { + "epoch": 2.94, + "grad_norm": 0.546875, + "learning_rate": 3.955356992453332e-05, + "loss": 0.4852, + "step": 22061 + }, + { + "epoch": 2.94, + "grad_norm": 0.58203125, + "learning_rate": 3.9544293538641405e-05, + "loss": 0.1667, + "step": 22062 + }, + { + "epoch": 2.94, + "grad_norm": 0.47265625, + "learning_rate": 3.9535017972567675e-05, + "loss": 0.1373, + "step": 22063 + }, + { + "epoch": 2.94, + "grad_norm": 0.5546875, + "learning_rate": 3.952574322643792e-05, + "loss": 0.4113, + "step": 22064 + }, + { + "epoch": 2.94, + "grad_norm": 0.71875, + "learning_rate": 3.9516469300377875e-05, + "loss": 0.447, + "step": 22065 + }, + { + "epoch": 2.94, + "grad_norm": 0.609375, + "learning_rate": 3.9507196194513374e-05, + "loss": 0.3884, + "step": 22066 + }, + { + "epoch": 2.94, + "grad_norm": 0.6484375, + "learning_rate": 3.9497923908970114e-05, + "loss": 0.2191, + "step": 22067 + }, + { + "epoch": 2.94, + "grad_norm": 0.8046875, + "learning_rate": 3.9488652443873795e-05, + "loss": 0.2128, + "step": 22068 + }, + { + "epoch": 2.94, + "grad_norm": 0.7890625, + "learning_rate": 3.947938179935019e-05, + "loss": 0.3437, + "step": 22069 + }, + { + "epoch": 2.95, + "grad_norm": 0.6640625, + "learning_rate": 3.9470111975525016e-05, + "loss": 0.2458, + "step": 22070 + }, + { + "epoch": 2.95, + "grad_norm": 0.41796875, + "learning_rate": 3.946084297252399e-05, + "loss": 0.2771, + "step": 22071 + }, + { + "epoch": 2.95, + "grad_norm": 0.5625, + "learning_rate": 3.9451574790472765e-05, + "loss": 0.4447, + "step": 22072 + }, + { + "epoch": 2.95, + "grad_norm": 0.6171875, + "learning_rate": 3.944230742949704e-05, + "loss": 0.2046, + "step": 22073 + }, + { + "epoch": 2.95, + "grad_norm": 0.462890625, + "learning_rate": 3.9433040889722496e-05, + "loss": 0.1941, + "step": 22074 + }, + { + "epoch": 2.95, + "grad_norm": 0.51953125, + "learning_rate": 3.9423775171274824e-05, + "loss": 0.4108, + "step": 22075 + }, + { + "epoch": 2.95, + "grad_norm": 0.5546875, + "learning_rate": 3.94145102742796e-05, + "loss": 0.2054, + "step": 22076 + }, + { + "epoch": 2.95, + "grad_norm": 0.5234375, + "learning_rate": 3.94052461988625e-05, + "loss": 0.1753, + "step": 22077 + }, + { + "epoch": 2.95, + "grad_norm": 0.6015625, + "learning_rate": 3.9395982945149155e-05, + "loss": 0.4397, + "step": 22078 + }, + { + "epoch": 2.95, + "grad_norm": 0.515625, + "learning_rate": 3.9386720513265206e-05, + "loss": 0.3604, + "step": 22079 + }, + { + "epoch": 2.95, + "grad_norm": 0.4609375, + "learning_rate": 3.937745890333623e-05, + "loss": 0.2312, + "step": 22080 + }, + { + "epoch": 2.95, + "grad_norm": 0.6015625, + "learning_rate": 3.936819811548779e-05, + "loss": 0.5155, + "step": 22081 + }, + { + "epoch": 2.95, + "grad_norm": 0.6015625, + "learning_rate": 3.935893814984549e-05, + "loss": 0.323, + "step": 22082 + }, + { + "epoch": 2.95, + "grad_norm": 0.61328125, + "learning_rate": 3.93496790065349e-05, + "loss": 0.433, + "step": 22083 + }, + { + "epoch": 2.95, + "grad_norm": 0.546875, + "learning_rate": 3.9340420685681636e-05, + "loss": 0.2445, + "step": 22084 + }, + { + "epoch": 2.95, + "grad_norm": 0.640625, + "learning_rate": 3.9331163187411155e-05, + "loss": 0.5826, + "step": 22085 + }, + { + "epoch": 2.95, + "grad_norm": 0.65234375, + "learning_rate": 3.932190651184905e-05, + "loss": 0.2571, + "step": 22086 + }, + { + "epoch": 2.95, + "grad_norm": 0.72265625, + "learning_rate": 3.931265065912083e-05, + "loss": 0.3263, + "step": 22087 + }, + { + "epoch": 2.95, + "grad_norm": 0.59375, + "learning_rate": 3.930339562935201e-05, + "loss": 0.3879, + "step": 22088 + }, + { + "epoch": 2.95, + "grad_norm": 0.78515625, + "learning_rate": 3.929414142266814e-05, + "loss": 0.3477, + "step": 22089 + }, + { + "epoch": 2.95, + "grad_norm": 0.65625, + "learning_rate": 3.928488803919465e-05, + "loss": 0.2881, + "step": 22090 + }, + { + "epoch": 2.95, + "grad_norm": 0.5390625, + "learning_rate": 3.927563547905706e-05, + "loss": 0.188, + "step": 22091 + }, + { + "epoch": 2.95, + "grad_norm": 0.478515625, + "learning_rate": 3.9266383742380794e-05, + "loss": 0.3759, + "step": 22092 + }, + { + "epoch": 2.95, + "grad_norm": 0.5703125, + "learning_rate": 3.925713282929134e-05, + "loss": 0.614, + "step": 22093 + }, + { + "epoch": 2.95, + "grad_norm": 0.68359375, + "learning_rate": 3.92478827399142e-05, + "loss": 0.4469, + "step": 22094 + }, + { + "epoch": 2.95, + "grad_norm": 0.52734375, + "learning_rate": 3.923863347437472e-05, + "loss": 0.3247, + "step": 22095 + }, + { + "epoch": 2.95, + "grad_norm": 0.6328125, + "learning_rate": 3.922938503279836e-05, + "loss": 0.4066, + "step": 22096 + }, + { + "epoch": 2.95, + "grad_norm": 0.484375, + "learning_rate": 3.922013741531054e-05, + "loss": 0.1626, + "step": 22097 + }, + { + "epoch": 2.95, + "grad_norm": 0.703125, + "learning_rate": 3.921089062203671e-05, + "loss": 0.4868, + "step": 22098 + }, + { + "epoch": 2.95, + "grad_norm": 0.5078125, + "learning_rate": 3.9201644653102173e-05, + "loss": 0.341, + "step": 22099 + }, + { + "epoch": 2.95, + "grad_norm": 0.5625, + "learning_rate": 3.919239950863236e-05, + "loss": 0.2534, + "step": 22100 + }, + { + "epoch": 2.95, + "grad_norm": 0.66015625, + "learning_rate": 3.9183155188752644e-05, + "loss": 0.3604, + "step": 22101 + }, + { + "epoch": 2.95, + "grad_norm": 0.75390625, + "learning_rate": 3.917391169358839e-05, + "loss": 0.4405, + "step": 22102 + }, + { + "epoch": 2.95, + "grad_norm": 0.6171875, + "learning_rate": 3.916466902326494e-05, + "loss": 0.3679, + "step": 22103 + }, + { + "epoch": 2.95, + "grad_norm": 0.5546875, + "learning_rate": 3.915542717790759e-05, + "loss": 0.1874, + "step": 22104 + }, + { + "epoch": 2.95, + "grad_norm": 0.83984375, + "learning_rate": 3.91461861576417e-05, + "loss": 0.4808, + "step": 22105 + }, + { + "epoch": 2.95, + "grad_norm": 0.58203125, + "learning_rate": 3.913694596259257e-05, + "loss": 0.2654, + "step": 22106 + }, + { + "epoch": 2.95, + "grad_norm": 0.6796875, + "learning_rate": 3.912770659288556e-05, + "loss": 0.4101, + "step": 22107 + }, + { + "epoch": 2.95, + "grad_norm": 0.48046875, + "learning_rate": 3.911846804864588e-05, + "loss": 0.2763, + "step": 22108 + }, + { + "epoch": 2.95, + "grad_norm": 0.447265625, + "learning_rate": 3.910923032999885e-05, + "loss": 0.1802, + "step": 22109 + }, + { + "epoch": 2.95, + "grad_norm": 0.65625, + "learning_rate": 3.909999343706973e-05, + "loss": 0.4567, + "step": 22110 + }, + { + "epoch": 2.95, + "grad_norm": 0.83984375, + "learning_rate": 3.9090757369983786e-05, + "loss": 0.33, + "step": 22111 + }, + { + "epoch": 2.95, + "grad_norm": 0.6328125, + "learning_rate": 3.90815221288663e-05, + "loss": 0.3565, + "step": 22112 + }, + { + "epoch": 2.95, + "grad_norm": 0.625, + "learning_rate": 3.9072287713842423e-05, + "loss": 0.3256, + "step": 22113 + }, + { + "epoch": 2.95, + "grad_norm": 0.59765625, + "learning_rate": 3.906305412503745e-05, + "loss": 0.4302, + "step": 22114 + }, + { + "epoch": 2.95, + "grad_norm": 0.61328125, + "learning_rate": 3.905382136257659e-05, + "loss": 0.3131, + "step": 22115 + }, + { + "epoch": 2.95, + "grad_norm": 0.79296875, + "learning_rate": 3.9044589426585e-05, + "loss": 0.2172, + "step": 22116 + }, + { + "epoch": 2.95, + "grad_norm": 0.59375, + "learning_rate": 3.9035358317187934e-05, + "loss": 0.3844, + "step": 22117 + }, + { + "epoch": 2.95, + "grad_norm": 0.6640625, + "learning_rate": 3.90261280345105e-05, + "loss": 0.5006, + "step": 22118 + }, + { + "epoch": 2.95, + "grad_norm": 0.6328125, + "learning_rate": 3.90168985786779e-05, + "loss": 0.4018, + "step": 22119 + }, + { + "epoch": 2.95, + "grad_norm": 0.59375, + "learning_rate": 3.90076699498153e-05, + "loss": 0.2077, + "step": 22120 + }, + { + "epoch": 2.95, + "grad_norm": 0.59765625, + "learning_rate": 3.8998442148047865e-05, + "loss": 0.1892, + "step": 22121 + }, + { + "epoch": 2.95, + "grad_norm": 0.5625, + "learning_rate": 3.898921517350067e-05, + "loss": 0.2357, + "step": 22122 + }, + { + "epoch": 2.95, + "grad_norm": 0.64453125, + "learning_rate": 3.897998902629888e-05, + "loss": 0.3203, + "step": 22123 + }, + { + "epoch": 2.95, + "grad_norm": 0.388671875, + "learning_rate": 3.8970763706567605e-05, + "loss": 0.1697, + "step": 22124 + }, + { + "epoch": 2.95, + "grad_norm": 0.578125, + "learning_rate": 3.896153921443198e-05, + "loss": 0.159, + "step": 22125 + }, + { + "epoch": 2.95, + "grad_norm": 0.3984375, + "learning_rate": 3.8952315550017006e-05, + "loss": 0.1045, + "step": 22126 + }, + { + "epoch": 2.95, + "grad_norm": 0.609375, + "learning_rate": 3.894309271344786e-05, + "loss": 0.2183, + "step": 22127 + }, + { + "epoch": 2.95, + "grad_norm": 0.73046875, + "learning_rate": 3.893387070484952e-05, + "loss": 0.4007, + "step": 22128 + }, + { + "epoch": 2.95, + "grad_norm": 0.54296875, + "learning_rate": 3.892464952434709e-05, + "loss": 0.4939, + "step": 22129 + }, + { + "epoch": 2.95, + "grad_norm": 0.70703125, + "learning_rate": 3.891542917206565e-05, + "loss": 0.4319, + "step": 22130 + }, + { + "epoch": 2.95, + "grad_norm": 0.546875, + "learning_rate": 3.890620964813015e-05, + "loss": 0.2947, + "step": 22131 + }, + { + "epoch": 2.95, + "grad_norm": 0.55078125, + "learning_rate": 3.889699095266567e-05, + "loss": 0.4524, + "step": 22132 + }, + { + "epoch": 2.95, + "grad_norm": 0.515625, + "learning_rate": 3.88877730857972e-05, + "loss": 0.1785, + "step": 22133 + }, + { + "epoch": 2.95, + "grad_norm": 0.6171875, + "learning_rate": 3.887855604764978e-05, + "loss": 0.3275, + "step": 22134 + }, + { + "epoch": 2.95, + "grad_norm": 0.7890625, + "learning_rate": 3.886933983834834e-05, + "loss": 0.265, + "step": 22135 + }, + { + "epoch": 2.95, + "grad_norm": 0.671875, + "learning_rate": 3.886012445801788e-05, + "loss": 0.4654, + "step": 22136 + }, + { + "epoch": 2.95, + "grad_norm": 0.5703125, + "learning_rate": 3.885090990678337e-05, + "loss": 0.2701, + "step": 22137 + }, + { + "epoch": 2.95, + "grad_norm": 0.68359375, + "learning_rate": 3.884169618476977e-05, + "loss": 0.4463, + "step": 22138 + }, + { + "epoch": 2.95, + "grad_norm": 0.61328125, + "learning_rate": 3.883248329210205e-05, + "loss": 0.4135, + "step": 22139 + }, + { + "epoch": 2.95, + "grad_norm": 0.58984375, + "learning_rate": 3.882327122890512e-05, + "loss": 0.2238, + "step": 22140 + }, + { + "epoch": 2.95, + "grad_norm": 0.6171875, + "learning_rate": 3.881405999530386e-05, + "loss": 0.4718, + "step": 22141 + }, + { + "epoch": 2.95, + "grad_norm": 0.52734375, + "learning_rate": 3.880484959142322e-05, + "loss": 0.3953, + "step": 22142 + }, + { + "epoch": 2.95, + "grad_norm": 0.6171875, + "learning_rate": 3.879564001738809e-05, + "loss": 0.5464, + "step": 22143 + }, + { + "epoch": 2.95, + "grad_norm": 0.4765625, + "learning_rate": 3.87864312733234e-05, + "loss": 0.3344, + "step": 22144 + }, + { + "epoch": 2.96, + "grad_norm": 0.69140625, + "learning_rate": 3.877722335935394e-05, + "loss": 0.4339, + "step": 22145 + }, + { + "epoch": 2.96, + "grad_norm": 0.6796875, + "learning_rate": 3.8768016275604635e-05, + "loss": 0.389, + "step": 22146 + }, + { + "epoch": 2.96, + "grad_norm": 0.5859375, + "learning_rate": 3.8758810022200333e-05, + "loss": 0.4528, + "step": 22147 + }, + { + "epoch": 2.96, + "grad_norm": 0.5, + "learning_rate": 3.87496045992659e-05, + "loss": 0.3437, + "step": 22148 + }, + { + "epoch": 2.96, + "grad_norm": 0.640625, + "learning_rate": 3.8740400006926106e-05, + "loss": 0.5169, + "step": 22149 + }, + { + "epoch": 2.96, + "grad_norm": 0.7890625, + "learning_rate": 3.87311962453058e-05, + "loss": 0.3775, + "step": 22150 + }, + { + "epoch": 2.96, + "grad_norm": 0.80078125, + "learning_rate": 3.872199331452982e-05, + "loss": 0.3362, + "step": 22151 + }, + { + "epoch": 2.96, + "grad_norm": 0.5859375, + "learning_rate": 3.871279121472292e-05, + "loss": 0.4246, + "step": 22152 + }, + { + "epoch": 2.96, + "grad_norm": 0.6484375, + "learning_rate": 3.870358994600994e-05, + "loss": 0.1948, + "step": 22153 + }, + { + "epoch": 2.96, + "grad_norm": 0.65625, + "learning_rate": 3.869438950851558e-05, + "loss": 0.2502, + "step": 22154 + }, + { + "epoch": 2.96, + "grad_norm": 0.470703125, + "learning_rate": 3.868518990236464e-05, + "loss": 0.533, + "step": 22155 + }, + { + "epoch": 2.96, + "grad_norm": 0.55859375, + "learning_rate": 3.8675991127681884e-05, + "loss": 0.2868, + "step": 22156 + }, + { + "epoch": 2.96, + "grad_norm": 0.5703125, + "learning_rate": 3.8666793184592076e-05, + "loss": 0.444, + "step": 22157 + }, + { + "epoch": 2.96, + "grad_norm": 0.61328125, + "learning_rate": 3.865759607321988e-05, + "loss": 0.498, + "step": 22158 + }, + { + "epoch": 2.96, + "grad_norm": 0.56640625, + "learning_rate": 3.864839979369006e-05, + "loss": 0.372, + "step": 22159 + }, + { + "epoch": 2.96, + "grad_norm": 0.63671875, + "learning_rate": 3.86392043461273e-05, + "loss": 0.3636, + "step": 22160 + }, + { + "epoch": 2.96, + "grad_norm": 0.64453125, + "learning_rate": 3.8630009730656325e-05, + "loss": 0.2337, + "step": 22161 + }, + { + "epoch": 2.96, + "grad_norm": 0.53125, + "learning_rate": 3.862081594740182e-05, + "loss": 0.1736, + "step": 22162 + }, + { + "epoch": 2.96, + "grad_norm": 0.47265625, + "learning_rate": 3.861162299648843e-05, + "loss": 0.1273, + "step": 22163 + }, + { + "epoch": 2.96, + "grad_norm": 0.87109375, + "learning_rate": 3.860243087804085e-05, + "loss": 0.6073, + "step": 22164 + }, + { + "epoch": 2.96, + "grad_norm": 0.484375, + "learning_rate": 3.859323959218369e-05, + "loss": 0.2408, + "step": 22165 + }, + { + "epoch": 2.96, + "grad_norm": 0.6640625, + "learning_rate": 3.85840491390416e-05, + "loss": 0.3189, + "step": 22166 + }, + { + "epoch": 2.96, + "grad_norm": 0.5625, + "learning_rate": 3.857485951873926e-05, + "loss": 0.3797, + "step": 22167 + }, + { + "epoch": 2.96, + "grad_norm": 0.392578125, + "learning_rate": 3.8565670731401214e-05, + "loss": 0.2391, + "step": 22168 + }, + { + "epoch": 2.96, + "grad_norm": 0.4921875, + "learning_rate": 3.8556482777152106e-05, + "loss": 0.3813, + "step": 22169 + }, + { + "epoch": 2.96, + "grad_norm": 0.78125, + "learning_rate": 3.854729565611652e-05, + "loss": 0.3352, + "step": 22170 + }, + { + "epoch": 2.96, + "grad_norm": 0.5390625, + "learning_rate": 3.8538109368419075e-05, + "loss": 0.1963, + "step": 22171 + }, + { + "epoch": 2.96, + "grad_norm": 0.5078125, + "learning_rate": 3.852892391418429e-05, + "loss": 0.2632, + "step": 22172 + }, + { + "epoch": 2.96, + "grad_norm": 0.5234375, + "learning_rate": 3.851973929353674e-05, + "loss": 0.2759, + "step": 22173 + }, + { + "epoch": 2.96, + "grad_norm": 0.478515625, + "learning_rate": 3.8510555506600974e-05, + "loss": 0.3354, + "step": 22174 + }, + { + "epoch": 2.96, + "grad_norm": 0.49609375, + "learning_rate": 3.8501372553501595e-05, + "loss": 0.3214, + "step": 22175 + }, + { + "epoch": 2.96, + "grad_norm": 0.51953125, + "learning_rate": 3.849219043436306e-05, + "loss": 0.3015, + "step": 22176 + }, + { + "epoch": 2.96, + "grad_norm": 0.451171875, + "learning_rate": 3.848300914930987e-05, + "loss": 0.1766, + "step": 22177 + }, + { + "epoch": 2.96, + "grad_norm": 0.578125, + "learning_rate": 3.8473828698466554e-05, + "loss": 0.2977, + "step": 22178 + }, + { + "epoch": 2.96, + "grad_norm": 0.640625, + "learning_rate": 3.8464649081957605e-05, + "loss": 0.3377, + "step": 22179 + }, + { + "epoch": 2.96, + "grad_norm": 0.734375, + "learning_rate": 3.8455470299907535e-05, + "loss": 0.4165, + "step": 22180 + }, + { + "epoch": 2.96, + "grad_norm": 0.5234375, + "learning_rate": 3.844629235244077e-05, + "loss": 0.3151, + "step": 22181 + }, + { + "epoch": 2.96, + "grad_norm": 0.70703125, + "learning_rate": 3.8437115239681775e-05, + "loss": 0.6053, + "step": 22182 + }, + { + "epoch": 2.96, + "grad_norm": 0.55859375, + "learning_rate": 3.842793896175502e-05, + "loss": 0.5563, + "step": 22183 + }, + { + "epoch": 2.96, + "grad_norm": 0.76953125, + "learning_rate": 3.84187635187849e-05, + "loss": 0.4578, + "step": 22184 + }, + { + "epoch": 2.96, + "grad_norm": 0.6953125, + "learning_rate": 3.840958891089593e-05, + "loss": 0.4268, + "step": 22185 + }, + { + "epoch": 2.96, + "grad_norm": 0.40234375, + "learning_rate": 3.840041513821243e-05, + "loss": 0.2254, + "step": 22186 + }, + { + "epoch": 2.96, + "grad_norm": 0.578125, + "learning_rate": 3.839124220085882e-05, + "loss": 0.4464, + "step": 22187 + }, + { + "epoch": 2.96, + "grad_norm": 0.51171875, + "learning_rate": 3.8382070098959555e-05, + "loss": 0.3036, + "step": 22188 + }, + { + "epoch": 2.96, + "grad_norm": 0.56640625, + "learning_rate": 3.8372898832638924e-05, + "loss": 0.564, + "step": 22189 + }, + { + "epoch": 2.96, + "grad_norm": 0.77734375, + "learning_rate": 3.836372840202137e-05, + "loss": 0.5259, + "step": 22190 + }, + { + "epoch": 2.96, + "grad_norm": 0.447265625, + "learning_rate": 3.835455880723118e-05, + "loss": 0.1382, + "step": 22191 + }, + { + "epoch": 2.96, + "grad_norm": 0.609375, + "learning_rate": 3.834539004839274e-05, + "loss": 0.2996, + "step": 22192 + }, + { + "epoch": 2.96, + "grad_norm": 0.67578125, + "learning_rate": 3.833622212563038e-05, + "loss": 0.1775, + "step": 22193 + }, + { + "epoch": 2.96, + "grad_norm": 0.4375, + "learning_rate": 3.832705503906846e-05, + "loss": 0.2199, + "step": 22194 + }, + { + "epoch": 2.96, + "grad_norm": 0.5078125, + "learning_rate": 3.8317888788831215e-05, + "loss": 0.2026, + "step": 22195 + }, + { + "epoch": 2.96, + "grad_norm": 0.65234375, + "learning_rate": 3.8308723375042985e-05, + "loss": 0.5448, + "step": 22196 + }, + { + "epoch": 2.96, + "grad_norm": 0.5859375, + "learning_rate": 3.829955879782807e-05, + "loss": 0.3428, + "step": 22197 + }, + { + "epoch": 2.96, + "grad_norm": 0.4921875, + "learning_rate": 3.8290395057310766e-05, + "loss": 0.4857, + "step": 22198 + }, + { + "epoch": 2.96, + "grad_norm": 0.5859375, + "learning_rate": 3.8281232153615276e-05, + "loss": 0.2656, + "step": 22199 + }, + { + "epoch": 2.96, + "grad_norm": 0.63671875, + "learning_rate": 3.827207008686592e-05, + "loss": 0.2378, + "step": 22200 + }, + { + "epoch": 2.96, + "grad_norm": 0.5703125, + "learning_rate": 3.826290885718687e-05, + "loss": 0.216, + "step": 22201 + }, + { + "epoch": 2.96, + "grad_norm": 0.84375, + "learning_rate": 3.825374846470241e-05, + "loss": 0.4046, + "step": 22202 + }, + { + "epoch": 2.96, + "grad_norm": 0.5703125, + "learning_rate": 3.8244588909536786e-05, + "loss": 0.3363, + "step": 22203 + }, + { + "epoch": 2.96, + "grad_norm": 0.5546875, + "learning_rate": 3.8235430191814126e-05, + "loss": 0.2906, + "step": 22204 + }, + { + "epoch": 2.96, + "grad_norm": 0.51171875, + "learning_rate": 3.822627231165867e-05, + "loss": 0.2001, + "step": 22205 + }, + { + "epoch": 2.96, + "grad_norm": 0.74609375, + "learning_rate": 3.8217115269194624e-05, + "loss": 0.3637, + "step": 22206 + }, + { + "epoch": 2.96, + "grad_norm": 0.54296875, + "learning_rate": 3.820795906454617e-05, + "loss": 0.2728, + "step": 22207 + }, + { + "epoch": 2.96, + "grad_norm": 0.4921875, + "learning_rate": 3.819880369783742e-05, + "loss": 0.2183, + "step": 22208 + }, + { + "epoch": 2.96, + "grad_norm": 0.4609375, + "learning_rate": 3.8189649169192546e-05, + "loss": 0.2453, + "step": 22209 + }, + { + "epoch": 2.96, + "grad_norm": 0.52734375, + "learning_rate": 3.8180495478735715e-05, + "loss": 0.1794, + "step": 22210 + }, + { + "epoch": 2.96, + "grad_norm": 0.7890625, + "learning_rate": 3.817134262659102e-05, + "loss": 0.5906, + "step": 22211 + }, + { + "epoch": 2.96, + "grad_norm": 0.703125, + "learning_rate": 3.816219061288264e-05, + "loss": 0.5235, + "step": 22212 + }, + { + "epoch": 2.96, + "grad_norm": 0.5703125, + "learning_rate": 3.815303943773466e-05, + "loss": 0.1785, + "step": 22213 + }, + { + "epoch": 2.96, + "grad_norm": 0.71875, + "learning_rate": 3.81438891012711e-05, + "loss": 0.3141, + "step": 22214 + }, + { + "epoch": 2.96, + "grad_norm": 0.7265625, + "learning_rate": 3.813473960361612e-05, + "loss": 0.3508, + "step": 22215 + }, + { + "epoch": 2.96, + "grad_norm": 0.80078125, + "learning_rate": 3.812559094489377e-05, + "loss": 0.3492, + "step": 22216 + }, + { + "epoch": 2.96, + "grad_norm": 0.734375, + "learning_rate": 3.8116443125228154e-05, + "loss": 0.3341, + "step": 22217 + }, + { + "epoch": 2.96, + "grad_norm": 0.54296875, + "learning_rate": 3.8107296144743256e-05, + "loss": 0.205, + "step": 22218 + }, + { + "epoch": 2.96, + "grad_norm": 0.5703125, + "learning_rate": 3.809815000356314e-05, + "loss": 0.3466, + "step": 22219 + }, + { + "epoch": 2.97, + "grad_norm": 0.72265625, + "learning_rate": 3.808900470181182e-05, + "loss": 0.3311, + "step": 22220 + }, + { + "epoch": 2.97, + "grad_norm": 0.69921875, + "learning_rate": 3.8079860239613395e-05, + "loss": 0.4212, + "step": 22221 + }, + { + "epoch": 2.97, + "grad_norm": 0.5625, + "learning_rate": 3.807071661709175e-05, + "loss": 0.2795, + "step": 22222 + }, + { + "epoch": 2.97, + "grad_norm": 0.62109375, + "learning_rate": 3.8061573834370944e-05, + "loss": 0.384, + "step": 22223 + }, + { + "epoch": 2.97, + "grad_norm": 0.66796875, + "learning_rate": 3.805243189157498e-05, + "loss": 0.426, + "step": 22224 + }, + { + "epoch": 2.97, + "grad_norm": 0.462890625, + "learning_rate": 3.8043290788827765e-05, + "loss": 0.2386, + "step": 22225 + }, + { + "epoch": 2.97, + "grad_norm": 0.82421875, + "learning_rate": 3.803415052625332e-05, + "loss": 0.4734, + "step": 22226 + }, + { + "epoch": 2.97, + "grad_norm": 0.640625, + "learning_rate": 3.802501110397553e-05, + "loss": 0.2544, + "step": 22227 + }, + { + "epoch": 2.97, + "grad_norm": 0.66015625, + "learning_rate": 3.8015872522118366e-05, + "loss": 0.469, + "step": 22228 + }, + { + "epoch": 2.97, + "grad_norm": 0.451171875, + "learning_rate": 3.8006734780805755e-05, + "loss": 0.3586, + "step": 22229 + }, + { + "epoch": 2.97, + "grad_norm": 0.52734375, + "learning_rate": 3.799759788016164e-05, + "loss": 0.2411, + "step": 22230 + }, + { + "epoch": 2.97, + "grad_norm": 0.59765625, + "learning_rate": 3.798846182030985e-05, + "loss": 0.4394, + "step": 22231 + }, + { + "epoch": 2.97, + "grad_norm": 0.59765625, + "learning_rate": 3.797932660137432e-05, + "loss": 0.3683, + "step": 22232 + }, + { + "epoch": 2.97, + "grad_norm": 0.54296875, + "learning_rate": 3.7970192223478926e-05, + "loss": 0.2228, + "step": 22233 + }, + { + "epoch": 2.97, + "grad_norm": 0.5859375, + "learning_rate": 3.7961058686747544e-05, + "loss": 0.3741, + "step": 22234 + }, + { + "epoch": 2.97, + "grad_norm": 0.54296875, + "learning_rate": 3.795192599130405e-05, + "loss": 0.366, + "step": 22235 + }, + { + "epoch": 2.97, + "grad_norm": 0.6484375, + "learning_rate": 3.794279413727226e-05, + "loss": 0.3128, + "step": 22236 + }, + { + "epoch": 2.97, + "grad_norm": 0.60546875, + "learning_rate": 3.7933663124775974e-05, + "loss": 0.4316, + "step": 22237 + }, + { + "epoch": 2.97, + "grad_norm": 0.70703125, + "learning_rate": 3.792453295393906e-05, + "loss": 0.4026, + "step": 22238 + }, + { + "epoch": 2.97, + "grad_norm": 0.6953125, + "learning_rate": 3.791540362488532e-05, + "loss": 0.4454, + "step": 22239 + }, + { + "epoch": 2.97, + "grad_norm": 0.5859375, + "learning_rate": 3.790627513773857e-05, + "loss": 0.1942, + "step": 22240 + }, + { + "epoch": 2.97, + "grad_norm": 0.640625, + "learning_rate": 3.789714749262255e-05, + "loss": 0.246, + "step": 22241 + }, + { + "epoch": 2.97, + "grad_norm": 0.53515625, + "learning_rate": 3.788802068966107e-05, + "loss": 0.4476, + "step": 22242 + }, + { + "epoch": 2.97, + "grad_norm": 0.64453125, + "learning_rate": 3.78788947289779e-05, + "loss": 0.6636, + "step": 22243 + }, + { + "epoch": 2.97, + "grad_norm": 0.5078125, + "learning_rate": 3.786976961069681e-05, + "loss": 0.215, + "step": 22244 + }, + { + "epoch": 2.97, + "grad_norm": 0.6640625, + "learning_rate": 3.7860645334941504e-05, + "loss": 0.4485, + "step": 22245 + }, + { + "epoch": 2.97, + "grad_norm": 0.62109375, + "learning_rate": 3.785152190183571e-05, + "loss": 0.2639, + "step": 22246 + }, + { + "epoch": 2.97, + "grad_norm": 0.67578125, + "learning_rate": 3.7842399311503176e-05, + "loss": 0.2401, + "step": 22247 + }, + { + "epoch": 2.97, + "grad_norm": 0.58203125, + "learning_rate": 3.7833277564067625e-05, + "loss": 0.3771, + "step": 22248 + }, + { + "epoch": 2.97, + "grad_norm": 0.53125, + "learning_rate": 3.7824156659652735e-05, + "loss": 0.3858, + "step": 22249 + }, + { + "epoch": 2.97, + "grad_norm": 0.56640625, + "learning_rate": 3.781503659838214e-05, + "loss": 0.28, + "step": 22250 + }, + { + "epoch": 2.97, + "grad_norm": 0.59375, + "learning_rate": 3.780591738037956e-05, + "loss": 0.4094, + "step": 22251 + }, + { + "epoch": 2.97, + "grad_norm": 0.73046875, + "learning_rate": 3.779679900576867e-05, + "loss": 0.5689, + "step": 22252 + }, + { + "epoch": 2.97, + "grad_norm": 0.5234375, + "learning_rate": 3.778768147467313e-05, + "loss": 0.4617, + "step": 22253 + }, + { + "epoch": 2.97, + "grad_norm": 0.51171875, + "learning_rate": 3.777856478721653e-05, + "loss": 0.1676, + "step": 22254 + }, + { + "epoch": 2.97, + "grad_norm": 0.71875, + "learning_rate": 3.7769448943522525e-05, + "loss": 0.6084, + "step": 22255 + }, + { + "epoch": 2.97, + "grad_norm": 0.498046875, + "learning_rate": 3.776033394371472e-05, + "loss": 0.3742, + "step": 22256 + }, + { + "epoch": 2.97, + "grad_norm": 0.69921875, + "learning_rate": 3.7751219787916747e-05, + "loss": 0.2063, + "step": 22257 + }, + { + "epoch": 2.97, + "grad_norm": 0.7734375, + "learning_rate": 3.774210647625221e-05, + "loss": 0.2507, + "step": 22258 + }, + { + "epoch": 2.97, + "grad_norm": 0.8046875, + "learning_rate": 3.773299400884463e-05, + "loss": 0.4123, + "step": 22259 + }, + { + "epoch": 2.97, + "grad_norm": 0.51171875, + "learning_rate": 3.772388238581762e-05, + "loss": 0.1359, + "step": 22260 + }, + { + "epoch": 2.97, + "grad_norm": 0.52734375, + "learning_rate": 3.771477160729476e-05, + "loss": 0.3347, + "step": 22261 + }, + { + "epoch": 2.97, + "grad_norm": 0.60546875, + "learning_rate": 3.770566167339955e-05, + "loss": 0.2273, + "step": 22262 + }, + { + "epoch": 2.97, + "grad_norm": 0.66015625, + "learning_rate": 3.7696552584255574e-05, + "loss": 0.5404, + "step": 22263 + }, + { + "epoch": 2.97, + "grad_norm": 0.55859375, + "learning_rate": 3.76874443399863e-05, + "loss": 0.3892, + "step": 22264 + }, + { + "epoch": 2.97, + "grad_norm": 0.5390625, + "learning_rate": 3.7678336940715265e-05, + "loss": 0.2262, + "step": 22265 + }, + { + "epoch": 2.97, + "grad_norm": 0.70703125, + "learning_rate": 3.766923038656599e-05, + "loss": 0.4372, + "step": 22266 + }, + { + "epoch": 2.97, + "grad_norm": 0.61328125, + "learning_rate": 3.766012467766198e-05, + "loss": 0.5527, + "step": 22267 + }, + { + "epoch": 2.97, + "grad_norm": 0.75390625, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.4146, + "step": 22268 + }, + { + "epoch": 2.97, + "grad_norm": 0.828125, + "learning_rate": 3.764191579608351e-05, + "loss": 0.4596, + "step": 22269 + }, + { + "epoch": 2.97, + "grad_norm": 0.6484375, + "learning_rate": 3.763281262365602e-05, + "loss": 0.2158, + "step": 22270 + }, + { + "epoch": 2.97, + "grad_norm": 0.59765625, + "learning_rate": 3.762371029696765e-05, + "loss": 0.2292, + "step": 22271 + }, + { + "epoch": 2.97, + "grad_norm": 0.53125, + "learning_rate": 3.761460881614177e-05, + "loss": 0.4858, + "step": 22272 + }, + { + "epoch": 2.97, + "grad_norm": 0.55078125, + "learning_rate": 3.7605508181301866e-05, + "loss": 0.4071, + "step": 22273 + }, + { + "epoch": 2.97, + "grad_norm": 0.69140625, + "learning_rate": 3.7596408392571284e-05, + "loss": 0.6689, + "step": 22274 + }, + { + "epoch": 2.97, + "grad_norm": 0.57421875, + "learning_rate": 3.758730945007344e-05, + "loss": 0.3143, + "step": 22275 + }, + { + "epoch": 2.97, + "grad_norm": 0.5234375, + "learning_rate": 3.757821135393178e-05, + "loss": 0.2886, + "step": 22276 + }, + { + "epoch": 2.97, + "grad_norm": 0.57421875, + "learning_rate": 3.75691141042696e-05, + "loss": 0.3461, + "step": 22277 + }, + { + "epoch": 2.97, + "grad_norm": 0.640625, + "learning_rate": 3.75600177012103e-05, + "loss": 0.5748, + "step": 22278 + }, + { + "epoch": 2.97, + "grad_norm": 0.66015625, + "learning_rate": 3.755092214487724e-05, + "loss": 0.3427, + "step": 22279 + }, + { + "epoch": 2.97, + "grad_norm": 0.68359375, + "learning_rate": 3.754182743539378e-05, + "loss": 0.3978, + "step": 22280 + }, + { + "epoch": 2.97, + "grad_norm": 0.46484375, + "learning_rate": 3.75327335728832e-05, + "loss": 0.2159, + "step": 22281 + }, + { + "epoch": 2.97, + "grad_norm": 0.57421875, + "learning_rate": 3.752364055746884e-05, + "loss": 0.1872, + "step": 22282 + }, + { + "epoch": 2.97, + "grad_norm": 0.625, + "learning_rate": 3.751454838927402e-05, + "loss": 0.2018, + "step": 22283 + }, + { + "epoch": 2.97, + "grad_norm": 0.5859375, + "learning_rate": 3.750545706842201e-05, + "loss": 0.2621, + "step": 22284 + }, + { + "epoch": 2.97, + "grad_norm": 0.453125, + "learning_rate": 3.7496366595036155e-05, + "loss": 0.2749, + "step": 22285 + }, + { + "epoch": 2.97, + "grad_norm": 0.53515625, + "learning_rate": 3.748727696923968e-05, + "loss": 0.3433, + "step": 22286 + }, + { + "epoch": 2.97, + "grad_norm": 0.74609375, + "learning_rate": 3.747818819115582e-05, + "loss": 0.5071, + "step": 22287 + }, + { + "epoch": 2.97, + "grad_norm": 0.91796875, + "learning_rate": 3.7469100260907856e-05, + "loss": 0.2666, + "step": 22288 + }, + { + "epoch": 2.97, + "grad_norm": 0.71484375, + "learning_rate": 3.7460013178619014e-05, + "loss": 0.5784, + "step": 22289 + }, + { + "epoch": 2.97, + "grad_norm": 0.375, + "learning_rate": 3.7450926944412566e-05, + "loss": 0.1094, + "step": 22290 + }, + { + "epoch": 2.97, + "grad_norm": 0.5078125, + "learning_rate": 3.744184155841167e-05, + "loss": 0.3542, + "step": 22291 + }, + { + "epoch": 2.97, + "grad_norm": 0.486328125, + "learning_rate": 3.743275702073954e-05, + "loss": 0.372, + "step": 22292 + }, + { + "epoch": 2.97, + "grad_norm": 0.62890625, + "learning_rate": 3.742367333151939e-05, + "loss": 0.1559, + "step": 22293 + }, + { + "epoch": 2.97, + "grad_norm": 0.42578125, + "learning_rate": 3.741459049087442e-05, + "loss": 0.3336, + "step": 22294 + }, + { + "epoch": 2.98, + "grad_norm": 0.57421875, + "learning_rate": 3.740550849892773e-05, + "loss": 0.4448, + "step": 22295 + }, + { + "epoch": 2.98, + "grad_norm": 0.60546875, + "learning_rate": 3.739642735580253e-05, + "loss": 0.3457, + "step": 22296 + }, + { + "epoch": 2.98, + "grad_norm": 0.63671875, + "learning_rate": 3.738734706162198e-05, + "loss": 0.4807, + "step": 22297 + }, + { + "epoch": 2.98, + "grad_norm": 0.515625, + "learning_rate": 3.7378267616509154e-05, + "loss": 0.182, + "step": 22298 + }, + { + "epoch": 2.98, + "grad_norm": 0.58984375, + "learning_rate": 3.736918902058725e-05, + "loss": 0.268, + "step": 22299 + }, + { + "epoch": 2.98, + "grad_norm": 0.6015625, + "learning_rate": 3.7360111273979304e-05, + "loss": 0.249, + "step": 22300 + }, + { + "epoch": 2.98, + "grad_norm": 0.75, + "learning_rate": 3.735103437680845e-05, + "loss": 0.4242, + "step": 22301 + }, + { + "epoch": 2.98, + "grad_norm": 0.53125, + "learning_rate": 3.734195832919777e-05, + "loss": 0.3265, + "step": 22302 + }, + { + "epoch": 2.98, + "grad_norm": 0.609375, + "learning_rate": 3.733288313127039e-05, + "loss": 0.1837, + "step": 22303 + }, + { + "epoch": 2.98, + "grad_norm": 0.5703125, + "learning_rate": 3.732380878314931e-05, + "loss": 0.3921, + "step": 22304 + }, + { + "epoch": 2.98, + "grad_norm": 0.703125, + "learning_rate": 3.731473528495759e-05, + "loss": 0.192, + "step": 22305 + }, + { + "epoch": 2.98, + "grad_norm": 0.72265625, + "learning_rate": 3.7305662636818296e-05, + "loss": 0.3206, + "step": 22306 + }, + { + "epoch": 2.98, + "grad_norm": 0.640625, + "learning_rate": 3.729659083885445e-05, + "loss": 0.2586, + "step": 22307 + }, + { + "epoch": 2.98, + "grad_norm": 0.466796875, + "learning_rate": 3.728751989118912e-05, + "loss": 0.1587, + "step": 22308 + }, + { + "epoch": 2.98, + "grad_norm": 0.6328125, + "learning_rate": 3.727844979394526e-05, + "loss": 0.4514, + "step": 22309 + }, + { + "epoch": 2.98, + "grad_norm": 0.4921875, + "learning_rate": 3.726938054724585e-05, + "loss": 0.1619, + "step": 22310 + }, + { + "epoch": 2.98, + "grad_norm": 0.875, + "learning_rate": 3.726031215121389e-05, + "loss": 0.594, + "step": 22311 + }, + { + "epoch": 2.98, + "grad_norm": 0.4609375, + "learning_rate": 3.725124460597237e-05, + "loss": 0.2278, + "step": 22312 + }, + { + "epoch": 2.98, + "grad_norm": 0.65234375, + "learning_rate": 3.724217791164427e-05, + "loss": 0.293, + "step": 22313 + }, + { + "epoch": 2.98, + "grad_norm": 0.435546875, + "learning_rate": 3.7233112068352496e-05, + "loss": 0.1439, + "step": 22314 + }, + { + "epoch": 2.98, + "grad_norm": 0.5390625, + "learning_rate": 3.722404707621999e-05, + "loss": 0.4186, + "step": 22315 + }, + { + "epoch": 2.98, + "grad_norm": 0.78125, + "learning_rate": 3.72149829353697e-05, + "loss": 0.2338, + "step": 22316 + }, + { + "epoch": 2.98, + "grad_norm": 0.72265625, + "learning_rate": 3.720591964592458e-05, + "loss": 0.5567, + "step": 22317 + }, + { + "epoch": 2.98, + "grad_norm": 0.6484375, + "learning_rate": 3.719685720800745e-05, + "loss": 0.2831, + "step": 22318 + }, + { + "epoch": 2.98, + "grad_norm": 0.498046875, + "learning_rate": 3.718779562174124e-05, + "loss": 0.1184, + "step": 22319 + }, + { + "epoch": 2.98, + "grad_norm": 0.75, + "learning_rate": 3.7178734887248835e-05, + "loss": 0.3132, + "step": 22320 + }, + { + "epoch": 2.98, + "grad_norm": 0.625, + "learning_rate": 3.716967500465315e-05, + "loss": 0.362, + "step": 22321 + }, + { + "epoch": 2.98, + "grad_norm": 0.53515625, + "learning_rate": 3.716061597407697e-05, + "loss": 0.3615, + "step": 22322 + }, + { + "epoch": 2.98, + "grad_norm": 0.4765625, + "learning_rate": 3.7151557795643154e-05, + "loss": 0.2703, + "step": 22323 + }, + { + "epoch": 2.98, + "grad_norm": 0.5234375, + "learning_rate": 3.714250046947455e-05, + "loss": 0.2316, + "step": 22324 + }, + { + "epoch": 2.98, + "grad_norm": 0.65234375, + "learning_rate": 3.7133443995693975e-05, + "loss": 0.3823, + "step": 22325 + }, + { + "epoch": 2.98, + "grad_norm": 0.71484375, + "learning_rate": 3.712438837442428e-05, + "loss": 0.3187, + "step": 22326 + }, + { + "epoch": 2.98, + "grad_norm": 0.494140625, + "learning_rate": 3.71153336057882e-05, + "loss": 0.4454, + "step": 22327 + }, + { + "epoch": 2.98, + "grad_norm": 0.6875, + "learning_rate": 3.710627968990855e-05, + "loss": 0.472, + "step": 22328 + }, + { + "epoch": 2.98, + "grad_norm": 0.55078125, + "learning_rate": 3.7097226626908124e-05, + "loss": 0.322, + "step": 22329 + }, + { + "epoch": 2.98, + "grad_norm": 0.68359375, + "learning_rate": 3.708817441690967e-05, + "loss": 0.2427, + "step": 22330 + }, + { + "epoch": 2.98, + "grad_norm": 0.58984375, + "learning_rate": 3.707912306003598e-05, + "loss": 0.5063, + "step": 22331 + }, + { + "epoch": 2.98, + "grad_norm": 0.5703125, + "learning_rate": 3.707007255640973e-05, + "loss": 0.2943, + "step": 22332 + }, + { + "epoch": 2.98, + "grad_norm": 0.78125, + "learning_rate": 3.706102290615369e-05, + "loss": 0.4252, + "step": 22333 + }, + { + "epoch": 2.98, + "grad_norm": 0.453125, + "learning_rate": 3.70519741093906e-05, + "loss": 0.2341, + "step": 22334 + }, + { + "epoch": 2.98, + "grad_norm": 0.7734375, + "learning_rate": 3.7042926166243106e-05, + "loss": 0.4821, + "step": 22335 + }, + { + "epoch": 2.98, + "grad_norm": 0.65234375, + "learning_rate": 3.703387907683398e-05, + "loss": 0.2756, + "step": 22336 + }, + { + "epoch": 2.98, + "grad_norm": 0.578125, + "learning_rate": 3.702483284128583e-05, + "loss": 0.2448, + "step": 22337 + }, + { + "epoch": 2.98, + "grad_norm": 0.66015625, + "learning_rate": 3.701578745972135e-05, + "loss": 0.376, + "step": 22338 + }, + { + "epoch": 2.98, + "grad_norm": 0.81640625, + "learning_rate": 3.7006742932263226e-05, + "loss": 0.37, + "step": 22339 + }, + { + "epoch": 2.98, + "grad_norm": 0.66015625, + "learning_rate": 3.699769925903414e-05, + "loss": 0.4133, + "step": 22340 + }, + { + "epoch": 2.98, + "grad_norm": 0.498046875, + "learning_rate": 3.698865644015663e-05, + "loss": 0.2735, + "step": 22341 + }, + { + "epoch": 2.98, + "grad_norm": 0.6015625, + "learning_rate": 3.69796144757534e-05, + "loss": 0.268, + "step": 22342 + }, + { + "epoch": 2.98, + "grad_norm": 0.466796875, + "learning_rate": 3.697057336594703e-05, + "loss": 0.2755, + "step": 22343 + }, + { + "epoch": 2.98, + "grad_norm": 0.64453125, + "learning_rate": 3.696153311086018e-05, + "loss": 0.5099, + "step": 22344 + }, + { + "epoch": 2.98, + "grad_norm": 0.55078125, + "learning_rate": 3.695249371061537e-05, + "loss": 0.2296, + "step": 22345 + }, + { + "epoch": 2.98, + "grad_norm": 0.58984375, + "learning_rate": 3.694345516533523e-05, + "loss": 0.4291, + "step": 22346 + }, + { + "epoch": 2.98, + "grad_norm": 0.6796875, + "learning_rate": 3.693441747514228e-05, + "loss": 0.3641, + "step": 22347 + }, + { + "epoch": 2.98, + "grad_norm": 0.61328125, + "learning_rate": 3.6925380640159105e-05, + "loss": 0.4198, + "step": 22348 + }, + { + "epoch": 2.98, + "grad_norm": 0.6328125, + "learning_rate": 3.6916344660508274e-05, + "loss": 0.401, + "step": 22349 + }, + { + "epoch": 2.98, + "grad_norm": 0.703125, + "learning_rate": 3.6907309536312276e-05, + "loss": 0.4607, + "step": 22350 + }, + { + "epoch": 2.98, + "grad_norm": 0.42578125, + "learning_rate": 3.689827526769365e-05, + "loss": 0.2969, + "step": 22351 + }, + { + "epoch": 2.98, + "grad_norm": 0.625, + "learning_rate": 3.68892418547749e-05, + "loss": 0.3899, + "step": 22352 + }, + { + "epoch": 2.98, + "grad_norm": 0.515625, + "learning_rate": 3.688020929767858e-05, + "loss": 0.2794, + "step": 22353 + }, + { + "epoch": 2.98, + "grad_norm": 0.5859375, + "learning_rate": 3.6871177596527084e-05, + "loss": 0.2819, + "step": 22354 + }, + { + "epoch": 2.98, + "grad_norm": 0.6953125, + "learning_rate": 3.686214675144295e-05, + "loss": 0.5589, + "step": 22355 + }, + { + "epoch": 2.98, + "grad_norm": 0.52734375, + "learning_rate": 3.6853116762548626e-05, + "loss": 0.3654, + "step": 22356 + }, + { + "epoch": 2.98, + "grad_norm": 0.59765625, + "learning_rate": 3.6844087629966565e-05, + "loss": 0.4217, + "step": 22357 + }, + { + "epoch": 2.98, + "grad_norm": 0.6796875, + "learning_rate": 3.683505935381923e-05, + "loss": 0.3128, + "step": 22358 + }, + { + "epoch": 2.98, + "grad_norm": 0.478515625, + "learning_rate": 3.682603193422904e-05, + "loss": 0.2371, + "step": 22359 + }, + { + "epoch": 2.98, + "grad_norm": 0.5234375, + "learning_rate": 3.681700537131837e-05, + "loss": 0.2671, + "step": 22360 + }, + { + "epoch": 2.98, + "grad_norm": 0.51171875, + "learning_rate": 3.6807979665209654e-05, + "loss": 0.3445, + "step": 22361 + }, + { + "epoch": 2.98, + "grad_norm": 0.76171875, + "learning_rate": 3.6798954816025286e-05, + "loss": 0.3733, + "step": 22362 + }, + { + "epoch": 2.98, + "grad_norm": 0.67578125, + "learning_rate": 3.67899308238877e-05, + "loss": 0.3814, + "step": 22363 + }, + { + "epoch": 2.98, + "grad_norm": 0.63671875, + "learning_rate": 3.6780907688919175e-05, + "loss": 0.3119, + "step": 22364 + }, + { + "epoch": 2.98, + "grad_norm": 0.51171875, + "learning_rate": 3.6771885411242124e-05, + "loss": 0.2242, + "step": 22365 + }, + { + "epoch": 2.98, + "grad_norm": 0.44921875, + "learning_rate": 3.676286399097888e-05, + "loss": 0.2554, + "step": 22366 + }, + { + "epoch": 2.98, + "grad_norm": 0.6640625, + "learning_rate": 3.675384342825181e-05, + "loss": 0.4098, + "step": 22367 + }, + { + "epoch": 2.98, + "grad_norm": 0.6796875, + "learning_rate": 3.6744823723183195e-05, + "loss": 0.4581, + "step": 22368 + }, + { + "epoch": 2.98, + "grad_norm": 0.609375, + "learning_rate": 3.6735804875895364e-05, + "loss": 0.2633, + "step": 22369 + }, + { + "epoch": 2.99, + "grad_norm": 0.65234375, + "learning_rate": 3.6726786886510656e-05, + "loss": 0.4942, + "step": 22370 + }, + { + "epoch": 2.99, + "grad_norm": 0.609375, + "learning_rate": 3.671776975515129e-05, + "loss": 0.3024, + "step": 22371 + }, + { + "epoch": 2.99, + "grad_norm": 0.54296875, + "learning_rate": 3.670875348193962e-05, + "loss": 0.3549, + "step": 22372 + }, + { + "epoch": 2.99, + "grad_norm": 0.74609375, + "learning_rate": 3.669973806699783e-05, + "loss": 0.273, + "step": 22373 + }, + { + "epoch": 2.99, + "grad_norm": 0.59765625, + "learning_rate": 3.6690723510448224e-05, + "loss": 0.3043, + "step": 22374 + }, + { + "epoch": 2.99, + "grad_norm": 0.69921875, + "learning_rate": 3.668170981241303e-05, + "loss": 0.238, + "step": 22375 + }, + { + "epoch": 2.99, + "grad_norm": 0.5859375, + "learning_rate": 3.667269697301453e-05, + "loss": 0.379, + "step": 22376 + }, + { + "epoch": 2.99, + "grad_norm": 0.65625, + "learning_rate": 3.666368499237487e-05, + "loss": 0.4134, + "step": 22377 + }, + { + "epoch": 2.99, + "grad_norm": 0.63671875, + "learning_rate": 3.665467387061628e-05, + "loss": 0.2783, + "step": 22378 + }, + { + "epoch": 2.99, + "grad_norm": 0.5, + "learning_rate": 3.664566360786098e-05, + "loss": 0.1941, + "step": 22379 + }, + { + "epoch": 2.99, + "grad_norm": 0.60546875, + "learning_rate": 3.663665420423114e-05, + "loss": 0.22, + "step": 22380 + }, + { + "epoch": 2.99, + "grad_norm": 0.75, + "learning_rate": 3.6627645659848964e-05, + "loss": 0.3544, + "step": 22381 + }, + { + "epoch": 2.99, + "grad_norm": 0.59765625, + "learning_rate": 3.661863797483659e-05, + "loss": 0.4474, + "step": 22382 + }, + { + "epoch": 2.99, + "grad_norm": 0.396484375, + "learning_rate": 3.6609631149316114e-05, + "loss": 0.1732, + "step": 22383 + }, + { + "epoch": 2.99, + "grad_norm": 0.61328125, + "learning_rate": 3.660062518340974e-05, + "loss": 0.2561, + "step": 22384 + }, + { + "epoch": 2.99, + "grad_norm": 0.80859375, + "learning_rate": 3.659162007723958e-05, + "loss": 0.2721, + "step": 22385 + }, + { + "epoch": 2.99, + "grad_norm": 0.640625, + "learning_rate": 3.6582615830927766e-05, + "loss": 0.2647, + "step": 22386 + }, + { + "epoch": 2.99, + "grad_norm": 0.58984375, + "learning_rate": 3.6573612444596363e-05, + "loss": 0.3026, + "step": 22387 + }, + { + "epoch": 2.99, + "grad_norm": 0.7265625, + "learning_rate": 3.6564609918367464e-05, + "loss": 0.4989, + "step": 22388 + }, + { + "epoch": 2.99, + "grad_norm": 0.5078125, + "learning_rate": 3.655560825236318e-05, + "loss": 0.2856, + "step": 22389 + }, + { + "epoch": 2.99, + "grad_norm": 0.8671875, + "learning_rate": 3.6546607446705606e-05, + "loss": 0.6779, + "step": 22390 + }, + { + "epoch": 2.99, + "grad_norm": 0.59375, + "learning_rate": 3.6537607501516715e-05, + "loss": 0.2991, + "step": 22391 + }, + { + "epoch": 2.99, + "grad_norm": 0.373046875, + "learning_rate": 3.65286084169186e-05, + "loss": 0.1352, + "step": 22392 + }, + { + "epoch": 2.99, + "grad_norm": 0.62109375, + "learning_rate": 3.651961019303328e-05, + "loss": 0.2693, + "step": 22393 + }, + { + "epoch": 2.99, + "grad_norm": 0.58203125, + "learning_rate": 3.651061282998284e-05, + "loss": 0.3704, + "step": 22394 + }, + { + "epoch": 2.99, + "grad_norm": 0.57421875, + "learning_rate": 3.650161632788923e-05, + "loss": 0.2323, + "step": 22395 + }, + { + "epoch": 2.99, + "grad_norm": 0.625, + "learning_rate": 3.649262068687441e-05, + "loss": 0.4491, + "step": 22396 + }, + { + "epoch": 2.99, + "grad_norm": 0.5859375, + "learning_rate": 3.648362590706042e-05, + "loss": 0.3784, + "step": 22397 + }, + { + "epoch": 2.99, + "grad_norm": 0.51171875, + "learning_rate": 3.647463198856924e-05, + "loss": 0.4468, + "step": 22398 + }, + { + "epoch": 2.99, + "grad_norm": 0.6953125, + "learning_rate": 3.646563893152284e-05, + "loss": 0.2906, + "step": 22399 + }, + { + "epoch": 2.99, + "grad_norm": 0.66015625, + "learning_rate": 3.645664673604312e-05, + "loss": 0.6152, + "step": 22400 + }, + { + "epoch": 2.99, + "grad_norm": 0.59765625, + "learning_rate": 3.6447655402252054e-05, + "loss": 0.3045, + "step": 22401 + }, + { + "epoch": 2.99, + "grad_norm": 0.75, + "learning_rate": 3.643866493027157e-05, + "loss": 0.2929, + "step": 22402 + }, + { + "epoch": 2.99, + "grad_norm": 0.64453125, + "learning_rate": 3.642967532022358e-05, + "loss": 0.6791, + "step": 22403 + }, + { + "epoch": 2.99, + "grad_norm": 0.62109375, + "learning_rate": 3.642068657223003e-05, + "loss": 0.3464, + "step": 22404 + }, + { + "epoch": 2.99, + "grad_norm": 0.8046875, + "learning_rate": 3.641169868641273e-05, + "loss": 0.4832, + "step": 22405 + }, + { + "epoch": 2.99, + "grad_norm": 0.62890625, + "learning_rate": 3.640271166289365e-05, + "loss": 0.3625, + "step": 22406 + }, + { + "epoch": 2.99, + "grad_norm": 0.48046875, + "learning_rate": 3.639372550179458e-05, + "loss": 0.311, + "step": 22407 + }, + { + "epoch": 2.99, + "grad_norm": 0.609375, + "learning_rate": 3.6384740203237424e-05, + "loss": 0.2522, + "step": 22408 + }, + { + "epoch": 2.99, + "grad_norm": 0.50390625, + "learning_rate": 3.637575576734404e-05, + "loss": 0.4172, + "step": 22409 + }, + { + "epoch": 2.99, + "grad_norm": 0.70703125, + "learning_rate": 3.636677219423621e-05, + "loss": 0.1973, + "step": 22410 + }, + { + "epoch": 2.99, + "grad_norm": 0.55078125, + "learning_rate": 3.63577894840358e-05, + "loss": 0.311, + "step": 22411 + }, + { + "epoch": 2.99, + "grad_norm": 0.5, + "learning_rate": 3.634880763686459e-05, + "loss": 0.1532, + "step": 22412 + }, + { + "epoch": 2.99, + "grad_norm": 0.609375, + "learning_rate": 3.633982665284444e-05, + "loss": 0.4166, + "step": 22413 + }, + { + "epoch": 2.99, + "grad_norm": 0.55078125, + "learning_rate": 3.633084653209707e-05, + "loss": 0.3782, + "step": 22414 + }, + { + "epoch": 2.99, + "grad_norm": 0.546875, + "learning_rate": 3.632186727474428e-05, + "loss": 0.2018, + "step": 22415 + }, + { + "epoch": 2.99, + "grad_norm": 0.59765625, + "learning_rate": 3.6312888880907844e-05, + "loss": 0.5165, + "step": 22416 + }, + { + "epoch": 2.99, + "grad_norm": 0.58984375, + "learning_rate": 3.630391135070954e-05, + "loss": 0.274, + "step": 22417 + }, + { + "epoch": 2.99, + "grad_norm": 0.578125, + "learning_rate": 3.629493468427104e-05, + "loss": 0.2652, + "step": 22418 + }, + { + "epoch": 2.99, + "grad_norm": 0.49609375, + "learning_rate": 3.6285958881714145e-05, + "loss": 0.1896, + "step": 22419 + }, + { + "epoch": 2.99, + "grad_norm": 0.5546875, + "learning_rate": 3.6276983943160506e-05, + "loss": 0.267, + "step": 22420 + }, + { + "epoch": 2.99, + "grad_norm": 0.66796875, + "learning_rate": 3.626800986873187e-05, + "loss": 0.5054, + "step": 22421 + }, + { + "epoch": 2.99, + "grad_norm": 0.77734375, + "learning_rate": 3.625903665854995e-05, + "loss": 0.3868, + "step": 22422 + }, + { + "epoch": 2.99, + "grad_norm": 0.71875, + "learning_rate": 3.6250064312736364e-05, + "loss": 0.3739, + "step": 22423 + }, + { + "epoch": 2.99, + "grad_norm": 0.62109375, + "learning_rate": 3.6241092831412825e-05, + "loss": 0.5393, + "step": 22424 + }, + { + "epoch": 2.99, + "grad_norm": 0.59375, + "learning_rate": 3.623212221470098e-05, + "loss": 0.4565, + "step": 22425 + }, + { + "epoch": 2.99, + "grad_norm": 1.234375, + "learning_rate": 3.622315246272253e-05, + "loss": 0.1968, + "step": 22426 + }, + { + "epoch": 2.99, + "grad_norm": 0.486328125, + "learning_rate": 3.621418357559903e-05, + "loss": 0.2035, + "step": 22427 + }, + { + "epoch": 2.99, + "grad_norm": 0.640625, + "learning_rate": 3.620521555345212e-05, + "loss": 0.3021, + "step": 22428 + }, + { + "epoch": 2.99, + "grad_norm": 0.60546875, + "learning_rate": 3.6196248396403446e-05, + "loss": 0.4024, + "step": 22429 + }, + { + "epoch": 2.99, + "grad_norm": 0.50390625, + "learning_rate": 3.6187282104574595e-05, + "loss": 0.4084, + "step": 22430 + }, + { + "epoch": 2.99, + "grad_norm": 0.64453125, + "learning_rate": 3.6178316678087174e-05, + "loss": 0.2956, + "step": 22431 + }, + { + "epoch": 2.99, + "grad_norm": 0.478515625, + "learning_rate": 3.616935211706275e-05, + "loss": 0.2355, + "step": 22432 + }, + { + "epoch": 2.99, + "grad_norm": 0.5703125, + "learning_rate": 3.6160388421622834e-05, + "loss": 0.3489, + "step": 22433 + }, + { + "epoch": 2.99, + "grad_norm": 0.703125, + "learning_rate": 3.6151425591889035e-05, + "loss": 0.2652, + "step": 22434 + }, + { + "epoch": 2.99, + "grad_norm": 0.76953125, + "learning_rate": 3.614246362798288e-05, + "loss": 0.6065, + "step": 22435 + }, + { + "epoch": 2.99, + "grad_norm": 0.578125, + "learning_rate": 3.613350253002593e-05, + "loss": 0.218, + "step": 22436 + }, + { + "epoch": 2.99, + "grad_norm": 0.61328125, + "learning_rate": 3.6124542298139644e-05, + "loss": 0.3431, + "step": 22437 + }, + { + "epoch": 2.99, + "grad_norm": 0.703125, + "learning_rate": 3.6115582932445555e-05, + "loss": 0.2288, + "step": 22438 + }, + { + "epoch": 2.99, + "grad_norm": 0.65234375, + "learning_rate": 3.6106624433065174e-05, + "loss": 0.2985, + "step": 22439 + }, + { + "epoch": 2.99, + "grad_norm": 0.63671875, + "learning_rate": 3.609766680012e-05, + "loss": 0.4817, + "step": 22440 + }, + { + "epoch": 2.99, + "grad_norm": 0.61328125, + "learning_rate": 3.608871003373144e-05, + "loss": 0.352, + "step": 22441 + }, + { + "epoch": 2.99, + "grad_norm": 0.64453125, + "learning_rate": 3.6079754134020996e-05, + "loss": 0.2696, + "step": 22442 + }, + { + "epoch": 2.99, + "grad_norm": 0.5625, + "learning_rate": 3.607079910111013e-05, + "loss": 0.2643, + "step": 22443 + }, + { + "epoch": 2.99, + "grad_norm": 0.7734375, + "learning_rate": 3.6061844935120235e-05, + "loss": 0.4209, + "step": 22444 + }, + { + "epoch": 3.0, + "grad_norm": 0.79296875, + "learning_rate": 3.605289163617279e-05, + "loss": 0.2061, + "step": 22445 + }, + { + "epoch": 3.0, + "grad_norm": 0.49609375, + "learning_rate": 3.604393920438914e-05, + "loss": 0.228, + "step": 22446 + }, + { + "epoch": 3.0, + "grad_norm": 0.6640625, + "learning_rate": 3.603498763989073e-05, + "loss": 0.424, + "step": 22447 + }, + { + "epoch": 3.0, + "grad_norm": 0.64453125, + "learning_rate": 3.602603694279892e-05, + "loss": 0.2707, + "step": 22448 + }, + { + "epoch": 3.0, + "grad_norm": 0.5078125, + "learning_rate": 3.601708711323516e-05, + "loss": 0.266, + "step": 22449 + }, + { + "epoch": 3.0, + "grad_norm": 0.703125, + "learning_rate": 3.600813815132072e-05, + "loss": 0.3691, + "step": 22450 + }, + { + "epoch": 3.0, + "grad_norm": 0.7421875, + "learning_rate": 3.5999190057177e-05, + "loss": 0.5559, + "step": 22451 + }, + { + "epoch": 3.0, + "grad_norm": 0.734375, + "learning_rate": 3.5990242830925336e-05, + "loss": 0.3412, + "step": 22452 + }, + { + "epoch": 3.0, + "grad_norm": 0.59765625, + "learning_rate": 3.598129647268705e-05, + "loss": 0.3345, + "step": 22453 + }, + { + "epoch": 3.0, + "grad_norm": 0.486328125, + "learning_rate": 3.5972350982583526e-05, + "loss": 0.3157, + "step": 22454 + }, + { + "epoch": 3.0, + "grad_norm": 0.5625, + "learning_rate": 3.596340636073601e-05, + "loss": 0.212, + "step": 22455 + }, + { + "epoch": 3.0, + "grad_norm": 0.6484375, + "learning_rate": 3.595446260726576e-05, + "loss": 0.2053, + "step": 22456 + }, + { + "epoch": 3.0, + "grad_norm": 0.447265625, + "learning_rate": 3.5945519722294093e-05, + "loss": 0.2109, + "step": 22457 + }, + { + "epoch": 3.0, + "grad_norm": 0.7109375, + "learning_rate": 3.593657770594231e-05, + "loss": 0.4085, + "step": 22458 + }, + { + "epoch": 3.0, + "grad_norm": 0.6015625, + "learning_rate": 3.592763655833167e-05, + "loss": 0.1479, + "step": 22459 + }, + { + "epoch": 3.0, + "grad_norm": 0.396484375, + "learning_rate": 3.591869627958338e-05, + "loss": 0.1758, + "step": 22460 + }, + { + "epoch": 3.0, + "grad_norm": 0.4609375, + "learning_rate": 3.5909756869818675e-05, + "loss": 0.1772, + "step": 22461 + }, + { + "epoch": 3.0, + "grad_norm": 0.70703125, + "learning_rate": 3.59008183291588e-05, + "loss": 0.408, + "step": 22462 + }, + { + "epoch": 3.0, + "grad_norm": 0.5546875, + "learning_rate": 3.589188065772502e-05, + "loss": 0.2895, + "step": 22463 + }, + { + "epoch": 3.0, + "grad_norm": 0.7421875, + "learning_rate": 3.588294385563844e-05, + "loss": 0.3553, + "step": 22464 + }, + { + "epoch": 3.0, + "grad_norm": 0.61328125, + "learning_rate": 3.587400792302028e-05, + "loss": 0.5662, + "step": 22465 + }, + { + "epoch": 3.0, + "grad_norm": 0.77734375, + "learning_rate": 3.5865072859991745e-05, + "loss": 0.1937, + "step": 22466 + }, + { + "epoch": 3.0, + "grad_norm": 0.65234375, + "learning_rate": 3.585613866667401e-05, + "loss": 0.4729, + "step": 22467 + }, + { + "epoch": 3.0, + "grad_norm": 0.61328125, + "learning_rate": 3.5847205343188204e-05, + "loss": 0.3812, + "step": 22468 + }, + { + "epoch": 3.0, + "grad_norm": 0.67578125, + "learning_rate": 3.583827288965543e-05, + "loss": 0.2222, + "step": 22469 + }, + { + "epoch": 3.0, + "grad_norm": 0.53515625, + "learning_rate": 3.582934130619685e-05, + "loss": 0.3512, + "step": 22470 + }, + { + "epoch": 3.0, + "grad_norm": 0.51953125, + "learning_rate": 3.5820410592933594e-05, + "loss": 0.2052, + "step": 22471 + }, + { + "epoch": 3.0, + "grad_norm": 0.66796875, + "learning_rate": 3.581148074998679e-05, + "loss": 0.279, + "step": 22472 + }, + { + "epoch": 3.0, + "grad_norm": 0.88671875, + "learning_rate": 3.5802551777477476e-05, + "loss": 0.3241, + "step": 22473 + }, + { + "epoch": 3.0, + "grad_norm": 0.62890625, + "learning_rate": 3.579362367552676e-05, + "loss": 0.4609, + "step": 22474 + }, + { + "epoch": 3.0, + "grad_norm": 0.6640625, + "learning_rate": 3.578469644425571e-05, + "loss": 0.3058, + "step": 22475 + }, + { + "epoch": 3.0, + "grad_norm": 0.6640625, + "learning_rate": 3.5775770083785384e-05, + "loss": 0.2929, + "step": 22476 + }, + { + "epoch": 3.0, + "grad_norm": 0.62109375, + "learning_rate": 3.5766844594236884e-05, + "loss": 0.4971, + "step": 22477 + }, + { + "epoch": 3.0, + "grad_norm": 0.5546875, + "learning_rate": 3.5757919975731144e-05, + "loss": 0.3009, + "step": 22478 + }, + { + "epoch": 3.0, + "grad_norm": 0.5078125, + "learning_rate": 3.574899622838928e-05, + "loss": 0.2798, + "step": 22479 + }, + { + "epoch": 3.0, + "grad_norm": 0.68359375, + "learning_rate": 3.574007335233224e-05, + "loss": 0.5972, + "step": 22480 + }, + { + "epoch": 3.0, + "grad_norm": 0.58984375, + "learning_rate": 3.573115134768104e-05, + "loss": 0.2629, + "step": 22481 + }, + { + "epoch": 3.0, + "grad_norm": 0.5625, + "learning_rate": 3.572223021455671e-05, + "loss": 0.3197, + "step": 22482 + }, + { + "epoch": 3.0, + "grad_norm": 0.6328125, + "learning_rate": 3.5713309953080144e-05, + "loss": 0.3617, + "step": 22483 + }, + { + "epoch": 3.0, + "grad_norm": 0.578125, + "learning_rate": 3.570439056337237e-05, + "loss": 0.5239, + "step": 22484 + }, + { + "epoch": 3.0, + "grad_norm": 0.466796875, + "learning_rate": 3.5695472045554314e-05, + "loss": 0.4153, + "step": 22485 + }, + { + "epoch": 3.0, + "grad_norm": 0.64453125, + "learning_rate": 3.568655439974697e-05, + "loss": 0.2058, + "step": 22486 + }, + { + "epoch": 3.0, + "grad_norm": 0.5703125, + "learning_rate": 3.567763762607119e-05, + "loss": 0.1714, + "step": 22487 + }, + { + "epoch": 3.0, + "grad_norm": 0.470703125, + "learning_rate": 3.5668721724647925e-05, + "loss": 0.2165, + "step": 22488 + }, + { + "epoch": 3.0, + "grad_norm": 0.7265625, + "learning_rate": 3.5659806695598076e-05, + "loss": 0.2752, + "step": 22489 + }, + { + "epoch": 3.0, + "grad_norm": 0.7421875, + "learning_rate": 3.565089253904258e-05, + "loss": 0.4431, + "step": 22490 + }, + { + "epoch": 3.0, + "grad_norm": 0.5078125, + "learning_rate": 3.564197925510227e-05, + "loss": 0.2287, + "step": 22491 + }, + { + "epoch": 3.0, + "grad_norm": 0.58984375, + "learning_rate": 3.5633066843898014e-05, + "loss": 0.5038, + "step": 22492 + }, + { + "epoch": 3.0, + "grad_norm": 0.734375, + "learning_rate": 3.562415530555066e-05, + "loss": 0.572, + "step": 22493 + }, + { + "epoch": 3.0, + "grad_norm": 0.53515625, + "learning_rate": 3.561524464018109e-05, + "loss": 0.322, + "step": 22494 + }, + { + "epoch": 3.0, + "grad_norm": 0.37890625, + "learning_rate": 3.560633484791015e-05, + "loss": 0.1734, + "step": 22495 + }, + { + "epoch": 3.0, + "grad_norm": 0.455078125, + "learning_rate": 3.559742592885861e-05, + "loss": 0.2116, + "step": 22496 + }, + { + "epoch": 3.0, + "grad_norm": 0.609375, + "learning_rate": 3.55885178831473e-05, + "loss": 0.3205, + "step": 22497 + }, + { + "epoch": 3.0, + "grad_norm": 0.7421875, + "learning_rate": 3.557961071089703e-05, + "loss": 0.3034, + "step": 22498 + }, + { + "epoch": 3.0, + "grad_norm": 0.640625, + "learning_rate": 3.557070441222862e-05, + "loss": 0.3159, + "step": 22499 + }, + { + "epoch": 3.0, + "grad_norm": 0.6953125, + "learning_rate": 3.556179898726277e-05, + "loss": 0.2139, + "step": 22500 + }, + { + "epoch": 3.0, + "grad_norm": 0.466796875, + "learning_rate": 3.555289443612029e-05, + "loss": 0.2593, + "step": 22501 + }, + { + "epoch": 3.0, + "grad_norm": 0.8046875, + "learning_rate": 3.554399075892192e-05, + "loss": 0.4766, + "step": 22502 + }, + { + "epoch": 3.0, + "grad_norm": 0.34375, + "learning_rate": 3.5535087955788396e-05, + "loss": 0.1402, + "step": 22503 + }, + { + "epoch": 3.0, + "grad_norm": 0.5234375, + "learning_rate": 3.552618602684049e-05, + "loss": 0.2435, + "step": 22504 + }, + { + "epoch": 3.0, + "grad_norm": 0.57421875, + "learning_rate": 3.5517284972198886e-05, + "loss": 0.517, + "step": 22505 + }, + { + "epoch": 3.0, + "grad_norm": 0.5546875, + "learning_rate": 3.550838479198424e-05, + "loss": 0.3052, + "step": 22506 + }, + { + "epoch": 3.0, + "grad_norm": 0.50390625, + "learning_rate": 3.54994854863173e-05, + "loss": 0.3408, + "step": 22507 + }, + { + "epoch": 3.0, + "grad_norm": 0.62109375, + "learning_rate": 3.549058705531871e-05, + "loss": 0.2551, + "step": 22508 + }, + { + "epoch": 3.0, + "grad_norm": 0.5390625, + "learning_rate": 3.548168949910922e-05, + "loss": 0.2217, + "step": 22509 + }, + { + "epoch": 3.0, + "grad_norm": 0.61328125, + "learning_rate": 3.547279281780938e-05, + "loss": 0.2264, + "step": 22510 + }, + { + "epoch": 3.0, + "grad_norm": 0.73046875, + "learning_rate": 3.5463897011539885e-05, + "loss": 0.4123, + "step": 22511 + }, + { + "epoch": 3.0, + "grad_norm": 0.375, + "learning_rate": 3.5455002080421364e-05, + "loss": 0.1406, + "step": 22512 + }, + { + "epoch": 3.0, + "grad_norm": 0.396484375, + "learning_rate": 3.5446108024574473e-05, + "loss": 0.1117, + "step": 22513 + }, + { + "epoch": 3.0, + "grad_norm": 0.7890625, + "learning_rate": 3.543721484411976e-05, + "loss": 0.6659, + "step": 22514 + }, + { + "epoch": 3.0, + "grad_norm": 0.515625, + "learning_rate": 3.542832253917785e-05, + "loss": 0.1681, + "step": 22515 + }, + { + "epoch": 3.0, + "grad_norm": 0.4453125, + "learning_rate": 3.541943110986936e-05, + "loss": 0.1507, + "step": 22516 + }, + { + "epoch": 3.0, + "grad_norm": 0.703125, + "learning_rate": 3.54105405563148e-05, + "loss": 0.4712, + "step": 22517 + }, + { + "epoch": 3.0, + "grad_norm": 0.51953125, + "learning_rate": 3.5401650878634784e-05, + "loss": 0.1622, + "step": 22518 + }, + { + "epoch": 3.0, + "grad_norm": 0.53515625, + "learning_rate": 3.5392762076949827e-05, + "loss": 0.3374, + "step": 22519 + }, + { + "epoch": 3.01, + "grad_norm": 0.703125, + "learning_rate": 3.538387415138047e-05, + "loss": 0.4683, + "step": 22520 + }, + { + "epoch": 3.01, + "grad_norm": 0.60546875, + "learning_rate": 3.537498710204724e-05, + "loss": 0.3512, + "step": 22521 + }, + { + "epoch": 3.01, + "grad_norm": 0.65625, + "learning_rate": 3.536610092907071e-05, + "loss": 0.3346, + "step": 22522 + }, + { + "epoch": 3.01, + "grad_norm": 0.4375, + "learning_rate": 3.5357215632571295e-05, + "loss": 0.2626, + "step": 22523 + }, + { + "epoch": 3.01, + "grad_norm": 0.62890625, + "learning_rate": 3.534833121266953e-05, + "loss": 0.5047, + "step": 22524 + }, + { + "epoch": 3.01, + "grad_norm": 0.60546875, + "learning_rate": 3.533944766948587e-05, + "loss": 0.3296, + "step": 22525 + }, + { + "epoch": 3.01, + "grad_norm": 0.51171875, + "learning_rate": 3.533056500314081e-05, + "loss": 0.2246, + "step": 22526 + }, + { + "epoch": 3.01, + "grad_norm": 0.52734375, + "learning_rate": 3.5321683213754834e-05, + "loss": 0.1399, + "step": 22527 + }, + { + "epoch": 3.01, + "grad_norm": 0.439453125, + "learning_rate": 3.531280230144833e-05, + "loss": 0.1392, + "step": 22528 + }, + { + "epoch": 3.01, + "grad_norm": 0.6484375, + "learning_rate": 3.530392226634172e-05, + "loss": 0.5064, + "step": 22529 + }, + { + "epoch": 3.01, + "grad_norm": 0.609375, + "learning_rate": 3.5295043108555435e-05, + "loss": 0.3737, + "step": 22530 + }, + { + "epoch": 3.01, + "grad_norm": 0.546875, + "learning_rate": 3.52861648282099e-05, + "loss": 0.216, + "step": 22531 + }, + { + "epoch": 3.01, + "grad_norm": 0.5390625, + "learning_rate": 3.527728742542553e-05, + "loss": 0.2804, + "step": 22532 + }, + { + "epoch": 3.01, + "grad_norm": 0.61328125, + "learning_rate": 3.526841090032266e-05, + "loss": 0.4071, + "step": 22533 + }, + { + "epoch": 3.01, + "grad_norm": 0.7578125, + "learning_rate": 3.525953525302167e-05, + "loss": 0.4643, + "step": 22534 + }, + { + "epoch": 3.01, + "grad_norm": 0.5390625, + "learning_rate": 3.525066048364294e-05, + "loss": 0.3512, + "step": 22535 + }, + { + "epoch": 3.01, + "grad_norm": 0.55078125, + "learning_rate": 3.524178659230684e-05, + "loss": 0.4717, + "step": 22536 + }, + { + "epoch": 3.01, + "grad_norm": 0.546875, + "learning_rate": 3.523291357913363e-05, + "loss": 0.2647, + "step": 22537 + }, + { + "epoch": 3.01, + "grad_norm": 0.5625, + "learning_rate": 3.5224041444243694e-05, + "loss": 0.1778, + "step": 22538 + }, + { + "epoch": 3.01, + "grad_norm": 0.65625, + "learning_rate": 3.521517018775733e-05, + "loss": 0.3531, + "step": 22539 + }, + { + "epoch": 3.01, + "grad_norm": 0.69140625, + "learning_rate": 3.5206299809794864e-05, + "loss": 0.5408, + "step": 22540 + }, + { + "epoch": 3.01, + "grad_norm": 0.55859375, + "learning_rate": 3.5197430310476564e-05, + "loss": 0.2762, + "step": 22541 + }, + { + "epoch": 3.01, + "grad_norm": 0.48046875, + "learning_rate": 3.5188561689922664e-05, + "loss": 0.3246, + "step": 22542 + }, + { + "epoch": 3.01, + "grad_norm": 0.41796875, + "learning_rate": 3.517969394825346e-05, + "loss": 0.2024, + "step": 22543 + }, + { + "epoch": 3.01, + "grad_norm": 0.671875, + "learning_rate": 3.5170827085589196e-05, + "loss": 0.405, + "step": 22544 + }, + { + "epoch": 3.01, + "grad_norm": 0.58203125, + "learning_rate": 3.516196110205018e-05, + "loss": 0.1751, + "step": 22545 + }, + { + "epoch": 3.01, + "grad_norm": 0.7109375, + "learning_rate": 3.515309599775654e-05, + "loss": 0.481, + "step": 22546 + }, + { + "epoch": 3.01, + "grad_norm": 0.48828125, + "learning_rate": 3.514423177282853e-05, + "loss": 0.2579, + "step": 22547 + }, + { + "epoch": 3.01, + "grad_norm": 0.5859375, + "learning_rate": 3.513536842738636e-05, + "loss": 0.1885, + "step": 22548 + }, + { + "epoch": 3.01, + "grad_norm": 0.490234375, + "learning_rate": 3.512650596155024e-05, + "loss": 0.2159, + "step": 22549 + }, + { + "epoch": 3.01, + "grad_norm": 0.65234375, + "learning_rate": 3.511764437544036e-05, + "loss": 0.3088, + "step": 22550 + }, + { + "epoch": 3.01, + "grad_norm": 0.70703125, + "learning_rate": 3.510878366917682e-05, + "loss": 0.2071, + "step": 22551 + }, + { + "epoch": 3.01, + "grad_norm": 0.4375, + "learning_rate": 3.5099923842879866e-05, + "loss": 0.1689, + "step": 22552 + }, + { + "epoch": 3.01, + "grad_norm": 0.48046875, + "learning_rate": 3.509106489666957e-05, + "loss": 0.3178, + "step": 22553 + }, + { + "epoch": 3.01, + "grad_norm": 0.5390625, + "learning_rate": 3.508220683066607e-05, + "loss": 0.335, + "step": 22554 + }, + { + "epoch": 3.01, + "grad_norm": 0.6328125, + "learning_rate": 3.5073349644989564e-05, + "loss": 0.3937, + "step": 22555 + }, + { + "epoch": 3.01, + "grad_norm": 0.61328125, + "learning_rate": 3.5064493339760064e-05, + "loss": 0.4028, + "step": 22556 + }, + { + "epoch": 3.01, + "grad_norm": 0.53125, + "learning_rate": 3.5055637915097714e-05, + "loss": 0.1701, + "step": 22557 + }, + { + "epoch": 3.01, + "grad_norm": 0.625, + "learning_rate": 3.504678337112259e-05, + "loss": 0.3127, + "step": 22558 + }, + { + "epoch": 3.01, + "grad_norm": 0.7734375, + "learning_rate": 3.503792970795481e-05, + "loss": 0.4887, + "step": 22559 + }, + { + "epoch": 3.01, + "grad_norm": 0.46875, + "learning_rate": 3.5029076925714354e-05, + "loss": 0.1447, + "step": 22560 + }, + { + "epoch": 3.01, + "grad_norm": 0.4765625, + "learning_rate": 3.5020225024521315e-05, + "loss": 0.1219, + "step": 22561 + }, + { + "epoch": 3.01, + "grad_norm": 0.462890625, + "learning_rate": 3.5011374004495736e-05, + "loss": 0.2304, + "step": 22562 + }, + { + "epoch": 3.01, + "grad_norm": 0.62109375, + "learning_rate": 3.5002523865757666e-05, + "loss": 0.2205, + "step": 22563 + }, + { + "epoch": 3.01, + "grad_norm": 0.5703125, + "learning_rate": 3.499367460842707e-05, + "loss": 0.3268, + "step": 22564 + }, + { + "epoch": 3.01, + "grad_norm": 0.50390625, + "learning_rate": 3.498482623262396e-05, + "loss": 0.2004, + "step": 22565 + }, + { + "epoch": 3.01, + "grad_norm": 0.5625, + "learning_rate": 3.497597873846832e-05, + "loss": 0.4231, + "step": 22566 + }, + { + "epoch": 3.01, + "grad_norm": 0.56640625, + "learning_rate": 3.4967132126080136e-05, + "loss": 0.5447, + "step": 22567 + }, + { + "epoch": 3.01, + "grad_norm": 0.4765625, + "learning_rate": 3.495828639557941e-05, + "loss": 0.3679, + "step": 22568 + }, + { + "epoch": 3.01, + "grad_norm": 0.59765625, + "learning_rate": 3.494944154708604e-05, + "loss": 0.2058, + "step": 22569 + }, + { + "epoch": 3.01, + "grad_norm": 0.65625, + "learning_rate": 3.494059758071999e-05, + "loss": 0.4814, + "step": 22570 + }, + { + "epoch": 3.01, + "grad_norm": 0.5546875, + "learning_rate": 3.493175449660119e-05, + "loss": 0.2379, + "step": 22571 + }, + { + "epoch": 3.01, + "grad_norm": 0.453125, + "learning_rate": 3.49229122948496e-05, + "loss": 0.1739, + "step": 22572 + }, + { + "epoch": 3.01, + "grad_norm": 0.36328125, + "learning_rate": 3.491407097558503e-05, + "loss": 0.1013, + "step": 22573 + }, + { + "epoch": 3.01, + "grad_norm": 0.4375, + "learning_rate": 3.490523053892746e-05, + "loss": 0.309, + "step": 22574 + }, + { + "epoch": 3.01, + "grad_norm": 0.51171875, + "learning_rate": 3.489639098499672e-05, + "loss": 0.326, + "step": 22575 + }, + { + "epoch": 3.01, + "grad_norm": 0.5234375, + "learning_rate": 3.4887552313912745e-05, + "loss": 0.323, + "step": 22576 + }, + { + "epoch": 3.01, + "grad_norm": 0.43359375, + "learning_rate": 3.487871452579531e-05, + "loss": 0.1708, + "step": 22577 + }, + { + "epoch": 3.01, + "grad_norm": 0.56640625, + "learning_rate": 3.486987762076434e-05, + "loss": 0.2086, + "step": 22578 + }, + { + "epoch": 3.01, + "grad_norm": 0.46875, + "learning_rate": 3.486104159893958e-05, + "loss": 0.2318, + "step": 22579 + }, + { + "epoch": 3.01, + "grad_norm": 0.64453125, + "learning_rate": 3.485220646044092e-05, + "loss": 0.3095, + "step": 22580 + }, + { + "epoch": 3.01, + "grad_norm": 0.59375, + "learning_rate": 3.4843372205388145e-05, + "loss": 0.2801, + "step": 22581 + }, + { + "epoch": 3.01, + "grad_norm": 0.57421875, + "learning_rate": 3.48345388339011e-05, + "loss": 0.3369, + "step": 22582 + }, + { + "epoch": 3.01, + "grad_norm": 0.69140625, + "learning_rate": 3.482570634609948e-05, + "loss": 0.2106, + "step": 22583 + }, + { + "epoch": 3.01, + "grad_norm": 0.51953125, + "learning_rate": 3.4816874742103133e-05, + "loss": 0.2187, + "step": 22584 + }, + { + "epoch": 3.01, + "grad_norm": 0.51171875, + "learning_rate": 3.4808044022031784e-05, + "loss": 0.3127, + "step": 22585 + }, + { + "epoch": 3.01, + "grad_norm": 0.52734375, + "learning_rate": 3.479921418600524e-05, + "loss": 0.3318, + "step": 22586 + }, + { + "epoch": 3.01, + "grad_norm": 0.7109375, + "learning_rate": 3.479038523414317e-05, + "loss": 0.4965, + "step": 22587 + }, + { + "epoch": 3.01, + "grad_norm": 0.546875, + "learning_rate": 3.4781557166565304e-05, + "loss": 0.3273, + "step": 22588 + }, + { + "epoch": 3.01, + "grad_norm": 0.494140625, + "learning_rate": 3.477272998339143e-05, + "loss": 0.3148, + "step": 22589 + }, + { + "epoch": 3.01, + "grad_norm": 0.640625, + "learning_rate": 3.476390368474116e-05, + "loss": 0.3883, + "step": 22590 + }, + { + "epoch": 3.01, + "grad_norm": 0.515625, + "learning_rate": 3.475507827073426e-05, + "loss": 0.1503, + "step": 22591 + }, + { + "epoch": 3.01, + "grad_norm": 0.51171875, + "learning_rate": 3.474625374149034e-05, + "loss": 0.3428, + "step": 22592 + }, + { + "epoch": 3.01, + "grad_norm": 0.48046875, + "learning_rate": 3.47374300971291e-05, + "loss": 0.2413, + "step": 22593 + }, + { + "epoch": 3.01, + "grad_norm": 0.55078125, + "learning_rate": 3.4728607337770194e-05, + "loss": 0.204, + "step": 22594 + }, + { + "epoch": 3.02, + "grad_norm": 0.671875, + "learning_rate": 3.4719785463533297e-05, + "loss": 0.2736, + "step": 22595 + }, + { + "epoch": 3.02, + "grad_norm": 0.490234375, + "learning_rate": 3.4710964474537966e-05, + "loss": 0.2345, + "step": 22596 + }, + { + "epoch": 3.02, + "grad_norm": 0.734375, + "learning_rate": 3.4702144370903864e-05, + "loss": 0.3382, + "step": 22597 + }, + { + "epoch": 3.02, + "grad_norm": 0.5, + "learning_rate": 3.469332515275059e-05, + "loss": 0.2351, + "step": 22598 + }, + { + "epoch": 3.02, + "grad_norm": 0.419921875, + "learning_rate": 3.468450682019775e-05, + "loss": 0.2608, + "step": 22599 + }, + { + "epoch": 3.02, + "grad_norm": 0.462890625, + "learning_rate": 3.467568937336494e-05, + "loss": 0.1973, + "step": 22600 + }, + { + "epoch": 3.02, + "grad_norm": 0.4765625, + "learning_rate": 3.466687281237171e-05, + "loss": 0.1978, + "step": 22601 + }, + { + "epoch": 3.02, + "grad_norm": 0.59375, + "learning_rate": 3.465805713733757e-05, + "loss": 0.3204, + "step": 22602 + }, + { + "epoch": 3.02, + "grad_norm": 0.482421875, + "learning_rate": 3.4649242348382125e-05, + "loss": 0.2044, + "step": 22603 + }, + { + "epoch": 3.02, + "grad_norm": 0.52734375, + "learning_rate": 3.464042844562489e-05, + "loss": 0.266, + "step": 22604 + }, + { + "epoch": 3.02, + "grad_norm": 0.5703125, + "learning_rate": 3.463161542918544e-05, + "loss": 0.2514, + "step": 22605 + }, + { + "epoch": 3.02, + "grad_norm": 0.55859375, + "learning_rate": 3.462280329918318e-05, + "loss": 0.2696, + "step": 22606 + }, + { + "epoch": 3.02, + "grad_norm": 0.46484375, + "learning_rate": 3.4613992055737684e-05, + "loss": 0.1734, + "step": 22607 + }, + { + "epoch": 3.02, + "grad_norm": 0.50390625, + "learning_rate": 3.460518169896842e-05, + "loss": 0.1916, + "step": 22608 + }, + { + "epoch": 3.02, + "grad_norm": 0.53125, + "learning_rate": 3.4596372228994886e-05, + "loss": 0.2002, + "step": 22609 + }, + { + "epoch": 3.02, + "grad_norm": 0.65625, + "learning_rate": 3.4587563645936497e-05, + "loss": 0.535, + "step": 22610 + }, + { + "epoch": 3.02, + "grad_norm": 0.486328125, + "learning_rate": 3.457875594991272e-05, + "loss": 0.3147, + "step": 22611 + }, + { + "epoch": 3.02, + "grad_norm": 0.51953125, + "learning_rate": 3.4569949141043e-05, + "loss": 0.1608, + "step": 22612 + }, + { + "epoch": 3.02, + "grad_norm": 0.412109375, + "learning_rate": 3.4561143219446803e-05, + "loss": 0.1814, + "step": 22613 + }, + { + "epoch": 3.02, + "grad_norm": 0.77734375, + "learning_rate": 3.455233818524349e-05, + "loss": 0.3869, + "step": 22614 + }, + { + "epoch": 3.02, + "grad_norm": 0.51171875, + "learning_rate": 3.454353403855245e-05, + "loss": 0.1805, + "step": 22615 + }, + { + "epoch": 3.02, + "grad_norm": 0.58984375, + "learning_rate": 3.45347307794931e-05, + "loss": 0.253, + "step": 22616 + }, + { + "epoch": 3.02, + "grad_norm": 0.59375, + "learning_rate": 3.45259284081848e-05, + "loss": 0.22, + "step": 22617 + }, + { + "epoch": 3.02, + "grad_norm": 0.408203125, + "learning_rate": 3.451712692474698e-05, + "loss": 0.1656, + "step": 22618 + }, + { + "epoch": 3.02, + "grad_norm": 0.53125, + "learning_rate": 3.450832632929891e-05, + "loss": 0.1094, + "step": 22619 + }, + { + "epoch": 3.02, + "grad_norm": 0.7421875, + "learning_rate": 3.4499526621959964e-05, + "loss": 0.7886, + "step": 22620 + }, + { + "epoch": 3.02, + "grad_norm": 0.7109375, + "learning_rate": 3.449072780284948e-05, + "loss": 0.3556, + "step": 22621 + }, + { + "epoch": 3.02, + "grad_norm": 0.796875, + "learning_rate": 3.448192987208677e-05, + "loss": 0.3711, + "step": 22622 + }, + { + "epoch": 3.02, + "grad_norm": 0.5703125, + "learning_rate": 3.4473132829791166e-05, + "loss": 0.1717, + "step": 22623 + }, + { + "epoch": 3.02, + "grad_norm": 0.54296875, + "learning_rate": 3.446433667608191e-05, + "loss": 0.2615, + "step": 22624 + }, + { + "epoch": 3.02, + "grad_norm": 0.5, + "learning_rate": 3.445554141107834e-05, + "loss": 0.1896, + "step": 22625 + }, + { + "epoch": 3.02, + "grad_norm": 0.453125, + "learning_rate": 3.444674703489966e-05, + "loss": 0.3073, + "step": 22626 + }, + { + "epoch": 3.02, + "grad_norm": 0.61328125, + "learning_rate": 3.443795354766517e-05, + "loss": 0.285, + "step": 22627 + }, + { + "epoch": 3.02, + "grad_norm": 0.7265625, + "learning_rate": 3.4429160949494135e-05, + "loss": 0.5379, + "step": 22628 + }, + { + "epoch": 3.02, + "grad_norm": 0.5234375, + "learning_rate": 3.4420369240505724e-05, + "loss": 0.1555, + "step": 22629 + }, + { + "epoch": 3.02, + "grad_norm": 0.66015625, + "learning_rate": 3.441157842081919e-05, + "loss": 0.3527, + "step": 22630 + }, + { + "epoch": 3.02, + "grad_norm": 0.498046875, + "learning_rate": 3.440278849055376e-05, + "loss": 0.316, + "step": 22631 + }, + { + "epoch": 3.02, + "grad_norm": 0.73046875, + "learning_rate": 3.439399944982865e-05, + "loss": 0.5081, + "step": 22632 + }, + { + "epoch": 3.02, + "grad_norm": 0.765625, + "learning_rate": 3.4385211298762975e-05, + "loss": 0.3845, + "step": 22633 + }, + { + "epoch": 3.02, + "grad_norm": 0.5546875, + "learning_rate": 3.437642403747594e-05, + "loss": 0.4199, + "step": 22634 + }, + { + "epoch": 3.02, + "grad_norm": 0.455078125, + "learning_rate": 3.4367637666086714e-05, + "loss": 0.2121, + "step": 22635 + }, + { + "epoch": 3.02, + "grad_norm": 0.61328125, + "learning_rate": 3.435885218471448e-05, + "loss": 0.3725, + "step": 22636 + }, + { + "epoch": 3.02, + "grad_norm": 0.53515625, + "learning_rate": 3.4350067593478356e-05, + "loss": 0.4729, + "step": 22637 + }, + { + "epoch": 3.02, + "grad_norm": 0.6328125, + "learning_rate": 3.434128389249739e-05, + "loss": 0.3923, + "step": 22638 + }, + { + "epoch": 3.02, + "grad_norm": 0.6796875, + "learning_rate": 3.4332501081890766e-05, + "loss": 0.2533, + "step": 22639 + }, + { + "epoch": 3.02, + "grad_norm": 0.6484375, + "learning_rate": 3.432371916177757e-05, + "loss": 0.3737, + "step": 22640 + }, + { + "epoch": 3.02, + "grad_norm": 0.76953125, + "learning_rate": 3.431493813227693e-05, + "loss": 0.4859, + "step": 22641 + }, + { + "epoch": 3.02, + "grad_norm": 0.8671875, + "learning_rate": 3.430615799350785e-05, + "loss": 0.4831, + "step": 22642 + }, + { + "epoch": 3.02, + "grad_norm": 0.423828125, + "learning_rate": 3.429737874558942e-05, + "loss": 0.1915, + "step": 22643 + }, + { + "epoch": 3.02, + "grad_norm": 0.59375, + "learning_rate": 3.4288600388640714e-05, + "loss": 0.175, + "step": 22644 + }, + { + "epoch": 3.02, + "grad_norm": 0.60546875, + "learning_rate": 3.427982292278079e-05, + "loss": 0.3862, + "step": 22645 + }, + { + "epoch": 3.02, + "grad_norm": 0.69140625, + "learning_rate": 3.4271046348128614e-05, + "loss": 0.3783, + "step": 22646 + }, + { + "epoch": 3.02, + "grad_norm": 0.65234375, + "learning_rate": 3.426227066480323e-05, + "loss": 0.33, + "step": 22647 + }, + { + "epoch": 3.02, + "grad_norm": 0.5078125, + "learning_rate": 3.425349587292365e-05, + "loss": 0.3858, + "step": 22648 + }, + { + "epoch": 3.02, + "grad_norm": 0.60546875, + "learning_rate": 3.4244721972608894e-05, + "loss": 0.4233, + "step": 22649 + }, + { + "epoch": 3.02, + "grad_norm": 0.703125, + "learning_rate": 3.4235948963977873e-05, + "loss": 0.5346, + "step": 22650 + }, + { + "epoch": 3.02, + "grad_norm": 0.54296875, + "learning_rate": 3.422717684714963e-05, + "loss": 0.273, + "step": 22651 + }, + { + "epoch": 3.02, + "grad_norm": 0.6640625, + "learning_rate": 3.4218405622243045e-05, + "loss": 0.3533, + "step": 22652 + }, + { + "epoch": 3.02, + "grad_norm": 0.578125, + "learning_rate": 3.42096352893771e-05, + "loss": 0.331, + "step": 22653 + }, + { + "epoch": 3.02, + "grad_norm": 0.7890625, + "learning_rate": 3.420086584867073e-05, + "loss": 0.5011, + "step": 22654 + }, + { + "epoch": 3.02, + "grad_norm": 0.53125, + "learning_rate": 3.419209730024286e-05, + "loss": 0.2998, + "step": 22655 + }, + { + "epoch": 3.02, + "grad_norm": 0.53125, + "learning_rate": 3.418332964421237e-05, + "loss": 0.5604, + "step": 22656 + }, + { + "epoch": 3.02, + "grad_norm": 0.8125, + "learning_rate": 3.417456288069817e-05, + "loss": 0.3271, + "step": 22657 + }, + { + "epoch": 3.02, + "grad_norm": 0.4921875, + "learning_rate": 3.416579700981915e-05, + "loss": 0.1799, + "step": 22658 + }, + { + "epoch": 3.02, + "grad_norm": 0.60546875, + "learning_rate": 3.415703203169419e-05, + "loss": 0.2223, + "step": 22659 + }, + { + "epoch": 3.02, + "grad_norm": 0.57421875, + "learning_rate": 3.414826794644211e-05, + "loss": 0.1897, + "step": 22660 + }, + { + "epoch": 3.02, + "grad_norm": 0.6171875, + "learning_rate": 3.413950475418182e-05, + "loss": 0.3631, + "step": 22661 + }, + { + "epoch": 3.02, + "grad_norm": 0.5546875, + "learning_rate": 3.413074245503207e-05, + "loss": 0.2969, + "step": 22662 + }, + { + "epoch": 3.02, + "grad_norm": 0.55859375, + "learning_rate": 3.412198104911174e-05, + "loss": 0.3375, + "step": 22663 + }, + { + "epoch": 3.02, + "grad_norm": 0.56640625, + "learning_rate": 3.411322053653965e-05, + "loss": 0.2423, + "step": 22664 + }, + { + "epoch": 3.02, + "grad_norm": 0.609375, + "learning_rate": 3.410446091743455e-05, + "loss": 0.3291, + "step": 22665 + }, + { + "epoch": 3.02, + "grad_norm": 0.427734375, + "learning_rate": 3.4095702191915235e-05, + "loss": 0.1561, + "step": 22666 + }, + { + "epoch": 3.02, + "grad_norm": 0.486328125, + "learning_rate": 3.4086944360100514e-05, + "loss": 0.4103, + "step": 22667 + }, + { + "epoch": 3.02, + "grad_norm": 0.6875, + "learning_rate": 3.407818742210915e-05, + "loss": 0.2545, + "step": 22668 + }, + { + "epoch": 3.02, + "grad_norm": 0.546875, + "learning_rate": 3.406943137805985e-05, + "loss": 0.2286, + "step": 22669 + }, + { + "epoch": 3.03, + "grad_norm": 0.80859375, + "learning_rate": 3.4060676228071365e-05, + "loss": 0.4513, + "step": 22670 + }, + { + "epoch": 3.03, + "grad_norm": 0.482421875, + "learning_rate": 3.4051921972262434e-05, + "loss": 0.2716, + "step": 22671 + }, + { + "epoch": 3.03, + "grad_norm": 0.7265625, + "learning_rate": 3.4043168610751764e-05, + "loss": 0.5299, + "step": 22672 + }, + { + "epoch": 3.03, + "grad_norm": 0.56640625, + "learning_rate": 3.4034416143658134e-05, + "loss": 0.3116, + "step": 22673 + }, + { + "epoch": 3.03, + "grad_norm": 0.91015625, + "learning_rate": 3.402566457110008e-05, + "loss": 0.2458, + "step": 22674 + }, + { + "epoch": 3.03, + "grad_norm": 0.61328125, + "learning_rate": 3.401691389319637e-05, + "loss": 0.3452, + "step": 22675 + }, + { + "epoch": 3.03, + "grad_norm": 0.66796875, + "learning_rate": 3.400816411006564e-05, + "loss": 0.364, + "step": 22676 + }, + { + "epoch": 3.03, + "grad_norm": 0.58203125, + "learning_rate": 3.399941522182657e-05, + "loss": 0.2289, + "step": 22677 + }, + { + "epoch": 3.03, + "grad_norm": 0.451171875, + "learning_rate": 3.399066722859782e-05, + "loss": 0.3102, + "step": 22678 + }, + { + "epoch": 3.03, + "grad_norm": 0.494140625, + "learning_rate": 3.398192013049794e-05, + "loss": 0.1573, + "step": 22679 + }, + { + "epoch": 3.03, + "grad_norm": 0.55078125, + "learning_rate": 3.397317392764561e-05, + "loss": 0.2192, + "step": 22680 + }, + { + "epoch": 3.03, + "grad_norm": 0.55078125, + "learning_rate": 3.396442862015941e-05, + "loss": 0.318, + "step": 22681 + }, + { + "epoch": 3.03, + "grad_norm": 0.625, + "learning_rate": 3.395568420815797e-05, + "loss": 0.4037, + "step": 22682 + }, + { + "epoch": 3.03, + "grad_norm": 0.625, + "learning_rate": 3.394694069175981e-05, + "loss": 0.2135, + "step": 22683 + }, + { + "epoch": 3.03, + "grad_norm": 0.81640625, + "learning_rate": 3.393819807108353e-05, + "loss": 0.7142, + "step": 22684 + }, + { + "epoch": 3.03, + "grad_norm": 0.439453125, + "learning_rate": 3.392945634624767e-05, + "loss": 0.2507, + "step": 22685 + }, + { + "epoch": 3.03, + "grad_norm": 0.5859375, + "learning_rate": 3.3920715517370824e-05, + "loss": 0.2292, + "step": 22686 + }, + { + "epoch": 3.03, + "grad_norm": 0.4921875, + "learning_rate": 3.391197558457149e-05, + "loss": 0.3955, + "step": 22687 + }, + { + "epoch": 3.03, + "grad_norm": 0.71875, + "learning_rate": 3.390323654796814e-05, + "loss": 0.3178, + "step": 22688 + }, + { + "epoch": 3.03, + "grad_norm": 0.55859375, + "learning_rate": 3.389449840767933e-05, + "loss": 0.1688, + "step": 22689 + }, + { + "epoch": 3.03, + "grad_norm": 0.498046875, + "learning_rate": 3.388576116382355e-05, + "loss": 0.1933, + "step": 22690 + }, + { + "epoch": 3.03, + "grad_norm": 0.58984375, + "learning_rate": 3.387702481651931e-05, + "loss": 0.4193, + "step": 22691 + }, + { + "epoch": 3.03, + "grad_norm": 0.62890625, + "learning_rate": 3.386828936588502e-05, + "loss": 0.4577, + "step": 22692 + }, + { + "epoch": 3.03, + "grad_norm": 0.68359375, + "learning_rate": 3.3859554812039175e-05, + "loss": 0.3697, + "step": 22693 + }, + { + "epoch": 3.03, + "grad_norm": 0.640625, + "learning_rate": 3.385082115510021e-05, + "loss": 0.2186, + "step": 22694 + }, + { + "epoch": 3.03, + "grad_norm": 0.5, + "learning_rate": 3.3842088395186555e-05, + "loss": 0.2978, + "step": 22695 + }, + { + "epoch": 3.03, + "grad_norm": 0.4609375, + "learning_rate": 3.383335653241668e-05, + "loss": 0.1467, + "step": 22696 + }, + { + "epoch": 3.03, + "grad_norm": 0.65234375, + "learning_rate": 3.382462556690893e-05, + "loss": 0.3878, + "step": 22697 + }, + { + "epoch": 3.03, + "grad_norm": 0.54296875, + "learning_rate": 3.381589549878176e-05, + "loss": 0.2025, + "step": 22698 + }, + { + "epoch": 3.03, + "grad_norm": 0.63671875, + "learning_rate": 3.3807166328153484e-05, + "loss": 0.4475, + "step": 22699 + }, + { + "epoch": 3.03, + "grad_norm": 0.625, + "learning_rate": 3.379843805514251e-05, + "loss": 0.3772, + "step": 22700 + }, + { + "epoch": 3.03, + "grad_norm": 0.6796875, + "learning_rate": 3.378971067986725e-05, + "loss": 0.2898, + "step": 22701 + }, + { + "epoch": 3.03, + "grad_norm": 0.64453125, + "learning_rate": 3.378098420244597e-05, + "loss": 0.4968, + "step": 22702 + }, + { + "epoch": 3.03, + "grad_norm": 0.72265625, + "learning_rate": 3.377225862299704e-05, + "loss": 0.4547, + "step": 22703 + }, + { + "epoch": 3.03, + "grad_norm": 0.6953125, + "learning_rate": 3.376353394163879e-05, + "loss": 0.3713, + "step": 22704 + }, + { + "epoch": 3.03, + "grad_norm": 0.462890625, + "learning_rate": 3.375481015848956e-05, + "loss": 0.2957, + "step": 22705 + }, + { + "epoch": 3.03, + "grad_norm": 0.7734375, + "learning_rate": 3.3746087273667593e-05, + "loss": 0.6765, + "step": 22706 + }, + { + "epoch": 3.03, + "grad_norm": 0.99609375, + "learning_rate": 3.37373652872912e-05, + "loss": 0.6082, + "step": 22707 + }, + { + "epoch": 3.03, + "grad_norm": 0.640625, + "learning_rate": 3.3728644199478666e-05, + "loss": 0.4362, + "step": 22708 + }, + { + "epoch": 3.03, + "grad_norm": 0.53515625, + "learning_rate": 3.371992401034827e-05, + "loss": 0.1298, + "step": 22709 + }, + { + "epoch": 3.03, + "grad_norm": 0.5859375, + "learning_rate": 3.371120472001825e-05, + "loss": 0.3142, + "step": 22710 + }, + { + "epoch": 3.03, + "grad_norm": 0.353515625, + "learning_rate": 3.370248632860681e-05, + "loss": 0.1643, + "step": 22711 + }, + { + "epoch": 3.03, + "grad_norm": 0.69921875, + "learning_rate": 3.36937688362322e-05, + "loss": 0.458, + "step": 22712 + }, + { + "epoch": 3.03, + "grad_norm": 0.6875, + "learning_rate": 3.368505224301264e-05, + "loss": 0.1386, + "step": 22713 + }, + { + "epoch": 3.03, + "grad_norm": 0.474609375, + "learning_rate": 3.367633654906637e-05, + "loss": 0.2137, + "step": 22714 + }, + { + "epoch": 3.03, + "grad_norm": 0.46484375, + "learning_rate": 3.3667621754511505e-05, + "loss": 0.3782, + "step": 22715 + }, + { + "epoch": 3.03, + "grad_norm": 0.57421875, + "learning_rate": 3.365890785946626e-05, + "loss": 0.4305, + "step": 22716 + }, + { + "epoch": 3.03, + "grad_norm": 0.79296875, + "learning_rate": 3.36501948640488e-05, + "loss": 0.505, + "step": 22717 + }, + { + "epoch": 3.03, + "grad_norm": 0.62890625, + "learning_rate": 3.364148276837731e-05, + "loss": 0.2691, + "step": 22718 + }, + { + "epoch": 3.03, + "grad_norm": 0.5078125, + "learning_rate": 3.363277157256988e-05, + "loss": 0.2791, + "step": 22719 + }, + { + "epoch": 3.03, + "grad_norm": 0.6484375, + "learning_rate": 3.362406127674465e-05, + "loss": 0.3768, + "step": 22720 + }, + { + "epoch": 3.03, + "grad_norm": 0.6171875, + "learning_rate": 3.361535188101976e-05, + "loss": 0.2514, + "step": 22721 + }, + { + "epoch": 3.03, + "grad_norm": 0.546875, + "learning_rate": 3.360664338551334e-05, + "loss": 0.217, + "step": 22722 + }, + { + "epoch": 3.03, + "grad_norm": 0.46875, + "learning_rate": 3.3597935790343405e-05, + "loss": 0.3149, + "step": 22723 + }, + { + "epoch": 3.03, + "grad_norm": 0.5625, + "learning_rate": 3.358922909562811e-05, + "loss": 0.2981, + "step": 22724 + }, + { + "epoch": 3.03, + "grad_norm": 0.435546875, + "learning_rate": 3.358052330148546e-05, + "loss": 0.2174, + "step": 22725 + }, + { + "epoch": 3.03, + "grad_norm": 0.48046875, + "learning_rate": 3.3571818408033537e-05, + "loss": 0.1637, + "step": 22726 + }, + { + "epoch": 3.03, + "grad_norm": 0.74609375, + "learning_rate": 3.3563114415390394e-05, + "loss": 0.3655, + "step": 22727 + }, + { + "epoch": 3.03, + "grad_norm": 0.66796875, + "learning_rate": 3.355441132367409e-05, + "loss": 0.3944, + "step": 22728 + }, + { + "epoch": 3.03, + "grad_norm": 0.87109375, + "learning_rate": 3.354570913300258e-05, + "loss": 0.2463, + "step": 22729 + }, + { + "epoch": 3.03, + "grad_norm": 0.78125, + "learning_rate": 3.353700784349391e-05, + "loss": 0.5259, + "step": 22730 + }, + { + "epoch": 3.03, + "grad_norm": 0.765625, + "learning_rate": 3.352830745526606e-05, + "loss": 0.6285, + "step": 22731 + }, + { + "epoch": 3.03, + "grad_norm": 0.625, + "learning_rate": 3.3519607968437064e-05, + "loss": 0.4945, + "step": 22732 + }, + { + "epoch": 3.03, + "grad_norm": 0.69921875, + "learning_rate": 3.351090938312481e-05, + "loss": 0.4312, + "step": 22733 + }, + { + "epoch": 3.03, + "grad_norm": 0.81640625, + "learning_rate": 3.350221169944734e-05, + "loss": 0.3778, + "step": 22734 + }, + { + "epoch": 3.03, + "grad_norm": 0.455078125, + "learning_rate": 3.349351491752252e-05, + "loss": 0.2067, + "step": 22735 + }, + { + "epoch": 3.03, + "grad_norm": 0.5625, + "learning_rate": 3.3484819037468306e-05, + "loss": 0.2283, + "step": 22736 + }, + { + "epoch": 3.03, + "grad_norm": 0.45703125, + "learning_rate": 3.3476124059402694e-05, + "loss": 0.2148, + "step": 22737 + }, + { + "epoch": 3.03, + "grad_norm": 0.49609375, + "learning_rate": 3.346742998344348e-05, + "loss": 0.274, + "step": 22738 + }, + { + "epoch": 3.03, + "grad_norm": 0.671875, + "learning_rate": 3.345873680970863e-05, + "loss": 0.2781, + "step": 22739 + }, + { + "epoch": 3.03, + "grad_norm": 0.5625, + "learning_rate": 3.345004453831601e-05, + "loss": 0.3578, + "step": 22740 + }, + { + "epoch": 3.03, + "grad_norm": 0.5546875, + "learning_rate": 3.344135316938353e-05, + "loss": 0.1763, + "step": 22741 + }, + { + "epoch": 3.03, + "grad_norm": 0.65234375, + "learning_rate": 3.343266270302898e-05, + "loss": 0.2901, + "step": 22742 + }, + { + "epoch": 3.03, + "grad_norm": 0.625, + "learning_rate": 3.342397313937025e-05, + "loss": 0.3728, + "step": 22743 + }, + { + "epoch": 3.03, + "grad_norm": 0.54296875, + "learning_rate": 3.341528447852518e-05, + "loss": 0.4809, + "step": 22744 + }, + { + "epoch": 3.04, + "grad_norm": 0.5546875, + "learning_rate": 3.3406596720611574e-05, + "loss": 0.1738, + "step": 22745 + }, + { + "epoch": 3.04, + "grad_norm": 0.51953125, + "learning_rate": 3.339790986574733e-05, + "loss": 0.2616, + "step": 22746 + }, + { + "epoch": 3.04, + "grad_norm": 0.52734375, + "learning_rate": 3.3389223914050104e-05, + "loss": 0.1581, + "step": 22747 + }, + { + "epoch": 3.04, + "grad_norm": 0.69140625, + "learning_rate": 3.3380538865637756e-05, + "loss": 0.2977, + "step": 22748 + }, + { + "epoch": 3.04, + "grad_norm": 0.5546875, + "learning_rate": 3.337185472062805e-05, + "loss": 0.2597, + "step": 22749 + }, + { + "epoch": 3.04, + "grad_norm": 0.765625, + "learning_rate": 3.336317147913877e-05, + "loss": 0.4961, + "step": 22750 + }, + { + "epoch": 3.04, + "grad_norm": 0.7109375, + "learning_rate": 3.335448914128767e-05, + "loss": 0.3624, + "step": 22751 + }, + { + "epoch": 3.04, + "grad_norm": 0.40234375, + "learning_rate": 3.334580770719244e-05, + "loss": 0.2254, + "step": 22752 + }, + { + "epoch": 3.04, + "grad_norm": 0.5078125, + "learning_rate": 3.333712717697084e-05, + "loss": 0.2397, + "step": 22753 + }, + { + "epoch": 3.04, + "grad_norm": 0.78125, + "learning_rate": 3.332844755074057e-05, + "loss": 0.3508, + "step": 22754 + }, + { + "epoch": 3.04, + "grad_norm": 0.443359375, + "learning_rate": 3.331976882861938e-05, + "loss": 0.1581, + "step": 22755 + }, + { + "epoch": 3.04, + "grad_norm": 0.5078125, + "learning_rate": 3.3311091010724896e-05, + "loss": 0.2365, + "step": 22756 + }, + { + "epoch": 3.04, + "grad_norm": 0.63671875, + "learning_rate": 3.3302414097174804e-05, + "loss": 0.3601, + "step": 22757 + }, + { + "epoch": 3.04, + "grad_norm": 0.54296875, + "learning_rate": 3.3293738088086825e-05, + "loss": 0.3214, + "step": 22758 + }, + { + "epoch": 3.04, + "grad_norm": 0.546875, + "learning_rate": 3.328506298357853e-05, + "loss": 0.219, + "step": 22759 + }, + { + "epoch": 3.04, + "grad_norm": 0.56640625, + "learning_rate": 3.327638878376764e-05, + "loss": 0.395, + "step": 22760 + }, + { + "epoch": 3.04, + "grad_norm": 0.5234375, + "learning_rate": 3.32677154887717e-05, + "loss": 0.3393, + "step": 22761 + }, + { + "epoch": 3.04, + "grad_norm": 0.69921875, + "learning_rate": 3.325904309870837e-05, + "loss": 0.405, + "step": 22762 + }, + { + "epoch": 3.04, + "grad_norm": 0.59375, + "learning_rate": 3.3250371613695255e-05, + "loss": 0.1884, + "step": 22763 + }, + { + "epoch": 3.04, + "grad_norm": 0.515625, + "learning_rate": 3.324170103384996e-05, + "loss": 0.2826, + "step": 22764 + }, + { + "epoch": 3.04, + "grad_norm": 0.6015625, + "learning_rate": 3.323303135929002e-05, + "loss": 0.3157, + "step": 22765 + }, + { + "epoch": 3.04, + "grad_norm": 0.68359375, + "learning_rate": 3.322436259013303e-05, + "loss": 0.3528, + "step": 22766 + }, + { + "epoch": 3.04, + "grad_norm": 0.703125, + "learning_rate": 3.3215694726496525e-05, + "loss": 0.3942, + "step": 22767 + }, + { + "epoch": 3.04, + "grad_norm": 0.76953125, + "learning_rate": 3.3207027768498075e-05, + "loss": 0.3242, + "step": 22768 + }, + { + "epoch": 3.04, + "grad_norm": 0.671875, + "learning_rate": 3.3198361716255225e-05, + "loss": 0.3197, + "step": 22769 + }, + { + "epoch": 3.04, + "grad_norm": 0.7109375, + "learning_rate": 3.318969656988543e-05, + "loss": 0.3658, + "step": 22770 + }, + { + "epoch": 3.04, + "grad_norm": 0.73046875, + "learning_rate": 3.318103232950626e-05, + "loss": 0.3032, + "step": 22771 + }, + { + "epoch": 3.04, + "grad_norm": 0.578125, + "learning_rate": 3.317236899523514e-05, + "loss": 0.3282, + "step": 22772 + }, + { + "epoch": 3.04, + "grad_norm": 0.515625, + "learning_rate": 3.316370656718959e-05, + "loss": 0.1654, + "step": 22773 + }, + { + "epoch": 3.04, + "grad_norm": 0.68359375, + "learning_rate": 3.315504504548711e-05, + "loss": 0.4235, + "step": 22774 + }, + { + "epoch": 3.04, + "grad_norm": 0.470703125, + "learning_rate": 3.314638443024508e-05, + "loss": 0.2125, + "step": 22775 + }, + { + "epoch": 3.04, + "grad_norm": 0.6875, + "learning_rate": 3.313772472158099e-05, + "loss": 0.3697, + "step": 22776 + }, + { + "epoch": 3.04, + "grad_norm": 0.47265625, + "learning_rate": 3.312906591961226e-05, + "loss": 0.3479, + "step": 22777 + }, + { + "epoch": 3.04, + "grad_norm": 0.65234375, + "learning_rate": 3.312040802445635e-05, + "loss": 0.1519, + "step": 22778 + }, + { + "epoch": 3.04, + "grad_norm": 0.5078125, + "learning_rate": 3.311175103623059e-05, + "loss": 0.2537, + "step": 22779 + }, + { + "epoch": 3.04, + "grad_norm": 0.7265625, + "learning_rate": 3.310309495505242e-05, + "loss": 0.2133, + "step": 22780 + }, + { + "epoch": 3.04, + "grad_norm": 0.78125, + "learning_rate": 3.309443978103922e-05, + "loss": 0.4229, + "step": 22781 + }, + { + "epoch": 3.04, + "grad_norm": 0.62109375, + "learning_rate": 3.308578551430839e-05, + "loss": 0.342, + "step": 22782 + }, + { + "epoch": 3.04, + "grad_norm": 0.7578125, + "learning_rate": 3.3077132154977244e-05, + "loss": 0.4288, + "step": 22783 + }, + { + "epoch": 3.04, + "grad_norm": 0.703125, + "learning_rate": 3.306847970316311e-05, + "loss": 0.271, + "step": 22784 + }, + { + "epoch": 3.04, + "grad_norm": 0.6171875, + "learning_rate": 3.305982815898334e-05, + "loss": 0.2443, + "step": 22785 + }, + { + "epoch": 3.04, + "grad_norm": 0.6640625, + "learning_rate": 3.3051177522555276e-05, + "loss": 0.3229, + "step": 22786 + }, + { + "epoch": 3.04, + "grad_norm": 0.73828125, + "learning_rate": 3.304252779399624e-05, + "loss": 0.5162, + "step": 22787 + }, + { + "epoch": 3.04, + "grad_norm": 0.49609375, + "learning_rate": 3.303387897342346e-05, + "loss": 0.3911, + "step": 22788 + }, + { + "epoch": 3.04, + "grad_norm": 0.49609375, + "learning_rate": 3.302523106095427e-05, + "loss": 0.152, + "step": 22789 + }, + { + "epoch": 3.04, + "grad_norm": 0.435546875, + "learning_rate": 3.301658405670592e-05, + "loss": 0.2325, + "step": 22790 + }, + { + "epoch": 3.04, + "grad_norm": 0.54296875, + "learning_rate": 3.300793796079572e-05, + "loss": 0.2523, + "step": 22791 + }, + { + "epoch": 3.04, + "grad_norm": 0.52734375, + "learning_rate": 3.299929277334084e-05, + "loss": 0.3415, + "step": 22792 + }, + { + "epoch": 3.04, + "grad_norm": 0.6328125, + "learning_rate": 3.299064849445855e-05, + "loss": 0.5618, + "step": 22793 + }, + { + "epoch": 3.04, + "grad_norm": 0.48046875, + "learning_rate": 3.298200512426608e-05, + "loss": 0.174, + "step": 22794 + }, + { + "epoch": 3.04, + "grad_norm": 0.6640625, + "learning_rate": 3.297336266288066e-05, + "loss": 0.4987, + "step": 22795 + }, + { + "epoch": 3.04, + "grad_norm": 0.64453125, + "learning_rate": 3.296472111041943e-05, + "loss": 0.5698, + "step": 22796 + }, + { + "epoch": 3.04, + "grad_norm": 0.63671875, + "learning_rate": 3.295608046699964e-05, + "loss": 0.5592, + "step": 22797 + }, + { + "epoch": 3.04, + "grad_norm": 0.6484375, + "learning_rate": 3.2947440732738386e-05, + "loss": 0.3663, + "step": 22798 + }, + { + "epoch": 3.04, + "grad_norm": 0.54296875, + "learning_rate": 3.293880190775287e-05, + "loss": 0.3412, + "step": 22799 + }, + { + "epoch": 3.04, + "grad_norm": 0.7890625, + "learning_rate": 3.293016399216024e-05, + "loss": 0.4966, + "step": 22800 + }, + { + "epoch": 3.04, + "grad_norm": 0.546875, + "learning_rate": 3.292152698607768e-05, + "loss": 0.3307, + "step": 22801 + }, + { + "epoch": 3.04, + "grad_norm": 0.474609375, + "learning_rate": 3.291289088962222e-05, + "loss": 0.257, + "step": 22802 + }, + { + "epoch": 3.04, + "grad_norm": 0.40625, + "learning_rate": 3.2904255702911e-05, + "loss": 0.1686, + "step": 22803 + }, + { + "epoch": 3.04, + "grad_norm": 0.640625, + "learning_rate": 3.289562142606115e-05, + "loss": 0.2718, + "step": 22804 + }, + { + "epoch": 3.04, + "grad_norm": 0.72265625, + "learning_rate": 3.288698805918978e-05, + "loss": 0.5613, + "step": 22805 + }, + { + "epoch": 3.04, + "grad_norm": 0.4765625, + "learning_rate": 3.287835560241389e-05, + "loss": 0.1767, + "step": 22806 + }, + { + "epoch": 3.04, + "grad_norm": 0.65625, + "learning_rate": 3.28697240558506e-05, + "loss": 0.3005, + "step": 22807 + }, + { + "epoch": 3.04, + "grad_norm": 0.53125, + "learning_rate": 3.286109341961691e-05, + "loss": 0.1362, + "step": 22808 + }, + { + "epoch": 3.04, + "grad_norm": 0.375, + "learning_rate": 3.2852463693829884e-05, + "loss": 0.093, + "step": 22809 + }, + { + "epoch": 3.04, + "grad_norm": 0.53125, + "learning_rate": 3.284383487860657e-05, + "loss": 0.2504, + "step": 22810 + }, + { + "epoch": 3.04, + "grad_norm": 0.65625, + "learning_rate": 3.2835206974063925e-05, + "loss": 0.2406, + "step": 22811 + }, + { + "epoch": 3.04, + "grad_norm": 0.69140625, + "learning_rate": 3.2826579980318987e-05, + "loss": 0.2658, + "step": 22812 + }, + { + "epoch": 3.04, + "grad_norm": 0.65625, + "learning_rate": 3.2817953897488716e-05, + "loss": 0.3006, + "step": 22813 + }, + { + "epoch": 3.04, + "grad_norm": 0.60546875, + "learning_rate": 3.280932872569016e-05, + "loss": 0.2142, + "step": 22814 + }, + { + "epoch": 3.04, + "grad_norm": 0.6328125, + "learning_rate": 3.280070446504019e-05, + "loss": 0.3149, + "step": 22815 + }, + { + "epoch": 3.04, + "grad_norm": 0.625, + "learning_rate": 3.2792081115655805e-05, + "loss": 0.2538, + "step": 22816 + }, + { + "epoch": 3.04, + "grad_norm": 0.69140625, + "learning_rate": 3.2783458677653924e-05, + "loss": 0.3835, + "step": 22817 + }, + { + "epoch": 3.04, + "grad_norm": 0.66015625, + "learning_rate": 3.277483715115148e-05, + "loss": 0.3511, + "step": 22818 + }, + { + "epoch": 3.04, + "grad_norm": 0.5078125, + "learning_rate": 3.276621653626546e-05, + "loss": 0.1726, + "step": 22819 + }, + { + "epoch": 3.05, + "grad_norm": 0.66796875, + "learning_rate": 3.275759683311263e-05, + "loss": 0.2975, + "step": 22820 + }, + { + "epoch": 3.05, + "grad_norm": 0.5625, + "learning_rate": 3.274897804180993e-05, + "loss": 0.2955, + "step": 22821 + }, + { + "epoch": 3.05, + "grad_norm": 0.53125, + "learning_rate": 3.274036016247426e-05, + "loss": 0.391, + "step": 22822 + }, + { + "epoch": 3.05, + "grad_norm": 0.5234375, + "learning_rate": 3.2731743195222475e-05, + "loss": 0.2922, + "step": 22823 + }, + { + "epoch": 3.05, + "grad_norm": 0.50390625, + "learning_rate": 3.272312714017146e-05, + "loss": 0.2861, + "step": 22824 + }, + { + "epoch": 3.05, + "grad_norm": 0.6875, + "learning_rate": 3.271451199743798e-05, + "loss": 0.3641, + "step": 22825 + }, + { + "epoch": 3.05, + "grad_norm": 0.490234375, + "learning_rate": 3.27058977671389e-05, + "loss": 0.3795, + "step": 22826 + }, + { + "epoch": 3.05, + "grad_norm": 0.72265625, + "learning_rate": 3.269728444939104e-05, + "loss": 0.4227, + "step": 22827 + }, + { + "epoch": 3.05, + "grad_norm": 0.45703125, + "learning_rate": 3.268867204431122e-05, + "loss": 0.2358, + "step": 22828 + }, + { + "epoch": 3.05, + "grad_norm": 0.5546875, + "learning_rate": 3.268006055201619e-05, + "loss": 0.1838, + "step": 22829 + }, + { + "epoch": 3.05, + "grad_norm": 0.703125, + "learning_rate": 3.2671449972622733e-05, + "loss": 0.354, + "step": 22830 + }, + { + "epoch": 3.05, + "grad_norm": 0.5859375, + "learning_rate": 3.266284030624767e-05, + "loss": 0.2818, + "step": 22831 + }, + { + "epoch": 3.05, + "grad_norm": 0.5625, + "learning_rate": 3.265423155300767e-05, + "loss": 0.2913, + "step": 22832 + }, + { + "epoch": 3.05, + "grad_norm": 0.55859375, + "learning_rate": 3.2645623713019536e-05, + "loss": 0.2264, + "step": 22833 + }, + { + "epoch": 3.05, + "grad_norm": 0.458984375, + "learning_rate": 3.263701678639994e-05, + "loss": 0.2697, + "step": 22834 + }, + { + "epoch": 3.05, + "grad_norm": 0.76953125, + "learning_rate": 3.2628410773265635e-05, + "loss": 0.3112, + "step": 22835 + }, + { + "epoch": 3.05, + "grad_norm": 0.67578125, + "learning_rate": 3.261980567373333e-05, + "loss": 0.2239, + "step": 22836 + }, + { + "epoch": 3.05, + "grad_norm": 0.6953125, + "learning_rate": 3.2611201487919716e-05, + "loss": 0.4045, + "step": 22837 + }, + { + "epoch": 3.05, + "grad_norm": 0.63671875, + "learning_rate": 3.260259821594145e-05, + "loss": 0.2073, + "step": 22838 + }, + { + "epoch": 3.05, + "grad_norm": 0.71875, + "learning_rate": 3.259399585791519e-05, + "loss": 0.2106, + "step": 22839 + }, + { + "epoch": 3.05, + "grad_norm": 0.62109375, + "learning_rate": 3.258539441395762e-05, + "loss": 0.3329, + "step": 22840 + }, + { + "epoch": 3.05, + "grad_norm": 0.56640625, + "learning_rate": 3.2576793884185354e-05, + "loss": 0.1768, + "step": 22841 + }, + { + "epoch": 3.05, + "grad_norm": 0.6875, + "learning_rate": 3.256819426871507e-05, + "loss": 0.3704, + "step": 22842 + }, + { + "epoch": 3.05, + "grad_norm": 0.6328125, + "learning_rate": 3.255959556766336e-05, + "loss": 0.4522, + "step": 22843 + }, + { + "epoch": 3.05, + "grad_norm": 0.64453125, + "learning_rate": 3.255099778114677e-05, + "loss": 0.2899, + "step": 22844 + }, + { + "epoch": 3.05, + "grad_norm": 0.6484375, + "learning_rate": 3.2542400909281956e-05, + "loss": 0.4079, + "step": 22845 + }, + { + "epoch": 3.05, + "grad_norm": 0.50390625, + "learning_rate": 3.253380495218546e-05, + "loss": 0.4147, + "step": 22846 + }, + { + "epoch": 3.05, + "grad_norm": 0.58984375, + "learning_rate": 3.252520990997392e-05, + "loss": 0.2271, + "step": 22847 + }, + { + "epoch": 3.05, + "grad_norm": 0.5625, + "learning_rate": 3.2516615782763795e-05, + "loss": 0.1212, + "step": 22848 + }, + { + "epoch": 3.05, + "grad_norm": 0.60546875, + "learning_rate": 3.250802257067167e-05, + "loss": 0.6079, + "step": 22849 + }, + { + "epoch": 3.05, + "grad_norm": 0.4765625, + "learning_rate": 3.249943027381408e-05, + "loss": 0.2023, + "step": 22850 + }, + { + "epoch": 3.05, + "grad_norm": 0.6640625, + "learning_rate": 3.2490838892307564e-05, + "loss": 0.3062, + "step": 22851 + }, + { + "epoch": 3.05, + "grad_norm": 0.70703125, + "learning_rate": 3.248224842626857e-05, + "loss": 0.3464, + "step": 22852 + }, + { + "epoch": 3.05, + "grad_norm": 0.59375, + "learning_rate": 3.247365887581363e-05, + "loss": 0.3447, + "step": 22853 + }, + { + "epoch": 3.05, + "grad_norm": 0.4765625, + "learning_rate": 3.2465070241059193e-05, + "loss": 0.1624, + "step": 22854 + }, + { + "epoch": 3.05, + "grad_norm": 0.478515625, + "learning_rate": 3.24564825221218e-05, + "loss": 0.2451, + "step": 22855 + }, + { + "epoch": 3.05, + "grad_norm": 0.51171875, + "learning_rate": 3.244789571911784e-05, + "loss": 0.2085, + "step": 22856 + }, + { + "epoch": 3.05, + "grad_norm": 0.59375, + "learning_rate": 3.243930983216374e-05, + "loss": 0.2346, + "step": 22857 + }, + { + "epoch": 3.05, + "grad_norm": 0.435546875, + "learning_rate": 3.243072486137596e-05, + "loss": 0.1136, + "step": 22858 + }, + { + "epoch": 3.05, + "grad_norm": 0.546875, + "learning_rate": 3.242214080687091e-05, + "loss": 0.3274, + "step": 22859 + }, + { + "epoch": 3.05, + "grad_norm": 0.734375, + "learning_rate": 3.241355766876505e-05, + "loss": 0.2247, + "step": 22860 + }, + { + "epoch": 3.05, + "grad_norm": 0.58984375, + "learning_rate": 3.240497544717468e-05, + "loss": 0.4311, + "step": 22861 + }, + { + "epoch": 3.05, + "grad_norm": 0.5234375, + "learning_rate": 3.2396394142216226e-05, + "loss": 0.3389, + "step": 22862 + }, + { + "epoch": 3.05, + "grad_norm": 0.74609375, + "learning_rate": 3.238781375400606e-05, + "loss": 0.3471, + "step": 22863 + }, + { + "epoch": 3.05, + "grad_norm": 0.5078125, + "learning_rate": 3.2379234282660556e-05, + "loss": 0.3607, + "step": 22864 + }, + { + "epoch": 3.05, + "grad_norm": 0.79296875, + "learning_rate": 3.2370655728296006e-05, + "loss": 0.4161, + "step": 22865 + }, + { + "epoch": 3.05, + "grad_norm": 0.47265625, + "learning_rate": 3.236207809102877e-05, + "loss": 0.2222, + "step": 22866 + }, + { + "epoch": 3.05, + "grad_norm": 0.5859375, + "learning_rate": 3.235350137097516e-05, + "loss": 0.3692, + "step": 22867 + }, + { + "epoch": 3.05, + "grad_norm": 0.5625, + "learning_rate": 3.2344925568251525e-05, + "loss": 0.2627, + "step": 22868 + }, + { + "epoch": 3.05, + "grad_norm": 0.671875, + "learning_rate": 3.233635068297409e-05, + "loss": 0.4462, + "step": 22869 + }, + { + "epoch": 3.05, + "grad_norm": 0.72265625, + "learning_rate": 3.232777671525919e-05, + "loss": 0.599, + "step": 22870 + }, + { + "epoch": 3.05, + "grad_norm": 0.640625, + "learning_rate": 3.231920366522304e-05, + "loss": 0.2327, + "step": 22871 + }, + { + "epoch": 3.05, + "grad_norm": 0.7265625, + "learning_rate": 3.231063153298193e-05, + "loss": 0.4039, + "step": 22872 + }, + { + "epoch": 3.05, + "grad_norm": 0.609375, + "learning_rate": 3.2302060318652095e-05, + "loss": 0.3364, + "step": 22873 + }, + { + "epoch": 3.05, + "grad_norm": 0.69921875, + "learning_rate": 3.22934900223498e-05, + "loss": 0.4641, + "step": 22874 + }, + { + "epoch": 3.05, + "grad_norm": 0.578125, + "learning_rate": 3.228492064419121e-05, + "loss": 0.176, + "step": 22875 + }, + { + "epoch": 3.05, + "grad_norm": 0.54296875, + "learning_rate": 3.2276352184292547e-05, + "loss": 0.2516, + "step": 22876 + }, + { + "epoch": 3.05, + "grad_norm": 0.53125, + "learning_rate": 3.226778464277003e-05, + "loss": 0.332, + "step": 22877 + }, + { + "epoch": 3.05, + "grad_norm": 0.578125, + "learning_rate": 3.225921801973983e-05, + "loss": 0.1993, + "step": 22878 + }, + { + "epoch": 3.05, + "grad_norm": 0.490234375, + "learning_rate": 3.22506523153181e-05, + "loss": 0.1586, + "step": 22879 + }, + { + "epoch": 3.05, + "grad_norm": 0.80078125, + "learning_rate": 3.224208752962102e-05, + "loss": 0.5471, + "step": 22880 + }, + { + "epoch": 3.05, + "grad_norm": 0.54296875, + "learning_rate": 3.2233523662764696e-05, + "loss": 0.3527, + "step": 22881 + }, + { + "epoch": 3.05, + "grad_norm": 0.6015625, + "learning_rate": 3.222496071486528e-05, + "loss": 0.3183, + "step": 22882 + }, + { + "epoch": 3.05, + "grad_norm": 0.58203125, + "learning_rate": 3.2216398686038926e-05, + "loss": 0.2277, + "step": 22883 + }, + { + "epoch": 3.05, + "grad_norm": 0.78125, + "learning_rate": 3.220783757640169e-05, + "loss": 0.2575, + "step": 22884 + }, + { + "epoch": 3.05, + "grad_norm": 0.59765625, + "learning_rate": 3.219927738606967e-05, + "loss": 0.3441, + "step": 22885 + }, + { + "epoch": 3.05, + "grad_norm": 0.41015625, + "learning_rate": 3.219071811515897e-05, + "loss": 0.101, + "step": 22886 + }, + { + "epoch": 3.05, + "grad_norm": 0.6484375, + "learning_rate": 3.218215976378568e-05, + "loss": 0.1214, + "step": 22887 + }, + { + "epoch": 3.05, + "grad_norm": 0.6484375, + "learning_rate": 3.21736023320658e-05, + "loss": 0.4876, + "step": 22888 + }, + { + "epoch": 3.05, + "grad_norm": 0.71875, + "learning_rate": 3.21650458201154e-05, + "loss": 0.3446, + "step": 22889 + }, + { + "epoch": 3.05, + "grad_norm": 0.50390625, + "learning_rate": 3.215649022805053e-05, + "loss": 0.2677, + "step": 22890 + }, + { + "epoch": 3.05, + "grad_norm": 0.59375, + "learning_rate": 3.2147935555987175e-05, + "loss": 0.3934, + "step": 22891 + }, + { + "epoch": 3.05, + "grad_norm": 0.58203125, + "learning_rate": 3.213938180404144e-05, + "loss": 0.3891, + "step": 22892 + }, + { + "epoch": 3.05, + "grad_norm": 0.67578125, + "learning_rate": 3.213082897232916e-05, + "loss": 0.3249, + "step": 22893 + }, + { + "epoch": 3.05, + "grad_norm": 0.63671875, + "learning_rate": 3.21222770609664e-05, + "loss": 0.3934, + "step": 22894 + }, + { + "epoch": 3.06, + "grad_norm": 0.60546875, + "learning_rate": 3.211372607006914e-05, + "loss": 0.1578, + "step": 22895 + }, + { + "epoch": 3.06, + "grad_norm": 0.427734375, + "learning_rate": 3.210517599975331e-05, + "loss": 0.2193, + "step": 22896 + }, + { + "epoch": 3.06, + "grad_norm": 0.453125, + "learning_rate": 3.20966268501349e-05, + "loss": 0.12, + "step": 22897 + }, + { + "epoch": 3.06, + "grad_norm": 0.515625, + "learning_rate": 3.2088078621329785e-05, + "loss": 0.3127, + "step": 22898 + }, + { + "epoch": 3.06, + "grad_norm": 0.62109375, + "learning_rate": 3.2079531313453915e-05, + "loss": 0.1743, + "step": 22899 + }, + { + "epoch": 3.06, + "grad_norm": 0.625, + "learning_rate": 3.207098492662318e-05, + "loss": 0.2503, + "step": 22900 + }, + { + "epoch": 3.06, + "grad_norm": 0.44140625, + "learning_rate": 3.206243946095353e-05, + "loss": 0.2902, + "step": 22901 + }, + { + "epoch": 3.06, + "grad_norm": 0.52734375, + "learning_rate": 3.205389491656076e-05, + "loss": 0.1855, + "step": 22902 + }, + { + "epoch": 3.06, + "grad_norm": 0.5, + "learning_rate": 3.2045351293560785e-05, + "loss": 0.2547, + "step": 22903 + }, + { + "epoch": 3.06, + "grad_norm": 0.5390625, + "learning_rate": 3.203680859206949e-05, + "loss": 0.3273, + "step": 22904 + }, + { + "epoch": 3.06, + "grad_norm": 0.50390625, + "learning_rate": 3.2028266812202654e-05, + "loss": 0.2626, + "step": 22905 + }, + { + "epoch": 3.06, + "grad_norm": 0.6015625, + "learning_rate": 3.201972595407618e-05, + "loss": 0.3163, + "step": 22906 + }, + { + "epoch": 3.06, + "grad_norm": 0.5703125, + "learning_rate": 3.201118601780583e-05, + "loss": 0.3065, + "step": 22907 + }, + { + "epoch": 3.06, + "grad_norm": 0.62109375, + "learning_rate": 3.200264700350743e-05, + "loss": 0.2935, + "step": 22908 + }, + { + "epoch": 3.06, + "grad_norm": 0.54296875, + "learning_rate": 3.199410891129676e-05, + "loss": 0.2185, + "step": 22909 + }, + { + "epoch": 3.06, + "grad_norm": 0.8125, + "learning_rate": 3.198557174128967e-05, + "loss": 0.3503, + "step": 22910 + }, + { + "epoch": 3.06, + "grad_norm": 0.69140625, + "learning_rate": 3.197703549360183e-05, + "loss": 0.3199, + "step": 22911 + }, + { + "epoch": 3.06, + "grad_norm": 0.37109375, + "learning_rate": 3.1968500168349056e-05, + "loss": 0.1797, + "step": 22912 + }, + { + "epoch": 3.06, + "grad_norm": 0.54296875, + "learning_rate": 3.1959965765647084e-05, + "loss": 0.2208, + "step": 22913 + }, + { + "epoch": 3.06, + "grad_norm": 0.4765625, + "learning_rate": 3.1951432285611624e-05, + "loss": 0.3459, + "step": 22914 + }, + { + "epoch": 3.06, + "grad_norm": 0.546875, + "learning_rate": 3.194289972835847e-05, + "loss": 0.2768, + "step": 22915 + }, + { + "epoch": 3.06, + "grad_norm": 0.57421875, + "learning_rate": 3.1934368094003265e-05, + "loss": 0.1581, + "step": 22916 + }, + { + "epoch": 3.06, + "grad_norm": 0.68359375, + "learning_rate": 3.192583738266168e-05, + "loss": 0.2958, + "step": 22917 + }, + { + "epoch": 3.06, + "grad_norm": 0.6953125, + "learning_rate": 3.191730759444944e-05, + "loss": 0.41, + "step": 22918 + }, + { + "epoch": 3.06, + "grad_norm": 0.6328125, + "learning_rate": 3.190877872948219e-05, + "loss": 0.2664, + "step": 22919 + }, + { + "epoch": 3.06, + "grad_norm": 0.48828125, + "learning_rate": 3.190025078787565e-05, + "loss": 0.2246, + "step": 22920 + }, + { + "epoch": 3.06, + "grad_norm": 0.388671875, + "learning_rate": 3.189172376974537e-05, + "loss": 0.112, + "step": 22921 + }, + { + "epoch": 3.06, + "grad_norm": 0.51953125, + "learning_rate": 3.1883197675207034e-05, + "loss": 0.2783, + "step": 22922 + }, + { + "epoch": 3.06, + "grad_norm": 0.703125, + "learning_rate": 3.187467250437624e-05, + "loss": 0.2817, + "step": 22923 + }, + { + "epoch": 3.06, + "grad_norm": 0.296875, + "learning_rate": 3.1866148257368665e-05, + "loss": 0.0816, + "step": 22924 + }, + { + "epoch": 3.06, + "grad_norm": 0.73046875, + "learning_rate": 3.18576249342998e-05, + "loss": 0.3022, + "step": 22925 + }, + { + "epoch": 3.06, + "grad_norm": 0.71484375, + "learning_rate": 3.184910253528528e-05, + "loss": 0.4129, + "step": 22926 + }, + { + "epoch": 3.06, + "grad_norm": 0.65234375, + "learning_rate": 3.184058106044067e-05, + "loss": 0.377, + "step": 22927 + }, + { + "epoch": 3.06, + "grad_norm": 0.5390625, + "learning_rate": 3.1832060509881546e-05, + "loss": 0.2754, + "step": 22928 + }, + { + "epoch": 3.06, + "grad_norm": 0.65625, + "learning_rate": 3.182354088372345e-05, + "loss": 0.3221, + "step": 22929 + }, + { + "epoch": 3.06, + "grad_norm": 0.79296875, + "learning_rate": 3.181502218208184e-05, + "loss": 0.4212, + "step": 22930 + }, + { + "epoch": 3.06, + "grad_norm": 0.71484375, + "learning_rate": 3.18065044050723e-05, + "loss": 0.2322, + "step": 22931 + }, + { + "epoch": 3.06, + "grad_norm": 0.59375, + "learning_rate": 3.1797987552810334e-05, + "loss": 0.2685, + "step": 22932 + }, + { + "epoch": 3.06, + "grad_norm": 0.6484375, + "learning_rate": 3.178947162541146e-05, + "loss": 0.3012, + "step": 22933 + }, + { + "epoch": 3.06, + "grad_norm": 0.703125, + "learning_rate": 3.1780956622991085e-05, + "loss": 0.3568, + "step": 22934 + }, + { + "epoch": 3.06, + "grad_norm": 0.78125, + "learning_rate": 3.1772442545664726e-05, + "loss": 0.3657, + "step": 22935 + }, + { + "epoch": 3.06, + "grad_norm": 0.6796875, + "learning_rate": 3.176392939354784e-05, + "loss": 0.3552, + "step": 22936 + }, + { + "epoch": 3.06, + "grad_norm": 0.8359375, + "learning_rate": 3.1755417166755895e-05, + "loss": 0.5358, + "step": 22937 + }, + { + "epoch": 3.06, + "grad_norm": 0.65625, + "learning_rate": 3.174690586540425e-05, + "loss": 0.3718, + "step": 22938 + }, + { + "epoch": 3.06, + "grad_norm": 0.3671875, + "learning_rate": 3.1738395489608375e-05, + "loss": 0.2131, + "step": 22939 + }, + { + "epoch": 3.06, + "grad_norm": 0.56640625, + "learning_rate": 3.172988603948367e-05, + "loss": 0.2563, + "step": 22940 + }, + { + "epoch": 3.06, + "grad_norm": 0.67578125, + "learning_rate": 3.172137751514556e-05, + "loss": 0.5325, + "step": 22941 + }, + { + "epoch": 3.06, + "grad_norm": 0.51953125, + "learning_rate": 3.171286991670934e-05, + "loss": 0.4333, + "step": 22942 + }, + { + "epoch": 3.06, + "grad_norm": 0.55078125, + "learning_rate": 3.170436324429048e-05, + "loss": 0.5141, + "step": 22943 + }, + { + "epoch": 3.06, + "grad_norm": 0.48046875, + "learning_rate": 3.169585749800426e-05, + "loss": 0.2902, + "step": 22944 + }, + { + "epoch": 3.06, + "grad_norm": 0.55078125, + "learning_rate": 3.168735267796603e-05, + "loss": 0.2606, + "step": 22945 + }, + { + "epoch": 3.06, + "grad_norm": 0.59765625, + "learning_rate": 3.167884878429115e-05, + "loss": 0.4075, + "step": 22946 + }, + { + "epoch": 3.06, + "grad_norm": 0.6015625, + "learning_rate": 3.167034581709496e-05, + "loss": 0.3311, + "step": 22947 + }, + { + "epoch": 3.06, + "grad_norm": 0.66015625, + "learning_rate": 3.166184377649271e-05, + "loss": 0.2644, + "step": 22948 + }, + { + "epoch": 3.06, + "grad_norm": 0.50390625, + "learning_rate": 3.165334266259971e-05, + "loss": 0.353, + "step": 22949 + }, + { + "epoch": 3.06, + "grad_norm": 0.69140625, + "learning_rate": 3.164484247553124e-05, + "loss": 0.4922, + "step": 22950 + }, + { + "epoch": 3.06, + "grad_norm": 0.69921875, + "learning_rate": 3.163634321540262e-05, + "loss": 0.167, + "step": 22951 + }, + { + "epoch": 3.06, + "grad_norm": 0.57421875, + "learning_rate": 3.162784488232903e-05, + "loss": 0.1503, + "step": 22952 + }, + { + "epoch": 3.06, + "grad_norm": 0.5703125, + "learning_rate": 3.1619347476425776e-05, + "loss": 0.227, + "step": 22953 + }, + { + "epoch": 3.06, + "grad_norm": 0.59765625, + "learning_rate": 3.161085099780802e-05, + "loss": 0.308, + "step": 22954 + }, + { + "epoch": 3.06, + "grad_norm": 0.609375, + "learning_rate": 3.160235544659102e-05, + "loss": 0.4398, + "step": 22955 + }, + { + "epoch": 3.06, + "grad_norm": 0.45703125, + "learning_rate": 3.159386082289e-05, + "loss": 0.1466, + "step": 22956 + }, + { + "epoch": 3.06, + "grad_norm": 0.56640625, + "learning_rate": 3.158536712682011e-05, + "loss": 0.2697, + "step": 22957 + }, + { + "epoch": 3.06, + "grad_norm": 0.78515625, + "learning_rate": 3.1576874358496545e-05, + "loss": 0.3561, + "step": 22958 + }, + { + "epoch": 3.06, + "grad_norm": 0.640625, + "learning_rate": 3.1568382518034466e-05, + "loss": 0.1635, + "step": 22959 + }, + { + "epoch": 3.06, + "grad_norm": 0.59765625, + "learning_rate": 3.155989160554908e-05, + "loss": 0.2207, + "step": 22960 + }, + { + "epoch": 3.06, + "grad_norm": 0.484375, + "learning_rate": 3.1551401621155464e-05, + "loss": 0.3317, + "step": 22961 + }, + { + "epoch": 3.06, + "grad_norm": 0.671875, + "learning_rate": 3.154291256496877e-05, + "loss": 0.4135, + "step": 22962 + }, + { + "epoch": 3.06, + "grad_norm": 0.6875, + "learning_rate": 3.1534424437104115e-05, + "loss": 0.4511, + "step": 22963 + }, + { + "epoch": 3.06, + "grad_norm": 0.5078125, + "learning_rate": 3.152593723767661e-05, + "loss": 0.2335, + "step": 22964 + }, + { + "epoch": 3.06, + "grad_norm": 0.51953125, + "learning_rate": 3.15174509668014e-05, + "loss": 0.27, + "step": 22965 + }, + { + "epoch": 3.06, + "grad_norm": 0.59375, + "learning_rate": 3.1508965624593446e-05, + "loss": 0.1974, + "step": 22966 + }, + { + "epoch": 3.06, + "grad_norm": 0.6640625, + "learning_rate": 3.1500481211167864e-05, + "loss": 0.3264, + "step": 22967 + }, + { + "epoch": 3.06, + "grad_norm": 0.6953125, + "learning_rate": 3.149199772663973e-05, + "loss": 0.3573, + "step": 22968 + }, + { + "epoch": 3.06, + "grad_norm": 0.478515625, + "learning_rate": 3.148351517112408e-05, + "loss": 0.2546, + "step": 22969 + }, + { + "epoch": 3.07, + "grad_norm": 0.69140625, + "learning_rate": 3.147503354473595e-05, + "loss": 0.3728, + "step": 22970 + }, + { + "epoch": 3.07, + "grad_norm": 0.65625, + "learning_rate": 3.146655284759034e-05, + "loss": 0.2691, + "step": 22971 + }, + { + "epoch": 3.07, + "grad_norm": 0.5078125, + "learning_rate": 3.145807307980224e-05, + "loss": 0.1065, + "step": 22972 + }, + { + "epoch": 3.07, + "grad_norm": 0.8984375, + "learning_rate": 3.1449594241486655e-05, + "loss": 0.2082, + "step": 22973 + }, + { + "epoch": 3.07, + "grad_norm": 0.46484375, + "learning_rate": 3.144111633275861e-05, + "loss": 0.3621, + "step": 22974 + }, + { + "epoch": 3.07, + "grad_norm": 0.5, + "learning_rate": 3.1432639353732995e-05, + "loss": 0.4315, + "step": 22975 + }, + { + "epoch": 3.07, + "grad_norm": 0.6640625, + "learning_rate": 3.142416330452479e-05, + "loss": 0.4137, + "step": 22976 + }, + { + "epoch": 3.07, + "grad_norm": 0.396484375, + "learning_rate": 3.1415688185248984e-05, + "loss": 0.2267, + "step": 22977 + }, + { + "epoch": 3.07, + "grad_norm": 0.56640625, + "learning_rate": 3.140721399602042e-05, + "loss": 0.221, + "step": 22978 + }, + { + "epoch": 3.07, + "grad_norm": 0.609375, + "learning_rate": 3.13987407369541e-05, + "loss": 0.2127, + "step": 22979 + }, + { + "epoch": 3.07, + "grad_norm": 0.67578125, + "learning_rate": 3.1390268408164844e-05, + "loss": 0.4901, + "step": 22980 + }, + { + "epoch": 3.07, + "grad_norm": 0.515625, + "learning_rate": 3.138179700976759e-05, + "loss": 0.2827, + "step": 22981 + }, + { + "epoch": 3.07, + "grad_norm": 0.58984375, + "learning_rate": 3.13733265418772e-05, + "loss": 0.3414, + "step": 22982 + }, + { + "epoch": 3.07, + "grad_norm": 0.62890625, + "learning_rate": 3.136485700460858e-05, + "loss": 0.7108, + "step": 22983 + }, + { + "epoch": 3.07, + "grad_norm": 0.58203125, + "learning_rate": 3.135638839807652e-05, + "loss": 0.4275, + "step": 22984 + }, + { + "epoch": 3.07, + "grad_norm": 0.49609375, + "learning_rate": 3.1347920722395884e-05, + "loss": 0.1409, + "step": 22985 + }, + { + "epoch": 3.07, + "grad_norm": 0.484375, + "learning_rate": 3.133945397768151e-05, + "loss": 0.3507, + "step": 22986 + }, + { + "epoch": 3.07, + "grad_norm": 0.5078125, + "learning_rate": 3.1330988164048194e-05, + "loss": 0.3055, + "step": 22987 + }, + { + "epoch": 3.07, + "grad_norm": 0.671875, + "learning_rate": 3.1322523281610794e-05, + "loss": 0.4701, + "step": 22988 + }, + { + "epoch": 3.07, + "grad_norm": 0.357421875, + "learning_rate": 3.1314059330484047e-05, + "loss": 0.228, + "step": 22989 + }, + { + "epoch": 3.07, + "grad_norm": 0.8515625, + "learning_rate": 3.1305596310782714e-05, + "loss": 0.6844, + "step": 22990 + }, + { + "epoch": 3.07, + "grad_norm": 0.5546875, + "learning_rate": 3.129713422262157e-05, + "loss": 0.3046, + "step": 22991 + }, + { + "epoch": 3.07, + "grad_norm": 0.482421875, + "learning_rate": 3.128867306611539e-05, + "loss": 0.1974, + "step": 22992 + }, + { + "epoch": 3.07, + "grad_norm": 0.671875, + "learning_rate": 3.128021284137893e-05, + "loss": 0.3607, + "step": 22993 + }, + { + "epoch": 3.07, + "grad_norm": 0.72265625, + "learning_rate": 3.127175354852686e-05, + "loss": 0.6643, + "step": 22994 + }, + { + "epoch": 3.07, + "grad_norm": 0.58984375, + "learning_rate": 3.1263295187673914e-05, + "loss": 0.3913, + "step": 22995 + }, + { + "epoch": 3.07, + "grad_norm": 0.7421875, + "learning_rate": 3.1254837758934804e-05, + "loss": 0.4323, + "step": 22996 + }, + { + "epoch": 3.07, + "grad_norm": 0.7265625, + "learning_rate": 3.124638126242425e-05, + "loss": 0.4742, + "step": 22997 + }, + { + "epoch": 3.07, + "grad_norm": 0.70703125, + "learning_rate": 3.123792569825685e-05, + "loss": 0.1644, + "step": 22998 + }, + { + "epoch": 3.07, + "grad_norm": 0.58984375, + "learning_rate": 3.1229471066547323e-05, + "loss": 0.3546, + "step": 22999 + }, + { + "epoch": 3.07, + "grad_norm": 0.6796875, + "learning_rate": 3.1221017367410297e-05, + "loss": 0.3054, + "step": 23000 + }, + { + "epoch": 3.07, + "grad_norm": 0.63671875, + "learning_rate": 3.1212564600960445e-05, + "loss": 0.1929, + "step": 23001 + }, + { + "epoch": 3.07, + "grad_norm": 0.6484375, + "learning_rate": 3.1204112767312376e-05, + "loss": 0.2494, + "step": 23002 + }, + { + "epoch": 3.07, + "grad_norm": 0.62890625, + "learning_rate": 3.119566186658065e-05, + "loss": 0.3383, + "step": 23003 + }, + { + "epoch": 3.07, + "grad_norm": 0.74609375, + "learning_rate": 3.118721189887991e-05, + "loss": 0.272, + "step": 23004 + }, + { + "epoch": 3.07, + "grad_norm": 0.71875, + "learning_rate": 3.117876286432474e-05, + "loss": 0.2223, + "step": 23005 + }, + { + "epoch": 3.07, + "grad_norm": 0.83203125, + "learning_rate": 3.117031476302975e-05, + "loss": 0.4231, + "step": 23006 + }, + { + "epoch": 3.07, + "grad_norm": 0.71875, + "learning_rate": 3.116186759510943e-05, + "loss": 0.479, + "step": 23007 + }, + { + "epoch": 3.07, + "grad_norm": 0.640625, + "learning_rate": 3.115342136067837e-05, + "loss": 0.415, + "step": 23008 + }, + { + "epoch": 3.07, + "grad_norm": 0.490234375, + "learning_rate": 3.114497605985109e-05, + "loss": 0.1812, + "step": 23009 + }, + { + "epoch": 3.07, + "grad_norm": 0.8203125, + "learning_rate": 3.113653169274216e-05, + "loss": 0.4042, + "step": 23010 + }, + { + "epoch": 3.07, + "grad_norm": 0.443359375, + "learning_rate": 3.112808825946604e-05, + "loss": 0.1768, + "step": 23011 + }, + { + "epoch": 3.07, + "grad_norm": 0.4921875, + "learning_rate": 3.111964576013724e-05, + "loss": 0.3162, + "step": 23012 + }, + { + "epoch": 3.07, + "grad_norm": 0.61328125, + "learning_rate": 3.111120419487027e-05, + "loss": 0.3783, + "step": 23013 + }, + { + "epoch": 3.07, + "grad_norm": 0.43359375, + "learning_rate": 3.110276356377957e-05, + "loss": 0.2165, + "step": 23014 + }, + { + "epoch": 3.07, + "grad_norm": 0.796875, + "learning_rate": 3.1094323866979605e-05, + "loss": 0.2801, + "step": 23015 + }, + { + "epoch": 3.07, + "grad_norm": 0.71484375, + "learning_rate": 3.108588510458486e-05, + "loss": 0.3119, + "step": 23016 + }, + { + "epoch": 3.07, + "grad_norm": 0.50390625, + "learning_rate": 3.10774472767097e-05, + "loss": 0.2303, + "step": 23017 + }, + { + "epoch": 3.07, + "grad_norm": 0.5078125, + "learning_rate": 3.106901038346861e-05, + "loss": 0.2356, + "step": 23018 + }, + { + "epoch": 3.07, + "grad_norm": 0.6640625, + "learning_rate": 3.1060574424975967e-05, + "loss": 0.4477, + "step": 23019 + }, + { + "epoch": 3.07, + "grad_norm": 0.60546875, + "learning_rate": 3.105213940134621e-05, + "loss": 0.2751, + "step": 23020 + }, + { + "epoch": 3.07, + "grad_norm": 0.6328125, + "learning_rate": 3.1043705312693674e-05, + "loss": 0.2051, + "step": 23021 + }, + { + "epoch": 3.07, + "grad_norm": 0.67578125, + "learning_rate": 3.1035272159132745e-05, + "loss": 0.3862, + "step": 23022 + }, + { + "epoch": 3.07, + "grad_norm": 0.72265625, + "learning_rate": 3.1026839940777776e-05, + "loss": 0.383, + "step": 23023 + }, + { + "epoch": 3.07, + "grad_norm": 0.5390625, + "learning_rate": 3.101840865774317e-05, + "loss": 0.2917, + "step": 23024 + }, + { + "epoch": 3.07, + "grad_norm": 0.64453125, + "learning_rate": 3.100997831014317e-05, + "loss": 0.3309, + "step": 23025 + }, + { + "epoch": 3.07, + "grad_norm": 0.62109375, + "learning_rate": 3.1001548898092184e-05, + "loss": 0.3129, + "step": 23026 + }, + { + "epoch": 3.07, + "grad_norm": 0.400390625, + "learning_rate": 3.0993120421704434e-05, + "loss": 0.1133, + "step": 23027 + }, + { + "epoch": 3.07, + "grad_norm": 0.443359375, + "learning_rate": 3.0984692881094276e-05, + "loss": 0.1875, + "step": 23028 + }, + { + "epoch": 3.07, + "grad_norm": 0.625, + "learning_rate": 3.0976266276376e-05, + "loss": 0.3605, + "step": 23029 + }, + { + "epoch": 3.07, + "grad_norm": 0.640625, + "learning_rate": 3.096784060766382e-05, + "loss": 0.4604, + "step": 23030 + }, + { + "epoch": 3.07, + "grad_norm": 0.5, + "learning_rate": 3.095941587507203e-05, + "loss": 0.3798, + "step": 23031 + }, + { + "epoch": 3.07, + "grad_norm": 0.625, + "learning_rate": 3.095099207871486e-05, + "loss": 0.2927, + "step": 23032 + }, + { + "epoch": 3.07, + "grad_norm": 0.578125, + "learning_rate": 3.0942569218706597e-05, + "loss": 0.2522, + "step": 23033 + }, + { + "epoch": 3.07, + "grad_norm": 0.52734375, + "learning_rate": 3.093414729516138e-05, + "loss": 0.1953, + "step": 23034 + }, + { + "epoch": 3.07, + "grad_norm": 0.734375, + "learning_rate": 3.0925726308193445e-05, + "loss": 0.3085, + "step": 23035 + }, + { + "epoch": 3.07, + "grad_norm": 0.51171875, + "learning_rate": 3.0917306257917e-05, + "loss": 0.3268, + "step": 23036 + }, + { + "epoch": 3.07, + "grad_norm": 0.578125, + "learning_rate": 3.0908887144446205e-05, + "loss": 0.2794, + "step": 23037 + }, + { + "epoch": 3.07, + "grad_norm": 0.53515625, + "learning_rate": 3.0900468967895325e-05, + "loss": 0.2398, + "step": 23038 + }, + { + "epoch": 3.07, + "grad_norm": 0.60546875, + "learning_rate": 3.089205172837837e-05, + "loss": 0.2092, + "step": 23039 + }, + { + "epoch": 3.07, + "grad_norm": 0.6171875, + "learning_rate": 3.088363542600954e-05, + "loss": 0.2386, + "step": 23040 + }, + { + "epoch": 3.07, + "grad_norm": 0.57421875, + "learning_rate": 3.087522006090296e-05, + "loss": 0.137, + "step": 23041 + }, + { + "epoch": 3.07, + "grad_norm": 0.6484375, + "learning_rate": 3.086680563317278e-05, + "loss": 0.344, + "step": 23042 + }, + { + "epoch": 3.07, + "grad_norm": 0.5703125, + "learning_rate": 3.085839214293311e-05, + "loss": 0.2162, + "step": 23043 + }, + { + "epoch": 3.07, + "grad_norm": 0.73046875, + "learning_rate": 3.0849979590297986e-05, + "loss": 0.5322, + "step": 23044 + }, + { + "epoch": 3.08, + "grad_norm": 0.5390625, + "learning_rate": 3.0841567975381515e-05, + "loss": 0.2733, + "step": 23045 + }, + { + "epoch": 3.08, + "grad_norm": 0.6484375, + "learning_rate": 3.0833157298297774e-05, + "loss": 0.2874, + "step": 23046 + }, + { + "epoch": 3.08, + "grad_norm": 0.57421875, + "learning_rate": 3.0824747559160836e-05, + "loss": 0.3294, + "step": 23047 + }, + { + "epoch": 3.08, + "grad_norm": 0.80078125, + "learning_rate": 3.08163387580847e-05, + "loss": 0.6117, + "step": 23048 + }, + { + "epoch": 3.08, + "grad_norm": 0.484375, + "learning_rate": 3.080793089518339e-05, + "loss": 0.234, + "step": 23049 + }, + { + "epoch": 3.08, + "grad_norm": 0.671875, + "learning_rate": 3.0799523970570987e-05, + "loss": 0.3902, + "step": 23050 + }, + { + "epoch": 3.08, + "grad_norm": 0.45703125, + "learning_rate": 3.079111798436142e-05, + "loss": 0.1947, + "step": 23051 + }, + { + "epoch": 3.08, + "grad_norm": 0.63671875, + "learning_rate": 3.0782712936668744e-05, + "loss": 0.4205, + "step": 23052 + }, + { + "epoch": 3.08, + "grad_norm": 0.578125, + "learning_rate": 3.0774308827606866e-05, + "loss": 0.2859, + "step": 23053 + }, + { + "epoch": 3.08, + "grad_norm": 0.58984375, + "learning_rate": 3.07659056572898e-05, + "loss": 0.3712, + "step": 23054 + }, + { + "epoch": 3.08, + "grad_norm": 0.44921875, + "learning_rate": 3.075750342583148e-05, + "loss": 0.1202, + "step": 23055 + }, + { + "epoch": 3.08, + "grad_norm": 0.5625, + "learning_rate": 3.074910213334589e-05, + "loss": 0.3612, + "step": 23056 + }, + { + "epoch": 3.08, + "grad_norm": 0.6171875, + "learning_rate": 3.074070177994688e-05, + "loss": 0.2189, + "step": 23057 + }, + { + "epoch": 3.08, + "grad_norm": 0.59375, + "learning_rate": 3.073230236574841e-05, + "loss": 0.1695, + "step": 23058 + }, + { + "epoch": 3.08, + "grad_norm": 0.546875, + "learning_rate": 3.072390389086438e-05, + "loss": 0.2241, + "step": 23059 + }, + { + "epoch": 3.08, + "grad_norm": 0.703125, + "learning_rate": 3.0715506355408666e-05, + "loss": 0.2707, + "step": 23060 + }, + { + "epoch": 3.08, + "grad_norm": 0.4609375, + "learning_rate": 3.070710975949519e-05, + "loss": 0.2162, + "step": 23061 + }, + { + "epoch": 3.08, + "grad_norm": 0.54296875, + "learning_rate": 3.069871410323777e-05, + "loss": 0.2157, + "step": 23062 + }, + { + "epoch": 3.08, + "grad_norm": 0.4140625, + "learning_rate": 3.0690319386750235e-05, + "loss": 0.2025, + "step": 23063 + }, + { + "epoch": 3.08, + "grad_norm": 0.63671875, + "learning_rate": 3.068192561014646e-05, + "loss": 0.3493, + "step": 23064 + }, + { + "epoch": 3.08, + "grad_norm": 0.578125, + "learning_rate": 3.0673532773540247e-05, + "loss": 0.1818, + "step": 23065 + }, + { + "epoch": 3.08, + "grad_norm": 0.625, + "learning_rate": 3.066514087704546e-05, + "loss": 0.3655, + "step": 23066 + }, + { + "epoch": 3.08, + "grad_norm": 0.79296875, + "learning_rate": 3.065674992077584e-05, + "loss": 0.4451, + "step": 23067 + }, + { + "epoch": 3.08, + "grad_norm": 0.5078125, + "learning_rate": 3.064835990484518e-05, + "loss": 0.3263, + "step": 23068 + }, + { + "epoch": 3.08, + "grad_norm": 0.61328125, + "learning_rate": 3.063997082936728e-05, + "loss": 0.2935, + "step": 23069 + }, + { + "epoch": 3.08, + "grad_norm": 0.4140625, + "learning_rate": 3.063158269445592e-05, + "loss": 0.1712, + "step": 23070 + }, + { + "epoch": 3.08, + "grad_norm": 0.431640625, + "learning_rate": 3.062319550022479e-05, + "loss": 0.1911, + "step": 23071 + }, + { + "epoch": 3.08, + "grad_norm": 0.609375, + "learning_rate": 3.0614809246787645e-05, + "loss": 0.3998, + "step": 23072 + }, + { + "epoch": 3.08, + "grad_norm": 0.6953125, + "learning_rate": 3.0606423934258234e-05, + "loss": 0.1783, + "step": 23073 + }, + { + "epoch": 3.08, + "grad_norm": 0.69921875, + "learning_rate": 3.0598039562750266e-05, + "loss": 0.367, + "step": 23074 + }, + { + "epoch": 3.08, + "grad_norm": 0.60546875, + "learning_rate": 3.058965613237743e-05, + "loss": 0.2704, + "step": 23075 + }, + { + "epoch": 3.08, + "grad_norm": 0.6171875, + "learning_rate": 3.058127364325337e-05, + "loss": 0.5083, + "step": 23076 + }, + { + "epoch": 3.08, + "grad_norm": 0.625, + "learning_rate": 3.05728920954918e-05, + "loss": 0.428, + "step": 23077 + }, + { + "epoch": 3.08, + "grad_norm": 0.796875, + "learning_rate": 3.056451148920637e-05, + "loss": 0.3512, + "step": 23078 + }, + { + "epoch": 3.08, + "grad_norm": 0.53515625, + "learning_rate": 3.055613182451075e-05, + "loss": 0.3578, + "step": 23079 + }, + { + "epoch": 3.08, + "grad_norm": 0.69140625, + "learning_rate": 3.054775310151853e-05, + "loss": 0.5614, + "step": 23080 + }, + { + "epoch": 3.08, + "grad_norm": 0.6015625, + "learning_rate": 3.053937532034335e-05, + "loss": 0.2853, + "step": 23081 + }, + { + "epoch": 3.08, + "grad_norm": 0.470703125, + "learning_rate": 3.0530998481098814e-05, + "loss": 0.2037, + "step": 23082 + }, + { + "epoch": 3.08, + "grad_norm": 0.58984375, + "learning_rate": 3.0522622583898555e-05, + "loss": 0.2014, + "step": 23083 + }, + { + "epoch": 3.08, + "grad_norm": 0.51953125, + "learning_rate": 3.0514247628856097e-05, + "loss": 0.2088, + "step": 23084 + }, + { + "epoch": 3.08, + "grad_norm": 0.68359375, + "learning_rate": 3.0505873616085035e-05, + "loss": 0.566, + "step": 23085 + }, + { + "epoch": 3.08, + "grad_norm": 0.49609375, + "learning_rate": 3.0497500545698964e-05, + "loss": 0.3623, + "step": 23086 + }, + { + "epoch": 3.08, + "grad_norm": 0.953125, + "learning_rate": 3.0489128417811353e-05, + "loss": 0.535, + "step": 23087 + }, + { + "epoch": 3.08, + "grad_norm": 0.640625, + "learning_rate": 3.0480757232535772e-05, + "loss": 0.5763, + "step": 23088 + }, + { + "epoch": 3.08, + "grad_norm": 0.51171875, + "learning_rate": 3.047238698998578e-05, + "loss": 0.3146, + "step": 23089 + }, + { + "epoch": 3.08, + "grad_norm": 0.59765625, + "learning_rate": 3.0464017690274804e-05, + "loss": 0.4189, + "step": 23090 + }, + { + "epoch": 3.08, + "grad_norm": 0.5078125, + "learning_rate": 3.0455649333516378e-05, + "loss": 0.216, + "step": 23091 + }, + { + "epoch": 3.08, + "grad_norm": 0.6796875, + "learning_rate": 3.0447281919823978e-05, + "loss": 0.2326, + "step": 23092 + }, + { + "epoch": 3.08, + "grad_norm": 0.60546875, + "learning_rate": 3.0438915449311112e-05, + "loss": 0.1838, + "step": 23093 + }, + { + "epoch": 3.08, + "grad_norm": 0.59375, + "learning_rate": 3.0430549922091166e-05, + "loss": 0.1841, + "step": 23094 + }, + { + "epoch": 3.08, + "grad_norm": 0.52734375, + "learning_rate": 3.0422185338277597e-05, + "loss": 0.5362, + "step": 23095 + }, + { + "epoch": 3.08, + "grad_norm": 0.92578125, + "learning_rate": 3.0413821697983867e-05, + "loss": 0.5633, + "step": 23096 + }, + { + "epoch": 3.08, + "grad_norm": 0.60546875, + "learning_rate": 3.04054590013234e-05, + "loss": 0.3534, + "step": 23097 + }, + { + "epoch": 3.08, + "grad_norm": 0.62890625, + "learning_rate": 3.039709724840958e-05, + "loss": 0.4194, + "step": 23098 + }, + { + "epoch": 3.08, + "grad_norm": 0.400390625, + "learning_rate": 3.0388736439355747e-05, + "loss": 0.131, + "step": 23099 + }, + { + "epoch": 3.08, + "grad_norm": 0.515625, + "learning_rate": 3.0380376574275337e-05, + "loss": 0.1742, + "step": 23100 + }, + { + "epoch": 3.08, + "grad_norm": 0.640625, + "learning_rate": 3.0372017653281692e-05, + "loss": 0.2897, + "step": 23101 + }, + { + "epoch": 3.08, + "grad_norm": 0.63671875, + "learning_rate": 3.036365967648821e-05, + "loss": 0.3588, + "step": 23102 + }, + { + "epoch": 3.08, + "grad_norm": 0.68359375, + "learning_rate": 3.0355302644008165e-05, + "loss": 0.3797, + "step": 23103 + }, + { + "epoch": 3.08, + "grad_norm": 0.71875, + "learning_rate": 3.0346946555954915e-05, + "loss": 0.3591, + "step": 23104 + }, + { + "epoch": 3.08, + "grad_norm": 0.5703125, + "learning_rate": 3.0338591412441775e-05, + "loss": 0.1573, + "step": 23105 + }, + { + "epoch": 3.08, + "grad_norm": 0.76171875, + "learning_rate": 3.033023721358207e-05, + "loss": 0.5207, + "step": 23106 + }, + { + "epoch": 3.08, + "grad_norm": 0.53515625, + "learning_rate": 3.0321883959489028e-05, + "loss": 0.3644, + "step": 23107 + }, + { + "epoch": 3.08, + "grad_norm": 0.58203125, + "learning_rate": 3.031353165027596e-05, + "loss": 0.3196, + "step": 23108 + }, + { + "epoch": 3.08, + "grad_norm": 0.52734375, + "learning_rate": 3.0305180286056133e-05, + "loss": 0.1334, + "step": 23109 + }, + { + "epoch": 3.08, + "grad_norm": 0.82421875, + "learning_rate": 3.029682986694279e-05, + "loss": 0.3259, + "step": 23110 + }, + { + "epoch": 3.08, + "grad_norm": 0.7421875, + "learning_rate": 3.0288480393049236e-05, + "loss": 0.2979, + "step": 23111 + }, + { + "epoch": 3.08, + "grad_norm": 0.484375, + "learning_rate": 3.0280131864488558e-05, + "loss": 0.3474, + "step": 23112 + }, + { + "epoch": 3.08, + "grad_norm": 0.546875, + "learning_rate": 3.027178428137404e-05, + "loss": 0.4498, + "step": 23113 + }, + { + "epoch": 3.08, + "grad_norm": 0.6875, + "learning_rate": 3.026343764381887e-05, + "loss": 0.2882, + "step": 23114 + }, + { + "epoch": 3.08, + "grad_norm": 0.53125, + "learning_rate": 3.0255091951936253e-05, + "loss": 0.1372, + "step": 23115 + }, + { + "epoch": 3.08, + "grad_norm": 0.69140625, + "learning_rate": 3.024674720583939e-05, + "loss": 0.3644, + "step": 23116 + }, + { + "epoch": 3.08, + "grad_norm": 0.61328125, + "learning_rate": 3.0238403405641358e-05, + "loss": 0.3312, + "step": 23117 + }, + { + "epoch": 3.08, + "grad_norm": 0.65625, + "learning_rate": 3.0230060551455343e-05, + "loss": 0.3238, + "step": 23118 + }, + { + "epoch": 3.09, + "grad_norm": 0.72265625, + "learning_rate": 3.0221718643394492e-05, + "loss": 0.4123, + "step": 23119 + }, + { + "epoch": 3.09, + "grad_norm": 0.4609375, + "learning_rate": 3.0213377681571954e-05, + "loss": 0.2158, + "step": 23120 + }, + { + "epoch": 3.09, + "grad_norm": 0.6015625, + "learning_rate": 3.0205037666100778e-05, + "loss": 0.4273, + "step": 23121 + }, + { + "epoch": 3.09, + "grad_norm": 0.65234375, + "learning_rate": 3.0196698597094077e-05, + "loss": 0.1643, + "step": 23122 + }, + { + "epoch": 3.09, + "grad_norm": 0.5859375, + "learning_rate": 3.0188360474664966e-05, + "loss": 0.1782, + "step": 23123 + }, + { + "epoch": 3.09, + "grad_norm": 0.498046875, + "learning_rate": 3.0180023298926474e-05, + "loss": 0.2974, + "step": 23124 + }, + { + "epoch": 3.09, + "grad_norm": 0.6640625, + "learning_rate": 3.0171687069991707e-05, + "loss": 0.3919, + "step": 23125 + }, + { + "epoch": 3.09, + "grad_norm": 0.65625, + "learning_rate": 3.016335178797364e-05, + "loss": 0.4069, + "step": 23126 + }, + { + "epoch": 3.09, + "grad_norm": 0.4609375, + "learning_rate": 3.0155017452985344e-05, + "loss": 0.4546, + "step": 23127 + }, + { + "epoch": 3.09, + "grad_norm": 0.6484375, + "learning_rate": 3.0146684065139842e-05, + "loss": 0.2377, + "step": 23128 + }, + { + "epoch": 3.09, + "grad_norm": 0.640625, + "learning_rate": 3.0138351624550164e-05, + "loss": 0.4912, + "step": 23129 + }, + { + "epoch": 3.09, + "grad_norm": 0.70703125, + "learning_rate": 3.013002013132924e-05, + "loss": 0.4332, + "step": 23130 + }, + { + "epoch": 3.09, + "grad_norm": 0.73046875, + "learning_rate": 3.0121689585590095e-05, + "loss": 0.4918, + "step": 23131 + }, + { + "epoch": 3.09, + "grad_norm": 0.640625, + "learning_rate": 3.0113359987445677e-05, + "loss": 0.3665, + "step": 23132 + }, + { + "epoch": 3.09, + "grad_norm": 0.63671875, + "learning_rate": 3.010503133700896e-05, + "loss": 0.3089, + "step": 23133 + }, + { + "epoch": 3.09, + "grad_norm": 0.76171875, + "learning_rate": 3.00967036343929e-05, + "loss": 0.2222, + "step": 23134 + }, + { + "epoch": 3.09, + "grad_norm": 0.58984375, + "learning_rate": 3.008837687971041e-05, + "loss": 0.49, + "step": 23135 + }, + { + "epoch": 3.09, + "grad_norm": 0.60546875, + "learning_rate": 3.0080051073074355e-05, + "loss": 0.3918, + "step": 23136 + }, + { + "epoch": 3.09, + "grad_norm": 0.7578125, + "learning_rate": 3.0071726214597694e-05, + "loss": 0.4321, + "step": 23137 + }, + { + "epoch": 3.09, + "grad_norm": 0.63671875, + "learning_rate": 3.0063402304393297e-05, + "loss": 0.2573, + "step": 23138 + }, + { + "epoch": 3.09, + "grad_norm": 0.7109375, + "learning_rate": 3.005507934257409e-05, + "loss": 0.3356, + "step": 23139 + }, + { + "epoch": 3.09, + "grad_norm": 0.73046875, + "learning_rate": 3.004675732925286e-05, + "loss": 0.3018, + "step": 23140 + }, + { + "epoch": 3.09, + "grad_norm": 0.54296875, + "learning_rate": 3.003843626454249e-05, + "loss": 0.3088, + "step": 23141 + }, + { + "epoch": 3.09, + "grad_norm": 0.7578125, + "learning_rate": 3.0030116148555833e-05, + "loss": 0.5087, + "step": 23142 + }, + { + "epoch": 3.09, + "grad_norm": 0.44921875, + "learning_rate": 3.0021796981405736e-05, + "loss": 0.134, + "step": 23143 + }, + { + "epoch": 3.09, + "grad_norm": 0.6171875, + "learning_rate": 3.001347876320496e-05, + "loss": 0.1895, + "step": 23144 + }, + { + "epoch": 3.09, + "grad_norm": 0.49609375, + "learning_rate": 3.0005161494066324e-05, + "loss": 0.195, + "step": 23145 + }, + { + "epoch": 3.09, + "grad_norm": 0.486328125, + "learning_rate": 2.999684517410262e-05, + "loss": 0.3441, + "step": 23146 + }, + { + "epoch": 3.09, + "grad_norm": 0.56640625, + "learning_rate": 2.9988529803426646e-05, + "loss": 0.4545, + "step": 23147 + }, + { + "epoch": 3.09, + "grad_norm": 0.52734375, + "learning_rate": 2.9980215382151155e-05, + "loss": 0.1822, + "step": 23148 + }, + { + "epoch": 3.09, + "grad_norm": 0.62890625, + "learning_rate": 2.9971901910388843e-05, + "loss": 0.3034, + "step": 23149 + }, + { + "epoch": 3.09, + "grad_norm": 0.6953125, + "learning_rate": 2.996358938825249e-05, + "loss": 0.5633, + "step": 23150 + }, + { + "epoch": 3.09, + "grad_norm": 0.5078125, + "learning_rate": 2.995527781585481e-05, + "loss": 0.2735, + "step": 23151 + }, + { + "epoch": 3.09, + "grad_norm": 0.57421875, + "learning_rate": 2.994696719330855e-05, + "loss": 0.1845, + "step": 23152 + }, + { + "epoch": 3.09, + "grad_norm": 0.50390625, + "learning_rate": 2.9938657520726343e-05, + "loss": 0.1721, + "step": 23153 + }, + { + "epoch": 3.09, + "grad_norm": 0.60546875, + "learning_rate": 2.9930348798220897e-05, + "loss": 0.5283, + "step": 23154 + }, + { + "epoch": 3.09, + "grad_norm": 0.65625, + "learning_rate": 2.99220410259049e-05, + "loss": 0.5092, + "step": 23155 + }, + { + "epoch": 3.09, + "grad_norm": 0.65625, + "learning_rate": 2.991373420389103e-05, + "loss": 0.3179, + "step": 23156 + }, + { + "epoch": 3.09, + "grad_norm": 0.58203125, + "learning_rate": 2.9905428332291886e-05, + "loss": 0.3595, + "step": 23157 + }, + { + "epoch": 3.09, + "grad_norm": 0.62109375, + "learning_rate": 2.9897123411220108e-05, + "loss": 0.3398, + "step": 23158 + }, + { + "epoch": 3.09, + "grad_norm": 0.640625, + "learning_rate": 2.9888819440788363e-05, + "loss": 0.6061, + "step": 23159 + }, + { + "epoch": 3.09, + "grad_norm": 0.51171875, + "learning_rate": 2.9880516421109182e-05, + "loss": 0.2888, + "step": 23160 + }, + { + "epoch": 3.09, + "grad_norm": 0.53125, + "learning_rate": 2.9872214352295213e-05, + "loss": 0.3678, + "step": 23161 + }, + { + "epoch": 3.09, + "grad_norm": 0.58203125, + "learning_rate": 2.986391323445906e-05, + "loss": 0.4102, + "step": 23162 + }, + { + "epoch": 3.09, + "grad_norm": 0.51953125, + "learning_rate": 2.9855613067713218e-05, + "loss": 0.3037, + "step": 23163 + }, + { + "epoch": 3.09, + "grad_norm": 0.57421875, + "learning_rate": 2.9847313852170277e-05, + "loss": 0.2173, + "step": 23164 + }, + { + "epoch": 3.09, + "grad_norm": 0.73828125, + "learning_rate": 2.983901558794279e-05, + "loss": 0.5851, + "step": 23165 + }, + { + "epoch": 3.09, + "grad_norm": 0.640625, + "learning_rate": 2.983071827514331e-05, + "loss": 0.3241, + "step": 23166 + }, + { + "epoch": 3.09, + "grad_norm": 0.453125, + "learning_rate": 2.9822421913884303e-05, + "loss": 0.1774, + "step": 23167 + }, + { + "epoch": 3.09, + "grad_norm": 0.68359375, + "learning_rate": 2.9814126504278284e-05, + "loss": 0.2528, + "step": 23168 + }, + { + "epoch": 3.09, + "grad_norm": 0.61328125, + "learning_rate": 2.9805832046437754e-05, + "loss": 0.4329, + "step": 23169 + }, + { + "epoch": 3.09, + "grad_norm": 0.61328125, + "learning_rate": 2.979753854047522e-05, + "loss": 0.3292, + "step": 23170 + }, + { + "epoch": 3.09, + "grad_norm": 0.64453125, + "learning_rate": 2.978924598650312e-05, + "loss": 0.372, + "step": 23171 + }, + { + "epoch": 3.09, + "grad_norm": 0.6875, + "learning_rate": 2.9780954384633886e-05, + "loss": 0.3479, + "step": 23172 + }, + { + "epoch": 3.09, + "grad_norm": 0.640625, + "learning_rate": 2.9772663734979968e-05, + "loss": 0.6355, + "step": 23173 + }, + { + "epoch": 3.09, + "grad_norm": 0.65234375, + "learning_rate": 2.9764374037653808e-05, + "loss": 0.3544, + "step": 23174 + }, + { + "epoch": 3.09, + "grad_norm": 0.6640625, + "learning_rate": 2.975608529276783e-05, + "loss": 0.4746, + "step": 23175 + }, + { + "epoch": 3.09, + "grad_norm": 0.5234375, + "learning_rate": 2.9747797500434406e-05, + "loss": 0.1546, + "step": 23176 + }, + { + "epoch": 3.09, + "grad_norm": 0.53515625, + "learning_rate": 2.9739510660765923e-05, + "loss": 0.3053, + "step": 23177 + }, + { + "epoch": 3.09, + "grad_norm": 0.53125, + "learning_rate": 2.9731224773874766e-05, + "loss": 0.1367, + "step": 23178 + }, + { + "epoch": 3.09, + "grad_norm": 0.60546875, + "learning_rate": 2.9722939839873333e-05, + "loss": 0.3761, + "step": 23179 + }, + { + "epoch": 3.09, + "grad_norm": 0.69921875, + "learning_rate": 2.9714655858873908e-05, + "loss": 0.373, + "step": 23180 + }, + { + "epoch": 3.09, + "grad_norm": 0.466796875, + "learning_rate": 2.970637283098886e-05, + "loss": 0.2905, + "step": 23181 + }, + { + "epoch": 3.09, + "grad_norm": 0.5625, + "learning_rate": 2.969809075633051e-05, + "loss": 0.1577, + "step": 23182 + }, + { + "epoch": 3.09, + "grad_norm": 0.3984375, + "learning_rate": 2.96898096350112e-05, + "loss": 0.2136, + "step": 23183 + }, + { + "epoch": 3.09, + "grad_norm": 0.578125, + "learning_rate": 2.96815294671432e-05, + "loss": 0.384, + "step": 23184 + }, + { + "epoch": 3.09, + "grad_norm": 0.609375, + "learning_rate": 2.9673250252838758e-05, + "loss": 0.3358, + "step": 23185 + }, + { + "epoch": 3.09, + "grad_norm": 0.69140625, + "learning_rate": 2.9664971992210178e-05, + "loss": 0.4473, + "step": 23186 + }, + { + "epoch": 3.09, + "grad_norm": 0.57421875, + "learning_rate": 2.9656694685369713e-05, + "loss": 0.207, + "step": 23187 + }, + { + "epoch": 3.09, + "grad_norm": 0.53515625, + "learning_rate": 2.9648418332429618e-05, + "loss": 0.2198, + "step": 23188 + }, + { + "epoch": 3.09, + "grad_norm": 0.59765625, + "learning_rate": 2.9640142933502146e-05, + "loss": 0.3064, + "step": 23189 + }, + { + "epoch": 3.09, + "grad_norm": 0.6640625, + "learning_rate": 2.9631868488699476e-05, + "loss": 0.3893, + "step": 23190 + }, + { + "epoch": 3.09, + "grad_norm": 0.4765625, + "learning_rate": 2.962359499813382e-05, + "loss": 0.3665, + "step": 23191 + }, + { + "epoch": 3.09, + "grad_norm": 0.75390625, + "learning_rate": 2.9615322461917384e-05, + "loss": 0.2961, + "step": 23192 + }, + { + "epoch": 3.09, + "grad_norm": 0.5703125, + "learning_rate": 2.960705088016238e-05, + "loss": 0.2795, + "step": 23193 + }, + { + "epoch": 3.1, + "grad_norm": 0.546875, + "learning_rate": 2.959878025298092e-05, + "loss": 0.1892, + "step": 23194 + }, + { + "epoch": 3.1, + "grad_norm": 0.94921875, + "learning_rate": 2.9590510580485175e-05, + "loss": 0.3713, + "step": 23195 + }, + { + "epoch": 3.1, + "grad_norm": 0.49609375, + "learning_rate": 2.958224186278733e-05, + "loss": 0.2316, + "step": 23196 + }, + { + "epoch": 3.1, + "grad_norm": 0.51953125, + "learning_rate": 2.9573974099999447e-05, + "loss": 0.2904, + "step": 23197 + }, + { + "epoch": 3.1, + "grad_norm": 0.451171875, + "learning_rate": 2.9565707292233702e-05, + "loss": 0.2995, + "step": 23198 + }, + { + "epoch": 3.1, + "grad_norm": 0.609375, + "learning_rate": 2.955744143960214e-05, + "loss": 0.2702, + "step": 23199 + }, + { + "epoch": 3.1, + "grad_norm": 0.5859375, + "learning_rate": 2.954917654221688e-05, + "loss": 0.3282, + "step": 23200 + }, + { + "epoch": 3.1, + "grad_norm": 0.60546875, + "learning_rate": 2.9540912600190006e-05, + "loss": 0.4071, + "step": 23201 + }, + { + "epoch": 3.1, + "grad_norm": 0.6953125, + "learning_rate": 2.953264961363361e-05, + "loss": 0.2605, + "step": 23202 + }, + { + "epoch": 3.1, + "grad_norm": 0.6796875, + "learning_rate": 2.9524387582659672e-05, + "loss": 0.3319, + "step": 23203 + }, + { + "epoch": 3.1, + "grad_norm": 0.84375, + "learning_rate": 2.9516126507380282e-05, + "loss": 0.4156, + "step": 23204 + }, + { + "epoch": 3.1, + "grad_norm": 0.6171875, + "learning_rate": 2.9507866387907445e-05, + "loss": 0.3359, + "step": 23205 + }, + { + "epoch": 3.1, + "grad_norm": 0.58203125, + "learning_rate": 2.9499607224353177e-05, + "loss": 0.5006, + "step": 23206 + }, + { + "epoch": 3.1, + "grad_norm": 0.48046875, + "learning_rate": 2.9491349016829515e-05, + "loss": 0.1398, + "step": 23207 + }, + { + "epoch": 3.1, + "grad_norm": 0.4609375, + "learning_rate": 2.9483091765448422e-05, + "loss": 0.2148, + "step": 23208 + }, + { + "epoch": 3.1, + "grad_norm": 0.455078125, + "learning_rate": 2.9474835470321827e-05, + "loss": 0.1994, + "step": 23209 + }, + { + "epoch": 3.1, + "grad_norm": 0.51953125, + "learning_rate": 2.9466580131561715e-05, + "loss": 0.3008, + "step": 23210 + }, + { + "epoch": 3.1, + "grad_norm": 0.5234375, + "learning_rate": 2.9458325749280057e-05, + "loss": 0.1824, + "step": 23211 + }, + { + "epoch": 3.1, + "grad_norm": 0.65625, + "learning_rate": 2.9450072323588806e-05, + "loss": 0.3572, + "step": 23212 + }, + { + "epoch": 3.1, + "grad_norm": 0.5859375, + "learning_rate": 2.9441819854599816e-05, + "loss": 0.3635, + "step": 23213 + }, + { + "epoch": 3.1, + "grad_norm": 0.62109375, + "learning_rate": 2.9433568342425034e-05, + "loss": 0.2298, + "step": 23214 + }, + { + "epoch": 3.1, + "grad_norm": 0.63671875, + "learning_rate": 2.9425317787176355e-05, + "loss": 0.2853, + "step": 23215 + }, + { + "epoch": 3.1, + "grad_norm": 0.6875, + "learning_rate": 2.941706818896569e-05, + "loss": 0.1901, + "step": 23216 + }, + { + "epoch": 3.1, + "grad_norm": 0.484375, + "learning_rate": 2.9408819547904855e-05, + "loss": 0.4619, + "step": 23217 + }, + { + "epoch": 3.1, + "grad_norm": 0.76171875, + "learning_rate": 2.940057186410573e-05, + "loss": 0.235, + "step": 23218 + }, + { + "epoch": 3.1, + "grad_norm": 0.484375, + "learning_rate": 2.9392325137680155e-05, + "loss": 0.3549, + "step": 23219 + }, + { + "epoch": 3.1, + "grad_norm": 0.6484375, + "learning_rate": 2.9384079368740003e-05, + "loss": 0.2547, + "step": 23220 + }, + { + "epoch": 3.1, + "grad_norm": 0.6328125, + "learning_rate": 2.937583455739705e-05, + "loss": 0.3882, + "step": 23221 + }, + { + "epoch": 3.1, + "grad_norm": 0.515625, + "learning_rate": 2.9367590703763083e-05, + "loss": 0.1994, + "step": 23222 + }, + { + "epoch": 3.1, + "grad_norm": 0.5234375, + "learning_rate": 2.9359347807949923e-05, + "loss": 0.3076, + "step": 23223 + }, + { + "epoch": 3.1, + "grad_norm": 0.72265625, + "learning_rate": 2.9351105870069328e-05, + "loss": 0.391, + "step": 23224 + }, + { + "epoch": 3.1, + "grad_norm": 0.6328125, + "learning_rate": 2.9342864890233123e-05, + "loss": 0.253, + "step": 23225 + }, + { + "epoch": 3.1, + "grad_norm": 0.5625, + "learning_rate": 2.9334624868552972e-05, + "loss": 0.2474, + "step": 23226 + }, + { + "epoch": 3.1, + "grad_norm": 0.62109375, + "learning_rate": 2.9326385805140676e-05, + "loss": 0.3386, + "step": 23227 + }, + { + "epoch": 3.1, + "grad_norm": 0.62109375, + "learning_rate": 2.9318147700107945e-05, + "loss": 0.3358, + "step": 23228 + }, + { + "epoch": 3.1, + "grad_norm": 0.625, + "learning_rate": 2.9309910553566524e-05, + "loss": 0.3063, + "step": 23229 + }, + { + "epoch": 3.1, + "grad_norm": 0.625, + "learning_rate": 2.930167436562804e-05, + "loss": 0.2297, + "step": 23230 + }, + { + "epoch": 3.1, + "grad_norm": 0.53515625, + "learning_rate": 2.9293439136404244e-05, + "loss": 0.2757, + "step": 23231 + }, + { + "epoch": 3.1, + "grad_norm": 0.7109375, + "learning_rate": 2.928520486600681e-05, + "loss": 0.551, + "step": 23232 + }, + { + "epoch": 3.1, + "grad_norm": 0.81640625, + "learning_rate": 2.927697155454736e-05, + "loss": 0.3204, + "step": 23233 + }, + { + "epoch": 3.1, + "grad_norm": 0.6875, + "learning_rate": 2.9268739202137562e-05, + "loss": 0.2717, + "step": 23234 + }, + { + "epoch": 3.1, + "grad_norm": 0.91796875, + "learning_rate": 2.9260507808889093e-05, + "loss": 0.8856, + "step": 23235 + }, + { + "epoch": 3.1, + "grad_norm": 0.58984375, + "learning_rate": 2.9252277374913494e-05, + "loss": 0.2665, + "step": 23236 + }, + { + "epoch": 3.1, + "grad_norm": 0.7578125, + "learning_rate": 2.9244047900322435e-05, + "loss": 0.2943, + "step": 23237 + }, + { + "epoch": 3.1, + "grad_norm": 0.53125, + "learning_rate": 2.9235819385227482e-05, + "loss": 0.1648, + "step": 23238 + }, + { + "epoch": 3.1, + "grad_norm": 0.63671875, + "learning_rate": 2.9227591829740275e-05, + "loss": 0.3019, + "step": 23239 + }, + { + "epoch": 3.1, + "grad_norm": 0.67578125, + "learning_rate": 2.9219365233972307e-05, + "loss": 0.3254, + "step": 23240 + }, + { + "epoch": 3.1, + "grad_norm": 0.54296875, + "learning_rate": 2.9211139598035187e-05, + "loss": 0.3235, + "step": 23241 + }, + { + "epoch": 3.1, + "grad_norm": 0.796875, + "learning_rate": 2.9202914922040425e-05, + "loss": 0.5826, + "step": 23242 + }, + { + "epoch": 3.1, + "grad_norm": 0.5234375, + "learning_rate": 2.9194691206099622e-05, + "loss": 0.1384, + "step": 23243 + }, + { + "epoch": 3.1, + "grad_norm": 0.6953125, + "learning_rate": 2.9186468450324245e-05, + "loss": 0.3744, + "step": 23244 + }, + { + "epoch": 3.1, + "grad_norm": 0.69921875, + "learning_rate": 2.917824665482577e-05, + "loss": 0.4407, + "step": 23245 + }, + { + "epoch": 3.1, + "grad_norm": 0.5625, + "learning_rate": 2.917002581971572e-05, + "loss": 0.346, + "step": 23246 + }, + { + "epoch": 3.1, + "grad_norm": 0.5234375, + "learning_rate": 2.916180594510558e-05, + "loss": 0.1768, + "step": 23247 + }, + { + "epoch": 3.1, + "grad_norm": 0.78515625, + "learning_rate": 2.9153587031106854e-05, + "loss": 0.218, + "step": 23248 + }, + { + "epoch": 3.1, + "grad_norm": 0.5859375, + "learning_rate": 2.914536907783092e-05, + "loss": 0.4401, + "step": 23249 + }, + { + "epoch": 3.1, + "grad_norm": 0.578125, + "learning_rate": 2.913715208538925e-05, + "loss": 0.3971, + "step": 23250 + }, + { + "epoch": 3.1, + "grad_norm": 0.6328125, + "learning_rate": 2.9128936053893284e-05, + "loss": 0.3872, + "step": 23251 + }, + { + "epoch": 3.1, + "grad_norm": 0.625, + "learning_rate": 2.9120720983454463e-05, + "loss": 0.2872, + "step": 23252 + }, + { + "epoch": 3.1, + "grad_norm": 0.56640625, + "learning_rate": 2.911250687418412e-05, + "loss": 0.2506, + "step": 23253 + }, + { + "epoch": 3.1, + "grad_norm": 0.609375, + "learning_rate": 2.9104293726193678e-05, + "loss": 0.3186, + "step": 23254 + }, + { + "epoch": 3.1, + "grad_norm": 0.55078125, + "learning_rate": 2.909608153959451e-05, + "loss": 0.2855, + "step": 23255 + }, + { + "epoch": 3.1, + "grad_norm": 0.46875, + "learning_rate": 2.9087870314498022e-05, + "loss": 0.1182, + "step": 23256 + }, + { + "epoch": 3.1, + "grad_norm": 0.5390625, + "learning_rate": 2.9079660051015523e-05, + "loss": 0.166, + "step": 23257 + }, + { + "epoch": 3.1, + "grad_norm": 0.84765625, + "learning_rate": 2.9071450749258312e-05, + "loss": 0.4015, + "step": 23258 + }, + { + "epoch": 3.1, + "grad_norm": 0.63671875, + "learning_rate": 2.9063242409337753e-05, + "loss": 0.3262, + "step": 23259 + }, + { + "epoch": 3.1, + "grad_norm": 0.69140625, + "learning_rate": 2.9055035031365164e-05, + "loss": 0.3081, + "step": 23260 + }, + { + "epoch": 3.1, + "grad_norm": 0.44921875, + "learning_rate": 2.9046828615451826e-05, + "loss": 0.3481, + "step": 23261 + }, + { + "epoch": 3.1, + "grad_norm": 0.5546875, + "learning_rate": 2.9038623161709055e-05, + "loss": 0.5286, + "step": 23262 + }, + { + "epoch": 3.1, + "grad_norm": 0.52734375, + "learning_rate": 2.9030418670248072e-05, + "loss": 0.2525, + "step": 23263 + }, + { + "epoch": 3.1, + "grad_norm": 0.5390625, + "learning_rate": 2.9022215141180154e-05, + "loss": 0.2141, + "step": 23264 + }, + { + "epoch": 3.1, + "grad_norm": 0.48828125, + "learning_rate": 2.9014012574616557e-05, + "loss": 0.3161, + "step": 23265 + }, + { + "epoch": 3.1, + "grad_norm": 0.47265625, + "learning_rate": 2.9005810970668536e-05, + "loss": 0.1539, + "step": 23266 + }, + { + "epoch": 3.1, + "grad_norm": 0.7109375, + "learning_rate": 2.8997610329447243e-05, + "loss": 0.3855, + "step": 23267 + }, + { + "epoch": 3.1, + "grad_norm": 0.71484375, + "learning_rate": 2.8989410651063965e-05, + "loss": 0.5112, + "step": 23268 + }, + { + "epoch": 3.11, + "grad_norm": 0.5625, + "learning_rate": 2.8981211935629825e-05, + "loss": 0.1555, + "step": 23269 + }, + { + "epoch": 3.11, + "grad_norm": 0.66796875, + "learning_rate": 2.897301418325602e-05, + "loss": 0.4637, + "step": 23270 + }, + { + "epoch": 3.11, + "grad_norm": 0.478515625, + "learning_rate": 2.8964817394053768e-05, + "loss": 0.2495, + "step": 23271 + }, + { + "epoch": 3.11, + "grad_norm": 0.6328125, + "learning_rate": 2.8956621568134146e-05, + "loss": 0.2378, + "step": 23272 + }, + { + "epoch": 3.11, + "grad_norm": 0.671875, + "learning_rate": 2.8948426705608335e-05, + "loss": 0.3743, + "step": 23273 + }, + { + "epoch": 3.11, + "grad_norm": 0.55859375, + "learning_rate": 2.894023280658745e-05, + "loss": 0.1421, + "step": 23274 + }, + { + "epoch": 3.11, + "grad_norm": 0.765625, + "learning_rate": 2.893203987118265e-05, + "loss": 0.3899, + "step": 23275 + }, + { + "epoch": 3.11, + "grad_norm": 0.59375, + "learning_rate": 2.8923847899504976e-05, + "loss": 0.2161, + "step": 23276 + }, + { + "epoch": 3.11, + "grad_norm": 0.7109375, + "learning_rate": 2.8915656891665534e-05, + "loss": 0.2722, + "step": 23277 + }, + { + "epoch": 3.11, + "grad_norm": 0.53125, + "learning_rate": 2.89074668477754e-05, + "loss": 0.3225, + "step": 23278 + }, + { + "epoch": 3.11, + "grad_norm": 0.625, + "learning_rate": 2.8899277767945654e-05, + "loss": 0.3579, + "step": 23279 + }, + { + "epoch": 3.11, + "grad_norm": 0.58203125, + "learning_rate": 2.8891089652287352e-05, + "loss": 0.3671, + "step": 23280 + }, + { + "epoch": 3.11, + "grad_norm": 0.671875, + "learning_rate": 2.8882902500911524e-05, + "loss": 0.343, + "step": 23281 + }, + { + "epoch": 3.11, + "grad_norm": 0.6015625, + "learning_rate": 2.887471631392914e-05, + "loss": 0.2447, + "step": 23282 + }, + { + "epoch": 3.11, + "grad_norm": 0.75, + "learning_rate": 2.8866531091451264e-05, + "loss": 0.5267, + "step": 23283 + }, + { + "epoch": 3.11, + "grad_norm": 0.625, + "learning_rate": 2.8858346833588868e-05, + "loss": 0.2318, + "step": 23284 + }, + { + "epoch": 3.11, + "grad_norm": 0.69921875, + "learning_rate": 2.885016354045298e-05, + "loss": 0.4006, + "step": 23285 + }, + { + "epoch": 3.11, + "grad_norm": 0.50390625, + "learning_rate": 2.8841981212154513e-05, + "loss": 0.2466, + "step": 23286 + }, + { + "epoch": 3.11, + "grad_norm": 0.466796875, + "learning_rate": 2.8833799848804444e-05, + "loss": 0.4133, + "step": 23287 + }, + { + "epoch": 3.11, + "grad_norm": 0.56640625, + "learning_rate": 2.882561945051372e-05, + "loss": 0.4343, + "step": 23288 + }, + { + "epoch": 3.11, + "grad_norm": 0.63671875, + "learning_rate": 2.8817440017393316e-05, + "loss": 0.4958, + "step": 23289 + }, + { + "epoch": 3.11, + "grad_norm": 0.6640625, + "learning_rate": 2.8809261549554067e-05, + "loss": 0.5027, + "step": 23290 + }, + { + "epoch": 3.11, + "grad_norm": 0.640625, + "learning_rate": 2.8801084047106928e-05, + "loss": 0.3597, + "step": 23291 + }, + { + "epoch": 3.11, + "grad_norm": 0.65625, + "learning_rate": 2.8792907510162793e-05, + "loss": 0.6202, + "step": 23292 + }, + { + "epoch": 3.11, + "grad_norm": 0.7265625, + "learning_rate": 2.8784731938832556e-05, + "loss": 0.2713, + "step": 23293 + }, + { + "epoch": 3.11, + "grad_norm": 0.64453125, + "learning_rate": 2.877655733322706e-05, + "loss": 0.4266, + "step": 23294 + }, + { + "epoch": 3.11, + "grad_norm": 0.435546875, + "learning_rate": 2.8768383693457123e-05, + "loss": 0.1142, + "step": 23295 + }, + { + "epoch": 3.11, + "grad_norm": 0.546875, + "learning_rate": 2.876021101963362e-05, + "loss": 0.2721, + "step": 23296 + }, + { + "epoch": 3.11, + "grad_norm": 0.330078125, + "learning_rate": 2.8752039311867386e-05, + "loss": 0.1362, + "step": 23297 + }, + { + "epoch": 3.11, + "grad_norm": 0.66015625, + "learning_rate": 2.8743868570269238e-05, + "loss": 0.5396, + "step": 23298 + }, + { + "epoch": 3.11, + "grad_norm": 0.56640625, + "learning_rate": 2.873569879494995e-05, + "loss": 0.3424, + "step": 23299 + }, + { + "epoch": 3.11, + "grad_norm": 0.484375, + "learning_rate": 2.872752998602032e-05, + "loss": 0.1967, + "step": 23300 + }, + { + "epoch": 3.11, + "grad_norm": 0.62109375, + "learning_rate": 2.8719362143591123e-05, + "loss": 0.331, + "step": 23301 + }, + { + "epoch": 3.11, + "grad_norm": 0.490234375, + "learning_rate": 2.871119526777315e-05, + "loss": 0.2722, + "step": 23302 + }, + { + "epoch": 3.11, + "grad_norm": 0.6484375, + "learning_rate": 2.8703029358677103e-05, + "loss": 0.2546, + "step": 23303 + }, + { + "epoch": 3.11, + "grad_norm": 0.60546875, + "learning_rate": 2.8694864416413726e-05, + "loss": 0.1505, + "step": 23304 + }, + { + "epoch": 3.11, + "grad_norm": 0.78125, + "learning_rate": 2.8686700441093784e-05, + "loss": 0.277, + "step": 23305 + }, + { + "epoch": 3.11, + "grad_norm": 0.69140625, + "learning_rate": 2.8678537432827925e-05, + "loss": 0.4128, + "step": 23306 + }, + { + "epoch": 3.11, + "grad_norm": 0.74609375, + "learning_rate": 2.8670375391726867e-05, + "loss": 0.2191, + "step": 23307 + }, + { + "epoch": 3.11, + "grad_norm": 0.734375, + "learning_rate": 2.8662214317901324e-05, + "loss": 0.2866, + "step": 23308 + }, + { + "epoch": 3.11, + "grad_norm": 0.70703125, + "learning_rate": 2.8654054211461922e-05, + "loss": 0.4029, + "step": 23309 + }, + { + "epoch": 3.11, + "grad_norm": 0.67578125, + "learning_rate": 2.8645895072519325e-05, + "loss": 0.3219, + "step": 23310 + }, + { + "epoch": 3.11, + "grad_norm": 0.78515625, + "learning_rate": 2.8637736901184188e-05, + "loss": 0.242, + "step": 23311 + }, + { + "epoch": 3.11, + "grad_norm": 0.50390625, + "learning_rate": 2.8629579697567167e-05, + "loss": 0.1972, + "step": 23312 + }, + { + "epoch": 3.11, + "grad_norm": 0.4140625, + "learning_rate": 2.862142346177883e-05, + "loss": 0.1138, + "step": 23313 + }, + { + "epoch": 3.11, + "grad_norm": 0.458984375, + "learning_rate": 2.8613268193929787e-05, + "loss": 0.1208, + "step": 23314 + }, + { + "epoch": 3.11, + "grad_norm": 0.64453125, + "learning_rate": 2.8605113894130653e-05, + "loss": 0.3778, + "step": 23315 + }, + { + "epoch": 3.11, + "grad_norm": 0.640625, + "learning_rate": 2.8596960562492027e-05, + "loss": 0.3684, + "step": 23316 + }, + { + "epoch": 3.11, + "grad_norm": 0.435546875, + "learning_rate": 2.858880819912444e-05, + "loss": 0.3239, + "step": 23317 + }, + { + "epoch": 3.11, + "grad_norm": 0.94140625, + "learning_rate": 2.8580656804138405e-05, + "loss": 0.1497, + "step": 23318 + }, + { + "epoch": 3.11, + "grad_norm": 0.35546875, + "learning_rate": 2.8572506377644504e-05, + "loss": 0.1364, + "step": 23319 + }, + { + "epoch": 3.11, + "grad_norm": 0.5546875, + "learning_rate": 2.8564356919753267e-05, + "loss": 0.2198, + "step": 23320 + }, + { + "epoch": 3.11, + "grad_norm": 0.875, + "learning_rate": 2.8556208430575227e-05, + "loss": 0.398, + "step": 23321 + }, + { + "epoch": 3.11, + "grad_norm": 0.51953125, + "learning_rate": 2.8548060910220818e-05, + "loss": 0.2972, + "step": 23322 + }, + { + "epoch": 3.11, + "grad_norm": 0.69140625, + "learning_rate": 2.8539914358800557e-05, + "loss": 0.3929, + "step": 23323 + }, + { + "epoch": 3.11, + "grad_norm": 0.66796875, + "learning_rate": 2.8531768776424927e-05, + "loss": 0.3098, + "step": 23324 + }, + { + "epoch": 3.11, + "grad_norm": 0.58984375, + "learning_rate": 2.852362416320441e-05, + "loss": 0.2157, + "step": 23325 + }, + { + "epoch": 3.11, + "grad_norm": 0.51953125, + "learning_rate": 2.851548051924938e-05, + "loss": 0.4781, + "step": 23326 + }, + { + "epoch": 3.11, + "grad_norm": 0.6171875, + "learning_rate": 2.8507337844670324e-05, + "loss": 0.3046, + "step": 23327 + }, + { + "epoch": 3.11, + "grad_norm": 0.4765625, + "learning_rate": 2.8499196139577656e-05, + "loss": 0.2263, + "step": 23328 + }, + { + "epoch": 3.11, + "grad_norm": 0.62109375, + "learning_rate": 2.84910554040818e-05, + "loss": 0.3336, + "step": 23329 + }, + { + "epoch": 3.11, + "grad_norm": 0.70703125, + "learning_rate": 2.8482915638293128e-05, + "loss": 0.4352, + "step": 23330 + }, + { + "epoch": 3.11, + "grad_norm": 0.56640625, + "learning_rate": 2.8474776842321982e-05, + "loss": 0.3661, + "step": 23331 + }, + { + "epoch": 3.11, + "grad_norm": 0.53125, + "learning_rate": 2.8466639016278773e-05, + "loss": 0.1973, + "step": 23332 + }, + { + "epoch": 3.11, + "grad_norm": 0.68359375, + "learning_rate": 2.8458502160273847e-05, + "loss": 0.2366, + "step": 23333 + }, + { + "epoch": 3.11, + "grad_norm": 0.6484375, + "learning_rate": 2.845036627441755e-05, + "loss": 0.3143, + "step": 23334 + }, + { + "epoch": 3.11, + "grad_norm": 0.5625, + "learning_rate": 2.8442231358820238e-05, + "loss": 0.2771, + "step": 23335 + }, + { + "epoch": 3.11, + "grad_norm": 0.578125, + "learning_rate": 2.8434097413592166e-05, + "loss": 0.3123, + "step": 23336 + }, + { + "epoch": 3.11, + "grad_norm": 0.54296875, + "learning_rate": 2.8425964438843657e-05, + "loss": 0.368, + "step": 23337 + }, + { + "epoch": 3.11, + "grad_norm": 0.65234375, + "learning_rate": 2.841783243468501e-05, + "loss": 0.4435, + "step": 23338 + }, + { + "epoch": 3.11, + "grad_norm": 0.63671875, + "learning_rate": 2.840970140122652e-05, + "loss": 0.1934, + "step": 23339 + }, + { + "epoch": 3.11, + "grad_norm": 0.58203125, + "learning_rate": 2.84015713385784e-05, + "loss": 0.2495, + "step": 23340 + }, + { + "epoch": 3.11, + "grad_norm": 0.6875, + "learning_rate": 2.8393442246850954e-05, + "loss": 0.2992, + "step": 23341 + }, + { + "epoch": 3.11, + "grad_norm": 0.7578125, + "learning_rate": 2.8385314126154362e-05, + "loss": 0.3763, + "step": 23342 + }, + { + "epoch": 3.11, + "grad_norm": 0.494140625, + "learning_rate": 2.837718697659887e-05, + "loss": 0.197, + "step": 23343 + }, + { + "epoch": 3.12, + "grad_norm": 0.55078125, + "learning_rate": 2.8369060798294722e-05, + "loss": 0.3432, + "step": 23344 + }, + { + "epoch": 3.12, + "grad_norm": 0.6796875, + "learning_rate": 2.8360935591352045e-05, + "loss": 0.2432, + "step": 23345 + }, + { + "epoch": 3.12, + "grad_norm": 0.478515625, + "learning_rate": 2.835281135588106e-05, + "loss": 0.4098, + "step": 23346 + }, + { + "epoch": 3.12, + "grad_norm": 0.69921875, + "learning_rate": 2.8344688091991932e-05, + "loss": 0.1828, + "step": 23347 + }, + { + "epoch": 3.12, + "grad_norm": 0.5, + "learning_rate": 2.833656579979487e-05, + "loss": 0.1867, + "step": 23348 + }, + { + "epoch": 3.12, + "grad_norm": 0.53125, + "learning_rate": 2.8328444479399918e-05, + "loss": 0.4533, + "step": 23349 + }, + { + "epoch": 3.12, + "grad_norm": 0.68359375, + "learning_rate": 2.8320324130917276e-05, + "loss": 0.3701, + "step": 23350 + }, + { + "epoch": 3.12, + "grad_norm": 0.59765625, + "learning_rate": 2.8312204754457027e-05, + "loss": 0.2597, + "step": 23351 + }, + { + "epoch": 3.12, + "grad_norm": 0.55078125, + "learning_rate": 2.83040863501293e-05, + "loss": 0.3592, + "step": 23352 + }, + { + "epoch": 3.12, + "grad_norm": 0.6015625, + "learning_rate": 2.8295968918044226e-05, + "loss": 0.6222, + "step": 23353 + }, + { + "epoch": 3.12, + "grad_norm": 0.5859375, + "learning_rate": 2.828785245831178e-05, + "loss": 0.416, + "step": 23354 + }, + { + "epoch": 3.12, + "grad_norm": 0.5234375, + "learning_rate": 2.8279736971042082e-05, + "loss": 0.3408, + "step": 23355 + }, + { + "epoch": 3.12, + "grad_norm": 0.734375, + "learning_rate": 2.827162245634517e-05, + "loss": 0.508, + "step": 23356 + }, + { + "epoch": 3.12, + "grad_norm": 0.5546875, + "learning_rate": 2.8263508914331104e-05, + "loss": 0.3114, + "step": 23357 + }, + { + "epoch": 3.12, + "grad_norm": 0.7265625, + "learning_rate": 2.825539634510992e-05, + "loss": 0.6418, + "step": 23358 + }, + { + "epoch": 3.12, + "grad_norm": 0.57421875, + "learning_rate": 2.8247284748791582e-05, + "loss": 0.2453, + "step": 23359 + }, + { + "epoch": 3.12, + "grad_norm": 0.453125, + "learning_rate": 2.8239174125486113e-05, + "loss": 0.2098, + "step": 23360 + }, + { + "epoch": 3.12, + "grad_norm": 0.6171875, + "learning_rate": 2.823106447530349e-05, + "loss": 0.3407, + "step": 23361 + }, + { + "epoch": 3.12, + "grad_norm": 0.5, + "learning_rate": 2.8222955798353734e-05, + "loss": 0.3226, + "step": 23362 + }, + { + "epoch": 3.12, + "grad_norm": 0.61328125, + "learning_rate": 2.8214848094746728e-05, + "loss": 0.33, + "step": 23363 + }, + { + "epoch": 3.12, + "grad_norm": 0.6171875, + "learning_rate": 2.8206741364592458e-05, + "loss": 0.2831, + "step": 23364 + }, + { + "epoch": 3.12, + "grad_norm": 0.66796875, + "learning_rate": 2.8198635608000846e-05, + "loss": 0.4051, + "step": 23365 + }, + { + "epoch": 3.12, + "grad_norm": 0.470703125, + "learning_rate": 2.819053082508185e-05, + "loss": 0.1733, + "step": 23366 + }, + { + "epoch": 3.12, + "grad_norm": 0.765625, + "learning_rate": 2.8182427015945345e-05, + "loss": 0.4279, + "step": 23367 + }, + { + "epoch": 3.12, + "grad_norm": 0.62109375, + "learning_rate": 2.8174324180701196e-05, + "loss": 0.1237, + "step": 23368 + }, + { + "epoch": 3.12, + "grad_norm": 0.54296875, + "learning_rate": 2.81662223194593e-05, + "loss": 0.379, + "step": 23369 + }, + { + "epoch": 3.12, + "grad_norm": 0.51953125, + "learning_rate": 2.815812143232953e-05, + "loss": 0.1845, + "step": 23370 + }, + { + "epoch": 3.12, + "grad_norm": 0.7421875, + "learning_rate": 2.8150021519421776e-05, + "loss": 0.4667, + "step": 23371 + }, + { + "epoch": 3.12, + "grad_norm": 0.5390625, + "learning_rate": 2.8141922580845816e-05, + "loss": 0.1549, + "step": 23372 + }, + { + "epoch": 3.12, + "grad_norm": 0.546875, + "learning_rate": 2.8133824616711503e-05, + "loss": 0.3293, + "step": 23373 + }, + { + "epoch": 3.12, + "grad_norm": 0.54296875, + "learning_rate": 2.812572762712864e-05, + "loss": 0.3815, + "step": 23374 + }, + { + "epoch": 3.12, + "grad_norm": 0.546875, + "learning_rate": 2.8117631612207084e-05, + "loss": 0.3396, + "step": 23375 + }, + { + "epoch": 3.12, + "grad_norm": 0.73828125, + "learning_rate": 2.810953657205654e-05, + "loss": 0.3356, + "step": 23376 + }, + { + "epoch": 3.12, + "grad_norm": 0.67578125, + "learning_rate": 2.8101442506786822e-05, + "loss": 0.3597, + "step": 23377 + }, + { + "epoch": 3.12, + "grad_norm": 0.48046875, + "learning_rate": 2.8093349416507708e-05, + "loss": 0.1252, + "step": 23378 + }, + { + "epoch": 3.12, + "grad_norm": 0.546875, + "learning_rate": 2.8085257301328904e-05, + "loss": 0.4661, + "step": 23379 + }, + { + "epoch": 3.12, + "grad_norm": 0.494140625, + "learning_rate": 2.807716616136017e-05, + "loss": 0.2592, + "step": 23380 + }, + { + "epoch": 3.12, + "grad_norm": 0.50390625, + "learning_rate": 2.806907599671125e-05, + "loss": 0.1357, + "step": 23381 + }, + { + "epoch": 3.12, + "grad_norm": 0.66015625, + "learning_rate": 2.8060986807491785e-05, + "loss": 0.3612, + "step": 23382 + }, + { + "epoch": 3.12, + "grad_norm": 0.63671875, + "learning_rate": 2.8052898593811518e-05, + "loss": 0.3958, + "step": 23383 + }, + { + "epoch": 3.12, + "grad_norm": 0.7109375, + "learning_rate": 2.804481135578012e-05, + "loss": 0.2411, + "step": 23384 + }, + { + "epoch": 3.12, + "grad_norm": 0.78125, + "learning_rate": 2.8036725093507288e-05, + "loss": 0.2984, + "step": 23385 + }, + { + "epoch": 3.12, + "grad_norm": 0.54296875, + "learning_rate": 2.802863980710262e-05, + "loss": 0.29, + "step": 23386 + }, + { + "epoch": 3.12, + "grad_norm": 0.6484375, + "learning_rate": 2.8020555496675794e-05, + "loss": 0.372, + "step": 23387 + }, + { + "epoch": 3.12, + "grad_norm": 0.70703125, + "learning_rate": 2.8012472162336422e-05, + "loss": 0.2263, + "step": 23388 + }, + { + "epoch": 3.12, + "grad_norm": 0.6484375, + "learning_rate": 2.8004389804194163e-05, + "loss": 0.2018, + "step": 23389 + }, + { + "epoch": 3.12, + "grad_norm": 0.53515625, + "learning_rate": 2.7996308422358587e-05, + "loss": 0.2034, + "step": 23390 + }, + { + "epoch": 3.12, + "grad_norm": 0.490234375, + "learning_rate": 2.7988228016939243e-05, + "loss": 0.1764, + "step": 23391 + }, + { + "epoch": 3.12, + "grad_norm": 0.63671875, + "learning_rate": 2.798014858804575e-05, + "loss": 0.2067, + "step": 23392 + }, + { + "epoch": 3.12, + "grad_norm": 0.5703125, + "learning_rate": 2.7972070135787666e-05, + "loss": 0.2067, + "step": 23393 + }, + { + "epoch": 3.12, + "grad_norm": 0.7109375, + "learning_rate": 2.7963992660274563e-05, + "loss": 0.2744, + "step": 23394 + }, + { + "epoch": 3.12, + "grad_norm": 0.578125, + "learning_rate": 2.7955916161615924e-05, + "loss": 0.2988, + "step": 23395 + }, + { + "epoch": 3.12, + "grad_norm": 0.765625, + "learning_rate": 2.794784063992131e-05, + "loss": 0.4049, + "step": 23396 + }, + { + "epoch": 3.12, + "grad_norm": 0.52734375, + "learning_rate": 2.7939766095300202e-05, + "loss": 0.3808, + "step": 23397 + }, + { + "epoch": 3.12, + "grad_norm": 0.671875, + "learning_rate": 2.793169252786215e-05, + "loss": 0.231, + "step": 23398 + }, + { + "epoch": 3.12, + "grad_norm": 0.52734375, + "learning_rate": 2.7923619937716572e-05, + "loss": 0.2469, + "step": 23399 + }, + { + "epoch": 3.12, + "grad_norm": 0.73046875, + "learning_rate": 2.791554832497296e-05, + "loss": 0.3026, + "step": 23400 + }, + { + "epoch": 3.12, + "grad_norm": 0.8046875, + "learning_rate": 2.790747768974079e-05, + "loss": 0.537, + "step": 23401 + }, + { + "epoch": 3.12, + "grad_norm": 0.455078125, + "learning_rate": 2.7899408032129516e-05, + "loss": 0.2362, + "step": 23402 + }, + { + "epoch": 3.12, + "grad_norm": 0.578125, + "learning_rate": 2.7891339352248536e-05, + "loss": 0.3883, + "step": 23403 + }, + { + "epoch": 3.12, + "grad_norm": 0.75390625, + "learning_rate": 2.7883271650207242e-05, + "loss": 0.3683, + "step": 23404 + }, + { + "epoch": 3.12, + "grad_norm": 0.6328125, + "learning_rate": 2.7875204926115085e-05, + "loss": 0.2147, + "step": 23405 + }, + { + "epoch": 3.12, + "grad_norm": 0.6015625, + "learning_rate": 2.7867139180081427e-05, + "loss": 0.2961, + "step": 23406 + }, + { + "epoch": 3.12, + "grad_norm": 0.70703125, + "learning_rate": 2.7859074412215646e-05, + "loss": 0.2643, + "step": 23407 + }, + { + "epoch": 3.12, + "grad_norm": 0.75390625, + "learning_rate": 2.785101062262716e-05, + "loss": 0.3827, + "step": 23408 + }, + { + "epoch": 3.12, + "grad_norm": 0.62890625, + "learning_rate": 2.7842947811425247e-05, + "loss": 0.4103, + "step": 23409 + }, + { + "epoch": 3.12, + "grad_norm": 0.484375, + "learning_rate": 2.783488597871926e-05, + "loss": 0.193, + "step": 23410 + }, + { + "epoch": 3.12, + "grad_norm": 0.62890625, + "learning_rate": 2.7826825124618538e-05, + "loss": 0.3915, + "step": 23411 + }, + { + "epoch": 3.12, + "grad_norm": 0.68359375, + "learning_rate": 2.7818765249232424e-05, + "loss": 0.5564, + "step": 23412 + }, + { + "epoch": 3.12, + "grad_norm": 0.49609375, + "learning_rate": 2.7810706352670148e-05, + "loss": 0.3479, + "step": 23413 + }, + { + "epoch": 3.12, + "grad_norm": 0.53515625, + "learning_rate": 2.7802648435041046e-05, + "loss": 0.2732, + "step": 23414 + }, + { + "epoch": 3.12, + "grad_norm": 0.498046875, + "learning_rate": 2.7794591496454347e-05, + "loss": 0.1653, + "step": 23415 + }, + { + "epoch": 3.12, + "grad_norm": 0.50390625, + "learning_rate": 2.778653553701932e-05, + "loss": 0.2583, + "step": 23416 + }, + { + "epoch": 3.12, + "grad_norm": 0.6328125, + "learning_rate": 2.7778480556845254e-05, + "loss": 0.4203, + "step": 23417 + }, + { + "epoch": 3.12, + "grad_norm": 0.53125, + "learning_rate": 2.7770426556041306e-05, + "loss": 0.2103, + "step": 23418 + }, + { + "epoch": 3.13, + "grad_norm": 0.73828125, + "learning_rate": 2.7762373534716734e-05, + "loss": 0.4312, + "step": 23419 + }, + { + "epoch": 3.13, + "grad_norm": 0.84375, + "learning_rate": 2.775432149298075e-05, + "loss": 0.3753, + "step": 23420 + }, + { + "epoch": 3.13, + "grad_norm": 0.6796875, + "learning_rate": 2.7746270430942544e-05, + "loss": 0.297, + "step": 23421 + }, + { + "epoch": 3.13, + "grad_norm": 0.640625, + "learning_rate": 2.7738220348711263e-05, + "loss": 0.2552, + "step": 23422 + }, + { + "epoch": 3.13, + "grad_norm": 0.63671875, + "learning_rate": 2.7730171246396096e-05, + "loss": 0.2124, + "step": 23423 + }, + { + "epoch": 3.13, + "grad_norm": 0.6171875, + "learning_rate": 2.772212312410618e-05, + "loss": 0.2855, + "step": 23424 + }, + { + "epoch": 3.13, + "grad_norm": 0.57421875, + "learning_rate": 2.771407598195067e-05, + "loss": 0.3385, + "step": 23425 + }, + { + "epoch": 3.13, + "grad_norm": 0.578125, + "learning_rate": 2.7706029820038737e-05, + "loss": 0.2207, + "step": 23426 + }, + { + "epoch": 3.13, + "grad_norm": 0.515625, + "learning_rate": 2.769798463847938e-05, + "loss": 0.2656, + "step": 23427 + }, + { + "epoch": 3.13, + "grad_norm": 0.6171875, + "learning_rate": 2.7689940437381756e-05, + "loss": 0.3472, + "step": 23428 + }, + { + "epoch": 3.13, + "grad_norm": 0.58203125, + "learning_rate": 2.7681897216854948e-05, + "loss": 0.327, + "step": 23429 + }, + { + "epoch": 3.13, + "grad_norm": 0.42578125, + "learning_rate": 2.767385497700802e-05, + "loss": 0.2068, + "step": 23430 + }, + { + "epoch": 3.13, + "grad_norm": 0.73046875, + "learning_rate": 2.7665813717950074e-05, + "loss": 0.1678, + "step": 23431 + }, + { + "epoch": 3.13, + "grad_norm": 0.56640625, + "learning_rate": 2.765777343979008e-05, + "loss": 0.3115, + "step": 23432 + }, + { + "epoch": 3.13, + "grad_norm": 0.6015625, + "learning_rate": 2.764973414263712e-05, + "loss": 0.2072, + "step": 23433 + }, + { + "epoch": 3.13, + "grad_norm": 0.6171875, + "learning_rate": 2.7641695826600188e-05, + "loss": 0.5919, + "step": 23434 + }, + { + "epoch": 3.13, + "grad_norm": 0.66015625, + "learning_rate": 2.763365849178834e-05, + "loss": 0.2436, + "step": 23435 + }, + { + "epoch": 3.13, + "grad_norm": 0.5703125, + "learning_rate": 2.762562213831049e-05, + "loss": 0.2413, + "step": 23436 + }, + { + "epoch": 3.13, + "grad_norm": 0.310546875, + "learning_rate": 2.761758676627565e-05, + "loss": 0.0971, + "step": 23437 + }, + { + "epoch": 3.13, + "grad_norm": 0.6015625, + "learning_rate": 2.7609552375792824e-05, + "loss": 0.2181, + "step": 23438 + }, + { + "epoch": 3.13, + "grad_norm": 0.6171875, + "learning_rate": 2.7601518966970897e-05, + "loss": 0.2104, + "step": 23439 + }, + { + "epoch": 3.13, + "grad_norm": 0.65234375, + "learning_rate": 2.7593486539918866e-05, + "loss": 0.2358, + "step": 23440 + }, + { + "epoch": 3.13, + "grad_norm": 0.5703125, + "learning_rate": 2.7585455094745605e-05, + "loss": 0.3458, + "step": 23441 + }, + { + "epoch": 3.13, + "grad_norm": 0.5703125, + "learning_rate": 2.7577424631560034e-05, + "loss": 0.3717, + "step": 23442 + }, + { + "epoch": 3.13, + "grad_norm": 0.68359375, + "learning_rate": 2.7569395150471077e-05, + "loss": 0.3591, + "step": 23443 + }, + { + "epoch": 3.13, + "grad_norm": 0.59765625, + "learning_rate": 2.7561366651587627e-05, + "loss": 0.3296, + "step": 23444 + }, + { + "epoch": 3.13, + "grad_norm": 0.61328125, + "learning_rate": 2.7553339135018518e-05, + "loss": 0.3637, + "step": 23445 + }, + { + "epoch": 3.13, + "grad_norm": 0.5546875, + "learning_rate": 2.7545312600872618e-05, + "loss": 0.3566, + "step": 23446 + }, + { + "epoch": 3.13, + "grad_norm": 0.52734375, + "learning_rate": 2.7537287049258787e-05, + "loss": 0.3355, + "step": 23447 + }, + { + "epoch": 3.13, + "grad_norm": 0.52734375, + "learning_rate": 2.752926248028588e-05, + "loss": 0.2751, + "step": 23448 + }, + { + "epoch": 3.13, + "grad_norm": 0.40625, + "learning_rate": 2.752123889406265e-05, + "loss": 0.1557, + "step": 23449 + }, + { + "epoch": 3.13, + "grad_norm": 0.60546875, + "learning_rate": 2.751321629069794e-05, + "loss": 0.332, + "step": 23450 + }, + { + "epoch": 3.13, + "grad_norm": 0.7109375, + "learning_rate": 2.750519467030056e-05, + "loss": 0.3655, + "step": 23451 + }, + { + "epoch": 3.13, + "grad_norm": 0.5390625, + "learning_rate": 2.7497174032979245e-05, + "loss": 0.2146, + "step": 23452 + }, + { + "epoch": 3.13, + "grad_norm": 0.578125, + "learning_rate": 2.7489154378842786e-05, + "loss": 0.3155, + "step": 23453 + }, + { + "epoch": 3.13, + "grad_norm": 0.609375, + "learning_rate": 2.7481135707999962e-05, + "loss": 0.182, + "step": 23454 + }, + { + "epoch": 3.13, + "grad_norm": 0.5234375, + "learning_rate": 2.747311802055944e-05, + "loss": 0.3312, + "step": 23455 + }, + { + "epoch": 3.13, + "grad_norm": 0.81640625, + "learning_rate": 2.7465101316629994e-05, + "loss": 0.4573, + "step": 23456 + }, + { + "epoch": 3.13, + "grad_norm": 0.8203125, + "learning_rate": 2.745708559632032e-05, + "loss": 0.4217, + "step": 23457 + }, + { + "epoch": 3.13, + "grad_norm": 0.5, + "learning_rate": 2.7449070859739155e-05, + "loss": 0.1934, + "step": 23458 + }, + { + "epoch": 3.13, + "grad_norm": 0.640625, + "learning_rate": 2.7441057106995118e-05, + "loss": 0.4079, + "step": 23459 + }, + { + "epoch": 3.13, + "grad_norm": 0.462890625, + "learning_rate": 2.7433044338196922e-05, + "loss": 0.1271, + "step": 23460 + }, + { + "epoch": 3.13, + "grad_norm": 0.55859375, + "learning_rate": 2.742503255345321e-05, + "loss": 0.5049, + "step": 23461 + }, + { + "epoch": 3.13, + "grad_norm": 0.5234375, + "learning_rate": 2.7417021752872675e-05, + "loss": 0.1964, + "step": 23462 + }, + { + "epoch": 3.13, + "grad_norm": 0.5859375, + "learning_rate": 2.74090119365639e-05, + "loss": 0.3823, + "step": 23463 + }, + { + "epoch": 3.13, + "grad_norm": 0.68359375, + "learning_rate": 2.7401003104635493e-05, + "loss": 0.3112, + "step": 23464 + }, + { + "epoch": 3.13, + "grad_norm": 0.52734375, + "learning_rate": 2.7392995257196062e-05, + "loss": 0.1672, + "step": 23465 + }, + { + "epoch": 3.13, + "grad_norm": 0.546875, + "learning_rate": 2.7384988394354227e-05, + "loss": 0.4136, + "step": 23466 + }, + { + "epoch": 3.13, + "grad_norm": 0.5234375, + "learning_rate": 2.7376982516218575e-05, + "loss": 0.1863, + "step": 23467 + }, + { + "epoch": 3.13, + "grad_norm": 0.4765625, + "learning_rate": 2.7368977622897616e-05, + "loss": 0.2629, + "step": 23468 + }, + { + "epoch": 3.13, + "grad_norm": 0.70703125, + "learning_rate": 2.736097371449995e-05, + "loss": 0.1574, + "step": 23469 + }, + { + "epoch": 3.13, + "grad_norm": 0.6953125, + "learning_rate": 2.735297079113409e-05, + "loss": 0.2566, + "step": 23470 + }, + { + "epoch": 3.13, + "grad_norm": 0.62109375, + "learning_rate": 2.7344968852908604e-05, + "loss": 0.2717, + "step": 23471 + }, + { + "epoch": 3.13, + "grad_norm": 0.69140625, + "learning_rate": 2.7336967899931933e-05, + "loss": 0.2071, + "step": 23472 + }, + { + "epoch": 3.13, + "grad_norm": 0.62109375, + "learning_rate": 2.7328967932312623e-05, + "loss": 0.428, + "step": 23473 + }, + { + "epoch": 3.13, + "grad_norm": 0.5625, + "learning_rate": 2.732096895015913e-05, + "loss": 0.3746, + "step": 23474 + }, + { + "epoch": 3.13, + "grad_norm": 0.58203125, + "learning_rate": 2.7312970953579987e-05, + "loss": 0.4163, + "step": 23475 + }, + { + "epoch": 3.13, + "grad_norm": 0.60546875, + "learning_rate": 2.730497394268361e-05, + "loss": 0.2141, + "step": 23476 + }, + { + "epoch": 3.13, + "grad_norm": 0.66796875, + "learning_rate": 2.7296977917578405e-05, + "loss": 0.3325, + "step": 23477 + }, + { + "epoch": 3.13, + "grad_norm": 0.68359375, + "learning_rate": 2.7288982878372848e-05, + "loss": 0.352, + "step": 23478 + }, + { + "epoch": 3.13, + "grad_norm": 0.7265625, + "learning_rate": 2.7280988825175336e-05, + "loss": 0.3897, + "step": 23479 + }, + { + "epoch": 3.13, + "grad_norm": 0.404296875, + "learning_rate": 2.72729957580943e-05, + "loss": 0.2236, + "step": 23480 + }, + { + "epoch": 3.13, + "grad_norm": 0.66015625, + "learning_rate": 2.7265003677238145e-05, + "loss": 0.6009, + "step": 23481 + }, + { + "epoch": 3.13, + "grad_norm": 0.37890625, + "learning_rate": 2.725701258271518e-05, + "loss": 0.1621, + "step": 23482 + }, + { + "epoch": 3.13, + "grad_norm": 0.515625, + "learning_rate": 2.7249022474633824e-05, + "loss": 0.168, + "step": 23483 + }, + { + "epoch": 3.13, + "grad_norm": 0.53515625, + "learning_rate": 2.7241033353102397e-05, + "loss": 0.1811, + "step": 23484 + }, + { + "epoch": 3.13, + "grad_norm": 0.72265625, + "learning_rate": 2.7233045218229302e-05, + "loss": 0.4968, + "step": 23485 + }, + { + "epoch": 3.13, + "grad_norm": 0.6796875, + "learning_rate": 2.7225058070122778e-05, + "loss": 0.4171, + "step": 23486 + }, + { + "epoch": 3.13, + "grad_norm": 0.71484375, + "learning_rate": 2.7217071908891202e-05, + "loss": 0.3201, + "step": 23487 + }, + { + "epoch": 3.13, + "grad_norm": 0.51171875, + "learning_rate": 2.720908673464282e-05, + "loss": 0.2448, + "step": 23488 + }, + { + "epoch": 3.13, + "grad_norm": 0.7421875, + "learning_rate": 2.7201102547485924e-05, + "loss": 0.417, + "step": 23489 + }, + { + "epoch": 3.13, + "grad_norm": 0.515625, + "learning_rate": 2.719311934752884e-05, + "loss": 0.2858, + "step": 23490 + }, + { + "epoch": 3.13, + "grad_norm": 0.58984375, + "learning_rate": 2.7185137134879757e-05, + "loss": 0.2148, + "step": 23491 + }, + { + "epoch": 3.13, + "grad_norm": 0.75, + "learning_rate": 2.7177155909646946e-05, + "loss": 0.25, + "step": 23492 + }, + { + "epoch": 3.13, + "grad_norm": 0.73828125, + "learning_rate": 2.716917567193863e-05, + "loss": 0.7764, + "step": 23493 + }, + { + "epoch": 3.14, + "grad_norm": 0.6640625, + "learning_rate": 2.7161196421863068e-05, + "loss": 0.3175, + "step": 23494 + }, + { + "epoch": 3.14, + "grad_norm": 0.57421875, + "learning_rate": 2.7153218159528405e-05, + "loss": 0.3114, + "step": 23495 + }, + { + "epoch": 3.14, + "grad_norm": 0.63671875, + "learning_rate": 2.7145240885042854e-05, + "loss": 0.2985, + "step": 23496 + }, + { + "epoch": 3.14, + "grad_norm": 0.62109375, + "learning_rate": 2.713726459851459e-05, + "loss": 0.4334, + "step": 23497 + }, + { + "epoch": 3.14, + "grad_norm": 0.66796875, + "learning_rate": 2.7129289300051787e-05, + "loss": 0.5488, + "step": 23498 + }, + { + "epoch": 3.14, + "grad_norm": 1.4921875, + "learning_rate": 2.7121314989762646e-05, + "loss": 0.1643, + "step": 23499 + }, + { + "epoch": 3.14, + "grad_norm": 0.51953125, + "learning_rate": 2.7113341667755187e-05, + "loss": 0.2786, + "step": 23500 + }, + { + "epoch": 3.14, + "grad_norm": 0.7421875, + "learning_rate": 2.71053693341376e-05, + "loss": 0.2548, + "step": 23501 + }, + { + "epoch": 3.14, + "grad_norm": 0.71875, + "learning_rate": 2.709739798901798e-05, + "loss": 0.3774, + "step": 23502 + }, + { + "epoch": 3.14, + "grad_norm": 0.734375, + "learning_rate": 2.7089427632504437e-05, + "loss": 0.3715, + "step": 23503 + }, + { + "epoch": 3.14, + "grad_norm": 0.64453125, + "learning_rate": 2.7081458264705074e-05, + "loss": 0.4171, + "step": 23504 + }, + { + "epoch": 3.14, + "grad_norm": 0.6484375, + "learning_rate": 2.707348988572791e-05, + "loss": 0.4059, + "step": 23505 + }, + { + "epoch": 3.14, + "grad_norm": 0.66015625, + "learning_rate": 2.706552249568103e-05, + "loss": 0.2764, + "step": 23506 + }, + { + "epoch": 3.14, + "grad_norm": 0.71484375, + "learning_rate": 2.7057556094672466e-05, + "loss": 0.6782, + "step": 23507 + }, + { + "epoch": 3.14, + "grad_norm": 0.56640625, + "learning_rate": 2.704959068281029e-05, + "loss": 0.2988, + "step": 23508 + }, + { + "epoch": 3.14, + "grad_norm": 0.58984375, + "learning_rate": 2.7041626260202446e-05, + "loss": 0.2936, + "step": 23509 + }, + { + "epoch": 3.14, + "grad_norm": 0.6875, + "learning_rate": 2.7033662826956983e-05, + "loss": 0.2175, + "step": 23510 + }, + { + "epoch": 3.14, + "grad_norm": 0.65234375, + "learning_rate": 2.7025700383181908e-05, + "loss": 0.4263, + "step": 23511 + }, + { + "epoch": 3.14, + "grad_norm": 0.7265625, + "learning_rate": 2.7017738928985148e-05, + "loss": 0.2189, + "step": 23512 + }, + { + "epoch": 3.14, + "grad_norm": 0.53125, + "learning_rate": 2.700977846447471e-05, + "loss": 0.2927, + "step": 23513 + }, + { + "epoch": 3.14, + "grad_norm": 0.9765625, + "learning_rate": 2.700181898975849e-05, + "loss": 0.4815, + "step": 23514 + }, + { + "epoch": 3.14, + "grad_norm": 0.515625, + "learning_rate": 2.6993860504944458e-05, + "loss": 0.2078, + "step": 23515 + }, + { + "epoch": 3.14, + "grad_norm": 0.6171875, + "learning_rate": 2.6985903010140533e-05, + "loss": 0.433, + "step": 23516 + }, + { + "epoch": 3.14, + "grad_norm": 0.72265625, + "learning_rate": 2.6977946505454643e-05, + "loss": 0.4048, + "step": 23517 + }, + { + "epoch": 3.14, + "grad_norm": 0.48828125, + "learning_rate": 2.6969990990994644e-05, + "loss": 0.2585, + "step": 23518 + }, + { + "epoch": 3.14, + "grad_norm": 0.55859375, + "learning_rate": 2.6962036466868435e-05, + "loss": 0.2224, + "step": 23519 + }, + { + "epoch": 3.14, + "grad_norm": 0.67578125, + "learning_rate": 2.695408293318389e-05, + "loss": 0.4278, + "step": 23520 + }, + { + "epoch": 3.14, + "grad_norm": 0.5859375, + "learning_rate": 2.6946130390048896e-05, + "loss": 0.2211, + "step": 23521 + }, + { + "epoch": 3.14, + "grad_norm": 0.58203125, + "learning_rate": 2.6938178837571215e-05, + "loss": 0.234, + "step": 23522 + }, + { + "epoch": 3.14, + "grad_norm": 0.8125, + "learning_rate": 2.693022827585876e-05, + "loss": 0.6177, + "step": 23523 + }, + { + "epoch": 3.14, + "grad_norm": 0.59765625, + "learning_rate": 2.6922278705019266e-05, + "loss": 0.365, + "step": 23524 + }, + { + "epoch": 3.14, + "grad_norm": 0.67578125, + "learning_rate": 2.691433012516058e-05, + "loss": 0.3485, + "step": 23525 + }, + { + "epoch": 3.14, + "grad_norm": 0.80078125, + "learning_rate": 2.6906382536390483e-05, + "loss": 0.4981, + "step": 23526 + }, + { + "epoch": 3.14, + "grad_norm": 0.58984375, + "learning_rate": 2.689843593881679e-05, + "loss": 0.3276, + "step": 23527 + }, + { + "epoch": 3.14, + "grad_norm": 0.63671875, + "learning_rate": 2.689049033254718e-05, + "loss": 0.436, + "step": 23528 + }, + { + "epoch": 3.14, + "grad_norm": 0.5625, + "learning_rate": 2.6882545717689446e-05, + "loss": 0.2385, + "step": 23529 + }, + { + "epoch": 3.14, + "grad_norm": 0.5078125, + "learning_rate": 2.687460209435132e-05, + "loss": 0.3665, + "step": 23530 + }, + { + "epoch": 3.14, + "grad_norm": 0.51953125, + "learning_rate": 2.6866659462640542e-05, + "loss": 0.3004, + "step": 23531 + }, + { + "epoch": 3.14, + "grad_norm": 0.83203125, + "learning_rate": 2.685871782266478e-05, + "loss": 0.533, + "step": 23532 + }, + { + "epoch": 3.14, + "grad_norm": 0.92578125, + "learning_rate": 2.685077717453174e-05, + "loss": 0.458, + "step": 23533 + }, + { + "epoch": 3.14, + "grad_norm": 0.609375, + "learning_rate": 2.6842837518349108e-05, + "loss": 0.447, + "step": 23534 + }, + { + "epoch": 3.14, + "grad_norm": 0.6015625, + "learning_rate": 2.683489885422459e-05, + "loss": 0.2251, + "step": 23535 + }, + { + "epoch": 3.14, + "grad_norm": 0.65234375, + "learning_rate": 2.6826961182265785e-05, + "loss": 0.3355, + "step": 23536 + }, + { + "epoch": 3.14, + "grad_norm": 1.078125, + "learning_rate": 2.6819024502580326e-05, + "loss": 0.4363, + "step": 23537 + }, + { + "epoch": 3.14, + "grad_norm": 0.419921875, + "learning_rate": 2.6811088815275865e-05, + "loss": 0.1295, + "step": 23538 + }, + { + "epoch": 3.14, + "grad_norm": 0.64453125, + "learning_rate": 2.6803154120460007e-05, + "loss": 0.1673, + "step": 23539 + }, + { + "epoch": 3.14, + "grad_norm": 0.46875, + "learning_rate": 2.6795220418240395e-05, + "loss": 0.2785, + "step": 23540 + }, + { + "epoch": 3.14, + "grad_norm": 0.61328125, + "learning_rate": 2.678728770872454e-05, + "loss": 0.4419, + "step": 23541 + }, + { + "epoch": 3.14, + "grad_norm": 0.7265625, + "learning_rate": 2.677935599202005e-05, + "loss": 0.216, + "step": 23542 + }, + { + "epoch": 3.14, + "grad_norm": 0.7109375, + "learning_rate": 2.6771425268234485e-05, + "loss": 0.1959, + "step": 23543 + }, + { + "epoch": 3.14, + "grad_norm": 0.796875, + "learning_rate": 2.6763495537475424e-05, + "loss": 0.6655, + "step": 23544 + }, + { + "epoch": 3.14, + "grad_norm": 0.77734375, + "learning_rate": 2.675556679985034e-05, + "loss": 0.4107, + "step": 23545 + }, + { + "epoch": 3.14, + "grad_norm": 0.76171875, + "learning_rate": 2.6747639055466778e-05, + "loss": 0.3769, + "step": 23546 + }, + { + "epoch": 3.14, + "grad_norm": 0.640625, + "learning_rate": 2.673971230443224e-05, + "loss": 0.3837, + "step": 23547 + }, + { + "epoch": 3.14, + "grad_norm": 0.75, + "learning_rate": 2.6731786546854244e-05, + "loss": 0.4876, + "step": 23548 + }, + { + "epoch": 3.14, + "grad_norm": 0.50390625, + "learning_rate": 2.6723861782840254e-05, + "loss": 0.2726, + "step": 23549 + }, + { + "epoch": 3.14, + "grad_norm": 0.470703125, + "learning_rate": 2.67159380124977e-05, + "loss": 0.1763, + "step": 23550 + }, + { + "epoch": 3.14, + "grad_norm": 0.4375, + "learning_rate": 2.6708015235934048e-05, + "loss": 0.1021, + "step": 23551 + }, + { + "epoch": 3.14, + "grad_norm": 0.515625, + "learning_rate": 2.670009345325676e-05, + "loss": 0.4165, + "step": 23552 + }, + { + "epoch": 3.14, + "grad_norm": 0.74609375, + "learning_rate": 2.6692172664573246e-05, + "loss": 0.408, + "step": 23553 + }, + { + "epoch": 3.14, + "grad_norm": 0.515625, + "learning_rate": 2.6684252869990945e-05, + "loss": 0.2412, + "step": 23554 + }, + { + "epoch": 3.14, + "grad_norm": 0.73828125, + "learning_rate": 2.6676334069617205e-05, + "loss": 0.5064, + "step": 23555 + }, + { + "epoch": 3.14, + "grad_norm": 0.67578125, + "learning_rate": 2.6668416263559425e-05, + "loss": 0.4305, + "step": 23556 + }, + { + "epoch": 3.14, + "grad_norm": 0.54296875, + "learning_rate": 2.666049945192499e-05, + "loss": 0.1424, + "step": 23557 + }, + { + "epoch": 3.14, + "grad_norm": 0.5625, + "learning_rate": 2.6652583634821283e-05, + "loss": 0.3307, + "step": 23558 + }, + { + "epoch": 3.14, + "grad_norm": 0.62890625, + "learning_rate": 2.664466881235558e-05, + "loss": 0.3629, + "step": 23559 + }, + { + "epoch": 3.14, + "grad_norm": 0.64453125, + "learning_rate": 2.6636754984635282e-05, + "loss": 0.4015, + "step": 23560 + }, + { + "epoch": 3.14, + "grad_norm": 0.6875, + "learning_rate": 2.662884215176763e-05, + "loss": 0.3867, + "step": 23561 + }, + { + "epoch": 3.14, + "grad_norm": 0.57421875, + "learning_rate": 2.662093031385998e-05, + "loss": 0.1749, + "step": 23562 + }, + { + "epoch": 3.14, + "grad_norm": 0.55078125, + "learning_rate": 2.661301947101964e-05, + "loss": 0.169, + "step": 23563 + }, + { + "epoch": 3.14, + "grad_norm": 0.51171875, + "learning_rate": 2.6605109623353818e-05, + "loss": 0.2686, + "step": 23564 + }, + { + "epoch": 3.14, + "grad_norm": 0.65625, + "learning_rate": 2.6597200770969822e-05, + "loss": 0.291, + "step": 23565 + }, + { + "epoch": 3.14, + "grad_norm": 0.66796875, + "learning_rate": 2.6589292913974884e-05, + "loss": 0.2405, + "step": 23566 + }, + { + "epoch": 3.14, + "grad_norm": 0.87109375, + "learning_rate": 2.6581386052476287e-05, + "loss": 0.2775, + "step": 23567 + }, + { + "epoch": 3.14, + "grad_norm": 0.68359375, + "learning_rate": 2.657348018658118e-05, + "loss": 0.6671, + "step": 23568 + }, + { + "epoch": 3.15, + "grad_norm": 0.8203125, + "learning_rate": 2.6565575316396808e-05, + "loss": 0.1965, + "step": 23569 + }, + { + "epoch": 3.15, + "grad_norm": 0.5703125, + "learning_rate": 2.6557671442030363e-05, + "loss": 0.3758, + "step": 23570 + }, + { + "epoch": 3.15, + "grad_norm": 0.54296875, + "learning_rate": 2.6549768563589028e-05, + "loss": 0.2714, + "step": 23571 + }, + { + "epoch": 3.15, + "grad_norm": 0.64453125, + "learning_rate": 2.6541866681180028e-05, + "loss": 0.368, + "step": 23572 + }, + { + "epoch": 3.15, + "grad_norm": 0.4921875, + "learning_rate": 2.653396579491041e-05, + "loss": 0.2282, + "step": 23573 + }, + { + "epoch": 3.15, + "grad_norm": 0.7421875, + "learning_rate": 2.6526065904887365e-05, + "loss": 0.2566, + "step": 23574 + }, + { + "epoch": 3.15, + "grad_norm": 0.71875, + "learning_rate": 2.651816701121801e-05, + "loss": 0.368, + "step": 23575 + }, + { + "epoch": 3.15, + "grad_norm": 0.4921875, + "learning_rate": 2.6510269114009477e-05, + "loss": 0.3277, + "step": 23576 + }, + { + "epoch": 3.15, + "grad_norm": 0.6171875, + "learning_rate": 2.65023722133689e-05, + "loss": 0.4214, + "step": 23577 + }, + { + "epoch": 3.15, + "grad_norm": 0.578125, + "learning_rate": 2.6494476309403283e-05, + "loss": 0.241, + "step": 23578 + }, + { + "epoch": 3.15, + "grad_norm": 0.515625, + "learning_rate": 2.6486581402219758e-05, + "loss": 0.3249, + "step": 23579 + }, + { + "epoch": 3.15, + "grad_norm": 0.671875, + "learning_rate": 2.6478687491925357e-05, + "loss": 0.4666, + "step": 23580 + }, + { + "epoch": 3.15, + "grad_norm": 0.51171875, + "learning_rate": 2.6470794578627167e-05, + "loss": 0.46, + "step": 23581 + }, + { + "epoch": 3.15, + "grad_norm": 0.734375, + "learning_rate": 2.6462902662432165e-05, + "loss": 0.5682, + "step": 23582 + }, + { + "epoch": 3.15, + "grad_norm": 0.53125, + "learning_rate": 2.645501174344741e-05, + "loss": 0.1611, + "step": 23583 + }, + { + "epoch": 3.15, + "grad_norm": 0.66015625, + "learning_rate": 2.6447121821779917e-05, + "loss": 0.258, + "step": 23584 + }, + { + "epoch": 3.15, + "grad_norm": 0.423828125, + "learning_rate": 2.6439232897536626e-05, + "loss": 0.1367, + "step": 23585 + }, + { + "epoch": 3.15, + "grad_norm": 0.484375, + "learning_rate": 2.643134497082458e-05, + "loss": 0.2661, + "step": 23586 + }, + { + "epoch": 3.15, + "grad_norm": 0.49609375, + "learning_rate": 2.642345804175069e-05, + "loss": 0.2317, + "step": 23587 + }, + { + "epoch": 3.15, + "grad_norm": 0.5234375, + "learning_rate": 2.6415572110421927e-05, + "loss": 0.2904, + "step": 23588 + }, + { + "epoch": 3.15, + "grad_norm": 0.546875, + "learning_rate": 2.640768717694523e-05, + "loss": 0.4102, + "step": 23589 + }, + { + "epoch": 3.15, + "grad_norm": 0.5078125, + "learning_rate": 2.6399803241427547e-05, + "loss": 0.2018, + "step": 23590 + }, + { + "epoch": 3.15, + "grad_norm": 0.75390625, + "learning_rate": 2.639192030397575e-05, + "loss": 0.2097, + "step": 23591 + }, + { + "epoch": 3.15, + "grad_norm": 0.625, + "learning_rate": 2.638403836469675e-05, + "loss": 0.301, + "step": 23592 + }, + { + "epoch": 3.15, + "grad_norm": 0.66015625, + "learning_rate": 2.6376157423697433e-05, + "loss": 0.2801, + "step": 23593 + }, + { + "epoch": 3.15, + "grad_norm": 0.51171875, + "learning_rate": 2.6368277481084712e-05, + "loss": 0.1465, + "step": 23594 + }, + { + "epoch": 3.15, + "grad_norm": 0.671875, + "learning_rate": 2.636039853696537e-05, + "loss": 0.193, + "step": 23595 + }, + { + "epoch": 3.15, + "grad_norm": 0.51953125, + "learning_rate": 2.6352520591446316e-05, + "loss": 0.304, + "step": 23596 + }, + { + "epoch": 3.15, + "grad_norm": 0.578125, + "learning_rate": 2.6344643644634326e-05, + "loss": 0.2838, + "step": 23597 + }, + { + "epoch": 3.15, + "grad_norm": 0.4140625, + "learning_rate": 2.633676769663623e-05, + "loss": 0.1724, + "step": 23598 + }, + { + "epoch": 3.15, + "grad_norm": 0.5859375, + "learning_rate": 2.632889274755884e-05, + "loss": 0.2215, + "step": 23599 + }, + { + "epoch": 3.15, + "grad_norm": 0.62890625, + "learning_rate": 2.6321018797508977e-05, + "loss": 0.2576, + "step": 23600 + }, + { + "epoch": 3.15, + "grad_norm": 0.6328125, + "learning_rate": 2.6313145846593368e-05, + "loss": 0.4176, + "step": 23601 + }, + { + "epoch": 3.15, + "grad_norm": 0.68359375, + "learning_rate": 2.630527389491878e-05, + "loss": 0.2027, + "step": 23602 + }, + { + "epoch": 3.15, + "grad_norm": 0.7578125, + "learning_rate": 2.6297402942591974e-05, + "loss": 0.4, + "step": 23603 + }, + { + "epoch": 3.15, + "grad_norm": 0.625, + "learning_rate": 2.6289532989719724e-05, + "loss": 0.4391, + "step": 23604 + }, + { + "epoch": 3.15, + "grad_norm": 0.53515625, + "learning_rate": 2.6281664036408682e-05, + "loss": 0.3259, + "step": 23605 + }, + { + "epoch": 3.15, + "grad_norm": 0.85546875, + "learning_rate": 2.627379608276559e-05, + "loss": 0.2809, + "step": 23606 + }, + { + "epoch": 3.15, + "grad_norm": 0.7421875, + "learning_rate": 2.626592912889714e-05, + "loss": 0.2438, + "step": 23607 + }, + { + "epoch": 3.15, + "grad_norm": 0.625, + "learning_rate": 2.625806317491004e-05, + "loss": 0.4531, + "step": 23608 + }, + { + "epoch": 3.15, + "grad_norm": 0.458984375, + "learning_rate": 2.6250198220910925e-05, + "loss": 0.2606, + "step": 23609 + }, + { + "epoch": 3.15, + "grad_norm": 0.7578125, + "learning_rate": 2.624233426700643e-05, + "loss": 0.5527, + "step": 23610 + }, + { + "epoch": 3.15, + "grad_norm": 0.8671875, + "learning_rate": 2.623447131330322e-05, + "loss": 0.5283, + "step": 23611 + }, + { + "epoch": 3.15, + "grad_norm": 0.78515625, + "learning_rate": 2.6226609359907926e-05, + "loss": 0.2519, + "step": 23612 + }, + { + "epoch": 3.15, + "grad_norm": 0.8984375, + "learning_rate": 2.6218748406927186e-05, + "loss": 0.5021, + "step": 23613 + }, + { + "epoch": 3.15, + "grad_norm": 0.4375, + "learning_rate": 2.621088845446753e-05, + "loss": 0.1915, + "step": 23614 + }, + { + "epoch": 3.15, + "grad_norm": 0.625, + "learning_rate": 2.620302950263559e-05, + "loss": 0.3132, + "step": 23615 + }, + { + "epoch": 3.15, + "grad_norm": 0.703125, + "learning_rate": 2.6195171551537935e-05, + "loss": 0.5835, + "step": 23616 + }, + { + "epoch": 3.15, + "grad_norm": 0.7109375, + "learning_rate": 2.6187314601281145e-05, + "loss": 0.4286, + "step": 23617 + }, + { + "epoch": 3.15, + "grad_norm": 0.52734375, + "learning_rate": 2.6179458651971723e-05, + "loss": 0.2531, + "step": 23618 + }, + { + "epoch": 3.15, + "grad_norm": 0.486328125, + "learning_rate": 2.6171603703716206e-05, + "loss": 0.1617, + "step": 23619 + }, + { + "epoch": 3.15, + "grad_norm": 0.52734375, + "learning_rate": 2.6163749756621137e-05, + "loss": 0.2382, + "step": 23620 + }, + { + "epoch": 3.15, + "grad_norm": 0.6875, + "learning_rate": 2.6155896810793036e-05, + "loss": 0.3299, + "step": 23621 + }, + { + "epoch": 3.15, + "grad_norm": 0.734375, + "learning_rate": 2.614804486633836e-05, + "loss": 0.3901, + "step": 23622 + }, + { + "epoch": 3.15, + "grad_norm": 0.447265625, + "learning_rate": 2.6140193923363564e-05, + "loss": 0.2537, + "step": 23623 + }, + { + "epoch": 3.15, + "grad_norm": 0.6328125, + "learning_rate": 2.613234398197515e-05, + "loss": 0.3234, + "step": 23624 + }, + { + "epoch": 3.15, + "grad_norm": 0.51953125, + "learning_rate": 2.612449504227955e-05, + "loss": 0.4217, + "step": 23625 + }, + { + "epoch": 3.15, + "grad_norm": 0.75, + "learning_rate": 2.611664710438322e-05, + "loss": 0.2796, + "step": 23626 + }, + { + "epoch": 3.15, + "grad_norm": 0.53515625, + "learning_rate": 2.61088001683926e-05, + "loss": 0.1919, + "step": 23627 + }, + { + "epoch": 3.15, + "grad_norm": 0.66015625, + "learning_rate": 2.6100954234414044e-05, + "loss": 0.272, + "step": 23628 + }, + { + "epoch": 3.15, + "grad_norm": 0.443359375, + "learning_rate": 2.6093109302553966e-05, + "loss": 0.174, + "step": 23629 + }, + { + "epoch": 3.15, + "grad_norm": 0.58203125, + "learning_rate": 2.6085265372918764e-05, + "loss": 0.1845, + "step": 23630 + }, + { + "epoch": 3.15, + "grad_norm": 0.73046875, + "learning_rate": 2.607742244561484e-05, + "loss": 0.3746, + "step": 23631 + }, + { + "epoch": 3.15, + "grad_norm": 0.5703125, + "learning_rate": 2.606958052074847e-05, + "loss": 0.2696, + "step": 23632 + }, + { + "epoch": 3.15, + "grad_norm": 0.71875, + "learning_rate": 2.6061739598426082e-05, + "loss": 0.3481, + "step": 23633 + }, + { + "epoch": 3.15, + "grad_norm": 0.62890625, + "learning_rate": 2.6053899678753913e-05, + "loss": 0.3496, + "step": 23634 + }, + { + "epoch": 3.15, + "grad_norm": 0.55859375, + "learning_rate": 2.604606076183833e-05, + "loss": 0.3659, + "step": 23635 + }, + { + "epoch": 3.15, + "grad_norm": 0.53125, + "learning_rate": 2.6038222847785652e-05, + "loss": 0.2218, + "step": 23636 + }, + { + "epoch": 3.15, + "grad_norm": 0.54296875, + "learning_rate": 2.6030385936702118e-05, + "loss": 0.2588, + "step": 23637 + }, + { + "epoch": 3.15, + "grad_norm": 0.57421875, + "learning_rate": 2.6022550028694014e-05, + "loss": 0.3522, + "step": 23638 + }, + { + "epoch": 3.15, + "grad_norm": 0.65625, + "learning_rate": 2.6014715123867616e-05, + "loss": 0.3247, + "step": 23639 + }, + { + "epoch": 3.15, + "grad_norm": 0.55078125, + "learning_rate": 2.6006881222329192e-05, + "loss": 0.1711, + "step": 23640 + }, + { + "epoch": 3.15, + "grad_norm": 0.68359375, + "learning_rate": 2.5999048324184917e-05, + "loss": 0.4542, + "step": 23641 + }, + { + "epoch": 3.15, + "grad_norm": 0.69921875, + "learning_rate": 2.5991216429541043e-05, + "loss": 0.1745, + "step": 23642 + }, + { + "epoch": 3.15, + "grad_norm": 0.671875, + "learning_rate": 2.598338553850377e-05, + "loss": 0.1805, + "step": 23643 + }, + { + "epoch": 3.16, + "grad_norm": 0.62109375, + "learning_rate": 2.5975555651179295e-05, + "loss": 0.4059, + "step": 23644 + }, + { + "epoch": 3.16, + "grad_norm": 0.71484375, + "learning_rate": 2.596772676767385e-05, + "loss": 0.4273, + "step": 23645 + }, + { + "epoch": 3.16, + "grad_norm": 0.51171875, + "learning_rate": 2.5959898888093493e-05, + "loss": 0.3157, + "step": 23646 + }, + { + "epoch": 3.16, + "grad_norm": 0.7890625, + "learning_rate": 2.5952072012544414e-05, + "loss": 0.5626, + "step": 23647 + }, + { + "epoch": 3.16, + "grad_norm": 0.609375, + "learning_rate": 2.5944246141132766e-05, + "loss": 0.3326, + "step": 23648 + }, + { + "epoch": 3.16, + "grad_norm": 0.474609375, + "learning_rate": 2.5936421273964674e-05, + "loss": 0.2884, + "step": 23649 + }, + { + "epoch": 3.16, + "grad_norm": 0.65234375, + "learning_rate": 2.5928597411146272e-05, + "loss": 0.3626, + "step": 23650 + }, + { + "epoch": 3.16, + "grad_norm": 0.73046875, + "learning_rate": 2.5920774552783588e-05, + "loss": 0.3308, + "step": 23651 + }, + { + "epoch": 3.16, + "grad_norm": 0.6171875, + "learning_rate": 2.591295269898276e-05, + "loss": 0.3359, + "step": 23652 + }, + { + "epoch": 3.16, + "grad_norm": 0.8125, + "learning_rate": 2.590513184984983e-05, + "loss": 0.2425, + "step": 23653 + }, + { + "epoch": 3.16, + "grad_norm": 0.490234375, + "learning_rate": 2.5897312005490904e-05, + "loss": 0.2858, + "step": 23654 + }, + { + "epoch": 3.16, + "grad_norm": 0.45703125, + "learning_rate": 2.5889493166011947e-05, + "loss": 0.212, + "step": 23655 + }, + { + "epoch": 3.16, + "grad_norm": 0.375, + "learning_rate": 2.5881675331519027e-05, + "loss": 0.144, + "step": 23656 + }, + { + "epoch": 3.16, + "grad_norm": 0.74609375, + "learning_rate": 2.58738585021182e-05, + "loss": 0.2957, + "step": 23657 + }, + { + "epoch": 3.16, + "grad_norm": 0.55859375, + "learning_rate": 2.5866042677915393e-05, + "loss": 0.4491, + "step": 23658 + }, + { + "epoch": 3.16, + "grad_norm": 0.703125, + "learning_rate": 2.5858227859016647e-05, + "loss": 0.263, + "step": 23659 + }, + { + "epoch": 3.16, + "grad_norm": 0.65625, + "learning_rate": 2.5850414045527893e-05, + "loss": 0.2847, + "step": 23660 + }, + { + "epoch": 3.16, + "grad_norm": 0.9375, + "learning_rate": 2.5842601237555108e-05, + "loss": 0.2375, + "step": 23661 + }, + { + "epoch": 3.16, + "grad_norm": 0.478515625, + "learning_rate": 2.5834789435204243e-05, + "loss": 0.1297, + "step": 23662 + }, + { + "epoch": 3.16, + "grad_norm": 0.73828125, + "learning_rate": 2.5826978638581268e-05, + "loss": 0.3136, + "step": 23663 + }, + { + "epoch": 3.16, + "grad_norm": 0.578125, + "learning_rate": 2.581916884779203e-05, + "loss": 0.4081, + "step": 23664 + }, + { + "epoch": 3.16, + "grad_norm": 0.625, + "learning_rate": 2.5811360062942482e-05, + "loss": 0.327, + "step": 23665 + }, + { + "epoch": 3.16, + "grad_norm": 0.6953125, + "learning_rate": 2.58035522841385e-05, + "loss": 0.2832, + "step": 23666 + }, + { + "epoch": 3.16, + "grad_norm": 0.6640625, + "learning_rate": 2.5795745511485993e-05, + "loss": 0.4372, + "step": 23667 + }, + { + "epoch": 3.16, + "grad_norm": 0.57421875, + "learning_rate": 2.578793974509077e-05, + "loss": 0.1841, + "step": 23668 + }, + { + "epoch": 3.16, + "grad_norm": 0.5390625, + "learning_rate": 2.5780134985058745e-05, + "loss": 0.2382, + "step": 23669 + }, + { + "epoch": 3.16, + "grad_norm": 0.484375, + "learning_rate": 2.5772331231495706e-05, + "loss": 0.1785, + "step": 23670 + }, + { + "epoch": 3.16, + "grad_norm": 0.515625, + "learning_rate": 2.5764528484507478e-05, + "loss": 0.3556, + "step": 23671 + }, + { + "epoch": 3.16, + "grad_norm": 0.470703125, + "learning_rate": 2.57567267441999e-05, + "loss": 0.1244, + "step": 23672 + }, + { + "epoch": 3.16, + "grad_norm": 0.5703125, + "learning_rate": 2.5748926010678776e-05, + "loss": 0.1993, + "step": 23673 + }, + { + "epoch": 3.16, + "grad_norm": 0.57421875, + "learning_rate": 2.5741126284049844e-05, + "loss": 0.3181, + "step": 23674 + }, + { + "epoch": 3.16, + "grad_norm": 0.52734375, + "learning_rate": 2.5733327564418897e-05, + "loss": 0.1895, + "step": 23675 + }, + { + "epoch": 3.16, + "grad_norm": 0.451171875, + "learning_rate": 2.5725529851891693e-05, + "loss": 0.1193, + "step": 23676 + }, + { + "epoch": 3.16, + "grad_norm": 0.59765625, + "learning_rate": 2.5717733146574006e-05, + "loss": 0.3009, + "step": 23677 + }, + { + "epoch": 3.16, + "grad_norm": 0.73046875, + "learning_rate": 2.570993744857151e-05, + "loss": 0.5161, + "step": 23678 + }, + { + "epoch": 3.16, + "grad_norm": 0.69921875, + "learning_rate": 2.570214275798992e-05, + "loss": 0.3518, + "step": 23679 + }, + { + "epoch": 3.16, + "grad_norm": 0.62109375, + "learning_rate": 2.5694349074934975e-05, + "loss": 0.1521, + "step": 23680 + }, + { + "epoch": 3.16, + "grad_norm": 0.6953125, + "learning_rate": 2.5686556399512375e-05, + "loss": 0.3656, + "step": 23681 + }, + { + "epoch": 3.16, + "grad_norm": 0.796875, + "learning_rate": 2.5678764731827753e-05, + "loss": 0.5817, + "step": 23682 + }, + { + "epoch": 3.16, + "grad_norm": 0.5390625, + "learning_rate": 2.5670974071986753e-05, + "loss": 0.3135, + "step": 23683 + }, + { + "epoch": 3.16, + "grad_norm": 0.64453125, + "learning_rate": 2.5663184420095055e-05, + "loss": 0.4282, + "step": 23684 + }, + { + "epoch": 3.16, + "grad_norm": 0.70703125, + "learning_rate": 2.5655395776258284e-05, + "loss": 0.4232, + "step": 23685 + }, + { + "epoch": 3.16, + "grad_norm": 0.640625, + "learning_rate": 2.5647608140582092e-05, + "loss": 0.3926, + "step": 23686 + }, + { + "epoch": 3.16, + "grad_norm": 0.6796875, + "learning_rate": 2.5639821513172013e-05, + "loss": 0.3523, + "step": 23687 + }, + { + "epoch": 3.16, + "grad_norm": 0.7109375, + "learning_rate": 2.563203589413369e-05, + "loss": 0.6164, + "step": 23688 + }, + { + "epoch": 3.16, + "grad_norm": 0.48828125, + "learning_rate": 2.5624251283572686e-05, + "loss": 0.2132, + "step": 23689 + }, + { + "epoch": 3.16, + "grad_norm": 0.58984375, + "learning_rate": 2.5616467681594593e-05, + "loss": 0.3642, + "step": 23690 + }, + { + "epoch": 3.16, + "grad_norm": 0.609375, + "learning_rate": 2.5608685088304906e-05, + "loss": 0.4356, + "step": 23691 + }, + { + "epoch": 3.16, + "grad_norm": 0.55078125, + "learning_rate": 2.5600903503809202e-05, + "loss": 0.1872, + "step": 23692 + }, + { + "epoch": 3.16, + "grad_norm": 0.515625, + "learning_rate": 2.559312292821302e-05, + "loss": 0.2075, + "step": 23693 + }, + { + "epoch": 3.16, + "grad_norm": 0.75, + "learning_rate": 2.5585343361621816e-05, + "loss": 0.3118, + "step": 23694 + }, + { + "epoch": 3.16, + "grad_norm": 0.546875, + "learning_rate": 2.5577564804141152e-05, + "loss": 0.1714, + "step": 23695 + }, + { + "epoch": 3.16, + "grad_norm": 0.578125, + "learning_rate": 2.556978725587643e-05, + "loss": 0.3981, + "step": 23696 + }, + { + "epoch": 3.16, + "grad_norm": 0.640625, + "learning_rate": 2.556201071693317e-05, + "loss": 0.2325, + "step": 23697 + }, + { + "epoch": 3.16, + "grad_norm": 0.76953125, + "learning_rate": 2.555423518741682e-05, + "loss": 0.3686, + "step": 23698 + }, + { + "epoch": 3.16, + "grad_norm": 0.55859375, + "learning_rate": 2.5546460667432815e-05, + "loss": 0.2732, + "step": 23699 + }, + { + "epoch": 3.16, + "grad_norm": 0.6875, + "learning_rate": 2.553868715708663e-05, + "loss": 0.2424, + "step": 23700 + }, + { + "epoch": 3.16, + "grad_norm": 0.75390625, + "learning_rate": 2.5530914656483583e-05, + "loss": 0.6616, + "step": 23701 + }, + { + "epoch": 3.16, + "grad_norm": 0.7421875, + "learning_rate": 2.5523143165729146e-05, + "loss": 0.36, + "step": 23702 + }, + { + "epoch": 3.16, + "grad_norm": 0.765625, + "learning_rate": 2.5515372684928683e-05, + "loss": 0.3773, + "step": 23703 + }, + { + "epoch": 3.16, + "grad_norm": 0.53125, + "learning_rate": 2.5507603214187593e-05, + "loss": 0.2927, + "step": 23704 + }, + { + "epoch": 3.16, + "grad_norm": 0.56640625, + "learning_rate": 2.5499834753611185e-05, + "loss": 0.3575, + "step": 23705 + }, + { + "epoch": 3.16, + "grad_norm": 0.5625, + "learning_rate": 2.5492067303304866e-05, + "loss": 0.293, + "step": 23706 + }, + { + "epoch": 3.16, + "grad_norm": 0.71484375, + "learning_rate": 2.54843008633739e-05, + "loss": 0.2985, + "step": 23707 + }, + { + "epoch": 3.16, + "grad_norm": 0.77734375, + "learning_rate": 2.5476535433923644e-05, + "loss": 0.3745, + "step": 23708 + }, + { + "epoch": 3.16, + "grad_norm": 0.640625, + "learning_rate": 2.5468771015059422e-05, + "loss": 0.3163, + "step": 23709 + }, + { + "epoch": 3.16, + "grad_norm": 0.75390625, + "learning_rate": 2.5461007606886467e-05, + "loss": 0.4049, + "step": 23710 + }, + { + "epoch": 3.16, + "grad_norm": 0.6015625, + "learning_rate": 2.545324520951009e-05, + "loss": 0.5061, + "step": 23711 + }, + { + "epoch": 3.16, + "grad_norm": 0.51171875, + "learning_rate": 2.5445483823035544e-05, + "loss": 0.1312, + "step": 23712 + }, + { + "epoch": 3.16, + "grad_norm": 0.59765625, + "learning_rate": 2.543772344756813e-05, + "loss": 0.2837, + "step": 23713 + }, + { + "epoch": 3.16, + "grad_norm": 0.5390625, + "learning_rate": 2.5429964083212998e-05, + "loss": 0.2844, + "step": 23714 + }, + { + "epoch": 3.16, + "grad_norm": 0.46875, + "learning_rate": 2.5422205730075423e-05, + "loss": 0.1695, + "step": 23715 + }, + { + "epoch": 3.16, + "grad_norm": 0.625, + "learning_rate": 2.5414448388260594e-05, + "loss": 0.3064, + "step": 23716 + }, + { + "epoch": 3.16, + "grad_norm": 0.62890625, + "learning_rate": 2.5406692057873716e-05, + "loss": 0.3553, + "step": 23717 + }, + { + "epoch": 3.16, + "grad_norm": 0.5625, + "learning_rate": 2.5398936739020018e-05, + "loss": 0.1919, + "step": 23718 + }, + { + "epoch": 3.17, + "grad_norm": 0.56640625, + "learning_rate": 2.539118243180457e-05, + "loss": 0.2806, + "step": 23719 + }, + { + "epoch": 3.17, + "grad_norm": 0.47265625, + "learning_rate": 2.538342913633256e-05, + "loss": 0.2667, + "step": 23720 + }, + { + "epoch": 3.17, + "grad_norm": 0.53125, + "learning_rate": 2.537567685270915e-05, + "loss": 0.441, + "step": 23721 + }, + { + "epoch": 3.17, + "grad_norm": 0.578125, + "learning_rate": 2.5367925581039455e-05, + "loss": 0.2145, + "step": 23722 + }, + { + "epoch": 3.17, + "grad_norm": 0.71875, + "learning_rate": 2.5360175321428614e-05, + "loss": 0.1966, + "step": 23723 + }, + { + "epoch": 3.17, + "grad_norm": 0.439453125, + "learning_rate": 2.5352426073981673e-05, + "loss": 0.1682, + "step": 23724 + }, + { + "epoch": 3.17, + "grad_norm": 0.83984375, + "learning_rate": 2.5344677838803733e-05, + "loss": 0.2313, + "step": 23725 + }, + { + "epoch": 3.17, + "grad_norm": 0.69140625, + "learning_rate": 2.5336930615999886e-05, + "loss": 0.3207, + "step": 23726 + }, + { + "epoch": 3.17, + "grad_norm": 0.640625, + "learning_rate": 2.53291844056752e-05, + "loss": 0.4864, + "step": 23727 + }, + { + "epoch": 3.17, + "grad_norm": 0.412109375, + "learning_rate": 2.532143920793467e-05, + "loss": 0.1501, + "step": 23728 + }, + { + "epoch": 3.17, + "grad_norm": 0.48046875, + "learning_rate": 2.5313695022883355e-05, + "loss": 0.1654, + "step": 23729 + }, + { + "epoch": 3.17, + "grad_norm": 0.78515625, + "learning_rate": 2.53059518506263e-05, + "loss": 0.235, + "step": 23730 + }, + { + "epoch": 3.17, + "grad_norm": 0.640625, + "learning_rate": 2.5298209691268436e-05, + "loss": 0.2511, + "step": 23731 + }, + { + "epoch": 3.17, + "grad_norm": 0.6640625, + "learning_rate": 2.5290468544914835e-05, + "loss": 0.2628, + "step": 23732 + }, + { + "epoch": 3.17, + "grad_norm": 0.61328125, + "learning_rate": 2.5282728411670387e-05, + "loss": 0.1516, + "step": 23733 + }, + { + "epoch": 3.17, + "grad_norm": 0.48828125, + "learning_rate": 2.5274989291640106e-05, + "loss": 0.2118, + "step": 23734 + }, + { + "epoch": 3.17, + "grad_norm": 0.78515625, + "learning_rate": 2.526725118492892e-05, + "loss": 0.3798, + "step": 23735 + }, + { + "epoch": 3.17, + "grad_norm": 0.78125, + "learning_rate": 2.52595140916418e-05, + "loss": 0.3709, + "step": 23736 + }, + { + "epoch": 3.17, + "grad_norm": 0.609375, + "learning_rate": 2.525177801188362e-05, + "loss": 0.3208, + "step": 23737 + }, + { + "epoch": 3.17, + "grad_norm": 0.482421875, + "learning_rate": 2.5244042945759295e-05, + "loss": 0.166, + "step": 23738 + }, + { + "epoch": 3.17, + "grad_norm": 0.58203125, + "learning_rate": 2.5236308893373716e-05, + "loss": 0.4203, + "step": 23739 + }, + { + "epoch": 3.17, + "grad_norm": 0.7734375, + "learning_rate": 2.5228575854831816e-05, + "loss": 0.3868, + "step": 23740 + }, + { + "epoch": 3.17, + "grad_norm": 0.48046875, + "learning_rate": 2.5220843830238383e-05, + "loss": 0.176, + "step": 23741 + }, + { + "epoch": 3.17, + "grad_norm": 0.80078125, + "learning_rate": 2.5213112819698327e-05, + "loss": 0.3011, + "step": 23742 + }, + { + "epoch": 3.17, + "grad_norm": 0.5625, + "learning_rate": 2.5205382823316426e-05, + "loss": 0.4466, + "step": 23743 + }, + { + "epoch": 3.17, + "grad_norm": 0.7421875, + "learning_rate": 2.5197653841197543e-05, + "loss": 0.6215, + "step": 23744 + }, + { + "epoch": 3.17, + "grad_norm": 0.4453125, + "learning_rate": 2.5189925873446473e-05, + "loss": 0.2682, + "step": 23745 + }, + { + "epoch": 3.17, + "grad_norm": 0.53515625, + "learning_rate": 2.5182198920168064e-05, + "loss": 0.4793, + "step": 23746 + }, + { + "epoch": 3.17, + "grad_norm": 0.82421875, + "learning_rate": 2.517447298146701e-05, + "loss": 0.5249, + "step": 23747 + }, + { + "epoch": 3.17, + "grad_norm": 0.76953125, + "learning_rate": 2.516674805744814e-05, + "loss": 0.1515, + "step": 23748 + }, + { + "epoch": 3.17, + "grad_norm": 0.55078125, + "learning_rate": 2.5159024148216193e-05, + "loss": 0.1985, + "step": 23749 + }, + { + "epoch": 3.17, + "grad_norm": 0.578125, + "learning_rate": 2.5151301253875937e-05, + "loss": 0.2061, + "step": 23750 + }, + { + "epoch": 3.17, + "grad_norm": 0.60546875, + "learning_rate": 2.5143579374532033e-05, + "loss": 0.4536, + "step": 23751 + }, + { + "epoch": 3.17, + "grad_norm": 0.63671875, + "learning_rate": 2.5135858510289245e-05, + "loss": 0.2699, + "step": 23752 + }, + { + "epoch": 3.17, + "grad_norm": 0.63671875, + "learning_rate": 2.5128138661252264e-05, + "loss": 0.2489, + "step": 23753 + }, + { + "epoch": 3.17, + "grad_norm": 0.7265625, + "learning_rate": 2.512041982752581e-05, + "loss": 0.2507, + "step": 23754 + }, + { + "epoch": 3.17, + "grad_norm": 0.6640625, + "learning_rate": 2.5112702009214505e-05, + "loss": 0.2572, + "step": 23755 + }, + { + "epoch": 3.17, + "grad_norm": 0.640625, + "learning_rate": 2.5104985206423003e-05, + "loss": 0.3293, + "step": 23756 + }, + { + "epoch": 3.17, + "grad_norm": 0.5546875, + "learning_rate": 2.5097269419255975e-05, + "loss": 0.1587, + "step": 23757 + }, + { + "epoch": 3.17, + "grad_norm": 0.61328125, + "learning_rate": 2.5089554647818037e-05, + "loss": 0.3191, + "step": 23758 + }, + { + "epoch": 3.17, + "grad_norm": 0.6328125, + "learning_rate": 2.5081840892213848e-05, + "loss": 0.4629, + "step": 23759 + }, + { + "epoch": 3.17, + "grad_norm": 0.734375, + "learning_rate": 2.5074128152547948e-05, + "loss": 0.3326, + "step": 23760 + }, + { + "epoch": 3.17, + "grad_norm": 0.64453125, + "learning_rate": 2.5066416428924953e-05, + "loss": 0.3454, + "step": 23761 + }, + { + "epoch": 3.17, + "grad_norm": 0.640625, + "learning_rate": 2.505870572144945e-05, + "loss": 0.2367, + "step": 23762 + }, + { + "epoch": 3.17, + "grad_norm": 0.5390625, + "learning_rate": 2.5050996030226025e-05, + "loss": 0.3953, + "step": 23763 + }, + { + "epoch": 3.17, + "grad_norm": 0.69140625, + "learning_rate": 2.5043287355359157e-05, + "loss": 0.7418, + "step": 23764 + }, + { + "epoch": 3.17, + "grad_norm": 0.59765625, + "learning_rate": 2.5035579696953428e-05, + "loss": 0.2335, + "step": 23765 + }, + { + "epoch": 3.17, + "grad_norm": 0.671875, + "learning_rate": 2.5027873055113382e-05, + "loss": 0.2958, + "step": 23766 + }, + { + "epoch": 3.17, + "grad_norm": 0.6015625, + "learning_rate": 2.5020167429943452e-05, + "loss": 0.1691, + "step": 23767 + }, + { + "epoch": 3.17, + "grad_norm": 0.6953125, + "learning_rate": 2.501246282154821e-05, + "loss": 0.4409, + "step": 23768 + }, + { + "epoch": 3.17, + "grad_norm": 0.63671875, + "learning_rate": 2.500475923003207e-05, + "loss": 0.224, + "step": 23769 + }, + { + "epoch": 3.17, + "grad_norm": 0.46484375, + "learning_rate": 2.4997056655499527e-05, + "loss": 0.2088, + "step": 23770 + }, + { + "epoch": 3.17, + "grad_norm": 0.734375, + "learning_rate": 2.4989355098055033e-05, + "loss": 0.5632, + "step": 23771 + }, + { + "epoch": 3.17, + "grad_norm": 0.51171875, + "learning_rate": 2.4981654557803026e-05, + "loss": 0.2365, + "step": 23772 + }, + { + "epoch": 3.17, + "grad_norm": 0.69140625, + "learning_rate": 2.497395503484796e-05, + "loss": 0.3744, + "step": 23773 + }, + { + "epoch": 3.17, + "grad_norm": 0.68359375, + "learning_rate": 2.4966256529294187e-05, + "loss": 0.3576, + "step": 23774 + }, + { + "epoch": 3.17, + "grad_norm": 0.8046875, + "learning_rate": 2.495855904124612e-05, + "loss": 0.7042, + "step": 23775 + }, + { + "epoch": 3.17, + "grad_norm": 0.83984375, + "learning_rate": 2.4950862570808166e-05, + "loss": 0.3247, + "step": 23776 + }, + { + "epoch": 3.17, + "grad_norm": 0.6171875, + "learning_rate": 2.494316711808471e-05, + "loss": 0.2821, + "step": 23777 + }, + { + "epoch": 3.17, + "grad_norm": 0.490234375, + "learning_rate": 2.4935472683180074e-05, + "loss": 0.2364, + "step": 23778 + }, + { + "epoch": 3.17, + "grad_norm": 0.515625, + "learning_rate": 2.492777926619857e-05, + "loss": 0.4216, + "step": 23779 + }, + { + "epoch": 3.17, + "grad_norm": 0.5, + "learning_rate": 2.4920086867244574e-05, + "loss": 0.2607, + "step": 23780 + }, + { + "epoch": 3.17, + "grad_norm": 0.5546875, + "learning_rate": 2.491239548642238e-05, + "loss": 0.3393, + "step": 23781 + }, + { + "epoch": 3.17, + "grad_norm": 0.515625, + "learning_rate": 2.4904705123836314e-05, + "loss": 0.3587, + "step": 23782 + }, + { + "epoch": 3.17, + "grad_norm": 0.62109375, + "learning_rate": 2.4897015779590625e-05, + "loss": 0.4684, + "step": 23783 + }, + { + "epoch": 3.17, + "grad_norm": 0.75, + "learning_rate": 2.48893274537896e-05, + "loss": 0.4826, + "step": 23784 + }, + { + "epoch": 3.17, + "grad_norm": 0.59375, + "learning_rate": 2.4881640146537498e-05, + "loss": 0.1703, + "step": 23785 + }, + { + "epoch": 3.17, + "grad_norm": 0.50390625, + "learning_rate": 2.4873953857938592e-05, + "loss": 0.3908, + "step": 23786 + }, + { + "epoch": 3.17, + "grad_norm": 0.703125, + "learning_rate": 2.4866268588097064e-05, + "loss": 0.2464, + "step": 23787 + }, + { + "epoch": 3.17, + "grad_norm": 0.5546875, + "learning_rate": 2.4858584337117164e-05, + "loss": 0.2043, + "step": 23788 + }, + { + "epoch": 3.17, + "grad_norm": 0.5703125, + "learning_rate": 2.4850901105103076e-05, + "loss": 0.2904, + "step": 23789 + }, + { + "epoch": 3.17, + "grad_norm": 0.65234375, + "learning_rate": 2.4843218892158992e-05, + "loss": 0.3763, + "step": 23790 + }, + { + "epoch": 3.17, + "grad_norm": 0.47265625, + "learning_rate": 2.4835537698389166e-05, + "loss": 0.1457, + "step": 23791 + }, + { + "epoch": 3.17, + "grad_norm": 0.486328125, + "learning_rate": 2.4827857523897625e-05, + "loss": 0.2696, + "step": 23792 + }, + { + "epoch": 3.17, + "grad_norm": 0.71875, + "learning_rate": 2.482017836878857e-05, + "loss": 0.418, + "step": 23793 + }, + { + "epoch": 3.18, + "grad_norm": 0.765625, + "learning_rate": 2.4812500233166158e-05, + "loss": 0.2271, + "step": 23794 + }, + { + "epoch": 3.18, + "grad_norm": 0.53125, + "learning_rate": 2.4804823117134502e-05, + "loss": 0.2346, + "step": 23795 + }, + { + "epoch": 3.18, + "grad_norm": 0.546875, + "learning_rate": 2.4797147020797727e-05, + "loss": 0.2327, + "step": 23796 + }, + { + "epoch": 3.18, + "grad_norm": 0.58203125, + "learning_rate": 2.478947194425988e-05, + "loss": 0.3478, + "step": 23797 + }, + { + "epoch": 3.18, + "grad_norm": 0.609375, + "learning_rate": 2.478179788762506e-05, + "loss": 0.3009, + "step": 23798 + }, + { + "epoch": 3.18, + "grad_norm": 0.484375, + "learning_rate": 2.477412485099734e-05, + "loss": 0.3114, + "step": 23799 + }, + { + "epoch": 3.18, + "grad_norm": 0.69140625, + "learning_rate": 2.476645283448078e-05, + "loss": 0.2435, + "step": 23800 + }, + { + "epoch": 3.18, + "grad_norm": 0.734375, + "learning_rate": 2.475878183817939e-05, + "loss": 0.4394, + "step": 23801 + }, + { + "epoch": 3.18, + "grad_norm": 0.5625, + "learning_rate": 2.4751111862197206e-05, + "loss": 0.5165, + "step": 23802 + }, + { + "epoch": 3.18, + "grad_norm": 0.57421875, + "learning_rate": 2.474344290663826e-05, + "loss": 0.3444, + "step": 23803 + }, + { + "epoch": 3.18, + "grad_norm": 0.71875, + "learning_rate": 2.4735774971606506e-05, + "loss": 0.3753, + "step": 23804 + }, + { + "epoch": 3.18, + "grad_norm": 0.5390625, + "learning_rate": 2.472810805720598e-05, + "loss": 0.2374, + "step": 23805 + }, + { + "epoch": 3.18, + "grad_norm": 0.75, + "learning_rate": 2.472044216354058e-05, + "loss": 0.3336, + "step": 23806 + }, + { + "epoch": 3.18, + "grad_norm": 0.78125, + "learning_rate": 2.4712777290714307e-05, + "loss": 0.5135, + "step": 23807 + }, + { + "epoch": 3.18, + "grad_norm": 0.63671875, + "learning_rate": 2.4705113438831084e-05, + "loss": 0.2298, + "step": 23808 + }, + { + "epoch": 3.18, + "grad_norm": 0.46875, + "learning_rate": 2.4697450607994878e-05, + "loss": 0.2405, + "step": 23809 + }, + { + "epoch": 3.18, + "grad_norm": 0.66015625, + "learning_rate": 2.4689788798309545e-05, + "loss": 0.3562, + "step": 23810 + }, + { + "epoch": 3.18, + "grad_norm": 0.84375, + "learning_rate": 2.4682128009879e-05, + "loss": 0.4414, + "step": 23811 + }, + { + "epoch": 3.18, + "grad_norm": 0.5625, + "learning_rate": 2.467446824280716e-05, + "loss": 0.1707, + "step": 23812 + }, + { + "epoch": 3.18, + "grad_norm": 0.431640625, + "learning_rate": 2.4666809497197885e-05, + "loss": 0.1123, + "step": 23813 + }, + { + "epoch": 3.18, + "grad_norm": 0.55078125, + "learning_rate": 2.4659151773154996e-05, + "loss": 0.1823, + "step": 23814 + }, + { + "epoch": 3.18, + "grad_norm": 0.671875, + "learning_rate": 2.4651495070782392e-05, + "loss": 0.3346, + "step": 23815 + }, + { + "epoch": 3.18, + "grad_norm": 0.578125, + "learning_rate": 2.4643839390183843e-05, + "loss": 0.292, + "step": 23816 + }, + { + "epoch": 3.18, + "grad_norm": 0.6484375, + "learning_rate": 2.4636184731463195e-05, + "loss": 0.2901, + "step": 23817 + }, + { + "epoch": 3.18, + "grad_norm": 0.8125, + "learning_rate": 2.462853109472425e-05, + "loss": 0.274, + "step": 23818 + }, + { + "epoch": 3.18, + "grad_norm": 0.6328125, + "learning_rate": 2.462087848007083e-05, + "loss": 0.5407, + "step": 23819 + }, + { + "epoch": 3.18, + "grad_norm": 0.6328125, + "learning_rate": 2.4613226887606643e-05, + "loss": 0.1995, + "step": 23820 + }, + { + "epoch": 3.18, + "grad_norm": 0.50390625, + "learning_rate": 2.4605576317435485e-05, + "loss": 0.337, + "step": 23821 + }, + { + "epoch": 3.18, + "grad_norm": 0.63671875, + "learning_rate": 2.4597926769661094e-05, + "loss": 0.2574, + "step": 23822 + }, + { + "epoch": 3.18, + "grad_norm": 0.67578125, + "learning_rate": 2.4590278244387244e-05, + "loss": 0.5952, + "step": 23823 + }, + { + "epoch": 3.18, + "grad_norm": 0.67578125, + "learning_rate": 2.4582630741717593e-05, + "loss": 0.3386, + "step": 23824 + }, + { + "epoch": 3.18, + "grad_norm": 0.5234375, + "learning_rate": 2.457498426175586e-05, + "loss": 0.3326, + "step": 23825 + }, + { + "epoch": 3.18, + "grad_norm": 0.609375, + "learning_rate": 2.4567338804605756e-05, + "loss": 0.2447, + "step": 23826 + }, + { + "epoch": 3.18, + "grad_norm": 0.5859375, + "learning_rate": 2.455969437037098e-05, + "loss": 0.5203, + "step": 23827 + }, + { + "epoch": 3.18, + "grad_norm": 0.703125, + "learning_rate": 2.455205095915516e-05, + "loss": 0.3333, + "step": 23828 + }, + { + "epoch": 3.18, + "grad_norm": 0.546875, + "learning_rate": 2.454440857106193e-05, + "loss": 0.3838, + "step": 23829 + }, + { + "epoch": 3.18, + "grad_norm": 0.69140625, + "learning_rate": 2.4536767206194943e-05, + "loss": 0.3048, + "step": 23830 + }, + { + "epoch": 3.18, + "grad_norm": 0.484375, + "learning_rate": 2.4529126864657826e-05, + "loss": 0.1318, + "step": 23831 + }, + { + "epoch": 3.18, + "grad_norm": 0.53125, + "learning_rate": 2.4521487546554213e-05, + "loss": 0.2485, + "step": 23832 + }, + { + "epoch": 3.18, + "grad_norm": 0.71484375, + "learning_rate": 2.451384925198763e-05, + "loss": 0.3953, + "step": 23833 + }, + { + "epoch": 3.18, + "grad_norm": 0.69140625, + "learning_rate": 2.450621198106171e-05, + "loss": 0.2579, + "step": 23834 + }, + { + "epoch": 3.18, + "grad_norm": 0.498046875, + "learning_rate": 2.4498575733880013e-05, + "loss": 0.1985, + "step": 23835 + }, + { + "epoch": 3.18, + "grad_norm": 0.7265625, + "learning_rate": 2.44909405105461e-05, + "loss": 0.399, + "step": 23836 + }, + { + "epoch": 3.18, + "grad_norm": 0.8046875, + "learning_rate": 2.448330631116348e-05, + "loss": 0.3692, + "step": 23837 + }, + { + "epoch": 3.18, + "grad_norm": 0.625, + "learning_rate": 2.4475673135835685e-05, + "loss": 0.3536, + "step": 23838 + }, + { + "epoch": 3.18, + "grad_norm": 0.76953125, + "learning_rate": 2.4468040984666263e-05, + "loss": 0.436, + "step": 23839 + }, + { + "epoch": 3.18, + "grad_norm": 0.63671875, + "learning_rate": 2.446040985775865e-05, + "loss": 0.5067, + "step": 23840 + }, + { + "epoch": 3.18, + "grad_norm": 0.625, + "learning_rate": 2.4452779755216392e-05, + "loss": 0.3158, + "step": 23841 + }, + { + "epoch": 3.18, + "grad_norm": 0.65234375, + "learning_rate": 2.4445150677142903e-05, + "loss": 0.3724, + "step": 23842 + }, + { + "epoch": 3.18, + "grad_norm": 0.58984375, + "learning_rate": 2.4437522623641653e-05, + "loss": 0.2902, + "step": 23843 + }, + { + "epoch": 3.18, + "grad_norm": 0.6484375, + "learning_rate": 2.4429895594816098e-05, + "loss": 0.2654, + "step": 23844 + }, + { + "epoch": 3.18, + "grad_norm": 0.515625, + "learning_rate": 2.442226959076965e-05, + "loss": 0.1869, + "step": 23845 + }, + { + "epoch": 3.18, + "grad_norm": 0.7578125, + "learning_rate": 2.4414644611605776e-05, + "loss": 0.4007, + "step": 23846 + }, + { + "epoch": 3.18, + "grad_norm": 0.84375, + "learning_rate": 2.4407020657427793e-05, + "loss": 0.301, + "step": 23847 + }, + { + "epoch": 3.18, + "grad_norm": 0.52734375, + "learning_rate": 2.4399397728339122e-05, + "loss": 0.1763, + "step": 23848 + }, + { + "epoch": 3.18, + "grad_norm": 0.71484375, + "learning_rate": 2.4391775824443142e-05, + "loss": 0.2341, + "step": 23849 + }, + { + "epoch": 3.18, + "grad_norm": 0.55078125, + "learning_rate": 2.438415494584324e-05, + "loss": 0.2106, + "step": 23850 + }, + { + "epoch": 3.18, + "grad_norm": 0.58203125, + "learning_rate": 2.4376535092642716e-05, + "loss": 0.359, + "step": 23851 + }, + { + "epoch": 3.18, + "grad_norm": 0.5546875, + "learning_rate": 2.4368916264944896e-05, + "loss": 0.2792, + "step": 23852 + }, + { + "epoch": 3.18, + "grad_norm": 0.6015625, + "learning_rate": 2.4361298462853098e-05, + "loss": 0.2243, + "step": 23853 + }, + { + "epoch": 3.18, + "grad_norm": 0.59765625, + "learning_rate": 2.435368168647064e-05, + "loss": 0.3163, + "step": 23854 + }, + { + "epoch": 3.18, + "grad_norm": 0.703125, + "learning_rate": 2.434606593590084e-05, + "loss": 0.441, + "step": 23855 + }, + { + "epoch": 3.18, + "grad_norm": 0.443359375, + "learning_rate": 2.4338451211246917e-05, + "loss": 0.1712, + "step": 23856 + }, + { + "epoch": 3.18, + "grad_norm": 0.56640625, + "learning_rate": 2.4330837512612148e-05, + "loss": 0.3344, + "step": 23857 + }, + { + "epoch": 3.18, + "grad_norm": 0.4140625, + "learning_rate": 2.432322484009979e-05, + "loss": 0.136, + "step": 23858 + }, + { + "epoch": 3.18, + "grad_norm": 0.61328125, + "learning_rate": 2.4315613193813092e-05, + "loss": 0.3552, + "step": 23859 + }, + { + "epoch": 3.18, + "grad_norm": 0.5078125, + "learning_rate": 2.4308002573855237e-05, + "loss": 0.2641, + "step": 23860 + }, + { + "epoch": 3.18, + "grad_norm": 0.5390625, + "learning_rate": 2.4300392980329435e-05, + "loss": 0.3877, + "step": 23861 + }, + { + "epoch": 3.18, + "grad_norm": 0.859375, + "learning_rate": 2.4292784413338897e-05, + "loss": 0.2837, + "step": 23862 + }, + { + "epoch": 3.18, + "grad_norm": 0.6796875, + "learning_rate": 2.4285176872986816e-05, + "loss": 0.2287, + "step": 23863 + }, + { + "epoch": 3.18, + "grad_norm": 0.6328125, + "learning_rate": 2.4277570359376324e-05, + "loss": 0.3346, + "step": 23864 + }, + { + "epoch": 3.18, + "grad_norm": 0.5625, + "learning_rate": 2.4269964872610552e-05, + "loss": 0.3909, + "step": 23865 + }, + { + "epoch": 3.18, + "grad_norm": 0.71484375, + "learning_rate": 2.426236041279266e-05, + "loss": 0.2361, + "step": 23866 + }, + { + "epoch": 3.18, + "grad_norm": 0.7265625, + "learning_rate": 2.4254756980025773e-05, + "loss": 0.4449, + "step": 23867 + }, + { + "epoch": 3.18, + "grad_norm": 0.53125, + "learning_rate": 2.424715457441299e-05, + "loss": 0.1853, + "step": 23868 + }, + { + "epoch": 3.19, + "grad_norm": 0.53125, + "learning_rate": 2.423955319605744e-05, + "loss": 0.1381, + "step": 23869 + }, + { + "epoch": 3.19, + "grad_norm": 0.64453125, + "learning_rate": 2.4231952845062135e-05, + "loss": 0.3943, + "step": 23870 + }, + { + "epoch": 3.19, + "grad_norm": 0.4921875, + "learning_rate": 2.4224353521530186e-05, + "loss": 0.3396, + "step": 23871 + }, + { + "epoch": 3.19, + "grad_norm": 0.421875, + "learning_rate": 2.4216755225564635e-05, + "loss": 0.1865, + "step": 23872 + }, + { + "epoch": 3.19, + "grad_norm": 0.5703125, + "learning_rate": 2.4209157957268547e-05, + "loss": 0.279, + "step": 23873 + }, + { + "epoch": 3.19, + "grad_norm": 0.83203125, + "learning_rate": 2.4201561716744893e-05, + "loss": 0.155, + "step": 23874 + }, + { + "epoch": 3.19, + "grad_norm": 0.515625, + "learning_rate": 2.4193966504096733e-05, + "loss": 0.1956, + "step": 23875 + }, + { + "epoch": 3.19, + "grad_norm": 0.48046875, + "learning_rate": 2.418637231942701e-05, + "loss": 0.26, + "step": 23876 + }, + { + "epoch": 3.19, + "grad_norm": 0.55859375, + "learning_rate": 2.417877916283874e-05, + "loss": 0.3217, + "step": 23877 + }, + { + "epoch": 3.19, + "grad_norm": 0.609375, + "learning_rate": 2.4171187034434906e-05, + "loss": 0.426, + "step": 23878 + }, + { + "epoch": 3.19, + "grad_norm": 0.52734375, + "learning_rate": 2.416359593431843e-05, + "loss": 0.347, + "step": 23879 + }, + { + "epoch": 3.19, + "grad_norm": 0.87890625, + "learning_rate": 2.4156005862592245e-05, + "loss": 0.4257, + "step": 23880 + }, + { + "epoch": 3.19, + "grad_norm": 0.8984375, + "learning_rate": 2.4148416819359308e-05, + "loss": 0.3288, + "step": 23881 + }, + { + "epoch": 3.19, + "grad_norm": 0.62890625, + "learning_rate": 2.4140828804722548e-05, + "loss": 0.1912, + "step": 23882 + }, + { + "epoch": 3.19, + "grad_norm": 0.58984375, + "learning_rate": 2.4133241818784812e-05, + "loss": 0.2146, + "step": 23883 + }, + { + "epoch": 3.19, + "grad_norm": 0.7109375, + "learning_rate": 2.4125655861649e-05, + "loss": 0.2645, + "step": 23884 + }, + { + "epoch": 3.19, + "grad_norm": 0.51953125, + "learning_rate": 2.4118070933417992e-05, + "loss": 0.396, + "step": 23885 + }, + { + "epoch": 3.19, + "grad_norm": 0.875, + "learning_rate": 2.4110487034194673e-05, + "loss": 0.4182, + "step": 23886 + }, + { + "epoch": 3.19, + "grad_norm": 0.68359375, + "learning_rate": 2.410290416408183e-05, + "loss": 0.2327, + "step": 23887 + }, + { + "epoch": 3.19, + "grad_norm": 0.765625, + "learning_rate": 2.4095322323182334e-05, + "loss": 0.6201, + "step": 23888 + }, + { + "epoch": 3.19, + "grad_norm": 0.58984375, + "learning_rate": 2.408774151159897e-05, + "loss": 0.33, + "step": 23889 + }, + { + "epoch": 3.19, + "grad_norm": 0.4140625, + "learning_rate": 2.4080161729434535e-05, + "loss": 0.1159, + "step": 23890 + }, + { + "epoch": 3.19, + "grad_norm": 0.62890625, + "learning_rate": 2.4072582976791847e-05, + "loss": 0.4227, + "step": 23891 + }, + { + "epoch": 3.19, + "grad_norm": 0.41796875, + "learning_rate": 2.4065005253773688e-05, + "loss": 0.2394, + "step": 23892 + }, + { + "epoch": 3.19, + "grad_norm": 0.703125, + "learning_rate": 2.4057428560482753e-05, + "loss": 0.3889, + "step": 23893 + }, + { + "epoch": 3.19, + "grad_norm": 0.6484375, + "learning_rate": 2.404985289702184e-05, + "loss": 0.3993, + "step": 23894 + }, + { + "epoch": 3.19, + "grad_norm": 0.69921875, + "learning_rate": 2.4042278263493656e-05, + "loss": 0.4508, + "step": 23895 + }, + { + "epoch": 3.19, + "grad_norm": 0.431640625, + "learning_rate": 2.403470466000096e-05, + "loss": 0.1601, + "step": 23896 + }, + { + "epoch": 3.19, + "grad_norm": 0.5859375, + "learning_rate": 2.40271320866464e-05, + "loss": 0.3081, + "step": 23897 + }, + { + "epoch": 3.19, + "grad_norm": 0.5703125, + "learning_rate": 2.401956054353268e-05, + "loss": 0.3136, + "step": 23898 + }, + { + "epoch": 3.19, + "grad_norm": 0.609375, + "learning_rate": 2.4011990030762487e-05, + "loss": 0.5293, + "step": 23899 + }, + { + "epoch": 3.19, + "grad_norm": 0.77734375, + "learning_rate": 2.400442054843851e-05, + "loss": 0.2936, + "step": 23900 + }, + { + "epoch": 3.19, + "grad_norm": 0.56640625, + "learning_rate": 2.399685209666336e-05, + "loss": 0.3442, + "step": 23901 + }, + { + "epoch": 3.19, + "grad_norm": 0.69140625, + "learning_rate": 2.398928467553965e-05, + "loss": 0.2756, + "step": 23902 + }, + { + "epoch": 3.19, + "grad_norm": 0.69921875, + "learning_rate": 2.3981718285170017e-05, + "loss": 0.2133, + "step": 23903 + }, + { + "epoch": 3.19, + "grad_norm": 0.6484375, + "learning_rate": 2.3974152925657066e-05, + "loss": 0.2343, + "step": 23904 + }, + { + "epoch": 3.19, + "grad_norm": 0.75, + "learning_rate": 2.3966588597103434e-05, + "loss": 0.3126, + "step": 23905 + }, + { + "epoch": 3.19, + "grad_norm": 0.5234375, + "learning_rate": 2.3959025299611627e-05, + "loss": 0.3932, + "step": 23906 + }, + { + "epoch": 3.19, + "grad_norm": 0.65625, + "learning_rate": 2.395146303328424e-05, + "loss": 0.4593, + "step": 23907 + }, + { + "epoch": 3.19, + "grad_norm": 0.451171875, + "learning_rate": 2.394390179822382e-05, + "loss": 0.2823, + "step": 23908 + }, + { + "epoch": 3.19, + "grad_norm": 0.72265625, + "learning_rate": 2.393634159453294e-05, + "loss": 0.2624, + "step": 23909 + }, + { + "epoch": 3.19, + "grad_norm": 0.6328125, + "learning_rate": 2.3928782422314044e-05, + "loss": 0.3288, + "step": 23910 + }, + { + "epoch": 3.19, + "grad_norm": 0.443359375, + "learning_rate": 2.3921224281669697e-05, + "loss": 0.1584, + "step": 23911 + }, + { + "epoch": 3.19, + "grad_norm": 0.69140625, + "learning_rate": 2.3913667172702392e-05, + "loss": 0.4239, + "step": 23912 + }, + { + "epoch": 3.19, + "grad_norm": 0.75, + "learning_rate": 2.390611109551456e-05, + "loss": 0.4134, + "step": 23913 + }, + { + "epoch": 3.19, + "grad_norm": 0.7109375, + "learning_rate": 2.3898556050208743e-05, + "loss": 0.261, + "step": 23914 + }, + { + "epoch": 3.19, + "grad_norm": 0.671875, + "learning_rate": 2.38910020368873e-05, + "loss": 0.4702, + "step": 23915 + }, + { + "epoch": 3.19, + "grad_norm": 0.6640625, + "learning_rate": 2.3883449055652727e-05, + "loss": 0.6425, + "step": 23916 + }, + { + "epoch": 3.19, + "grad_norm": 0.423828125, + "learning_rate": 2.3875897106607436e-05, + "loss": 0.1946, + "step": 23917 + }, + { + "epoch": 3.19, + "grad_norm": 0.408203125, + "learning_rate": 2.3868346189853842e-05, + "loss": 0.2096, + "step": 23918 + }, + { + "epoch": 3.19, + "grad_norm": 0.64453125, + "learning_rate": 2.3860796305494348e-05, + "loss": 0.3919, + "step": 23919 + }, + { + "epoch": 3.19, + "grad_norm": 0.65625, + "learning_rate": 2.38532474536313e-05, + "loss": 0.4005, + "step": 23920 + }, + { + "epoch": 3.19, + "grad_norm": 0.68359375, + "learning_rate": 2.3845699634367093e-05, + "loss": 0.2869, + "step": 23921 + }, + { + "epoch": 3.19, + "grad_norm": 0.51953125, + "learning_rate": 2.3838152847804063e-05, + "loss": 0.298, + "step": 23922 + }, + { + "epoch": 3.19, + "grad_norm": 0.828125, + "learning_rate": 2.38306070940446e-05, + "loss": 0.3881, + "step": 23923 + }, + { + "epoch": 3.19, + "grad_norm": 0.4296875, + "learning_rate": 2.3823062373190985e-05, + "loss": 0.1557, + "step": 23924 + }, + { + "epoch": 3.19, + "grad_norm": 0.65234375, + "learning_rate": 2.3815518685345506e-05, + "loss": 0.4962, + "step": 23925 + }, + { + "epoch": 3.19, + "grad_norm": 0.48046875, + "learning_rate": 2.380797603061049e-05, + "loss": 0.2305, + "step": 23926 + }, + { + "epoch": 3.19, + "grad_norm": 0.83203125, + "learning_rate": 2.3800434409088212e-05, + "loss": 0.5661, + "step": 23927 + }, + { + "epoch": 3.19, + "grad_norm": 0.765625, + "learning_rate": 2.379289382088099e-05, + "loss": 0.3341, + "step": 23928 + }, + { + "epoch": 3.19, + "grad_norm": 0.7578125, + "learning_rate": 2.3785354266090998e-05, + "loss": 0.4789, + "step": 23929 + }, + { + "epoch": 3.19, + "grad_norm": 0.6328125, + "learning_rate": 2.3777815744820518e-05, + "loss": 0.6029, + "step": 23930 + }, + { + "epoch": 3.19, + "grad_norm": 0.55078125, + "learning_rate": 2.3770278257171774e-05, + "loss": 0.2644, + "step": 23931 + }, + { + "epoch": 3.19, + "grad_norm": 0.7421875, + "learning_rate": 2.3762741803247013e-05, + "loss": 0.4669, + "step": 23932 + }, + { + "epoch": 3.19, + "grad_norm": 0.498046875, + "learning_rate": 2.3755206383148376e-05, + "loss": 0.2711, + "step": 23933 + }, + { + "epoch": 3.19, + "grad_norm": 0.59765625, + "learning_rate": 2.374767199697807e-05, + "loss": 0.3211, + "step": 23934 + }, + { + "epoch": 3.19, + "grad_norm": 0.58984375, + "learning_rate": 2.3740138644838272e-05, + "loss": 0.2185, + "step": 23935 + }, + { + "epoch": 3.19, + "grad_norm": 0.47265625, + "learning_rate": 2.3732606326831163e-05, + "loss": 0.2392, + "step": 23936 + }, + { + "epoch": 3.19, + "grad_norm": 0.470703125, + "learning_rate": 2.3725075043058865e-05, + "loss": 0.2836, + "step": 23937 + }, + { + "epoch": 3.19, + "grad_norm": 0.61328125, + "learning_rate": 2.371754479362347e-05, + "loss": 0.2431, + "step": 23938 + }, + { + "epoch": 3.19, + "grad_norm": 0.70703125, + "learning_rate": 2.371001557862712e-05, + "loss": 0.2525, + "step": 23939 + }, + { + "epoch": 3.19, + "grad_norm": 0.52734375, + "learning_rate": 2.3702487398171937e-05, + "loss": 0.1214, + "step": 23940 + }, + { + "epoch": 3.19, + "grad_norm": 0.50390625, + "learning_rate": 2.369496025235998e-05, + "loss": 0.1937, + "step": 23941 + }, + { + "epoch": 3.19, + "grad_norm": 0.76171875, + "learning_rate": 2.3687434141293362e-05, + "loss": 0.3148, + "step": 23942 + }, + { + "epoch": 3.19, + "grad_norm": 0.458984375, + "learning_rate": 2.367990906507409e-05, + "loss": 0.1671, + "step": 23943 + }, + { + "epoch": 3.2, + "grad_norm": 0.75390625, + "learning_rate": 2.3672385023804234e-05, + "loss": 0.3447, + "step": 23944 + }, + { + "epoch": 3.2, + "grad_norm": 0.466796875, + "learning_rate": 2.366486201758582e-05, + "loss": 0.2189, + "step": 23945 + }, + { + "epoch": 3.2, + "grad_norm": 0.86328125, + "learning_rate": 2.3657340046520893e-05, + "loss": 0.4441, + "step": 23946 + }, + { + "epoch": 3.2, + "grad_norm": 0.7109375, + "learning_rate": 2.3649819110711415e-05, + "loss": 0.4605, + "step": 23947 + }, + { + "epoch": 3.2, + "grad_norm": 0.546875, + "learning_rate": 2.364229921025941e-05, + "loss": 0.3371, + "step": 23948 + }, + { + "epoch": 3.2, + "grad_norm": 0.515625, + "learning_rate": 2.3634780345266806e-05, + "loss": 0.1533, + "step": 23949 + }, + { + "epoch": 3.2, + "grad_norm": 0.6015625, + "learning_rate": 2.3627262515835604e-05, + "loss": 0.2377, + "step": 23950 + }, + { + "epoch": 3.2, + "grad_norm": 0.671875, + "learning_rate": 2.3619745722067754e-05, + "loss": 0.2899, + "step": 23951 + }, + { + "epoch": 3.2, + "grad_norm": 0.59375, + "learning_rate": 2.361222996406515e-05, + "loss": 0.1539, + "step": 23952 + }, + { + "epoch": 3.2, + "grad_norm": 0.6484375, + "learning_rate": 2.360471524192972e-05, + "loss": 0.3811, + "step": 23953 + }, + { + "epoch": 3.2, + "grad_norm": 0.5859375, + "learning_rate": 2.3597201555763393e-05, + "loss": 0.2872, + "step": 23954 + }, + { + "epoch": 3.2, + "grad_norm": 0.46875, + "learning_rate": 2.3589688905668074e-05, + "loss": 0.1553, + "step": 23955 + }, + { + "epoch": 3.2, + "grad_norm": 0.74609375, + "learning_rate": 2.3582177291745587e-05, + "loss": 0.4459, + "step": 23956 + }, + { + "epoch": 3.2, + "grad_norm": 0.5546875, + "learning_rate": 2.3574666714097814e-05, + "loss": 0.2664, + "step": 23957 + }, + { + "epoch": 3.2, + "grad_norm": 0.75390625, + "learning_rate": 2.3567157172826616e-05, + "loss": 0.4171, + "step": 23958 + }, + { + "epoch": 3.2, + "grad_norm": 0.56640625, + "learning_rate": 2.355964866803385e-05, + "loss": 0.2378, + "step": 23959 + }, + { + "epoch": 3.2, + "grad_norm": 0.6328125, + "learning_rate": 2.35521411998213e-05, + "loss": 0.3295, + "step": 23960 + }, + { + "epoch": 3.2, + "grad_norm": 0.7265625, + "learning_rate": 2.3544634768290763e-05, + "loss": 0.5222, + "step": 23961 + }, + { + "epoch": 3.2, + "grad_norm": 0.7578125, + "learning_rate": 2.353712937354403e-05, + "loss": 0.4005, + "step": 23962 + }, + { + "epoch": 3.2, + "grad_norm": 0.57421875, + "learning_rate": 2.352962501568291e-05, + "loss": 0.3574, + "step": 23963 + }, + { + "epoch": 3.2, + "grad_norm": 0.54296875, + "learning_rate": 2.352212169480915e-05, + "loss": 0.2505, + "step": 23964 + }, + { + "epoch": 3.2, + "grad_norm": 0.458984375, + "learning_rate": 2.351461941102453e-05, + "loss": 0.2793, + "step": 23965 + }, + { + "epoch": 3.2, + "grad_norm": 0.6796875, + "learning_rate": 2.350711816443073e-05, + "loss": 0.1994, + "step": 23966 + }, + { + "epoch": 3.2, + "grad_norm": 0.57421875, + "learning_rate": 2.3499617955129504e-05, + "loss": 0.2855, + "step": 23967 + }, + { + "epoch": 3.2, + "grad_norm": 0.7890625, + "learning_rate": 2.349211878322255e-05, + "loss": 0.7084, + "step": 23968 + }, + { + "epoch": 3.2, + "grad_norm": 0.578125, + "learning_rate": 2.3484620648811605e-05, + "loss": 0.347, + "step": 23969 + }, + { + "epoch": 3.2, + "grad_norm": 0.5703125, + "learning_rate": 2.3477123551998282e-05, + "loss": 0.2421, + "step": 23970 + }, + { + "epoch": 3.2, + "grad_norm": 0.69921875, + "learning_rate": 2.346962749288427e-05, + "loss": 0.5171, + "step": 23971 + }, + { + "epoch": 3.2, + "grad_norm": 0.65625, + "learning_rate": 2.346213247157122e-05, + "loss": 0.4421, + "step": 23972 + }, + { + "epoch": 3.2, + "grad_norm": 0.625, + "learning_rate": 2.345463848816082e-05, + "loss": 0.3312, + "step": 23973 + }, + { + "epoch": 3.2, + "grad_norm": 0.5078125, + "learning_rate": 2.3447145542754634e-05, + "loss": 0.2751, + "step": 23974 + }, + { + "epoch": 3.2, + "grad_norm": 0.478515625, + "learning_rate": 2.343965363545426e-05, + "loss": 0.3306, + "step": 23975 + }, + { + "epoch": 3.2, + "grad_norm": 0.796875, + "learning_rate": 2.343216276636132e-05, + "loss": 0.5869, + "step": 23976 + }, + { + "epoch": 3.2, + "grad_norm": 0.5625, + "learning_rate": 2.3424672935577407e-05, + "loss": 0.2335, + "step": 23977 + }, + { + "epoch": 3.2, + "grad_norm": 0.61328125, + "learning_rate": 2.3417184143204086e-05, + "loss": 0.1985, + "step": 23978 + }, + { + "epoch": 3.2, + "grad_norm": 0.59765625, + "learning_rate": 2.340969638934287e-05, + "loss": 0.3372, + "step": 23979 + }, + { + "epoch": 3.2, + "grad_norm": 0.75, + "learning_rate": 2.3402209674095344e-05, + "loss": 0.6596, + "step": 23980 + }, + { + "epoch": 3.2, + "grad_norm": 0.67578125, + "learning_rate": 2.3394723997562995e-05, + "loss": 0.3685, + "step": 23981 + }, + { + "epoch": 3.2, + "grad_norm": 0.494140625, + "learning_rate": 2.338723935984739e-05, + "loss": 0.1897, + "step": 23982 + }, + { + "epoch": 3.2, + "grad_norm": 0.55859375, + "learning_rate": 2.3379755761049958e-05, + "loss": 0.1329, + "step": 23983 + }, + { + "epoch": 3.2, + "grad_norm": 0.52734375, + "learning_rate": 2.3372273201272222e-05, + "loss": 0.291, + "step": 23984 + }, + { + "epoch": 3.2, + "grad_norm": 0.56640625, + "learning_rate": 2.336479168061566e-05, + "loss": 0.2605, + "step": 23985 + }, + { + "epoch": 3.2, + "grad_norm": 0.57421875, + "learning_rate": 2.3357311199181686e-05, + "loss": 0.2894, + "step": 23986 + }, + { + "epoch": 3.2, + "grad_norm": 0.6640625, + "learning_rate": 2.3349831757071783e-05, + "loss": 0.2191, + "step": 23987 + }, + { + "epoch": 3.2, + "grad_norm": 0.8671875, + "learning_rate": 2.3342353354387326e-05, + "loss": 0.5859, + "step": 23988 + }, + { + "epoch": 3.2, + "grad_norm": 0.69140625, + "learning_rate": 2.3334875991229753e-05, + "loss": 0.3821, + "step": 23989 + }, + { + "epoch": 3.2, + "grad_norm": 0.462890625, + "learning_rate": 2.3327399667700477e-05, + "loss": 0.2408, + "step": 23990 + }, + { + "epoch": 3.2, + "grad_norm": 0.58203125, + "learning_rate": 2.331992438390086e-05, + "loss": 0.1326, + "step": 23991 + }, + { + "epoch": 3.2, + "grad_norm": 0.43359375, + "learning_rate": 2.3312450139932317e-05, + "loss": 0.1346, + "step": 23992 + }, + { + "epoch": 3.2, + "grad_norm": 0.78125, + "learning_rate": 2.3304976935896128e-05, + "loss": 0.3295, + "step": 23993 + }, + { + "epoch": 3.2, + "grad_norm": 0.85546875, + "learning_rate": 2.3297504771893687e-05, + "loss": 0.7944, + "step": 23994 + }, + { + "epoch": 3.2, + "grad_norm": 0.8203125, + "learning_rate": 2.32900336480263e-05, + "loss": 0.2933, + "step": 23995 + }, + { + "epoch": 3.2, + "grad_norm": 0.6640625, + "learning_rate": 2.328256356439532e-05, + "loss": 0.5613, + "step": 23996 + }, + { + "epoch": 3.2, + "grad_norm": 0.61328125, + "learning_rate": 2.3275094521102015e-05, + "loss": 0.3279, + "step": 23997 + }, + { + "epoch": 3.2, + "grad_norm": 0.55859375, + "learning_rate": 2.3267626518247644e-05, + "loss": 0.299, + "step": 23998 + }, + { + "epoch": 3.2, + "grad_norm": 0.7421875, + "learning_rate": 2.3260159555933503e-05, + "loss": 0.3368, + "step": 23999 + }, + { + "epoch": 3.2, + "grad_norm": 0.578125, + "learning_rate": 2.325269363426085e-05, + "loss": 0.1683, + "step": 24000 + }, + { + "epoch": 3.2, + "grad_norm": 0.7109375, + "learning_rate": 2.324522875333095e-05, + "loss": 0.3106, + "step": 24001 + }, + { + "epoch": 3.2, + "grad_norm": 0.58984375, + "learning_rate": 2.3237764913244985e-05, + "loss": 0.1884, + "step": 24002 + }, + { + "epoch": 3.2, + "grad_norm": 0.490234375, + "learning_rate": 2.3230302114104187e-05, + "loss": 0.2159, + "step": 24003 + }, + { + "epoch": 3.2, + "grad_norm": 0.81640625, + "learning_rate": 2.3222840356009767e-05, + "loss": 0.4721, + "step": 24004 + }, + { + "epoch": 3.2, + "grad_norm": 0.6015625, + "learning_rate": 2.321537963906294e-05, + "loss": 0.2882, + "step": 24005 + }, + { + "epoch": 3.2, + "grad_norm": 0.451171875, + "learning_rate": 2.3207919963364822e-05, + "loss": 0.3626, + "step": 24006 + }, + { + "epoch": 3.2, + "grad_norm": 0.45703125, + "learning_rate": 2.3200461329016575e-05, + "loss": 0.1444, + "step": 24007 + }, + { + "epoch": 3.2, + "grad_norm": 0.62890625, + "learning_rate": 2.3193003736119378e-05, + "loss": 0.3315, + "step": 24008 + }, + { + "epoch": 3.2, + "grad_norm": 0.69140625, + "learning_rate": 2.318554718477437e-05, + "loss": 0.7972, + "step": 24009 + }, + { + "epoch": 3.2, + "grad_norm": 0.5234375, + "learning_rate": 2.3178091675082636e-05, + "loss": 0.2843, + "step": 24010 + }, + { + "epoch": 3.2, + "grad_norm": 0.65625, + "learning_rate": 2.3170637207145264e-05, + "loss": 0.396, + "step": 24011 + }, + { + "epoch": 3.2, + "grad_norm": 0.74609375, + "learning_rate": 2.3163183781063347e-05, + "loss": 0.6887, + "step": 24012 + }, + { + "epoch": 3.2, + "grad_norm": 0.56640625, + "learning_rate": 2.3155731396937987e-05, + "loss": 0.1859, + "step": 24013 + }, + { + "epoch": 3.2, + "grad_norm": 0.59765625, + "learning_rate": 2.314828005487022e-05, + "loss": 0.2202, + "step": 24014 + }, + { + "epoch": 3.2, + "grad_norm": 0.640625, + "learning_rate": 2.3140829754961123e-05, + "loss": 0.2374, + "step": 24015 + }, + { + "epoch": 3.2, + "grad_norm": 0.49609375, + "learning_rate": 2.3133380497311674e-05, + "loss": 0.3115, + "step": 24016 + }, + { + "epoch": 3.2, + "grad_norm": 0.890625, + "learning_rate": 2.3125932282022922e-05, + "loss": 0.3023, + "step": 24017 + }, + { + "epoch": 3.2, + "grad_norm": 0.6796875, + "learning_rate": 2.3118485109195864e-05, + "loss": 0.3502, + "step": 24018 + }, + { + "epoch": 3.21, + "grad_norm": 0.48828125, + "learning_rate": 2.3111038978931522e-05, + "loss": 0.2873, + "step": 24019 + }, + { + "epoch": 3.21, + "grad_norm": 0.69921875, + "learning_rate": 2.310359389133081e-05, + "loss": 0.3189, + "step": 24020 + }, + { + "epoch": 3.21, + "grad_norm": 0.6015625, + "learning_rate": 2.309614984649474e-05, + "loss": 0.4071, + "step": 24021 + }, + { + "epoch": 3.21, + "grad_norm": 0.73046875, + "learning_rate": 2.308870684452421e-05, + "loss": 0.6171, + "step": 24022 + }, + { + "epoch": 3.21, + "grad_norm": 0.62890625, + "learning_rate": 2.308126488552017e-05, + "loss": 0.5155, + "step": 24023 + }, + { + "epoch": 3.21, + "grad_norm": 0.412109375, + "learning_rate": 2.3073823969583587e-05, + "loss": 0.1545, + "step": 24024 + }, + { + "epoch": 3.21, + "grad_norm": 0.71875, + "learning_rate": 2.3066384096815276e-05, + "loss": 0.4982, + "step": 24025 + }, + { + "epoch": 3.21, + "grad_norm": 0.66015625, + "learning_rate": 2.305894526731619e-05, + "loss": 0.4116, + "step": 24026 + }, + { + "epoch": 3.21, + "grad_norm": 0.45703125, + "learning_rate": 2.3051507481187173e-05, + "loss": 0.1664, + "step": 24027 + }, + { + "epoch": 3.21, + "grad_norm": 0.57421875, + "learning_rate": 2.3044070738529134e-05, + "loss": 0.3631, + "step": 24028 + }, + { + "epoch": 3.21, + "grad_norm": 0.625, + "learning_rate": 2.3036635039442856e-05, + "loss": 0.2803, + "step": 24029 + }, + { + "epoch": 3.21, + "grad_norm": 0.7109375, + "learning_rate": 2.3029200384029203e-05, + "loss": 0.2933, + "step": 24030 + }, + { + "epoch": 3.21, + "grad_norm": 0.5546875, + "learning_rate": 2.3021766772388986e-05, + "loss": 0.2172, + "step": 24031 + }, + { + "epoch": 3.21, + "grad_norm": 0.6015625, + "learning_rate": 2.3014334204623055e-05, + "loss": 0.4458, + "step": 24032 + }, + { + "epoch": 3.21, + "grad_norm": 0.60546875, + "learning_rate": 2.3006902680832154e-05, + "loss": 0.2295, + "step": 24033 + }, + { + "epoch": 3.21, + "grad_norm": 0.67578125, + "learning_rate": 2.2999472201117035e-05, + "loss": 0.2697, + "step": 24034 + }, + { + "epoch": 3.21, + "grad_norm": 0.6953125, + "learning_rate": 2.2992042765578503e-05, + "loss": 0.4342, + "step": 24035 + }, + { + "epoch": 3.21, + "grad_norm": 0.66015625, + "learning_rate": 2.298461437431728e-05, + "loss": 0.3324, + "step": 24036 + }, + { + "epoch": 3.21, + "grad_norm": 0.69921875, + "learning_rate": 2.2977187027434122e-05, + "loss": 0.2192, + "step": 24037 + }, + { + "epoch": 3.21, + "grad_norm": 0.5, + "learning_rate": 2.2969760725029776e-05, + "loss": 0.4283, + "step": 24038 + }, + { + "epoch": 3.21, + "grad_norm": 0.48828125, + "learning_rate": 2.2962335467204876e-05, + "loss": 0.2534, + "step": 24039 + }, + { + "epoch": 3.21, + "grad_norm": 0.62109375, + "learning_rate": 2.2954911254060153e-05, + "loss": 0.4553, + "step": 24040 + }, + { + "epoch": 3.21, + "grad_norm": 0.55078125, + "learning_rate": 2.294748808569627e-05, + "loss": 0.4064, + "step": 24041 + }, + { + "epoch": 3.21, + "grad_norm": 0.578125, + "learning_rate": 2.2940065962213943e-05, + "loss": 0.1731, + "step": 24042 + }, + { + "epoch": 3.21, + "grad_norm": 0.71484375, + "learning_rate": 2.2932644883713738e-05, + "loss": 0.6254, + "step": 24043 + }, + { + "epoch": 3.21, + "grad_norm": 0.67578125, + "learning_rate": 2.2925224850296332e-05, + "loss": 0.1659, + "step": 24044 + }, + { + "epoch": 3.21, + "grad_norm": 0.546875, + "learning_rate": 2.291780586206238e-05, + "loss": 0.2134, + "step": 24045 + }, + { + "epoch": 3.21, + "grad_norm": 0.72265625, + "learning_rate": 2.291038791911242e-05, + "loss": 0.2412, + "step": 24046 + }, + { + "epoch": 3.21, + "grad_norm": 0.62109375, + "learning_rate": 2.2902971021547103e-05, + "loss": 0.2471, + "step": 24047 + }, + { + "epoch": 3.21, + "grad_norm": 0.63671875, + "learning_rate": 2.2895555169466954e-05, + "loss": 0.2291, + "step": 24048 + }, + { + "epoch": 3.21, + "grad_norm": 0.6171875, + "learning_rate": 2.2888140362972556e-05, + "loss": 0.5103, + "step": 24049 + }, + { + "epoch": 3.21, + "grad_norm": 0.5546875, + "learning_rate": 2.288072660216446e-05, + "loss": 0.1756, + "step": 24050 + }, + { + "epoch": 3.21, + "grad_norm": 0.578125, + "learning_rate": 2.2873313887143243e-05, + "loss": 0.372, + "step": 24051 + }, + { + "epoch": 3.21, + "grad_norm": 0.60546875, + "learning_rate": 2.286590221800936e-05, + "loss": 0.2482, + "step": 24052 + }, + { + "epoch": 3.21, + "grad_norm": 0.578125, + "learning_rate": 2.285849159486335e-05, + "loss": 0.311, + "step": 24053 + }, + { + "epoch": 3.21, + "grad_norm": 0.408203125, + "learning_rate": 2.2851082017805703e-05, + "loss": 0.157, + "step": 24054 + }, + { + "epoch": 3.21, + "grad_norm": 0.77734375, + "learning_rate": 2.2843673486936923e-05, + "loss": 0.5812, + "step": 24055 + }, + { + "epoch": 3.21, + "grad_norm": 0.6875, + "learning_rate": 2.2836266002357422e-05, + "loss": 0.3772, + "step": 24056 + }, + { + "epoch": 3.21, + "grad_norm": 0.8203125, + "learning_rate": 2.2828859564167672e-05, + "loss": 0.3983, + "step": 24057 + }, + { + "epoch": 3.21, + "grad_norm": 0.6953125, + "learning_rate": 2.282145417246815e-05, + "loss": 0.3792, + "step": 24058 + }, + { + "epoch": 3.21, + "grad_norm": 0.73828125, + "learning_rate": 2.2814049827359207e-05, + "loss": 0.2523, + "step": 24059 + }, + { + "epoch": 3.21, + "grad_norm": 0.58203125, + "learning_rate": 2.2806646528941323e-05, + "loss": 0.184, + "step": 24060 + }, + { + "epoch": 3.21, + "grad_norm": 0.671875, + "learning_rate": 2.2799244277314823e-05, + "loss": 0.2425, + "step": 24061 + }, + { + "epoch": 3.21, + "grad_norm": 0.416015625, + "learning_rate": 2.279184307258011e-05, + "loss": 0.1081, + "step": 24062 + }, + { + "epoch": 3.21, + "grad_norm": 0.6796875, + "learning_rate": 2.278444291483757e-05, + "loss": 0.4115, + "step": 24063 + }, + { + "epoch": 3.21, + "grad_norm": 0.51953125, + "learning_rate": 2.2777043804187527e-05, + "loss": 0.2198, + "step": 24064 + }, + { + "epoch": 3.21, + "grad_norm": 0.53125, + "learning_rate": 2.2769645740730373e-05, + "loss": 0.3364, + "step": 24065 + }, + { + "epoch": 3.21, + "grad_norm": 0.515625, + "learning_rate": 2.2762248724566347e-05, + "loss": 0.2315, + "step": 24066 + }, + { + "epoch": 3.21, + "grad_norm": 0.5703125, + "learning_rate": 2.2754852755795806e-05, + "loss": 0.3615, + "step": 24067 + }, + { + "epoch": 3.21, + "grad_norm": 0.5390625, + "learning_rate": 2.2747457834519036e-05, + "loss": 0.2022, + "step": 24068 + }, + { + "epoch": 3.21, + "grad_norm": 0.7109375, + "learning_rate": 2.2740063960836355e-05, + "loss": 0.2172, + "step": 24069 + }, + { + "epoch": 3.21, + "grad_norm": 0.73828125, + "learning_rate": 2.273267113484798e-05, + "loss": 0.4281, + "step": 24070 + }, + { + "epoch": 3.21, + "grad_norm": 0.63671875, + "learning_rate": 2.272527935665415e-05, + "loss": 0.3233, + "step": 24071 + }, + { + "epoch": 3.21, + "grad_norm": 0.62890625, + "learning_rate": 2.2717888626355134e-05, + "loss": 0.3429, + "step": 24072 + }, + { + "epoch": 3.21, + "grad_norm": 0.416015625, + "learning_rate": 2.271049894405113e-05, + "loss": 0.1876, + "step": 24073 + }, + { + "epoch": 3.21, + "grad_norm": 0.50390625, + "learning_rate": 2.2703110309842403e-05, + "loss": 0.297, + "step": 24074 + }, + { + "epoch": 3.21, + "grad_norm": 0.62890625, + "learning_rate": 2.269572272382908e-05, + "loss": 0.2704, + "step": 24075 + }, + { + "epoch": 3.21, + "grad_norm": 0.5859375, + "learning_rate": 2.268833618611138e-05, + "loss": 0.3533, + "step": 24076 + }, + { + "epoch": 3.21, + "grad_norm": 0.5859375, + "learning_rate": 2.268095069678945e-05, + "loss": 0.1784, + "step": 24077 + }, + { + "epoch": 3.21, + "grad_norm": 0.515625, + "learning_rate": 2.2673566255963473e-05, + "loss": 0.302, + "step": 24078 + }, + { + "epoch": 3.21, + "grad_norm": 0.5234375, + "learning_rate": 2.2666182863733554e-05, + "loss": 0.1919, + "step": 24079 + }, + { + "epoch": 3.21, + "grad_norm": 0.765625, + "learning_rate": 2.265880052019982e-05, + "loss": 0.3065, + "step": 24080 + }, + { + "epoch": 3.21, + "grad_norm": 0.4609375, + "learning_rate": 2.265141922546238e-05, + "loss": 0.2055, + "step": 24081 + }, + { + "epoch": 3.21, + "grad_norm": 0.53125, + "learning_rate": 2.2644038979621375e-05, + "loss": 0.4251, + "step": 24082 + }, + { + "epoch": 3.21, + "grad_norm": 0.76171875, + "learning_rate": 2.263665978277685e-05, + "loss": 0.4247, + "step": 24083 + }, + { + "epoch": 3.21, + "grad_norm": 0.765625, + "learning_rate": 2.2629281635028832e-05, + "loss": 0.4673, + "step": 24084 + }, + { + "epoch": 3.21, + "grad_norm": 0.546875, + "learning_rate": 2.2621904536477413e-05, + "loss": 0.1764, + "step": 24085 + }, + { + "epoch": 3.21, + "grad_norm": 0.5078125, + "learning_rate": 2.261452848722263e-05, + "loss": 0.2053, + "step": 24086 + }, + { + "epoch": 3.21, + "grad_norm": 0.609375, + "learning_rate": 2.2607153487364517e-05, + "loss": 0.1282, + "step": 24087 + }, + { + "epoch": 3.21, + "grad_norm": 0.380859375, + "learning_rate": 2.2599779537003086e-05, + "loss": 0.1449, + "step": 24088 + }, + { + "epoch": 3.21, + "grad_norm": 0.6484375, + "learning_rate": 2.2592406636238306e-05, + "loss": 0.2606, + "step": 24089 + }, + { + "epoch": 3.21, + "grad_norm": 0.62890625, + "learning_rate": 2.2585034785170155e-05, + "loss": 0.4402, + "step": 24090 + }, + { + "epoch": 3.21, + "grad_norm": 0.74609375, + "learning_rate": 2.2577663983898633e-05, + "loss": 0.3341, + "step": 24091 + }, + { + "epoch": 3.21, + "grad_norm": 0.486328125, + "learning_rate": 2.2570294232523703e-05, + "loss": 0.2067, + "step": 24092 + }, + { + "epoch": 3.21, + "grad_norm": 0.5625, + "learning_rate": 2.256292553114524e-05, + "loss": 0.2653, + "step": 24093 + }, + { + "epoch": 3.22, + "grad_norm": 0.6328125, + "learning_rate": 2.2555557879863243e-05, + "loss": 0.4396, + "step": 24094 + }, + { + "epoch": 3.22, + "grad_norm": 0.69140625, + "learning_rate": 2.254819127877754e-05, + "loss": 0.5751, + "step": 24095 + }, + { + "epoch": 3.22, + "grad_norm": 0.6171875, + "learning_rate": 2.2540825727988092e-05, + "loss": 0.3842, + "step": 24096 + }, + { + "epoch": 3.22, + "grad_norm": 0.4765625, + "learning_rate": 2.2533461227594775e-05, + "loss": 0.1858, + "step": 24097 + }, + { + "epoch": 3.22, + "grad_norm": 0.66796875, + "learning_rate": 2.2526097777697418e-05, + "loss": 0.5755, + "step": 24098 + }, + { + "epoch": 3.22, + "grad_norm": 0.6171875, + "learning_rate": 2.2518735378395893e-05, + "loss": 0.3665, + "step": 24099 + }, + { + "epoch": 3.22, + "grad_norm": 0.921875, + "learning_rate": 2.2511374029790055e-05, + "loss": 0.259, + "step": 24100 + }, + { + "epoch": 3.22, + "grad_norm": 0.62890625, + "learning_rate": 2.2504013731979732e-05, + "loss": 0.3665, + "step": 24101 + }, + { + "epoch": 3.22, + "grad_norm": 0.55078125, + "learning_rate": 2.2496654485064705e-05, + "loss": 0.233, + "step": 24102 + }, + { + "epoch": 3.22, + "grad_norm": 0.609375, + "learning_rate": 2.248929628914478e-05, + "loss": 0.4192, + "step": 24103 + }, + { + "epoch": 3.22, + "grad_norm": 0.69140625, + "learning_rate": 2.248193914431973e-05, + "loss": 0.4612, + "step": 24104 + }, + { + "epoch": 3.22, + "grad_norm": 0.67578125, + "learning_rate": 2.247458305068938e-05, + "loss": 0.3806, + "step": 24105 + }, + { + "epoch": 3.22, + "grad_norm": 0.62109375, + "learning_rate": 2.2467228008353436e-05, + "loss": 0.3993, + "step": 24106 + }, + { + "epoch": 3.22, + "grad_norm": 0.53515625, + "learning_rate": 2.245987401741161e-05, + "loss": 0.284, + "step": 24107 + }, + { + "epoch": 3.22, + "grad_norm": 0.73828125, + "learning_rate": 2.245252107796365e-05, + "loss": 0.3167, + "step": 24108 + }, + { + "epoch": 3.22, + "grad_norm": 0.59765625, + "learning_rate": 2.2445169190109283e-05, + "loss": 0.3082, + "step": 24109 + }, + { + "epoch": 3.22, + "grad_norm": 0.640625, + "learning_rate": 2.2437818353948194e-05, + "loss": 0.4884, + "step": 24110 + }, + { + "epoch": 3.22, + "grad_norm": 0.46875, + "learning_rate": 2.24304685695801e-05, + "loss": 0.1947, + "step": 24111 + }, + { + "epoch": 3.22, + "grad_norm": 0.57421875, + "learning_rate": 2.24231198371046e-05, + "loss": 0.2087, + "step": 24112 + }, + { + "epoch": 3.22, + "grad_norm": 0.455078125, + "learning_rate": 2.2415772156621382e-05, + "loss": 0.2423, + "step": 24113 + }, + { + "epoch": 3.22, + "grad_norm": 0.6015625, + "learning_rate": 2.2408425528230094e-05, + "loss": 0.2411, + "step": 24114 + }, + { + "epoch": 3.22, + "grad_norm": 0.427734375, + "learning_rate": 2.240107995203038e-05, + "loss": 0.2302, + "step": 24115 + }, + { + "epoch": 3.22, + "grad_norm": 0.56640625, + "learning_rate": 2.23937354281218e-05, + "loss": 0.2214, + "step": 24116 + }, + { + "epoch": 3.22, + "grad_norm": 0.5703125, + "learning_rate": 2.238639195660397e-05, + "loss": 0.3619, + "step": 24117 + }, + { + "epoch": 3.22, + "grad_norm": 0.51953125, + "learning_rate": 2.23790495375765e-05, + "loss": 0.1357, + "step": 24118 + }, + { + "epoch": 3.22, + "grad_norm": 0.498046875, + "learning_rate": 2.2371708171138917e-05, + "loss": 0.2423, + "step": 24119 + }, + { + "epoch": 3.22, + "grad_norm": 0.5625, + "learning_rate": 2.236436785739081e-05, + "loss": 0.2655, + "step": 24120 + }, + { + "epoch": 3.22, + "grad_norm": 0.69140625, + "learning_rate": 2.2357028596431683e-05, + "loss": 0.4899, + "step": 24121 + }, + { + "epoch": 3.22, + "grad_norm": 0.78125, + "learning_rate": 2.234969038836108e-05, + "loss": 0.366, + "step": 24122 + }, + { + "epoch": 3.22, + "grad_norm": 0.62109375, + "learning_rate": 2.23423532332785e-05, + "loss": 0.3469, + "step": 24123 + }, + { + "epoch": 3.22, + "grad_norm": 0.625, + "learning_rate": 2.2335017131283485e-05, + "loss": 0.3457, + "step": 24124 + }, + { + "epoch": 3.22, + "grad_norm": 0.484375, + "learning_rate": 2.2327682082475453e-05, + "loss": 0.2631, + "step": 24125 + }, + { + "epoch": 3.22, + "grad_norm": 0.55859375, + "learning_rate": 2.2320348086953914e-05, + "loss": 0.2135, + "step": 24126 + }, + { + "epoch": 3.22, + "grad_norm": 0.61328125, + "learning_rate": 2.2313015144818296e-05, + "loss": 0.2362, + "step": 24127 + }, + { + "epoch": 3.22, + "grad_norm": 0.75, + "learning_rate": 2.2305683256168075e-05, + "loss": 0.2915, + "step": 24128 + }, + { + "epoch": 3.22, + "grad_norm": 0.578125, + "learning_rate": 2.229835242110264e-05, + "loss": 0.2278, + "step": 24129 + }, + { + "epoch": 3.22, + "grad_norm": 0.462890625, + "learning_rate": 2.2291022639721438e-05, + "loss": 0.1708, + "step": 24130 + }, + { + "epoch": 3.22, + "grad_norm": 0.53515625, + "learning_rate": 2.2283693912123803e-05, + "loss": 0.2423, + "step": 24131 + }, + { + "epoch": 3.22, + "grad_norm": 0.5546875, + "learning_rate": 2.2276366238409176e-05, + "loss": 0.259, + "step": 24132 + }, + { + "epoch": 3.22, + "grad_norm": 0.462890625, + "learning_rate": 2.2269039618676913e-05, + "loss": 0.1192, + "step": 24133 + }, + { + "epoch": 3.22, + "grad_norm": 0.439453125, + "learning_rate": 2.226171405302635e-05, + "loss": 0.1799, + "step": 24134 + }, + { + "epoch": 3.22, + "grad_norm": 0.60546875, + "learning_rate": 2.2254389541556818e-05, + "loss": 0.2844, + "step": 24135 + }, + { + "epoch": 3.22, + "grad_norm": 0.60546875, + "learning_rate": 2.2247066084367673e-05, + "loss": 0.1827, + "step": 24136 + }, + { + "epoch": 3.22, + "grad_norm": 0.69140625, + "learning_rate": 2.223974368155821e-05, + "loss": 0.3584, + "step": 24137 + }, + { + "epoch": 3.22, + "grad_norm": 0.66015625, + "learning_rate": 2.2232422333227763e-05, + "loss": 0.3497, + "step": 24138 + }, + { + "epoch": 3.22, + "grad_norm": 0.4765625, + "learning_rate": 2.2225102039475554e-05, + "loss": 0.2303, + "step": 24139 + }, + { + "epoch": 3.22, + "grad_norm": 0.51171875, + "learning_rate": 2.2217782800400866e-05, + "loss": 0.347, + "step": 24140 + }, + { + "epoch": 3.22, + "grad_norm": 0.48828125, + "learning_rate": 2.2210464616102965e-05, + "loss": 0.2762, + "step": 24141 + }, + { + "epoch": 3.22, + "grad_norm": 0.72265625, + "learning_rate": 2.220314748668112e-05, + "loss": 0.4096, + "step": 24142 + }, + { + "epoch": 3.22, + "grad_norm": 0.66015625, + "learning_rate": 2.2195831412234523e-05, + "loss": 0.2904, + "step": 24143 + }, + { + "epoch": 3.22, + "grad_norm": 0.72265625, + "learning_rate": 2.2188516392862368e-05, + "loss": 0.3787, + "step": 24144 + }, + { + "epoch": 3.22, + "grad_norm": 0.60546875, + "learning_rate": 2.218120242866386e-05, + "loss": 0.2472, + "step": 24145 + }, + { + "epoch": 3.22, + "grad_norm": 0.70703125, + "learning_rate": 2.2173889519738188e-05, + "loss": 0.4395, + "step": 24146 + }, + { + "epoch": 3.22, + "grad_norm": 0.890625, + "learning_rate": 2.216657766618456e-05, + "loss": 0.4171, + "step": 24147 + }, + { + "epoch": 3.22, + "grad_norm": 0.55859375, + "learning_rate": 2.215926686810206e-05, + "loss": 0.2606, + "step": 24148 + }, + { + "epoch": 3.22, + "grad_norm": 0.625, + "learning_rate": 2.215195712558985e-05, + "loss": 0.3127, + "step": 24149 + }, + { + "epoch": 3.22, + "grad_norm": 0.60546875, + "learning_rate": 2.2144648438747074e-05, + "loss": 0.4377, + "step": 24150 + }, + { + "epoch": 3.22, + "grad_norm": 0.765625, + "learning_rate": 2.2137340807672856e-05, + "loss": 0.4098, + "step": 24151 + }, + { + "epoch": 3.22, + "grad_norm": 0.73828125, + "learning_rate": 2.2130034232466247e-05, + "loss": 0.508, + "step": 24152 + }, + { + "epoch": 3.22, + "grad_norm": 0.54296875, + "learning_rate": 2.2122728713226347e-05, + "loss": 0.1698, + "step": 24153 + }, + { + "epoch": 3.22, + "grad_norm": 0.6484375, + "learning_rate": 2.211542425005223e-05, + "loss": 0.2468, + "step": 24154 + }, + { + "epoch": 3.22, + "grad_norm": 0.59765625, + "learning_rate": 2.210812084304297e-05, + "loss": 0.3437, + "step": 24155 + }, + { + "epoch": 3.22, + "grad_norm": 0.6328125, + "learning_rate": 2.210081849229758e-05, + "loss": 0.3839, + "step": 24156 + }, + { + "epoch": 3.22, + "grad_norm": 0.60546875, + "learning_rate": 2.209351719791506e-05, + "loss": 0.444, + "step": 24157 + }, + { + "epoch": 3.22, + "grad_norm": 0.490234375, + "learning_rate": 2.2086216959994456e-05, + "loss": 0.1065, + "step": 24158 + }, + { + "epoch": 3.22, + "grad_norm": 0.875, + "learning_rate": 2.2078917778634745e-05, + "loss": 0.2399, + "step": 24159 + }, + { + "epoch": 3.22, + "grad_norm": 0.60546875, + "learning_rate": 2.2071619653934917e-05, + "loss": 0.5994, + "step": 24160 + }, + { + "epoch": 3.22, + "grad_norm": 0.69140625, + "learning_rate": 2.206432258599397e-05, + "loss": 0.4296, + "step": 24161 + }, + { + "epoch": 3.22, + "grad_norm": 0.62109375, + "learning_rate": 2.20570265749108e-05, + "loss": 0.2829, + "step": 24162 + }, + { + "epoch": 3.22, + "grad_norm": 0.625, + "learning_rate": 2.2049731620784374e-05, + "loss": 0.1772, + "step": 24163 + }, + { + "epoch": 3.22, + "grad_norm": 0.3984375, + "learning_rate": 2.2042437723713616e-05, + "loss": 0.1523, + "step": 24164 + }, + { + "epoch": 3.22, + "grad_norm": 0.55859375, + "learning_rate": 2.2035144883797466e-05, + "loss": 0.2999, + "step": 24165 + }, + { + "epoch": 3.22, + "grad_norm": 0.62109375, + "learning_rate": 2.2027853101134756e-05, + "loss": 0.3013, + "step": 24166 + }, + { + "epoch": 3.22, + "grad_norm": 0.671875, + "learning_rate": 2.2020562375824426e-05, + "loss": 0.3688, + "step": 24167 + }, + { + "epoch": 3.22, + "grad_norm": 0.58203125, + "learning_rate": 2.2013272707965294e-05, + "loss": 0.3197, + "step": 24168 + }, + { + "epoch": 3.23, + "grad_norm": 0.486328125, + "learning_rate": 2.200598409765624e-05, + "loss": 0.2403, + "step": 24169 + }, + { + "epoch": 3.23, + "grad_norm": 0.66015625, + "learning_rate": 2.1998696544996122e-05, + "loss": 0.249, + "step": 24170 + }, + { + "epoch": 3.23, + "grad_norm": 0.703125, + "learning_rate": 2.1991410050083705e-05, + "loss": 0.396, + "step": 24171 + }, + { + "epoch": 3.23, + "grad_norm": 0.46484375, + "learning_rate": 2.1984124613017843e-05, + "loss": 0.1439, + "step": 24172 + }, + { + "epoch": 3.23, + "grad_norm": 0.69921875, + "learning_rate": 2.1976840233897312e-05, + "loss": 0.2433, + "step": 24173 + }, + { + "epoch": 3.23, + "grad_norm": 0.57421875, + "learning_rate": 2.1969556912820943e-05, + "loss": 0.2044, + "step": 24174 + }, + { + "epoch": 3.23, + "grad_norm": 0.5546875, + "learning_rate": 2.1962274649887425e-05, + "loss": 0.4216, + "step": 24175 + }, + { + "epoch": 3.23, + "grad_norm": 0.56640625, + "learning_rate": 2.195499344519555e-05, + "loss": 0.255, + "step": 24176 + }, + { + "epoch": 3.23, + "grad_norm": 0.3984375, + "learning_rate": 2.194771329884405e-05, + "loss": 0.1498, + "step": 24177 + }, + { + "epoch": 3.23, + "grad_norm": 0.59375, + "learning_rate": 2.1940434210931682e-05, + "loss": 0.2389, + "step": 24178 + }, + { + "epoch": 3.23, + "grad_norm": 0.56640625, + "learning_rate": 2.1933156181557123e-05, + "loss": 0.4356, + "step": 24179 + }, + { + "epoch": 3.23, + "grad_norm": 0.58984375, + "learning_rate": 2.1925879210819035e-05, + "loss": 0.1873, + "step": 24180 + }, + { + "epoch": 3.23, + "grad_norm": 0.71484375, + "learning_rate": 2.1918603298816143e-05, + "loss": 0.3229, + "step": 24181 + }, + { + "epoch": 3.23, + "grad_norm": 0.703125, + "learning_rate": 2.1911328445647095e-05, + "loss": 0.2577, + "step": 24182 + }, + { + "epoch": 3.23, + "grad_norm": 0.51171875, + "learning_rate": 2.1904054651410556e-05, + "loss": 0.4218, + "step": 24183 + }, + { + "epoch": 3.23, + "grad_norm": 0.66015625, + "learning_rate": 2.189678191620518e-05, + "loss": 0.2712, + "step": 24184 + }, + { + "epoch": 3.23, + "grad_norm": 0.56640625, + "learning_rate": 2.188951024012954e-05, + "loss": 0.5118, + "step": 24185 + }, + { + "epoch": 3.23, + "grad_norm": 0.62109375, + "learning_rate": 2.1882239623282274e-05, + "loss": 0.3443, + "step": 24186 + }, + { + "epoch": 3.23, + "grad_norm": 0.6171875, + "learning_rate": 2.187497006576198e-05, + "loss": 0.2802, + "step": 24187 + }, + { + "epoch": 3.23, + "grad_norm": 0.59375, + "learning_rate": 2.1867701567667255e-05, + "loss": 0.3846, + "step": 24188 + }, + { + "epoch": 3.23, + "grad_norm": 0.5625, + "learning_rate": 2.1860434129096618e-05, + "loss": 0.2429, + "step": 24189 + }, + { + "epoch": 3.23, + "grad_norm": 0.58203125, + "learning_rate": 2.185316775014864e-05, + "loss": 0.4347, + "step": 24190 + }, + { + "epoch": 3.23, + "grad_norm": 0.46484375, + "learning_rate": 2.1845902430921882e-05, + "loss": 0.2172, + "step": 24191 + }, + { + "epoch": 3.23, + "grad_norm": 0.83984375, + "learning_rate": 2.1838638171514826e-05, + "loss": 0.4539, + "step": 24192 + }, + { + "epoch": 3.23, + "grad_norm": 0.76171875, + "learning_rate": 2.183137497202603e-05, + "loss": 0.4169, + "step": 24193 + }, + { + "epoch": 3.23, + "grad_norm": 0.71484375, + "learning_rate": 2.182411283255392e-05, + "loss": 0.3918, + "step": 24194 + }, + { + "epoch": 3.23, + "grad_norm": 0.6171875, + "learning_rate": 2.181685175319702e-05, + "loss": 0.321, + "step": 24195 + }, + { + "epoch": 3.23, + "grad_norm": 0.765625, + "learning_rate": 2.1809591734053793e-05, + "loss": 0.3689, + "step": 24196 + }, + { + "epoch": 3.23, + "grad_norm": 0.5, + "learning_rate": 2.180233277522271e-05, + "loss": 0.1784, + "step": 24197 + }, + { + "epoch": 3.23, + "grad_norm": 0.765625, + "learning_rate": 2.1795074876802147e-05, + "loss": 0.5116, + "step": 24198 + }, + { + "epoch": 3.23, + "grad_norm": 0.61328125, + "learning_rate": 2.1787818038890563e-05, + "loss": 0.2218, + "step": 24199 + }, + { + "epoch": 3.23, + "grad_norm": 0.703125, + "learning_rate": 2.178056226158637e-05, + "loss": 0.2034, + "step": 24200 + }, + { + "epoch": 3.23, + "grad_norm": 0.625, + "learning_rate": 2.1773307544987974e-05, + "loss": 0.4044, + "step": 24201 + }, + { + "epoch": 3.23, + "grad_norm": 0.5546875, + "learning_rate": 2.1766053889193704e-05, + "loss": 0.2808, + "step": 24202 + }, + { + "epoch": 3.23, + "grad_norm": 0.466796875, + "learning_rate": 2.175880129430198e-05, + "loss": 0.2341, + "step": 24203 + }, + { + "epoch": 3.23, + "grad_norm": 0.51953125, + "learning_rate": 2.1751549760411107e-05, + "loss": 0.1869, + "step": 24204 + }, + { + "epoch": 3.23, + "grad_norm": 0.490234375, + "learning_rate": 2.174429928761943e-05, + "loss": 0.2206, + "step": 24205 + }, + { + "epoch": 3.23, + "grad_norm": 0.55078125, + "learning_rate": 2.1737049876025306e-05, + "loss": 0.2223, + "step": 24206 + }, + { + "epoch": 3.23, + "grad_norm": 0.65234375, + "learning_rate": 2.172980152572699e-05, + "loss": 0.2954, + "step": 24207 + }, + { + "epoch": 3.23, + "grad_norm": 0.58203125, + "learning_rate": 2.17225542368228e-05, + "loss": 0.1935, + "step": 24208 + }, + { + "epoch": 3.23, + "grad_norm": 0.41796875, + "learning_rate": 2.1715308009411018e-05, + "loss": 0.1338, + "step": 24209 + }, + { + "epoch": 3.23, + "grad_norm": 0.73046875, + "learning_rate": 2.1708062843589893e-05, + "loss": 0.4099, + "step": 24210 + }, + { + "epoch": 3.23, + "grad_norm": 0.6171875, + "learning_rate": 2.1700818739457716e-05, + "loss": 0.3937, + "step": 24211 + }, + { + "epoch": 3.23, + "grad_norm": 0.546875, + "learning_rate": 2.1693575697112656e-05, + "loss": 0.1678, + "step": 24212 + }, + { + "epoch": 3.23, + "grad_norm": 0.7109375, + "learning_rate": 2.1686333716652974e-05, + "loss": 0.3346, + "step": 24213 + }, + { + "epoch": 3.23, + "grad_norm": 0.77734375, + "learning_rate": 2.1679092798176858e-05, + "loss": 0.4918, + "step": 24214 + }, + { + "epoch": 3.23, + "grad_norm": 0.640625, + "learning_rate": 2.1671852941782544e-05, + "loss": 0.3364, + "step": 24215 + }, + { + "epoch": 3.23, + "grad_norm": 0.56640625, + "learning_rate": 2.1664614147568173e-05, + "loss": 0.2627, + "step": 24216 + }, + { + "epoch": 3.23, + "grad_norm": 0.6171875, + "learning_rate": 2.165737641563188e-05, + "loss": 0.2744, + "step": 24217 + }, + { + "epoch": 3.23, + "grad_norm": 0.546875, + "learning_rate": 2.1650139746071853e-05, + "loss": 0.1644, + "step": 24218 + }, + { + "epoch": 3.23, + "grad_norm": 0.52734375, + "learning_rate": 2.1642904138986208e-05, + "loss": 0.2427, + "step": 24219 + }, + { + "epoch": 3.23, + "grad_norm": 0.59375, + "learning_rate": 2.1635669594473095e-05, + "loss": 0.255, + "step": 24220 + }, + { + "epoch": 3.23, + "grad_norm": 0.53125, + "learning_rate": 2.162843611263058e-05, + "loss": 0.3573, + "step": 24221 + }, + { + "epoch": 3.23, + "grad_norm": 0.5703125, + "learning_rate": 2.1621203693556758e-05, + "loss": 0.1845, + "step": 24222 + }, + { + "epoch": 3.23, + "grad_norm": 0.7578125, + "learning_rate": 2.161397233734972e-05, + "loss": 0.5831, + "step": 24223 + }, + { + "epoch": 3.23, + "grad_norm": 0.482421875, + "learning_rate": 2.1606742044107553e-05, + "loss": 0.2099, + "step": 24224 + }, + { + "epoch": 3.23, + "grad_norm": 0.53515625, + "learning_rate": 2.159951281392826e-05, + "loss": 0.3135, + "step": 24225 + }, + { + "epoch": 3.23, + "grad_norm": 0.66796875, + "learning_rate": 2.1592284646909876e-05, + "loss": 0.1387, + "step": 24226 + }, + { + "epoch": 3.23, + "grad_norm": 0.85546875, + "learning_rate": 2.1585057543150435e-05, + "loss": 0.4418, + "step": 24227 + }, + { + "epoch": 3.23, + "grad_norm": 0.470703125, + "learning_rate": 2.157783150274797e-05, + "loss": 0.2931, + "step": 24228 + }, + { + "epoch": 3.23, + "grad_norm": 0.51171875, + "learning_rate": 2.157060652580045e-05, + "loss": 0.1776, + "step": 24229 + }, + { + "epoch": 3.23, + "grad_norm": 0.59375, + "learning_rate": 2.1563382612405802e-05, + "loss": 0.4087, + "step": 24230 + }, + { + "epoch": 3.23, + "grad_norm": 0.6171875, + "learning_rate": 2.155615976266203e-05, + "loss": 0.5293, + "step": 24231 + }, + { + "epoch": 3.23, + "grad_norm": 0.65234375, + "learning_rate": 2.154893797666707e-05, + "loss": 0.3678, + "step": 24232 + }, + { + "epoch": 3.23, + "grad_norm": 0.62890625, + "learning_rate": 2.154171725451888e-05, + "loss": 0.5079, + "step": 24233 + }, + { + "epoch": 3.23, + "grad_norm": 0.57421875, + "learning_rate": 2.1534497596315363e-05, + "loss": 0.2328, + "step": 24234 + }, + { + "epoch": 3.23, + "grad_norm": 0.765625, + "learning_rate": 2.15272790021544e-05, + "loss": 0.8829, + "step": 24235 + }, + { + "epoch": 3.23, + "grad_norm": 0.546875, + "learning_rate": 2.1520061472133902e-05, + "loss": 0.2746, + "step": 24236 + }, + { + "epoch": 3.23, + "grad_norm": 0.55078125, + "learning_rate": 2.1512845006351736e-05, + "loss": 0.3615, + "step": 24237 + }, + { + "epoch": 3.23, + "grad_norm": 0.54296875, + "learning_rate": 2.1505629604905786e-05, + "loss": 0.2736, + "step": 24238 + }, + { + "epoch": 3.23, + "grad_norm": 0.5703125, + "learning_rate": 2.1498415267893846e-05, + "loss": 0.2418, + "step": 24239 + }, + { + "epoch": 3.23, + "grad_norm": 0.71875, + "learning_rate": 2.1491201995413812e-05, + "loss": 0.2185, + "step": 24240 + }, + { + "epoch": 3.23, + "grad_norm": 0.62109375, + "learning_rate": 2.148398978756344e-05, + "loss": 0.3206, + "step": 24241 + }, + { + "epoch": 3.23, + "grad_norm": 0.453125, + "learning_rate": 2.1476778644440553e-05, + "loss": 0.2631, + "step": 24242 + }, + { + "epoch": 3.23, + "grad_norm": 0.6640625, + "learning_rate": 2.1469568566142972e-05, + "loss": 0.3725, + "step": 24243 + }, + { + "epoch": 3.24, + "grad_norm": 0.640625, + "learning_rate": 2.1462359552768417e-05, + "loss": 0.3343, + "step": 24244 + }, + { + "epoch": 3.24, + "grad_norm": 0.68359375, + "learning_rate": 2.145515160441467e-05, + "loss": 0.3969, + "step": 24245 + }, + { + "epoch": 3.24, + "grad_norm": 0.7421875, + "learning_rate": 2.144794472117947e-05, + "loss": 0.3381, + "step": 24246 + }, + { + "epoch": 3.24, + "grad_norm": 0.65234375, + "learning_rate": 2.1440738903160583e-05, + "loss": 0.5585, + "step": 24247 + }, + { + "epoch": 3.24, + "grad_norm": 0.55859375, + "learning_rate": 2.143353415045568e-05, + "loss": 0.243, + "step": 24248 + }, + { + "epoch": 3.24, + "grad_norm": 0.453125, + "learning_rate": 2.1426330463162468e-05, + "loss": 0.284, + "step": 24249 + }, + { + "epoch": 3.24, + "grad_norm": 0.6796875, + "learning_rate": 2.141912784137864e-05, + "loss": 0.1677, + "step": 24250 + }, + { + "epoch": 3.24, + "grad_norm": 0.6875, + "learning_rate": 2.141192628520191e-05, + "loss": 0.2849, + "step": 24251 + }, + { + "epoch": 3.24, + "grad_norm": 0.5625, + "learning_rate": 2.1404725794729886e-05, + "loss": 0.3136, + "step": 24252 + }, + { + "epoch": 3.24, + "grad_norm": 0.455078125, + "learning_rate": 2.1397526370060205e-05, + "loss": 0.2516, + "step": 24253 + }, + { + "epoch": 3.24, + "grad_norm": 0.72265625, + "learning_rate": 2.1390328011290506e-05, + "loss": 0.2233, + "step": 24254 + }, + { + "epoch": 3.24, + "grad_norm": 0.65234375, + "learning_rate": 2.1383130718518405e-05, + "loss": 0.4152, + "step": 24255 + }, + { + "epoch": 3.24, + "grad_norm": 0.625, + "learning_rate": 2.1375934491841522e-05, + "loss": 0.3241, + "step": 24256 + }, + { + "epoch": 3.24, + "grad_norm": 0.515625, + "learning_rate": 2.1368739331357447e-05, + "loss": 0.2015, + "step": 24257 + }, + { + "epoch": 3.24, + "grad_norm": 0.578125, + "learning_rate": 2.1361545237163704e-05, + "loss": 0.2609, + "step": 24258 + }, + { + "epoch": 3.24, + "grad_norm": 0.59375, + "learning_rate": 2.1354352209357876e-05, + "loss": 0.1836, + "step": 24259 + }, + { + "epoch": 3.24, + "grad_norm": 0.55078125, + "learning_rate": 2.1347160248037514e-05, + "loss": 0.3027, + "step": 24260 + }, + { + "epoch": 3.24, + "grad_norm": 0.50390625, + "learning_rate": 2.133996935330016e-05, + "loss": 0.1933, + "step": 24261 + }, + { + "epoch": 3.24, + "grad_norm": 0.48046875, + "learning_rate": 2.1332779525243273e-05, + "loss": 0.2962, + "step": 24262 + }, + { + "epoch": 3.24, + "grad_norm": 0.625, + "learning_rate": 2.13255907639644e-05, + "loss": 0.2916, + "step": 24263 + }, + { + "epoch": 3.24, + "grad_norm": 0.59765625, + "learning_rate": 2.1318403069561023e-05, + "loss": 0.4777, + "step": 24264 + }, + { + "epoch": 3.24, + "grad_norm": 0.703125, + "learning_rate": 2.1311216442130565e-05, + "loss": 0.5387, + "step": 24265 + }, + { + "epoch": 3.24, + "grad_norm": 0.67578125, + "learning_rate": 2.1304030881770555e-05, + "loss": 0.379, + "step": 24266 + }, + { + "epoch": 3.24, + "grad_norm": 0.3984375, + "learning_rate": 2.1296846388578362e-05, + "loss": 0.1122, + "step": 24267 + }, + { + "epoch": 3.24, + "grad_norm": 0.5390625, + "learning_rate": 2.1289662962651437e-05, + "loss": 0.2141, + "step": 24268 + }, + { + "epoch": 3.24, + "grad_norm": 0.52734375, + "learning_rate": 2.1282480604087208e-05, + "loss": 0.2821, + "step": 24269 + }, + { + "epoch": 3.24, + "grad_norm": 0.6875, + "learning_rate": 2.127529931298309e-05, + "loss": 0.2836, + "step": 24270 + }, + { + "epoch": 3.24, + "grad_norm": 0.75390625, + "learning_rate": 2.12681190894364e-05, + "loss": 0.2986, + "step": 24271 + }, + { + "epoch": 3.24, + "grad_norm": 0.796875, + "learning_rate": 2.1260939933544554e-05, + "loss": 0.4115, + "step": 24272 + }, + { + "epoch": 3.24, + "grad_norm": 0.55078125, + "learning_rate": 2.1253761845404895e-05, + "loss": 0.3342, + "step": 24273 + }, + { + "epoch": 3.24, + "grad_norm": 0.60546875, + "learning_rate": 2.124658482511479e-05, + "loss": 0.1846, + "step": 24274 + }, + { + "epoch": 3.24, + "grad_norm": 0.5703125, + "learning_rate": 2.1239408872771504e-05, + "loss": 0.1339, + "step": 24275 + }, + { + "epoch": 3.24, + "grad_norm": 0.61328125, + "learning_rate": 2.1232233988472416e-05, + "loss": 0.5278, + "step": 24276 + }, + { + "epoch": 3.24, + "grad_norm": 0.88671875, + "learning_rate": 2.122506017231477e-05, + "loss": 0.4802, + "step": 24277 + }, + { + "epoch": 3.24, + "grad_norm": 0.578125, + "learning_rate": 2.1217887424395853e-05, + "loss": 0.2683, + "step": 24278 + }, + { + "epoch": 3.24, + "grad_norm": 0.65234375, + "learning_rate": 2.1210715744812982e-05, + "loss": 0.3176, + "step": 24279 + }, + { + "epoch": 3.24, + "grad_norm": 0.5859375, + "learning_rate": 2.1203545133663338e-05, + "loss": 0.1596, + "step": 24280 + }, + { + "epoch": 3.24, + "grad_norm": 0.5625, + "learning_rate": 2.1196375591044192e-05, + "loss": 0.4387, + "step": 24281 + }, + { + "epoch": 3.24, + "grad_norm": 0.609375, + "learning_rate": 2.1189207117052767e-05, + "loss": 0.2119, + "step": 24282 + }, + { + "epoch": 3.24, + "grad_norm": 0.51171875, + "learning_rate": 2.1182039711786283e-05, + "loss": 0.1965, + "step": 24283 + }, + { + "epoch": 3.24, + "grad_norm": 0.6640625, + "learning_rate": 2.117487337534194e-05, + "loss": 0.2117, + "step": 24284 + }, + { + "epoch": 3.24, + "grad_norm": 0.61328125, + "learning_rate": 2.1167708107816885e-05, + "loss": 0.4611, + "step": 24285 + }, + { + "epoch": 3.24, + "grad_norm": 0.5078125, + "learning_rate": 2.1160543909308306e-05, + "loss": 0.2361, + "step": 24286 + }, + { + "epoch": 3.24, + "grad_norm": 0.6328125, + "learning_rate": 2.1153380779913347e-05, + "loss": 0.3245, + "step": 24287 + }, + { + "epoch": 3.24, + "grad_norm": 0.53125, + "learning_rate": 2.114621871972917e-05, + "loss": 0.1914, + "step": 24288 + }, + { + "epoch": 3.24, + "grad_norm": 0.63671875, + "learning_rate": 2.1139057728852886e-05, + "loss": 0.3196, + "step": 24289 + }, + { + "epoch": 3.24, + "grad_norm": 0.42578125, + "learning_rate": 2.113189780738155e-05, + "loss": 0.1505, + "step": 24290 + }, + { + "epoch": 3.24, + "grad_norm": 0.474609375, + "learning_rate": 2.1124738955412305e-05, + "loss": 0.3178, + "step": 24291 + }, + { + "epoch": 3.24, + "grad_norm": 0.62109375, + "learning_rate": 2.111758117304222e-05, + "loss": 0.5264, + "step": 24292 + }, + { + "epoch": 3.24, + "grad_norm": 0.60546875, + "learning_rate": 2.1110424460368393e-05, + "loss": 0.2117, + "step": 24293 + }, + { + "epoch": 3.24, + "grad_norm": 0.5546875, + "learning_rate": 2.110326881748781e-05, + "loss": 0.4201, + "step": 24294 + }, + { + "epoch": 3.24, + "grad_norm": 0.57421875, + "learning_rate": 2.1096114244497532e-05, + "loss": 0.2939, + "step": 24295 + }, + { + "epoch": 3.24, + "grad_norm": 0.443359375, + "learning_rate": 2.108896074149459e-05, + "loss": 0.136, + "step": 24296 + }, + { + "epoch": 3.24, + "grad_norm": 0.63671875, + "learning_rate": 2.108180830857601e-05, + "loss": 0.4376, + "step": 24297 + }, + { + "epoch": 3.24, + "grad_norm": 0.52734375, + "learning_rate": 2.1074656945838744e-05, + "loss": 0.2638, + "step": 24298 + }, + { + "epoch": 3.24, + "grad_norm": 0.515625, + "learning_rate": 2.1067506653379776e-05, + "loss": 0.468, + "step": 24299 + }, + { + "epoch": 3.24, + "grad_norm": 0.57421875, + "learning_rate": 2.1060357431296097e-05, + "loss": 0.5167, + "step": 24300 + }, + { + "epoch": 3.24, + "grad_norm": 0.59765625, + "learning_rate": 2.1053209279684616e-05, + "loss": 0.2738, + "step": 24301 + }, + { + "epoch": 3.24, + "grad_norm": 0.65625, + "learning_rate": 2.1046062198642302e-05, + "loss": 0.443, + "step": 24302 + }, + { + "epoch": 3.24, + "grad_norm": 0.83984375, + "learning_rate": 2.1038916188266032e-05, + "loss": 0.6701, + "step": 24303 + }, + { + "epoch": 3.24, + "grad_norm": 0.6640625, + "learning_rate": 2.1031771248652744e-05, + "loss": 0.4157, + "step": 24304 + }, + { + "epoch": 3.24, + "grad_norm": 0.5703125, + "learning_rate": 2.1024627379899307e-05, + "loss": 0.5086, + "step": 24305 + }, + { + "epoch": 3.24, + "grad_norm": 0.671875, + "learning_rate": 2.101748458210262e-05, + "loss": 0.3626, + "step": 24306 + }, + { + "epoch": 3.24, + "grad_norm": 0.5703125, + "learning_rate": 2.1010342855359555e-05, + "loss": 0.225, + "step": 24307 + }, + { + "epoch": 3.24, + "grad_norm": 0.5, + "learning_rate": 2.1003202199766915e-05, + "loss": 0.1645, + "step": 24308 + }, + { + "epoch": 3.24, + "grad_norm": 0.578125, + "learning_rate": 2.0996062615421542e-05, + "loss": 0.3893, + "step": 24309 + }, + { + "epoch": 3.24, + "grad_norm": 0.5703125, + "learning_rate": 2.098892410242027e-05, + "loss": 0.302, + "step": 24310 + }, + { + "epoch": 3.24, + "grad_norm": 0.6640625, + "learning_rate": 2.0981786660859925e-05, + "loss": 0.3395, + "step": 24311 + }, + { + "epoch": 3.24, + "grad_norm": 0.51953125, + "learning_rate": 2.097465029083724e-05, + "loss": 0.3624, + "step": 24312 + }, + { + "epoch": 3.24, + "grad_norm": 0.6640625, + "learning_rate": 2.0967514992449043e-05, + "loss": 0.3537, + "step": 24313 + }, + { + "epoch": 3.24, + "grad_norm": 0.62109375, + "learning_rate": 2.096038076579203e-05, + "loss": 0.7013, + "step": 24314 + }, + { + "epoch": 3.24, + "grad_norm": 0.65625, + "learning_rate": 2.095324761096299e-05, + "loss": 0.2703, + "step": 24315 + }, + { + "epoch": 3.24, + "grad_norm": 0.625, + "learning_rate": 2.0946115528058663e-05, + "loss": 0.2614, + "step": 24316 + }, + { + "epoch": 3.24, + "grad_norm": 0.69921875, + "learning_rate": 2.093898451717573e-05, + "loss": 0.2009, + "step": 24317 + }, + { + "epoch": 3.24, + "grad_norm": 0.6796875, + "learning_rate": 2.0931854578410905e-05, + "loss": 0.4364, + "step": 24318 + }, + { + "epoch": 3.25, + "grad_norm": 0.55859375, + "learning_rate": 2.092472571186087e-05, + "loss": 0.2321, + "step": 24319 + }, + { + "epoch": 3.25, + "grad_norm": 0.498046875, + "learning_rate": 2.0917597917622335e-05, + "loss": 0.2507, + "step": 24320 + }, + { + "epoch": 3.25, + "grad_norm": 0.5859375, + "learning_rate": 2.0910471195791903e-05, + "loss": 0.2444, + "step": 24321 + }, + { + "epoch": 3.25, + "grad_norm": 0.859375, + "learning_rate": 2.0903345546466237e-05, + "loss": 0.3546, + "step": 24322 + }, + { + "epoch": 3.25, + "grad_norm": 0.65234375, + "learning_rate": 2.0896220969741963e-05, + "loss": 0.2077, + "step": 24323 + }, + { + "epoch": 3.25, + "grad_norm": 0.6640625, + "learning_rate": 2.0889097465715734e-05, + "loss": 0.4712, + "step": 24324 + }, + { + "epoch": 3.25, + "grad_norm": 0.60546875, + "learning_rate": 2.0881975034484123e-05, + "loss": 0.1768, + "step": 24325 + }, + { + "epoch": 3.25, + "grad_norm": 0.75, + "learning_rate": 2.087485367614368e-05, + "loss": 0.4531, + "step": 24326 + }, + { + "epoch": 3.25, + "grad_norm": 0.67578125, + "learning_rate": 2.0867733390791e-05, + "loss": 0.4937, + "step": 24327 + }, + { + "epoch": 3.25, + "grad_norm": 0.53125, + "learning_rate": 2.0860614178522642e-05, + "loss": 0.45, + "step": 24328 + }, + { + "epoch": 3.25, + "grad_norm": 0.67578125, + "learning_rate": 2.085349603943514e-05, + "loss": 0.6094, + "step": 24329 + }, + { + "epoch": 3.25, + "grad_norm": 0.703125, + "learning_rate": 2.0846378973625068e-05, + "loss": 0.3837, + "step": 24330 + }, + { + "epoch": 3.25, + "grad_norm": 0.62890625, + "learning_rate": 2.0839262981188856e-05, + "loss": 0.3064, + "step": 24331 + }, + { + "epoch": 3.25, + "grad_norm": 0.6015625, + "learning_rate": 2.0832148062223055e-05, + "loss": 0.3908, + "step": 24332 + }, + { + "epoch": 3.25, + "grad_norm": 0.79296875, + "learning_rate": 2.0825034216824135e-05, + "loss": 0.2085, + "step": 24333 + }, + { + "epoch": 3.25, + "grad_norm": 0.64453125, + "learning_rate": 2.0817921445088584e-05, + "loss": 0.2442, + "step": 24334 + }, + { + "epoch": 3.25, + "grad_norm": 0.859375, + "learning_rate": 2.0810809747112813e-05, + "loss": 0.2894, + "step": 24335 + }, + { + "epoch": 3.25, + "grad_norm": 0.45703125, + "learning_rate": 2.0803699122993293e-05, + "loss": 0.1478, + "step": 24336 + }, + { + "epoch": 3.25, + "grad_norm": 0.62109375, + "learning_rate": 2.0796589572826464e-05, + "loss": 0.5796, + "step": 24337 + }, + { + "epoch": 3.25, + "grad_norm": 0.6015625, + "learning_rate": 2.0789481096708675e-05, + "loss": 0.3945, + "step": 24338 + }, + { + "epoch": 3.25, + "grad_norm": 0.66796875, + "learning_rate": 2.0782373694736402e-05, + "loss": 0.475, + "step": 24339 + }, + { + "epoch": 3.25, + "grad_norm": 0.640625, + "learning_rate": 2.0775267367005946e-05, + "loss": 0.265, + "step": 24340 + }, + { + "epoch": 3.25, + "grad_norm": 0.8828125, + "learning_rate": 2.076816211361371e-05, + "loss": 0.6132, + "step": 24341 + }, + { + "epoch": 3.25, + "grad_norm": 0.7265625, + "learning_rate": 2.076105793465605e-05, + "loss": 0.355, + "step": 24342 + }, + { + "epoch": 3.25, + "grad_norm": 0.5078125, + "learning_rate": 2.0753954830229327e-05, + "loss": 0.1197, + "step": 24343 + }, + { + "epoch": 3.25, + "grad_norm": 0.5078125, + "learning_rate": 2.0746852800429806e-05, + "loss": 0.2396, + "step": 24344 + }, + { + "epoch": 3.25, + "grad_norm": 0.58203125, + "learning_rate": 2.073975184535384e-05, + "loss": 0.2562, + "step": 24345 + }, + { + "epoch": 3.25, + "grad_norm": 0.51171875, + "learning_rate": 2.0732651965097693e-05, + "loss": 0.3053, + "step": 24346 + }, + { + "epoch": 3.25, + "grad_norm": 0.48046875, + "learning_rate": 2.0725553159757683e-05, + "loss": 0.3245, + "step": 24347 + }, + { + "epoch": 3.25, + "grad_norm": 0.458984375, + "learning_rate": 2.071845542943003e-05, + "loss": 0.369, + "step": 24348 + }, + { + "epoch": 3.25, + "grad_norm": 0.76171875, + "learning_rate": 2.0711358774211033e-05, + "loss": 0.4299, + "step": 24349 + }, + { + "epoch": 3.25, + "grad_norm": 0.84765625, + "learning_rate": 2.0704263194196872e-05, + "loss": 0.2301, + "step": 24350 + }, + { + "epoch": 3.25, + "grad_norm": 0.65234375, + "learning_rate": 2.069716868948378e-05, + "loss": 0.3877, + "step": 24351 + }, + { + "epoch": 3.25, + "grad_norm": 0.7890625, + "learning_rate": 2.0690075260168018e-05, + "loss": 0.3403, + "step": 24352 + }, + { + "epoch": 3.25, + "grad_norm": 0.8671875, + "learning_rate": 2.0682982906345703e-05, + "loss": 0.5984, + "step": 24353 + }, + { + "epoch": 3.25, + "grad_norm": 0.578125, + "learning_rate": 2.0675891628113052e-05, + "loss": 0.3476, + "step": 24354 + }, + { + "epoch": 3.25, + "grad_norm": 0.49609375, + "learning_rate": 2.066880142556621e-05, + "loss": 0.1143, + "step": 24355 + }, + { + "epoch": 3.25, + "grad_norm": 0.67578125, + "learning_rate": 2.066171229880134e-05, + "loss": 0.368, + "step": 24356 + }, + { + "epoch": 3.25, + "grad_norm": 0.62109375, + "learning_rate": 2.06546242479146e-05, + "loss": 0.2391, + "step": 24357 + }, + { + "epoch": 3.25, + "grad_norm": 0.84765625, + "learning_rate": 2.064753727300205e-05, + "loss": 0.37, + "step": 24358 + }, + { + "epoch": 3.25, + "grad_norm": 0.423828125, + "learning_rate": 2.064045137415982e-05, + "loss": 0.0895, + "step": 24359 + }, + { + "epoch": 3.25, + "grad_norm": 0.515625, + "learning_rate": 2.0633366551484e-05, + "loss": 0.1752, + "step": 24360 + }, + { + "epoch": 3.25, + "grad_norm": 0.40234375, + "learning_rate": 2.0626282805070708e-05, + "loss": 0.133, + "step": 24361 + }, + { + "epoch": 3.25, + "grad_norm": 0.70703125, + "learning_rate": 2.0619200135015948e-05, + "loss": 0.2814, + "step": 24362 + }, + { + "epoch": 3.25, + "grad_norm": 0.5, + "learning_rate": 2.061211854141576e-05, + "loss": 0.1549, + "step": 24363 + }, + { + "epoch": 3.25, + "grad_norm": 0.69921875, + "learning_rate": 2.0605038024366184e-05, + "loss": 0.4917, + "step": 24364 + }, + { + "epoch": 3.25, + "grad_norm": 0.484375, + "learning_rate": 2.059795858396326e-05, + "loss": 0.1898, + "step": 24365 + }, + { + "epoch": 3.25, + "grad_norm": 0.70703125, + "learning_rate": 2.0590880220303e-05, + "loss": 0.1986, + "step": 24366 + }, + { + "epoch": 3.25, + "grad_norm": 0.423828125, + "learning_rate": 2.0583802933481343e-05, + "loss": 0.1534, + "step": 24367 + }, + { + "epoch": 3.25, + "grad_norm": 0.65625, + "learning_rate": 2.0576726723594297e-05, + "loss": 0.4056, + "step": 24368 + }, + { + "epoch": 3.25, + "grad_norm": 0.5703125, + "learning_rate": 2.0569651590737792e-05, + "loss": 0.454, + "step": 24369 + }, + { + "epoch": 3.25, + "grad_norm": 0.66015625, + "learning_rate": 2.056257753500783e-05, + "loss": 0.1506, + "step": 24370 + }, + { + "epoch": 3.25, + "grad_norm": 0.8359375, + "learning_rate": 2.055550455650026e-05, + "loss": 0.3334, + "step": 24371 + }, + { + "epoch": 3.25, + "grad_norm": 0.51171875, + "learning_rate": 2.0548432655311047e-05, + "loss": 0.0939, + "step": 24372 + }, + { + "epoch": 3.25, + "grad_norm": 0.5, + "learning_rate": 2.05413618315361e-05, + "loss": 0.2025, + "step": 24373 + }, + { + "epoch": 3.25, + "grad_norm": 0.703125, + "learning_rate": 2.0534292085271256e-05, + "loss": 0.4795, + "step": 24374 + }, + { + "epoch": 3.25, + "grad_norm": 0.443359375, + "learning_rate": 2.052722341661244e-05, + "loss": 0.1289, + "step": 24375 + }, + { + "epoch": 3.25, + "grad_norm": 0.62109375, + "learning_rate": 2.0520155825655453e-05, + "loss": 0.2618, + "step": 24376 + }, + { + "epoch": 3.25, + "grad_norm": 0.52734375, + "learning_rate": 2.051308931249617e-05, + "loss": 0.3554, + "step": 24377 + }, + { + "epoch": 3.25, + "grad_norm": 0.54296875, + "learning_rate": 2.0506023877230396e-05, + "loss": 0.2713, + "step": 24378 + }, + { + "epoch": 3.25, + "grad_norm": 1.109375, + "learning_rate": 2.0498959519953966e-05, + "loss": 0.3123, + "step": 24379 + }, + { + "epoch": 3.25, + "grad_norm": 0.71875, + "learning_rate": 2.0491896240762697e-05, + "loss": 0.325, + "step": 24380 + }, + { + "epoch": 3.25, + "grad_norm": 0.6015625, + "learning_rate": 2.048483403975231e-05, + "loss": 0.4626, + "step": 24381 + }, + { + "epoch": 3.25, + "grad_norm": 0.5625, + "learning_rate": 2.0477772917018613e-05, + "loss": 0.4326, + "step": 24382 + }, + { + "epoch": 3.25, + "grad_norm": 0.73828125, + "learning_rate": 2.047071287265735e-05, + "loss": 0.3882, + "step": 24383 + }, + { + "epoch": 3.25, + "grad_norm": 0.63671875, + "learning_rate": 2.046365390676428e-05, + "loss": 0.2112, + "step": 24384 + }, + { + "epoch": 3.25, + "grad_norm": 0.80859375, + "learning_rate": 2.0456596019435116e-05, + "loss": 0.2173, + "step": 24385 + }, + { + "epoch": 3.25, + "grad_norm": 0.46484375, + "learning_rate": 2.0449539210765532e-05, + "loss": 0.1357, + "step": 24386 + }, + { + "epoch": 3.25, + "grad_norm": 0.578125, + "learning_rate": 2.044248348085126e-05, + "loss": 0.3895, + "step": 24387 + }, + { + "epoch": 3.25, + "grad_norm": 0.57421875, + "learning_rate": 2.043542882978796e-05, + "loss": 0.3456, + "step": 24388 + }, + { + "epoch": 3.25, + "grad_norm": 0.52734375, + "learning_rate": 2.042837525767134e-05, + "loss": 0.3769, + "step": 24389 + }, + { + "epoch": 3.25, + "grad_norm": 0.59375, + "learning_rate": 2.0421322764596994e-05, + "loss": 0.2902, + "step": 24390 + }, + { + "epoch": 3.25, + "grad_norm": 0.56640625, + "learning_rate": 2.0414271350660584e-05, + "loss": 0.2321, + "step": 24391 + }, + { + "epoch": 3.25, + "grad_norm": 0.7109375, + "learning_rate": 2.0407221015957732e-05, + "loss": 0.3843, + "step": 24392 + }, + { + "epoch": 3.26, + "grad_norm": 0.53125, + "learning_rate": 2.0400171760584076e-05, + "loss": 0.4304, + "step": 24393 + }, + { + "epoch": 3.26, + "grad_norm": 0.8046875, + "learning_rate": 2.0393123584635143e-05, + "loss": 0.6107, + "step": 24394 + }, + { + "epoch": 3.26, + "grad_norm": 0.73828125, + "learning_rate": 2.038607648820655e-05, + "loss": 0.3421, + "step": 24395 + }, + { + "epoch": 3.26, + "grad_norm": 0.6328125, + "learning_rate": 2.0379030471393857e-05, + "loss": 0.4062, + "step": 24396 + }, + { + "epoch": 3.26, + "grad_norm": 0.609375, + "learning_rate": 2.0371985534292638e-05, + "loss": 0.3044, + "step": 24397 + }, + { + "epoch": 3.26, + "grad_norm": 0.51953125, + "learning_rate": 2.0364941676998406e-05, + "loss": 0.2503, + "step": 24398 + }, + { + "epoch": 3.26, + "grad_norm": 0.4296875, + "learning_rate": 2.035789889960663e-05, + "loss": 0.1224, + "step": 24399 + }, + { + "epoch": 3.26, + "grad_norm": 0.5390625, + "learning_rate": 2.035085720221288e-05, + "loss": 0.2263, + "step": 24400 + }, + { + "epoch": 3.26, + "grad_norm": 0.6484375, + "learning_rate": 2.0343816584912612e-05, + "loss": 0.4744, + "step": 24401 + }, + { + "epoch": 3.26, + "grad_norm": 0.53515625, + "learning_rate": 2.0336777047801314e-05, + "loss": 0.31, + "step": 24402 + }, + { + "epoch": 3.26, + "grad_norm": 0.5703125, + "learning_rate": 2.0329738590974477e-05, + "loss": 0.2655, + "step": 24403 + }, + { + "epoch": 3.26, + "grad_norm": 0.63671875, + "learning_rate": 2.0322701214527483e-05, + "loss": 0.2767, + "step": 24404 + }, + { + "epoch": 3.26, + "grad_norm": 0.91015625, + "learning_rate": 2.031566491855581e-05, + "loss": 0.3785, + "step": 24405 + }, + { + "epoch": 3.26, + "grad_norm": 0.74609375, + "learning_rate": 2.0308629703154857e-05, + "loss": 0.5487, + "step": 24406 + }, + { + "epoch": 3.26, + "grad_norm": 0.62890625, + "learning_rate": 2.0301595568420052e-05, + "loss": 0.1915, + "step": 24407 + }, + { + "epoch": 3.26, + "grad_norm": 0.50390625, + "learning_rate": 2.0294562514446734e-05, + "loss": 0.2742, + "step": 24408 + }, + { + "epoch": 3.26, + "grad_norm": 0.6796875, + "learning_rate": 2.0287530541330314e-05, + "loss": 0.356, + "step": 24409 + }, + { + "epoch": 3.26, + "grad_norm": 0.67578125, + "learning_rate": 2.0280499649166153e-05, + "loss": 0.4226, + "step": 24410 + }, + { + "epoch": 3.26, + "grad_norm": 0.609375, + "learning_rate": 2.0273469838049563e-05, + "loss": 0.4334, + "step": 24411 + }, + { + "epoch": 3.26, + "grad_norm": 0.58203125, + "learning_rate": 2.0266441108075907e-05, + "loss": 0.169, + "step": 24412 + }, + { + "epoch": 3.26, + "grad_norm": 0.49609375, + "learning_rate": 2.0259413459340472e-05, + "loss": 0.3495, + "step": 24413 + }, + { + "epoch": 3.26, + "grad_norm": 0.7109375, + "learning_rate": 2.0252386891938546e-05, + "loss": 0.3831, + "step": 24414 + }, + { + "epoch": 3.26, + "grad_norm": 0.6328125, + "learning_rate": 2.0245361405965447e-05, + "loss": 0.4852, + "step": 24415 + }, + { + "epoch": 3.26, + "grad_norm": 0.71875, + "learning_rate": 2.023833700151646e-05, + "loss": 0.4515, + "step": 24416 + }, + { + "epoch": 3.26, + "grad_norm": 0.52734375, + "learning_rate": 2.0231313678686792e-05, + "loss": 0.2181, + "step": 24417 + }, + { + "epoch": 3.26, + "grad_norm": 0.56640625, + "learning_rate": 2.0224291437571695e-05, + "loss": 0.2887, + "step": 24418 + }, + { + "epoch": 3.26, + "grad_norm": 0.515625, + "learning_rate": 2.0217270278266422e-05, + "loss": 0.1528, + "step": 24419 + }, + { + "epoch": 3.26, + "grad_norm": 0.58984375, + "learning_rate": 2.0210250200866178e-05, + "loss": 0.3958, + "step": 24420 + }, + { + "epoch": 3.26, + "grad_norm": 0.490234375, + "learning_rate": 2.0203231205466132e-05, + "loss": 0.137, + "step": 24421 + }, + { + "epoch": 3.26, + "grad_norm": 0.625, + "learning_rate": 2.0196213292161505e-05, + "loss": 0.3228, + "step": 24422 + }, + { + "epoch": 3.26, + "grad_norm": 0.6953125, + "learning_rate": 2.018919646104742e-05, + "loss": 0.2778, + "step": 24423 + }, + { + "epoch": 3.26, + "grad_norm": 0.65625, + "learning_rate": 2.0182180712219058e-05, + "loss": 0.2412, + "step": 24424 + }, + { + "epoch": 3.26, + "grad_norm": 0.7265625, + "learning_rate": 2.0175166045771577e-05, + "loss": 0.6755, + "step": 24425 + }, + { + "epoch": 3.26, + "grad_norm": 0.62109375, + "learning_rate": 2.016815246180004e-05, + "loss": 0.5143, + "step": 24426 + }, + { + "epoch": 3.26, + "grad_norm": 0.62109375, + "learning_rate": 2.01611399603996e-05, + "loss": 0.2526, + "step": 24427 + }, + { + "epoch": 3.26, + "grad_norm": 0.7421875, + "learning_rate": 2.015412854166534e-05, + "loss": 0.5571, + "step": 24428 + }, + { + "epoch": 3.26, + "grad_norm": 0.53515625, + "learning_rate": 2.014711820569234e-05, + "loss": 0.1173, + "step": 24429 + }, + { + "epoch": 3.26, + "grad_norm": 0.77734375, + "learning_rate": 2.0140108952575698e-05, + "loss": 0.2465, + "step": 24430 + }, + { + "epoch": 3.26, + "grad_norm": 0.8203125, + "learning_rate": 2.0133100782410397e-05, + "loss": 0.2468, + "step": 24431 + }, + { + "epoch": 3.26, + "grad_norm": 0.62109375, + "learning_rate": 2.0126093695291516e-05, + "loss": 0.4375, + "step": 24432 + }, + { + "epoch": 3.26, + "grad_norm": 0.765625, + "learning_rate": 2.0119087691314064e-05, + "loss": 0.5147, + "step": 24433 + }, + { + "epoch": 3.26, + "grad_norm": 0.55859375, + "learning_rate": 2.0112082770573083e-05, + "loss": 0.4617, + "step": 24434 + }, + { + "epoch": 3.26, + "grad_norm": 0.447265625, + "learning_rate": 2.0105078933163525e-05, + "loss": 0.1965, + "step": 24435 + }, + { + "epoch": 3.26, + "grad_norm": 0.53125, + "learning_rate": 2.0098076179180348e-05, + "loss": 0.3504, + "step": 24436 + }, + { + "epoch": 3.26, + "grad_norm": 0.55078125, + "learning_rate": 2.0091074508718532e-05, + "loss": 0.1497, + "step": 24437 + }, + { + "epoch": 3.26, + "grad_norm": 0.6484375, + "learning_rate": 2.0084073921873037e-05, + "loss": 0.3047, + "step": 24438 + }, + { + "epoch": 3.26, + "grad_norm": 0.85546875, + "learning_rate": 2.007707441873882e-05, + "loss": 0.3792, + "step": 24439 + }, + { + "epoch": 3.26, + "grad_norm": 0.73046875, + "learning_rate": 2.007007599941073e-05, + "loss": 0.2166, + "step": 24440 + }, + { + "epoch": 3.26, + "grad_norm": 0.6875, + "learning_rate": 2.0063078663983714e-05, + "loss": 0.4853, + "step": 24441 + }, + { + "epoch": 3.26, + "grad_norm": 0.671875, + "learning_rate": 2.0056082412552655e-05, + "loss": 0.3132, + "step": 24442 + }, + { + "epoch": 3.26, + "grad_norm": 0.53125, + "learning_rate": 2.0049087245212462e-05, + "loss": 0.2314, + "step": 24443 + }, + { + "epoch": 3.26, + "grad_norm": 0.53125, + "learning_rate": 2.0042093162057918e-05, + "loss": 0.395, + "step": 24444 + }, + { + "epoch": 3.26, + "grad_norm": 0.64453125, + "learning_rate": 2.0035100163183907e-05, + "loss": 0.2117, + "step": 24445 + }, + { + "epoch": 3.26, + "grad_norm": 0.57421875, + "learning_rate": 2.0028108248685296e-05, + "loss": 0.2704, + "step": 24446 + }, + { + "epoch": 3.26, + "grad_norm": 0.76953125, + "learning_rate": 2.0021117418656832e-05, + "loss": 0.4524, + "step": 24447 + }, + { + "epoch": 3.26, + "grad_norm": 0.8359375, + "learning_rate": 2.0014127673193373e-05, + "loss": 0.3149, + "step": 24448 + }, + { + "epoch": 3.26, + "grad_norm": 0.6171875, + "learning_rate": 2.0007139012389643e-05, + "loss": 0.5034, + "step": 24449 + }, + { + "epoch": 3.26, + "grad_norm": 0.458984375, + "learning_rate": 2.0000151436340452e-05, + "loss": 0.3382, + "step": 24450 + }, + { + "epoch": 3.26, + "grad_norm": 0.73828125, + "learning_rate": 1.9993164945140554e-05, + "loss": 0.1915, + "step": 24451 + }, + { + "epoch": 3.26, + "grad_norm": 0.5546875, + "learning_rate": 1.9986179538884686e-05, + "loss": 0.2017, + "step": 24452 + }, + { + "epoch": 3.26, + "grad_norm": 0.78125, + "learning_rate": 1.9979195217667603e-05, + "loss": 0.2569, + "step": 24453 + }, + { + "epoch": 3.26, + "grad_norm": 0.6328125, + "learning_rate": 1.9972211981583967e-05, + "loss": 0.2475, + "step": 24454 + }, + { + "epoch": 3.26, + "grad_norm": 0.7734375, + "learning_rate": 1.9965229830728494e-05, + "loss": 0.4345, + "step": 24455 + }, + { + "epoch": 3.26, + "grad_norm": 0.54296875, + "learning_rate": 1.9958248765195862e-05, + "loss": 0.3417, + "step": 24456 + }, + { + "epoch": 3.26, + "grad_norm": 0.609375, + "learning_rate": 1.995126878508078e-05, + "loss": 0.4284, + "step": 24457 + }, + { + "epoch": 3.26, + "grad_norm": 0.67578125, + "learning_rate": 1.9944289890477875e-05, + "loss": 0.2824, + "step": 24458 + }, + { + "epoch": 3.26, + "grad_norm": 0.65625, + "learning_rate": 1.993731208148174e-05, + "loss": 0.3691, + "step": 24459 + }, + { + "epoch": 3.26, + "grad_norm": 0.671875, + "learning_rate": 1.9930335358187045e-05, + "loss": 0.3524, + "step": 24460 + }, + { + "epoch": 3.26, + "grad_norm": 0.62109375, + "learning_rate": 1.9923359720688396e-05, + "loss": 0.1683, + "step": 24461 + }, + { + "epoch": 3.26, + "grad_norm": 0.6640625, + "learning_rate": 1.9916385169080397e-05, + "loss": 0.5999, + "step": 24462 + }, + { + "epoch": 3.26, + "grad_norm": 0.53125, + "learning_rate": 1.9909411703457593e-05, + "loss": 0.2651, + "step": 24463 + }, + { + "epoch": 3.26, + "grad_norm": 0.70703125, + "learning_rate": 1.990243932391457e-05, + "loss": 0.2555, + "step": 24464 + }, + { + "epoch": 3.26, + "grad_norm": 0.5625, + "learning_rate": 1.989546803054587e-05, + "loss": 0.3348, + "step": 24465 + }, + { + "epoch": 3.26, + "grad_norm": 0.5625, + "learning_rate": 1.988849782344606e-05, + "loss": 0.4192, + "step": 24466 + }, + { + "epoch": 3.26, + "grad_norm": 0.5390625, + "learning_rate": 1.9881528702709618e-05, + "loss": 0.3116, + "step": 24467 + }, + { + "epoch": 3.27, + "grad_norm": 0.578125, + "learning_rate": 1.987456066843105e-05, + "loss": 0.2288, + "step": 24468 + }, + { + "epoch": 3.27, + "grad_norm": 0.50390625, + "learning_rate": 1.986759372070488e-05, + "loss": 0.3311, + "step": 24469 + }, + { + "epoch": 3.27, + "grad_norm": 0.68359375, + "learning_rate": 1.986062785962558e-05, + "loss": 0.3682, + "step": 24470 + }, + { + "epoch": 3.27, + "grad_norm": 0.6015625, + "learning_rate": 1.985366308528761e-05, + "loss": 0.295, + "step": 24471 + }, + { + "epoch": 3.27, + "grad_norm": 0.64453125, + "learning_rate": 1.9846699397785373e-05, + "loss": 0.3887, + "step": 24472 + }, + { + "epoch": 3.27, + "grad_norm": 0.6953125, + "learning_rate": 1.983973679721335e-05, + "loss": 0.4125, + "step": 24473 + }, + { + "epoch": 3.27, + "grad_norm": 0.486328125, + "learning_rate": 1.983277528366593e-05, + "loss": 0.2504, + "step": 24474 + }, + { + "epoch": 3.27, + "grad_norm": 0.671875, + "learning_rate": 1.9825814857237523e-05, + "loss": 0.3167, + "step": 24475 + }, + { + "epoch": 3.27, + "grad_norm": 0.474609375, + "learning_rate": 1.981885551802255e-05, + "loss": 0.3191, + "step": 24476 + }, + { + "epoch": 3.27, + "grad_norm": 0.8203125, + "learning_rate": 1.981189726611533e-05, + "loss": 0.3522, + "step": 24477 + }, + { + "epoch": 3.27, + "grad_norm": 0.77734375, + "learning_rate": 1.980494010161026e-05, + "loss": 0.1602, + "step": 24478 + }, + { + "epoch": 3.27, + "grad_norm": 0.7890625, + "learning_rate": 1.9797984024601657e-05, + "loss": 0.2887, + "step": 24479 + }, + { + "epoch": 3.27, + "grad_norm": 0.53515625, + "learning_rate": 1.9791029035183895e-05, + "loss": 0.2985, + "step": 24480 + }, + { + "epoch": 3.27, + "grad_norm": 0.466796875, + "learning_rate": 1.978407513345123e-05, + "loss": 0.1819, + "step": 24481 + }, + { + "epoch": 3.27, + "grad_norm": 0.578125, + "learning_rate": 1.9777122319497986e-05, + "loss": 0.2192, + "step": 24482 + }, + { + "epoch": 3.27, + "grad_norm": 0.578125, + "learning_rate": 1.977017059341849e-05, + "loss": 0.293, + "step": 24483 + }, + { + "epoch": 3.27, + "grad_norm": 0.44140625, + "learning_rate": 1.9763219955306932e-05, + "loss": 0.3081, + "step": 24484 + }, + { + "epoch": 3.27, + "grad_norm": 0.5703125, + "learning_rate": 1.975627040525764e-05, + "loss": 0.2175, + "step": 24485 + }, + { + "epoch": 3.27, + "grad_norm": 0.58984375, + "learning_rate": 1.9749321943364786e-05, + "loss": 0.1687, + "step": 24486 + }, + { + "epoch": 3.27, + "grad_norm": 0.62890625, + "learning_rate": 1.9742374569722642e-05, + "loss": 0.2665, + "step": 24487 + }, + { + "epoch": 3.27, + "grad_norm": 0.66015625, + "learning_rate": 1.9735428284425394e-05, + "loss": 0.3603, + "step": 24488 + }, + { + "epoch": 3.27, + "grad_norm": 0.474609375, + "learning_rate": 1.9728483087567295e-05, + "loss": 0.3717, + "step": 24489 + }, + { + "epoch": 3.27, + "grad_norm": 0.71484375, + "learning_rate": 1.972153897924245e-05, + "loss": 0.4844, + "step": 24490 + }, + { + "epoch": 3.27, + "grad_norm": 0.412109375, + "learning_rate": 1.9714595959545058e-05, + "loss": 0.1913, + "step": 24491 + }, + { + "epoch": 3.27, + "grad_norm": 0.5703125, + "learning_rate": 1.9707654028569268e-05, + "loss": 0.1759, + "step": 24492 + }, + { + "epoch": 3.27, + "grad_norm": 0.5546875, + "learning_rate": 1.9700713186409246e-05, + "loss": 0.4011, + "step": 24493 + }, + { + "epoch": 3.27, + "grad_norm": 0.54296875, + "learning_rate": 1.9693773433159067e-05, + "loss": 0.2472, + "step": 24494 + }, + { + "epoch": 3.27, + "grad_norm": 0.71875, + "learning_rate": 1.9686834768912886e-05, + "loss": 0.3174, + "step": 24495 + }, + { + "epoch": 3.27, + "grad_norm": 0.482421875, + "learning_rate": 1.9679897193764753e-05, + "loss": 0.2385, + "step": 24496 + }, + { + "epoch": 3.27, + "grad_norm": 0.421875, + "learning_rate": 1.967296070780875e-05, + "loss": 0.252, + "step": 24497 + }, + { + "epoch": 3.27, + "grad_norm": 0.84375, + "learning_rate": 1.966602531113898e-05, + "loss": 0.5324, + "step": 24498 + }, + { + "epoch": 3.27, + "grad_norm": 0.5234375, + "learning_rate": 1.9659091003849438e-05, + "loss": 0.3375, + "step": 24499 + }, + { + "epoch": 3.27, + "grad_norm": 0.6640625, + "learning_rate": 1.96521577860342e-05, + "loss": 0.451, + "step": 24500 + }, + { + "epoch": 3.27, + "grad_norm": 0.83203125, + "learning_rate": 1.964522565778726e-05, + "loss": 0.44, + "step": 24501 + }, + { + "epoch": 3.27, + "grad_norm": 0.69140625, + "learning_rate": 1.9638294619202624e-05, + "loss": 0.2971, + "step": 24502 + }, + { + "epoch": 3.27, + "grad_norm": 0.56640625, + "learning_rate": 1.963136467037432e-05, + "loss": 0.1782, + "step": 24503 + }, + { + "epoch": 3.27, + "grad_norm": 0.70703125, + "learning_rate": 1.962443581139627e-05, + "loss": 0.4942, + "step": 24504 + }, + { + "epoch": 3.27, + "grad_norm": 0.78515625, + "learning_rate": 1.9617508042362444e-05, + "loss": 0.378, + "step": 24505 + }, + { + "epoch": 3.27, + "grad_norm": 0.482421875, + "learning_rate": 1.961058136336681e-05, + "loss": 0.1962, + "step": 24506 + }, + { + "epoch": 3.27, + "grad_norm": 0.640625, + "learning_rate": 1.9603655774503305e-05, + "loss": 0.1938, + "step": 24507 + }, + { + "epoch": 3.27, + "grad_norm": 0.72265625, + "learning_rate": 1.959673127586583e-05, + "loss": 0.6046, + "step": 24508 + }, + { + "epoch": 3.27, + "grad_norm": 0.6640625, + "learning_rate": 1.958980786754825e-05, + "loss": 0.4353, + "step": 24509 + }, + { + "epoch": 3.27, + "grad_norm": 0.55859375, + "learning_rate": 1.958288554964449e-05, + "loss": 0.3528, + "step": 24510 + }, + { + "epoch": 3.27, + "grad_norm": 0.859375, + "learning_rate": 1.9575964322248408e-05, + "loss": 0.417, + "step": 24511 + }, + { + "epoch": 3.27, + "grad_norm": 0.6484375, + "learning_rate": 1.956904418545389e-05, + "loss": 0.3998, + "step": 24512 + }, + { + "epoch": 3.27, + "grad_norm": 0.79296875, + "learning_rate": 1.9562125139354726e-05, + "loss": 0.3998, + "step": 24513 + }, + { + "epoch": 3.27, + "grad_norm": 0.65234375, + "learning_rate": 1.9555207184044777e-05, + "loss": 0.4751, + "step": 24514 + }, + { + "epoch": 3.27, + "grad_norm": 0.65625, + "learning_rate": 1.9548290319617847e-05, + "loss": 0.4193, + "step": 24515 + }, + { + "epoch": 3.27, + "grad_norm": 0.486328125, + "learning_rate": 1.9541374546167757e-05, + "loss": 0.2694, + "step": 24516 + }, + { + "epoch": 3.27, + "grad_norm": 0.8125, + "learning_rate": 1.9534459863788233e-05, + "loss": 0.4066, + "step": 24517 + }, + { + "epoch": 3.27, + "grad_norm": 0.5859375, + "learning_rate": 1.9527546272573083e-05, + "loss": 0.2955, + "step": 24518 + }, + { + "epoch": 3.27, + "grad_norm": 0.53515625, + "learning_rate": 1.9520633772616083e-05, + "loss": 0.3769, + "step": 24519 + }, + { + "epoch": 3.27, + "grad_norm": 0.96484375, + "learning_rate": 1.951372236401091e-05, + "loss": 0.1974, + "step": 24520 + }, + { + "epoch": 3.27, + "grad_norm": 0.78515625, + "learning_rate": 1.950681204685134e-05, + "loss": 0.2595, + "step": 24521 + }, + { + "epoch": 3.27, + "grad_norm": 0.5703125, + "learning_rate": 1.949990282123104e-05, + "loss": 0.3319, + "step": 24522 + }, + { + "epoch": 3.27, + "grad_norm": 0.5625, + "learning_rate": 1.9492994687243714e-05, + "loss": 0.1997, + "step": 24523 + }, + { + "epoch": 3.27, + "grad_norm": 0.70703125, + "learning_rate": 1.9486087644983054e-05, + "loss": 0.4606, + "step": 24524 + }, + { + "epoch": 3.27, + "grad_norm": 0.609375, + "learning_rate": 1.9479181694542727e-05, + "loss": 0.3329, + "step": 24525 + }, + { + "epoch": 3.27, + "grad_norm": 0.498046875, + "learning_rate": 1.947227683601639e-05, + "loss": 0.318, + "step": 24526 + }, + { + "epoch": 3.27, + "grad_norm": 0.57421875, + "learning_rate": 1.9465373069497628e-05, + "loss": 0.2771, + "step": 24527 + }, + { + "epoch": 3.27, + "grad_norm": 0.53125, + "learning_rate": 1.945847039508011e-05, + "loss": 0.2741, + "step": 24528 + }, + { + "epoch": 3.27, + "grad_norm": 0.458984375, + "learning_rate": 1.9451568812857412e-05, + "loss": 0.2206, + "step": 24529 + }, + { + "epoch": 3.27, + "grad_norm": 0.50390625, + "learning_rate": 1.9444668322923166e-05, + "loss": 0.3843, + "step": 24530 + }, + { + "epoch": 3.27, + "grad_norm": 0.7578125, + "learning_rate": 1.9437768925370926e-05, + "loss": 0.3948, + "step": 24531 + }, + { + "epoch": 3.27, + "grad_norm": 0.6015625, + "learning_rate": 1.9430870620294206e-05, + "loss": 0.4757, + "step": 24532 + }, + { + "epoch": 3.27, + "grad_norm": 0.466796875, + "learning_rate": 1.94239734077866e-05, + "loss": 0.2035, + "step": 24533 + }, + { + "epoch": 3.27, + "grad_norm": 0.6484375, + "learning_rate": 1.941707728794162e-05, + "loss": 0.2426, + "step": 24534 + }, + { + "epoch": 3.27, + "grad_norm": 0.640625, + "learning_rate": 1.9410182260852815e-05, + "loss": 0.4867, + "step": 24535 + }, + { + "epoch": 3.27, + "grad_norm": 0.59765625, + "learning_rate": 1.9403288326613646e-05, + "loss": 0.1601, + "step": 24536 + }, + { + "epoch": 3.27, + "grad_norm": 0.5078125, + "learning_rate": 1.93963954853176e-05, + "loss": 0.1961, + "step": 24537 + }, + { + "epoch": 3.27, + "grad_norm": 0.6328125, + "learning_rate": 1.9389503737058167e-05, + "loss": 0.6039, + "step": 24538 + }, + { + "epoch": 3.27, + "grad_norm": 0.609375, + "learning_rate": 1.9382613081928837e-05, + "loss": 0.4058, + "step": 24539 + }, + { + "epoch": 3.27, + "grad_norm": 0.443359375, + "learning_rate": 1.937572352002297e-05, + "loss": 0.2304, + "step": 24540 + }, + { + "epoch": 3.27, + "grad_norm": 0.61328125, + "learning_rate": 1.9368835051434042e-05, + "loss": 0.5365, + "step": 24541 + }, + { + "epoch": 3.27, + "grad_norm": 0.59765625, + "learning_rate": 1.9361947676255466e-05, + "loss": 0.2711, + "step": 24542 + }, + { + "epoch": 3.28, + "grad_norm": 0.74609375, + "learning_rate": 1.9355061394580653e-05, + "loss": 0.3378, + "step": 24543 + }, + { + "epoch": 3.28, + "grad_norm": 0.478515625, + "learning_rate": 1.9348176206502966e-05, + "loss": 0.2704, + "step": 24544 + }, + { + "epoch": 3.28, + "grad_norm": 0.48828125, + "learning_rate": 1.934129211211574e-05, + "loss": 0.2184, + "step": 24545 + }, + { + "epoch": 3.28, + "grad_norm": 0.482421875, + "learning_rate": 1.9334409111512362e-05, + "loss": 0.1889, + "step": 24546 + }, + { + "epoch": 3.28, + "grad_norm": 0.73828125, + "learning_rate": 1.9327527204786178e-05, + "loss": 0.2455, + "step": 24547 + }, + { + "epoch": 3.28, + "grad_norm": 0.734375, + "learning_rate": 1.9320646392030482e-05, + "loss": 0.3845, + "step": 24548 + }, + { + "epoch": 3.28, + "grad_norm": 0.59375, + "learning_rate": 1.931376667333864e-05, + "loss": 0.4526, + "step": 24549 + }, + { + "epoch": 3.28, + "grad_norm": 0.67578125, + "learning_rate": 1.9306888048803874e-05, + "loss": 0.458, + "step": 24550 + }, + { + "epoch": 3.28, + "grad_norm": 0.62890625, + "learning_rate": 1.9300010518519495e-05, + "loss": 0.4276, + "step": 24551 + }, + { + "epoch": 3.28, + "grad_norm": 0.57421875, + "learning_rate": 1.9293134082578766e-05, + "loss": 0.4191, + "step": 24552 + }, + { + "epoch": 3.28, + "grad_norm": 0.66015625, + "learning_rate": 1.928625874107497e-05, + "loss": 0.2904, + "step": 24553 + }, + { + "epoch": 3.28, + "grad_norm": 0.6328125, + "learning_rate": 1.927938449410128e-05, + "loss": 0.4845, + "step": 24554 + }, + { + "epoch": 3.28, + "grad_norm": 0.6015625, + "learning_rate": 1.9272511341750966e-05, + "loss": 0.3165, + "step": 24555 + }, + { + "epoch": 3.28, + "grad_norm": 0.53125, + "learning_rate": 1.9265639284117176e-05, + "loss": 0.4827, + "step": 24556 + }, + { + "epoch": 3.28, + "grad_norm": 0.58203125, + "learning_rate": 1.925876832129314e-05, + "loss": 0.1319, + "step": 24557 + }, + { + "epoch": 3.28, + "grad_norm": 0.5859375, + "learning_rate": 1.9251898453372043e-05, + "loss": 0.3487, + "step": 24558 + }, + { + "epoch": 3.28, + "grad_norm": 0.62109375, + "learning_rate": 1.9245029680447014e-05, + "loss": 0.3754, + "step": 24559 + }, + { + "epoch": 3.28, + "grad_norm": 0.66015625, + "learning_rate": 1.9238162002611193e-05, + "loss": 0.3783, + "step": 24560 + }, + { + "epoch": 3.28, + "grad_norm": 0.498046875, + "learning_rate": 1.923129541995774e-05, + "loss": 0.1973, + "step": 24561 + }, + { + "epoch": 3.28, + "grad_norm": 0.65625, + "learning_rate": 1.922442993257978e-05, + "loss": 0.6169, + "step": 24562 + }, + { + "epoch": 3.28, + "grad_norm": 0.62109375, + "learning_rate": 1.921756554057037e-05, + "loss": 0.3589, + "step": 24563 + }, + { + "epoch": 3.28, + "grad_norm": 0.6015625, + "learning_rate": 1.9210702244022617e-05, + "loss": 0.3375, + "step": 24564 + }, + { + "epoch": 3.28, + "grad_norm": 0.65625, + "learning_rate": 1.9203840043029586e-05, + "loss": 0.2059, + "step": 24565 + }, + { + "epoch": 3.28, + "grad_norm": 0.58984375, + "learning_rate": 1.9196978937684364e-05, + "loss": 0.2815, + "step": 24566 + }, + { + "epoch": 3.28, + "grad_norm": 0.5859375, + "learning_rate": 1.9190118928079947e-05, + "loss": 0.3885, + "step": 24567 + }, + { + "epoch": 3.28, + "grad_norm": 0.6875, + "learning_rate": 1.9183260014309402e-05, + "loss": 0.4522, + "step": 24568 + }, + { + "epoch": 3.28, + "grad_norm": 0.59765625, + "learning_rate": 1.917640219646569e-05, + "loss": 0.2534, + "step": 24569 + }, + { + "epoch": 3.28, + "grad_norm": 0.58203125, + "learning_rate": 1.9169545474641847e-05, + "loss": 0.3049, + "step": 24570 + }, + { + "epoch": 3.28, + "grad_norm": 0.5078125, + "learning_rate": 1.916268984893086e-05, + "loss": 0.2965, + "step": 24571 + }, + { + "epoch": 3.28, + "grad_norm": 0.70703125, + "learning_rate": 1.9155835319425665e-05, + "loss": 0.3105, + "step": 24572 + }, + { + "epoch": 3.28, + "grad_norm": 0.60546875, + "learning_rate": 1.914898188621922e-05, + "loss": 0.2703, + "step": 24573 + }, + { + "epoch": 3.28, + "grad_norm": 0.6015625, + "learning_rate": 1.9142129549404476e-05, + "loss": 0.3378, + "step": 24574 + }, + { + "epoch": 3.28, + "grad_norm": 0.671875, + "learning_rate": 1.9135278309074346e-05, + "loss": 0.4227, + "step": 24575 + }, + { + "epoch": 3.28, + "grad_norm": 0.89453125, + "learning_rate": 1.9128428165321764e-05, + "loss": 0.1457, + "step": 24576 + }, + { + "epoch": 3.28, + "grad_norm": 0.60546875, + "learning_rate": 1.9121579118239575e-05, + "loss": 0.3017, + "step": 24577 + }, + { + "epoch": 3.28, + "grad_norm": 0.796875, + "learning_rate": 1.9114731167920673e-05, + "loss": 0.3685, + "step": 24578 + }, + { + "epoch": 3.28, + "grad_norm": 0.6953125, + "learning_rate": 1.9107884314457946e-05, + "loss": 0.5357, + "step": 24579 + }, + { + "epoch": 3.28, + "grad_norm": 0.546875, + "learning_rate": 1.910103855794424e-05, + "loss": 0.1782, + "step": 24580 + }, + { + "epoch": 3.28, + "grad_norm": 0.640625, + "learning_rate": 1.909419389847237e-05, + "loss": 0.5524, + "step": 24581 + }, + { + "epoch": 3.28, + "grad_norm": 0.58203125, + "learning_rate": 1.9087350336135124e-05, + "loss": 0.1416, + "step": 24582 + }, + { + "epoch": 3.28, + "grad_norm": 0.68359375, + "learning_rate": 1.9080507871025342e-05, + "loss": 0.4082, + "step": 24583 + }, + { + "epoch": 3.28, + "grad_norm": 0.57421875, + "learning_rate": 1.907366650323582e-05, + "loss": 0.2395, + "step": 24584 + }, + { + "epoch": 3.28, + "grad_norm": 0.60546875, + "learning_rate": 1.9066826232859325e-05, + "loss": 0.3095, + "step": 24585 + }, + { + "epoch": 3.28, + "grad_norm": 0.66015625, + "learning_rate": 1.9059987059988593e-05, + "loss": 0.2935, + "step": 24586 + }, + { + "epoch": 3.28, + "grad_norm": 0.55078125, + "learning_rate": 1.9053148984716384e-05, + "loss": 0.3326, + "step": 24587 + }, + { + "epoch": 3.28, + "grad_norm": 0.484375, + "learning_rate": 1.9046312007135426e-05, + "loss": 0.2556, + "step": 24588 + }, + { + "epoch": 3.28, + "grad_norm": 0.6796875, + "learning_rate": 1.903947612733846e-05, + "loss": 0.4488, + "step": 24589 + }, + { + "epoch": 3.28, + "grad_norm": 0.6484375, + "learning_rate": 1.9032641345418122e-05, + "loss": 0.2547, + "step": 24590 + }, + { + "epoch": 3.28, + "grad_norm": 0.78515625, + "learning_rate": 1.9025807661467144e-05, + "loss": 0.2527, + "step": 24591 + }, + { + "epoch": 3.28, + "grad_norm": 0.6171875, + "learning_rate": 1.9018975075578206e-05, + "loss": 0.6596, + "step": 24592 + }, + { + "epoch": 3.28, + "grad_norm": 0.5390625, + "learning_rate": 1.9012143587843913e-05, + "loss": 0.2719, + "step": 24593 + }, + { + "epoch": 3.28, + "grad_norm": 0.69140625, + "learning_rate": 1.9005313198356944e-05, + "loss": 0.4983, + "step": 24594 + }, + { + "epoch": 3.28, + "grad_norm": 0.546875, + "learning_rate": 1.8998483907209896e-05, + "loss": 0.1949, + "step": 24595 + }, + { + "epoch": 3.28, + "grad_norm": 0.5625, + "learning_rate": 1.8991655714495395e-05, + "loss": 0.4515, + "step": 24596 + }, + { + "epoch": 3.28, + "grad_norm": 0.5703125, + "learning_rate": 1.8984828620306027e-05, + "loss": 0.3494, + "step": 24597 + }, + { + "epoch": 3.28, + "grad_norm": 0.56640625, + "learning_rate": 1.897800262473438e-05, + "loss": 0.2958, + "step": 24598 + }, + { + "epoch": 3.28, + "grad_norm": 0.57421875, + "learning_rate": 1.8971177727873046e-05, + "loss": 0.5459, + "step": 24599 + }, + { + "epoch": 3.28, + "grad_norm": 0.7890625, + "learning_rate": 1.8964353929814516e-05, + "loss": 0.3396, + "step": 24600 + }, + { + "epoch": 3.28, + "grad_norm": 0.578125, + "learning_rate": 1.895753123065136e-05, + "loss": 0.1838, + "step": 24601 + }, + { + "epoch": 3.28, + "grad_norm": 0.57421875, + "learning_rate": 1.8950709630476092e-05, + "loss": 0.2808, + "step": 24602 + }, + { + "epoch": 3.28, + "grad_norm": 0.50390625, + "learning_rate": 1.8943889129381243e-05, + "loss": 0.2366, + "step": 24603 + }, + { + "epoch": 3.28, + "grad_norm": 0.62109375, + "learning_rate": 1.893706972745928e-05, + "loss": 0.3628, + "step": 24604 + }, + { + "epoch": 3.28, + "grad_norm": 0.55078125, + "learning_rate": 1.893025142480265e-05, + "loss": 0.1954, + "step": 24605 + }, + { + "epoch": 3.28, + "grad_norm": 0.478515625, + "learning_rate": 1.8923434221503834e-05, + "loss": 0.179, + "step": 24606 + }, + { + "epoch": 3.28, + "grad_norm": 0.66015625, + "learning_rate": 1.8916618117655292e-05, + "loss": 0.3936, + "step": 24607 + }, + { + "epoch": 3.28, + "grad_norm": 0.61328125, + "learning_rate": 1.8909803113349466e-05, + "loss": 0.3191, + "step": 24608 + }, + { + "epoch": 3.28, + "grad_norm": 0.6484375, + "learning_rate": 1.8902989208678745e-05, + "loss": 0.3815, + "step": 24609 + }, + { + "epoch": 3.28, + "grad_norm": 0.640625, + "learning_rate": 1.889617640373552e-05, + "loss": 0.308, + "step": 24610 + }, + { + "epoch": 3.28, + "grad_norm": 0.65234375, + "learning_rate": 1.88893646986122e-05, + "loss": 0.4363, + "step": 24611 + }, + { + "epoch": 3.28, + "grad_norm": 0.78515625, + "learning_rate": 1.8882554093401183e-05, + "loss": 0.3972, + "step": 24612 + }, + { + "epoch": 3.28, + "grad_norm": 0.7109375, + "learning_rate": 1.8875744588194766e-05, + "loss": 0.3974, + "step": 24613 + }, + { + "epoch": 3.28, + "grad_norm": 0.59375, + "learning_rate": 1.8868936183085308e-05, + "loss": 0.48, + "step": 24614 + }, + { + "epoch": 3.28, + "grad_norm": 0.71484375, + "learning_rate": 1.8862128878165152e-05, + "loss": 0.249, + "step": 24615 + }, + { + "epoch": 3.28, + "grad_norm": 0.65625, + "learning_rate": 1.8855322673526633e-05, + "loss": 0.2122, + "step": 24616 + }, + { + "epoch": 3.28, + "grad_norm": 0.55859375, + "learning_rate": 1.8848517569262e-05, + "loss": 0.3362, + "step": 24617 + }, + { + "epoch": 3.29, + "grad_norm": 0.609375, + "learning_rate": 1.8841713565463548e-05, + "loss": 0.1354, + "step": 24618 + }, + { + "epoch": 3.29, + "grad_norm": 0.48828125, + "learning_rate": 1.8834910662223537e-05, + "loss": 0.3351, + "step": 24619 + }, + { + "epoch": 3.29, + "grad_norm": 0.671875, + "learning_rate": 1.8828108859634218e-05, + "loss": 0.2671, + "step": 24620 + }, + { + "epoch": 3.29, + "grad_norm": 0.53125, + "learning_rate": 1.8821308157787853e-05, + "loss": 0.3562, + "step": 24621 + }, + { + "epoch": 3.29, + "grad_norm": 0.62109375, + "learning_rate": 1.8814508556776677e-05, + "loss": 0.3551, + "step": 24622 + }, + { + "epoch": 3.29, + "grad_norm": 0.4140625, + "learning_rate": 1.880771005669283e-05, + "loss": 0.1535, + "step": 24623 + }, + { + "epoch": 3.29, + "grad_norm": 0.67578125, + "learning_rate": 1.8800912657628555e-05, + "loss": 0.4031, + "step": 24624 + }, + { + "epoch": 3.29, + "grad_norm": 0.65625, + "learning_rate": 1.8794116359676018e-05, + "loss": 0.4131, + "step": 24625 + }, + { + "epoch": 3.29, + "grad_norm": 0.76953125, + "learning_rate": 1.8787321162927397e-05, + "loss": 0.2243, + "step": 24626 + }, + { + "epoch": 3.29, + "grad_norm": 0.68359375, + "learning_rate": 1.8780527067474805e-05, + "loss": 0.32, + "step": 24627 + }, + { + "epoch": 3.29, + "grad_norm": 0.58203125, + "learning_rate": 1.877373407341041e-05, + "loss": 0.2424, + "step": 24628 + }, + { + "epoch": 3.29, + "grad_norm": 0.640625, + "learning_rate": 1.8766942180826298e-05, + "loss": 0.2151, + "step": 24629 + }, + { + "epoch": 3.29, + "grad_norm": 0.4921875, + "learning_rate": 1.8760151389814574e-05, + "loss": 0.1053, + "step": 24630 + }, + { + "epoch": 3.29, + "grad_norm": 0.53515625, + "learning_rate": 1.8753361700467366e-05, + "loss": 0.3512, + "step": 24631 + }, + { + "epoch": 3.29, + "grad_norm": 0.73828125, + "learning_rate": 1.87465731128767e-05, + "loss": 0.3536, + "step": 24632 + }, + { + "epoch": 3.29, + "grad_norm": 0.494140625, + "learning_rate": 1.873978562713464e-05, + "loss": 0.215, + "step": 24633 + }, + { + "epoch": 3.29, + "grad_norm": 0.7734375, + "learning_rate": 1.873299924333324e-05, + "loss": 0.4947, + "step": 24634 + }, + { + "epoch": 3.29, + "grad_norm": 0.71875, + "learning_rate": 1.8726213961564554e-05, + "loss": 0.3616, + "step": 24635 + }, + { + "epoch": 3.29, + "grad_norm": 0.59375, + "learning_rate": 1.871942978192054e-05, + "loss": 0.3751, + "step": 24636 + }, + { + "epoch": 3.29, + "grad_norm": 0.7734375, + "learning_rate": 1.871264670449323e-05, + "loss": 0.2677, + "step": 24637 + }, + { + "epoch": 3.29, + "grad_norm": 0.6171875, + "learning_rate": 1.87058647293746e-05, + "loss": 0.1408, + "step": 24638 + }, + { + "epoch": 3.29, + "grad_norm": 0.70703125, + "learning_rate": 1.869908385665664e-05, + "loss": 0.4507, + "step": 24639 + }, + { + "epoch": 3.29, + "grad_norm": 0.6328125, + "learning_rate": 1.8692304086431277e-05, + "loss": 0.4511, + "step": 24640 + }, + { + "epoch": 3.29, + "grad_norm": 0.6875, + "learning_rate": 1.868552541879044e-05, + "loss": 0.3738, + "step": 24641 + }, + { + "epoch": 3.29, + "grad_norm": 0.435546875, + "learning_rate": 1.8678747853826063e-05, + "loss": 0.2299, + "step": 24642 + }, + { + "epoch": 3.29, + "grad_norm": 0.7734375, + "learning_rate": 1.867197139163005e-05, + "loss": 0.2008, + "step": 24643 + }, + { + "epoch": 3.29, + "grad_norm": 0.7890625, + "learning_rate": 1.866519603229433e-05, + "loss": 0.4606, + "step": 24644 + }, + { + "epoch": 3.29, + "grad_norm": 0.7734375, + "learning_rate": 1.865842177591073e-05, + "loss": 0.5032, + "step": 24645 + }, + { + "epoch": 3.29, + "grad_norm": 0.466796875, + "learning_rate": 1.8651648622571128e-05, + "loss": 0.2775, + "step": 24646 + }, + { + "epoch": 3.29, + "grad_norm": 0.5703125, + "learning_rate": 1.8644876572367375e-05, + "loss": 0.1099, + "step": 24647 + }, + { + "epoch": 3.29, + "grad_norm": 0.515625, + "learning_rate": 1.863810562539132e-05, + "loss": 0.2244, + "step": 24648 + }, + { + "epoch": 3.29, + "grad_norm": 0.53515625, + "learning_rate": 1.8631335781734793e-05, + "loss": 0.2474, + "step": 24649 + }, + { + "epoch": 3.29, + "grad_norm": 0.515625, + "learning_rate": 1.862456704148955e-05, + "loss": 0.5004, + "step": 24650 + }, + { + "epoch": 3.29, + "grad_norm": 0.53515625, + "learning_rate": 1.8617799404747403e-05, + "loss": 0.24, + "step": 24651 + }, + { + "epoch": 3.29, + "grad_norm": 0.5078125, + "learning_rate": 1.861103287160013e-05, + "loss": 0.2062, + "step": 24652 + }, + { + "epoch": 3.29, + "grad_norm": 0.578125, + "learning_rate": 1.860426744213951e-05, + "loss": 0.2619, + "step": 24653 + }, + { + "epoch": 3.29, + "grad_norm": 0.6796875, + "learning_rate": 1.859750311645726e-05, + "loss": 0.3705, + "step": 24654 + }, + { + "epoch": 3.29, + "grad_norm": 0.5859375, + "learning_rate": 1.8590739894645094e-05, + "loss": 0.3495, + "step": 24655 + }, + { + "epoch": 3.29, + "grad_norm": 0.72265625, + "learning_rate": 1.8583977776794736e-05, + "loss": 0.4125, + "step": 24656 + }, + { + "epoch": 3.29, + "grad_norm": 0.76171875, + "learning_rate": 1.8577216762997897e-05, + "loss": 0.2035, + "step": 24657 + }, + { + "epoch": 3.29, + "grad_norm": 0.5234375, + "learning_rate": 1.8570456853346274e-05, + "loss": 0.3182, + "step": 24658 + }, + { + "epoch": 3.29, + "grad_norm": 0.443359375, + "learning_rate": 1.8563698047931498e-05, + "loss": 0.2683, + "step": 24659 + }, + { + "epoch": 3.29, + "grad_norm": 0.72265625, + "learning_rate": 1.855694034684523e-05, + "loss": 0.4153, + "step": 24660 + }, + { + "epoch": 3.29, + "grad_norm": 0.49609375, + "learning_rate": 1.855018375017914e-05, + "loss": 0.1368, + "step": 24661 + }, + { + "epoch": 3.29, + "grad_norm": 0.625, + "learning_rate": 1.8543428258024843e-05, + "loss": 0.2514, + "step": 24662 + }, + { + "epoch": 3.29, + "grad_norm": 0.609375, + "learning_rate": 1.853667387047392e-05, + "loss": 0.1511, + "step": 24663 + }, + { + "epoch": 3.29, + "grad_norm": 0.64453125, + "learning_rate": 1.852992058761798e-05, + "loss": 0.2033, + "step": 24664 + }, + { + "epoch": 3.29, + "grad_norm": 0.66015625, + "learning_rate": 1.852316840954862e-05, + "loss": 0.1692, + "step": 24665 + }, + { + "epoch": 3.29, + "grad_norm": 0.54296875, + "learning_rate": 1.8516417336357374e-05, + "loss": 0.1669, + "step": 24666 + }, + { + "epoch": 3.29, + "grad_norm": 0.55078125, + "learning_rate": 1.850966736813583e-05, + "loss": 0.3073, + "step": 24667 + }, + { + "epoch": 3.29, + "grad_norm": 0.546875, + "learning_rate": 1.8502918504975464e-05, + "loss": 0.3453, + "step": 24668 + }, + { + "epoch": 3.29, + "grad_norm": 0.66796875, + "learning_rate": 1.8496170746967835e-05, + "loss": 0.5268, + "step": 24669 + }, + { + "epoch": 3.29, + "grad_norm": 0.5546875, + "learning_rate": 1.848942409420443e-05, + "loss": 0.2491, + "step": 24670 + }, + { + "epoch": 3.29, + "grad_norm": 0.82421875, + "learning_rate": 1.8482678546776754e-05, + "loss": 0.3367, + "step": 24671 + }, + { + "epoch": 3.29, + "grad_norm": 0.64453125, + "learning_rate": 1.8475934104776305e-05, + "loss": 0.2756, + "step": 24672 + }, + { + "epoch": 3.29, + "grad_norm": 0.4921875, + "learning_rate": 1.8469190768294474e-05, + "loss": 0.2246, + "step": 24673 + }, + { + "epoch": 3.29, + "grad_norm": 0.50390625, + "learning_rate": 1.846244853742275e-05, + "loss": 0.1173, + "step": 24674 + }, + { + "epoch": 3.29, + "grad_norm": 0.625, + "learning_rate": 1.8455707412252554e-05, + "loss": 0.1544, + "step": 24675 + }, + { + "epoch": 3.29, + "grad_norm": 0.62109375, + "learning_rate": 1.8448967392875327e-05, + "loss": 0.3767, + "step": 24676 + }, + { + "epoch": 3.29, + "grad_norm": 0.62109375, + "learning_rate": 1.844222847938244e-05, + "loss": 0.3435, + "step": 24677 + }, + { + "epoch": 3.29, + "grad_norm": 0.349609375, + "learning_rate": 1.8435490671865242e-05, + "loss": 0.0821, + "step": 24678 + }, + { + "epoch": 3.29, + "grad_norm": 0.5078125, + "learning_rate": 1.8428753970415148e-05, + "loss": 0.4148, + "step": 24679 + }, + { + "epoch": 3.29, + "grad_norm": 0.498046875, + "learning_rate": 1.8422018375123506e-05, + "loss": 0.3587, + "step": 24680 + }, + { + "epoch": 3.29, + "grad_norm": 0.70703125, + "learning_rate": 1.8415283886081667e-05, + "loss": 0.4193, + "step": 24681 + }, + { + "epoch": 3.29, + "grad_norm": 0.70703125, + "learning_rate": 1.840855050338093e-05, + "loss": 0.4416, + "step": 24682 + }, + { + "epoch": 3.29, + "grad_norm": 0.60546875, + "learning_rate": 1.84018182271126e-05, + "loss": 0.1868, + "step": 24683 + }, + { + "epoch": 3.29, + "grad_norm": 0.75390625, + "learning_rate": 1.8395087057367987e-05, + "loss": 0.4351, + "step": 24684 + }, + { + "epoch": 3.29, + "grad_norm": 0.546875, + "learning_rate": 1.8388356994238388e-05, + "loss": 0.3024, + "step": 24685 + }, + { + "epoch": 3.29, + "grad_norm": 0.95703125, + "learning_rate": 1.8381628037815024e-05, + "loss": 0.4172, + "step": 24686 + }, + { + "epoch": 3.29, + "grad_norm": 0.6328125, + "learning_rate": 1.837490018818917e-05, + "loss": 0.4936, + "step": 24687 + }, + { + "epoch": 3.29, + "grad_norm": 0.5234375, + "learning_rate": 1.8368173445452053e-05, + "loss": 0.2409, + "step": 24688 + }, + { + "epoch": 3.29, + "grad_norm": 0.59765625, + "learning_rate": 1.836144780969492e-05, + "loss": 0.3832, + "step": 24689 + }, + { + "epoch": 3.29, + "grad_norm": 0.45703125, + "learning_rate": 1.8354723281008945e-05, + "loss": 0.3303, + "step": 24690 + }, + { + "epoch": 3.29, + "grad_norm": 0.578125, + "learning_rate": 1.834799985948531e-05, + "loss": 0.1802, + "step": 24691 + }, + { + "epoch": 3.29, + "grad_norm": 0.640625, + "learning_rate": 1.8341277545215186e-05, + "loss": 0.4495, + "step": 24692 + }, + { + "epoch": 3.3, + "grad_norm": 0.51953125, + "learning_rate": 1.8334556338289755e-05, + "loss": 0.2049, + "step": 24693 + }, + { + "epoch": 3.3, + "grad_norm": 0.66015625, + "learning_rate": 1.8327836238800156e-05, + "loss": 0.4992, + "step": 24694 + }, + { + "epoch": 3.3, + "grad_norm": 0.494140625, + "learning_rate": 1.8321117246837537e-05, + "loss": 0.2141, + "step": 24695 + }, + { + "epoch": 3.3, + "grad_norm": 0.56640625, + "learning_rate": 1.8314399362492963e-05, + "loss": 0.2196, + "step": 24696 + }, + { + "epoch": 3.3, + "grad_norm": 0.55859375, + "learning_rate": 1.8307682585857556e-05, + "loss": 0.4871, + "step": 24697 + }, + { + "epoch": 3.3, + "grad_norm": 0.59375, + "learning_rate": 1.83009669170224e-05, + "loss": 0.33, + "step": 24698 + }, + { + "epoch": 3.3, + "grad_norm": 0.6484375, + "learning_rate": 1.82942523560786e-05, + "loss": 0.207, + "step": 24699 + }, + { + "epoch": 3.3, + "grad_norm": 0.60546875, + "learning_rate": 1.828753890311715e-05, + "loss": 0.4021, + "step": 24700 + }, + { + "epoch": 3.3, + "grad_norm": 0.53515625, + "learning_rate": 1.8280826558229136e-05, + "loss": 0.3847, + "step": 24701 + }, + { + "epoch": 3.3, + "grad_norm": 0.73046875, + "learning_rate": 1.8274115321505537e-05, + "loss": 0.2352, + "step": 24702 + }, + { + "epoch": 3.3, + "grad_norm": 0.5234375, + "learning_rate": 1.826740519303739e-05, + "loss": 0.2713, + "step": 24703 + }, + { + "epoch": 3.3, + "grad_norm": 0.5625, + "learning_rate": 1.8260696172915703e-05, + "loss": 0.1508, + "step": 24704 + }, + { + "epoch": 3.3, + "grad_norm": 0.6171875, + "learning_rate": 1.8253988261231414e-05, + "loss": 0.5245, + "step": 24705 + }, + { + "epoch": 3.3, + "grad_norm": 0.7578125, + "learning_rate": 1.8247281458075505e-05, + "loss": 0.2405, + "step": 24706 + }, + { + "epoch": 3.3, + "grad_norm": 0.5390625, + "learning_rate": 1.8240575763538935e-05, + "loss": 0.289, + "step": 24707 + }, + { + "epoch": 3.3, + "grad_norm": 0.5703125, + "learning_rate": 1.8233871177712648e-05, + "loss": 0.2057, + "step": 24708 + }, + { + "epoch": 3.3, + "grad_norm": 0.59375, + "learning_rate": 1.822716770068753e-05, + "loss": 0.2019, + "step": 24709 + }, + { + "epoch": 3.3, + "grad_norm": 0.5, + "learning_rate": 1.8220465332554493e-05, + "loss": 0.3438, + "step": 24710 + }, + { + "epoch": 3.3, + "grad_norm": 0.7578125, + "learning_rate": 1.8213764073404427e-05, + "loss": 0.4126, + "step": 24711 + }, + { + "epoch": 3.3, + "grad_norm": 0.60546875, + "learning_rate": 1.8207063923328237e-05, + "loss": 0.1838, + "step": 24712 + }, + { + "epoch": 3.3, + "grad_norm": 0.51953125, + "learning_rate": 1.8200364882416764e-05, + "loss": 0.2149, + "step": 24713 + }, + { + "epoch": 3.3, + "grad_norm": 0.80859375, + "learning_rate": 1.8193666950760814e-05, + "loss": 0.3339, + "step": 24714 + }, + { + "epoch": 3.3, + "grad_norm": 0.55078125, + "learning_rate": 1.818697012845124e-05, + "loss": 0.1858, + "step": 24715 + }, + { + "epoch": 3.3, + "grad_norm": 0.62890625, + "learning_rate": 1.818027441557886e-05, + "loss": 0.2528, + "step": 24716 + }, + { + "epoch": 3.3, + "grad_norm": 0.671875, + "learning_rate": 1.8173579812234487e-05, + "loss": 0.4293, + "step": 24717 + }, + { + "epoch": 3.3, + "grad_norm": 0.48828125, + "learning_rate": 1.816688631850887e-05, + "loss": 0.3589, + "step": 24718 + }, + { + "epoch": 3.3, + "grad_norm": 0.6796875, + "learning_rate": 1.8160193934492796e-05, + "loss": 0.5419, + "step": 24719 + }, + { + "epoch": 3.3, + "grad_norm": 0.70703125, + "learning_rate": 1.815350266027701e-05, + "loss": 0.5563, + "step": 24720 + }, + { + "epoch": 3.3, + "grad_norm": 0.64453125, + "learning_rate": 1.8146812495952258e-05, + "loss": 0.1689, + "step": 24721 + }, + { + "epoch": 3.3, + "grad_norm": 0.640625, + "learning_rate": 1.8140123441609293e-05, + "loss": 0.2136, + "step": 24722 + }, + { + "epoch": 3.3, + "grad_norm": 0.84765625, + "learning_rate": 1.813343549733877e-05, + "loss": 0.3677, + "step": 24723 + }, + { + "epoch": 3.3, + "grad_norm": 0.64453125, + "learning_rate": 1.8126748663231387e-05, + "loss": 0.4248, + "step": 24724 + }, + { + "epoch": 3.3, + "grad_norm": 0.640625, + "learning_rate": 1.812006293937788e-05, + "loss": 0.4404, + "step": 24725 + }, + { + "epoch": 3.3, + "grad_norm": 0.5859375, + "learning_rate": 1.8113378325868837e-05, + "loss": 0.3733, + "step": 24726 + }, + { + "epoch": 3.3, + "grad_norm": 0.953125, + "learning_rate": 1.8106694822794955e-05, + "loss": 0.3574, + "step": 24727 + }, + { + "epoch": 3.3, + "grad_norm": 0.58203125, + "learning_rate": 1.8100012430246837e-05, + "loss": 0.4467, + "step": 24728 + }, + { + "epoch": 3.3, + "grad_norm": 0.515625, + "learning_rate": 1.8093331148315106e-05, + "loss": 0.0964, + "step": 24729 + }, + { + "epoch": 3.3, + "grad_norm": 0.53515625, + "learning_rate": 1.808665097709038e-05, + "loss": 0.3315, + "step": 24730 + }, + { + "epoch": 3.3, + "grad_norm": 0.77734375, + "learning_rate": 1.8079971916663264e-05, + "loss": 0.3181, + "step": 24731 + }, + { + "epoch": 3.3, + "grad_norm": 0.6015625, + "learning_rate": 1.807329396712427e-05, + "loss": 0.4038, + "step": 24732 + }, + { + "epoch": 3.3, + "grad_norm": 0.5703125, + "learning_rate": 1.8066617128564e-05, + "loss": 0.3822, + "step": 24733 + }, + { + "epoch": 3.3, + "grad_norm": 0.43359375, + "learning_rate": 1.805994140107298e-05, + "loss": 0.1854, + "step": 24734 + }, + { + "epoch": 3.3, + "grad_norm": 0.578125, + "learning_rate": 1.8053266784741764e-05, + "loss": 0.1734, + "step": 24735 + }, + { + "epoch": 3.3, + "grad_norm": 0.51171875, + "learning_rate": 1.8046593279660828e-05, + "loss": 0.1582, + "step": 24736 + }, + { + "epoch": 3.3, + "grad_norm": 0.58984375, + "learning_rate": 1.8039920885920682e-05, + "loss": 0.3532, + "step": 24737 + }, + { + "epoch": 3.3, + "grad_norm": 0.65625, + "learning_rate": 1.8033249603611836e-05, + "loss": 0.4732, + "step": 24738 + }, + { + "epoch": 3.3, + "grad_norm": 0.490234375, + "learning_rate": 1.80265794328247e-05, + "loss": 0.3648, + "step": 24739 + }, + { + "epoch": 3.3, + "grad_norm": 0.50390625, + "learning_rate": 1.8019910373649785e-05, + "loss": 0.3001, + "step": 24740 + }, + { + "epoch": 3.3, + "grad_norm": 0.5703125, + "learning_rate": 1.801324242617748e-05, + "loss": 0.5178, + "step": 24741 + }, + { + "epoch": 3.3, + "grad_norm": 0.38671875, + "learning_rate": 1.800657559049822e-05, + "loss": 0.1993, + "step": 24742 + }, + { + "epoch": 3.3, + "grad_norm": 0.58984375, + "learning_rate": 1.7999909866702426e-05, + "loss": 0.3564, + "step": 24743 + }, + { + "epoch": 3.3, + "grad_norm": 0.466796875, + "learning_rate": 1.7993245254880476e-05, + "loss": 0.1492, + "step": 24744 + }, + { + "epoch": 3.3, + "grad_norm": 0.5546875, + "learning_rate": 1.7986581755122767e-05, + "loss": 0.1793, + "step": 24745 + }, + { + "epoch": 3.3, + "grad_norm": 0.57421875, + "learning_rate": 1.7979919367519636e-05, + "loss": 0.3675, + "step": 24746 + }, + { + "epoch": 3.3, + "grad_norm": 0.625, + "learning_rate": 1.7973258092161428e-05, + "loss": 0.3134, + "step": 24747 + }, + { + "epoch": 3.3, + "grad_norm": 0.58984375, + "learning_rate": 1.7966597929138497e-05, + "loss": 0.2403, + "step": 24748 + }, + { + "epoch": 3.3, + "grad_norm": 0.62890625, + "learning_rate": 1.795993887854116e-05, + "loss": 0.4676, + "step": 24749 + }, + { + "epoch": 3.3, + "grad_norm": 0.50390625, + "learning_rate": 1.7953280940459704e-05, + "loss": 0.2967, + "step": 24750 + }, + { + "epoch": 3.3, + "grad_norm": 0.5, + "learning_rate": 1.79466241149844e-05, + "loss": 0.4358, + "step": 24751 + }, + { + "epoch": 3.3, + "grad_norm": 0.6171875, + "learning_rate": 1.793996840220553e-05, + "loss": 0.5345, + "step": 24752 + }, + { + "epoch": 3.3, + "grad_norm": 0.69140625, + "learning_rate": 1.793331380221336e-05, + "loss": 0.259, + "step": 24753 + }, + { + "epoch": 3.3, + "grad_norm": 0.6015625, + "learning_rate": 1.7926660315098143e-05, + "loss": 0.2418, + "step": 24754 + }, + { + "epoch": 3.3, + "grad_norm": 0.4921875, + "learning_rate": 1.7920007940950057e-05, + "loss": 0.2131, + "step": 24755 + }, + { + "epoch": 3.3, + "grad_norm": 0.65234375, + "learning_rate": 1.791335667985935e-05, + "loss": 0.2061, + "step": 24756 + }, + { + "epoch": 3.3, + "grad_norm": 0.734375, + "learning_rate": 1.7906706531916205e-05, + "loss": 0.321, + "step": 24757 + }, + { + "epoch": 3.3, + "grad_norm": 0.57421875, + "learning_rate": 1.790005749721083e-05, + "loss": 0.3094, + "step": 24758 + }, + { + "epoch": 3.3, + "grad_norm": 0.69140625, + "learning_rate": 1.7893409575833352e-05, + "loss": 0.2543, + "step": 24759 + }, + { + "epoch": 3.3, + "grad_norm": 0.5546875, + "learning_rate": 1.7886762767873933e-05, + "loss": 0.1962, + "step": 24760 + }, + { + "epoch": 3.3, + "grad_norm": 0.68359375, + "learning_rate": 1.7880117073422698e-05, + "loss": 0.2738, + "step": 24761 + }, + { + "epoch": 3.3, + "grad_norm": 0.828125, + "learning_rate": 1.787347249256982e-05, + "loss": 0.5644, + "step": 24762 + }, + { + "epoch": 3.3, + "grad_norm": 0.515625, + "learning_rate": 1.7866829025405353e-05, + "loss": 0.3666, + "step": 24763 + }, + { + "epoch": 3.3, + "grad_norm": 0.65625, + "learning_rate": 1.7860186672019373e-05, + "loss": 0.5224, + "step": 24764 + }, + { + "epoch": 3.3, + "grad_norm": 0.59765625, + "learning_rate": 1.7853545432501983e-05, + "loss": 0.2775, + "step": 24765 + }, + { + "epoch": 3.3, + "grad_norm": 0.65234375, + "learning_rate": 1.784690530694324e-05, + "loss": 0.4711, + "step": 24766 + }, + { + "epoch": 3.3, + "grad_norm": 0.7265625, + "learning_rate": 1.7840266295433182e-05, + "loss": 0.2255, + "step": 24767 + }, + { + "epoch": 3.31, + "grad_norm": 0.60546875, + "learning_rate": 1.7833628398061875e-05, + "loss": 0.173, + "step": 24768 + }, + { + "epoch": 3.31, + "grad_norm": 0.703125, + "learning_rate": 1.7826991614919265e-05, + "loss": 0.3233, + "step": 24769 + }, + { + "epoch": 3.31, + "grad_norm": 0.5703125, + "learning_rate": 1.7820355946095392e-05, + "loss": 0.2619, + "step": 24770 + }, + { + "epoch": 3.31, + "grad_norm": 0.73828125, + "learning_rate": 1.7813721391680238e-05, + "loss": 0.5923, + "step": 24771 + }, + { + "epoch": 3.31, + "grad_norm": 0.4765625, + "learning_rate": 1.780708795176379e-05, + "loss": 0.1548, + "step": 24772 + }, + { + "epoch": 3.31, + "grad_norm": 0.640625, + "learning_rate": 1.7800455626435952e-05, + "loss": 0.2882, + "step": 24773 + }, + { + "epoch": 3.31, + "grad_norm": 0.6796875, + "learning_rate": 1.779382441578672e-05, + "loss": 0.5297, + "step": 24774 + }, + { + "epoch": 3.31, + "grad_norm": 0.765625, + "learning_rate": 1.7787194319905965e-05, + "loss": 0.4763, + "step": 24775 + }, + { + "epoch": 3.31, + "grad_norm": 0.546875, + "learning_rate": 1.7780565338883616e-05, + "loss": 0.2193, + "step": 24776 + }, + { + "epoch": 3.31, + "grad_norm": 0.68359375, + "learning_rate": 1.7773937472809597e-05, + "loss": 0.2296, + "step": 24777 + }, + { + "epoch": 3.31, + "grad_norm": 0.55859375, + "learning_rate": 1.7767310721773734e-05, + "loss": 0.2457, + "step": 24778 + }, + { + "epoch": 3.31, + "grad_norm": 0.609375, + "learning_rate": 1.7760685085865915e-05, + "loss": 0.2632, + "step": 24779 + }, + { + "epoch": 3.31, + "grad_norm": 0.5390625, + "learning_rate": 1.775406056517598e-05, + "loss": 0.2758, + "step": 24780 + }, + { + "epoch": 3.31, + "grad_norm": 0.65234375, + "learning_rate": 1.77474371597938e-05, + "loss": 0.3998, + "step": 24781 + }, + { + "epoch": 3.31, + "grad_norm": 0.39453125, + "learning_rate": 1.774081486980913e-05, + "loss": 0.1412, + "step": 24782 + }, + { + "epoch": 3.31, + "grad_norm": 0.57421875, + "learning_rate": 1.7734193695311807e-05, + "loss": 0.1855, + "step": 24783 + }, + { + "epoch": 3.31, + "grad_norm": 0.63671875, + "learning_rate": 1.7727573636391624e-05, + "loss": 0.2471, + "step": 24784 + }, + { + "epoch": 3.31, + "grad_norm": 0.6015625, + "learning_rate": 1.772095469313836e-05, + "loss": 0.3285, + "step": 24785 + }, + { + "epoch": 3.31, + "grad_norm": 0.5546875, + "learning_rate": 1.7714336865641756e-05, + "loss": 0.2993, + "step": 24786 + }, + { + "epoch": 3.31, + "grad_norm": 0.455078125, + "learning_rate": 1.7707720153991536e-05, + "loss": 0.282, + "step": 24787 + }, + { + "epoch": 3.31, + "grad_norm": 0.6953125, + "learning_rate": 1.7701104558277437e-05, + "loss": 0.286, + "step": 24788 + }, + { + "epoch": 3.31, + "grad_norm": 0.58984375, + "learning_rate": 1.7694490078589188e-05, + "loss": 0.4044, + "step": 24789 + }, + { + "epoch": 3.31, + "grad_norm": 0.91015625, + "learning_rate": 1.7687876715016504e-05, + "loss": 0.3969, + "step": 24790 + }, + { + "epoch": 3.31, + "grad_norm": 0.498046875, + "learning_rate": 1.7681264467649006e-05, + "loss": 0.1669, + "step": 24791 + }, + { + "epoch": 3.31, + "grad_norm": 0.5234375, + "learning_rate": 1.7674653336576395e-05, + "loss": 0.1612, + "step": 24792 + }, + { + "epoch": 3.31, + "grad_norm": 0.62109375, + "learning_rate": 1.7668043321888317e-05, + "loss": 0.3317, + "step": 24793 + }, + { + "epoch": 3.31, + "grad_norm": 0.482421875, + "learning_rate": 1.7661434423674418e-05, + "loss": 0.1119, + "step": 24794 + }, + { + "epoch": 3.31, + "grad_norm": 0.5, + "learning_rate": 1.7654826642024337e-05, + "loss": 0.2198, + "step": 24795 + }, + { + "epoch": 3.31, + "grad_norm": 0.63671875, + "learning_rate": 1.7648219977027636e-05, + "loss": 0.4489, + "step": 24796 + }, + { + "epoch": 3.31, + "grad_norm": 0.6328125, + "learning_rate": 1.764161442877391e-05, + "loss": 0.3626, + "step": 24797 + }, + { + "epoch": 3.31, + "grad_norm": 0.51953125, + "learning_rate": 1.7635009997352792e-05, + "loss": 0.2684, + "step": 24798 + }, + { + "epoch": 3.31, + "grad_norm": 0.671875, + "learning_rate": 1.762840668285376e-05, + "loss": 0.4672, + "step": 24799 + }, + { + "epoch": 3.31, + "grad_norm": 0.56640625, + "learning_rate": 1.7621804485366432e-05, + "loss": 0.3427, + "step": 24800 + }, + { + "epoch": 3.31, + "grad_norm": 0.578125, + "learning_rate": 1.7615203404980284e-05, + "loss": 0.1937, + "step": 24801 + }, + { + "epoch": 3.31, + "grad_norm": 0.56640625, + "learning_rate": 1.7608603441784844e-05, + "loss": 0.3842, + "step": 24802 + }, + { + "epoch": 3.31, + "grad_norm": 0.59375, + "learning_rate": 1.7602004595869626e-05, + "loss": 0.4536, + "step": 24803 + }, + { + "epoch": 3.31, + "grad_norm": 0.77734375, + "learning_rate": 1.7595406867324127e-05, + "loss": 0.4513, + "step": 24804 + }, + { + "epoch": 3.31, + "grad_norm": 0.7265625, + "learning_rate": 1.7588810256237774e-05, + "loss": 0.4767, + "step": 24805 + }, + { + "epoch": 3.31, + "grad_norm": 0.57421875, + "learning_rate": 1.7582214762700054e-05, + "loss": 0.2456, + "step": 24806 + }, + { + "epoch": 3.31, + "grad_norm": 0.8125, + "learning_rate": 1.7575620386800383e-05, + "loss": 0.5871, + "step": 24807 + }, + { + "epoch": 3.31, + "grad_norm": 0.55859375, + "learning_rate": 1.7569027128628236e-05, + "loss": 0.3621, + "step": 24808 + }, + { + "epoch": 3.31, + "grad_norm": 0.66796875, + "learning_rate": 1.7562434988272956e-05, + "loss": 0.4608, + "step": 24809 + }, + { + "epoch": 3.31, + "grad_norm": 0.42578125, + "learning_rate": 1.7555843965823992e-05, + "loss": 0.1434, + "step": 24810 + }, + { + "epoch": 3.31, + "grad_norm": 0.58203125, + "learning_rate": 1.7549254061370667e-05, + "loss": 0.4065, + "step": 24811 + }, + { + "epoch": 3.31, + "grad_norm": 0.5, + "learning_rate": 1.754266527500238e-05, + "loss": 0.4199, + "step": 24812 + }, + { + "epoch": 3.31, + "grad_norm": 0.66796875, + "learning_rate": 1.7536077606808488e-05, + "loss": 0.5146, + "step": 24813 + }, + { + "epoch": 3.31, + "grad_norm": 0.65234375, + "learning_rate": 1.752949105687829e-05, + "loss": 0.2812, + "step": 24814 + }, + { + "epoch": 3.31, + "grad_norm": 0.66015625, + "learning_rate": 1.7522905625301123e-05, + "loss": 0.4548, + "step": 24815 + }, + { + "epoch": 3.31, + "grad_norm": 0.60546875, + "learning_rate": 1.7516321312166282e-05, + "loss": 0.3592, + "step": 24816 + }, + { + "epoch": 3.31, + "grad_norm": 0.6015625, + "learning_rate": 1.7509738117563067e-05, + "loss": 0.4374, + "step": 24817 + }, + { + "epoch": 3.31, + "grad_norm": 0.5390625, + "learning_rate": 1.750315604158077e-05, + "loss": 0.3526, + "step": 24818 + }, + { + "epoch": 3.31, + "grad_norm": 0.796875, + "learning_rate": 1.7496575084308597e-05, + "loss": 0.2906, + "step": 24819 + }, + { + "epoch": 3.31, + "grad_norm": 0.6484375, + "learning_rate": 1.7489995245835822e-05, + "loss": 0.2667, + "step": 24820 + }, + { + "epoch": 3.31, + "grad_norm": 0.68359375, + "learning_rate": 1.7483416526251663e-05, + "loss": 0.2363, + "step": 24821 + }, + { + "epoch": 3.31, + "grad_norm": 0.54296875, + "learning_rate": 1.7476838925645356e-05, + "loss": 0.1881, + "step": 24822 + }, + { + "epoch": 3.31, + "grad_norm": 0.5703125, + "learning_rate": 1.7470262444106076e-05, + "loss": 0.3381, + "step": 24823 + }, + { + "epoch": 3.31, + "grad_norm": 0.765625, + "learning_rate": 1.746368708172299e-05, + "loss": 0.2825, + "step": 24824 + }, + { + "epoch": 3.31, + "grad_norm": 0.466796875, + "learning_rate": 1.7457112838585266e-05, + "loss": 0.1515, + "step": 24825 + }, + { + "epoch": 3.31, + "grad_norm": 0.578125, + "learning_rate": 1.7450539714782076e-05, + "loss": 0.2981, + "step": 24826 + }, + { + "epoch": 3.31, + "grad_norm": 0.66015625, + "learning_rate": 1.7443967710402566e-05, + "loss": 0.3188, + "step": 24827 + }, + { + "epoch": 3.31, + "grad_norm": 0.69921875, + "learning_rate": 1.7437396825535813e-05, + "loss": 0.4443, + "step": 24828 + }, + { + "epoch": 3.31, + "grad_norm": 0.6328125, + "learning_rate": 1.7430827060270948e-05, + "loss": 0.5555, + "step": 24829 + }, + { + "epoch": 3.31, + "grad_norm": 0.640625, + "learning_rate": 1.742425841469707e-05, + "loss": 0.6867, + "step": 24830 + }, + { + "epoch": 3.31, + "grad_norm": 0.609375, + "learning_rate": 1.741769088890326e-05, + "loss": 0.3568, + "step": 24831 + }, + { + "epoch": 3.31, + "grad_norm": 0.7578125, + "learning_rate": 1.7411124482978535e-05, + "loss": 0.3341, + "step": 24832 + }, + { + "epoch": 3.31, + "grad_norm": 0.640625, + "learning_rate": 1.7404559197011973e-05, + "loss": 0.2859, + "step": 24833 + }, + { + "epoch": 3.31, + "grad_norm": 0.61328125, + "learning_rate": 1.7397995031092596e-05, + "loss": 0.4836, + "step": 24834 + }, + { + "epoch": 3.31, + "grad_norm": 0.5078125, + "learning_rate": 1.739143198530945e-05, + "loss": 0.2544, + "step": 24835 + }, + { + "epoch": 3.31, + "grad_norm": 0.498046875, + "learning_rate": 1.73848700597515e-05, + "loss": 0.1957, + "step": 24836 + }, + { + "epoch": 3.31, + "grad_norm": 0.69921875, + "learning_rate": 1.737830925450771e-05, + "loss": 0.3986, + "step": 24837 + }, + { + "epoch": 3.31, + "grad_norm": 0.70703125, + "learning_rate": 1.737174956966707e-05, + "loss": 0.5163, + "step": 24838 + }, + { + "epoch": 3.31, + "grad_norm": 0.8046875, + "learning_rate": 1.7365191005318537e-05, + "loss": 0.6185, + "step": 24839 + }, + { + "epoch": 3.31, + "grad_norm": 0.62890625, + "learning_rate": 1.7358633561551054e-05, + "loss": 0.1851, + "step": 24840 + }, + { + "epoch": 3.31, + "grad_norm": 0.75, + "learning_rate": 1.7352077238453557e-05, + "loss": 0.8471, + "step": 24841 + }, + { + "epoch": 3.31, + "grad_norm": 0.44140625, + "learning_rate": 1.7345522036114902e-05, + "loss": 0.2418, + "step": 24842 + }, + { + "epoch": 3.32, + "grad_norm": 0.55078125, + "learning_rate": 1.733896795462403e-05, + "loss": 0.1994, + "step": 24843 + }, + { + "epoch": 3.32, + "grad_norm": 0.49609375, + "learning_rate": 1.73324149940698e-05, + "loss": 0.37, + "step": 24844 + }, + { + "epoch": 3.32, + "grad_norm": 0.65625, + "learning_rate": 1.7325863154541112e-05, + "loss": 0.3737, + "step": 24845 + }, + { + "epoch": 3.32, + "grad_norm": 0.58203125, + "learning_rate": 1.7319312436126746e-05, + "loss": 0.3497, + "step": 24846 + }, + { + "epoch": 3.32, + "grad_norm": 0.6015625, + "learning_rate": 1.7312762838915587e-05, + "loss": 0.1545, + "step": 24847 + }, + { + "epoch": 3.32, + "grad_norm": 0.6484375, + "learning_rate": 1.730621436299641e-05, + "loss": 0.3667, + "step": 24848 + }, + { + "epoch": 3.32, + "grad_norm": 0.51171875, + "learning_rate": 1.7299667008458044e-05, + "loss": 0.1141, + "step": 24849 + }, + { + "epoch": 3.32, + "grad_norm": 0.58984375, + "learning_rate": 1.7293120775389295e-05, + "loss": 0.2297, + "step": 24850 + }, + { + "epoch": 3.32, + "grad_norm": 0.6328125, + "learning_rate": 1.7286575663878877e-05, + "loss": 0.3967, + "step": 24851 + }, + { + "epoch": 3.32, + "grad_norm": 0.55078125, + "learning_rate": 1.7280031674015584e-05, + "loss": 0.2309, + "step": 24852 + }, + { + "epoch": 3.32, + "grad_norm": 0.5390625, + "learning_rate": 1.727348880588815e-05, + "loss": 0.3272, + "step": 24853 + }, + { + "epoch": 3.32, + "grad_norm": 0.51953125, + "learning_rate": 1.7266947059585326e-05, + "loss": 0.2352, + "step": 24854 + }, + { + "epoch": 3.32, + "grad_norm": 0.490234375, + "learning_rate": 1.7260406435195775e-05, + "loss": 0.2592, + "step": 24855 + }, + { + "epoch": 3.32, + "grad_norm": 0.59765625, + "learning_rate": 1.7253866932808214e-05, + "loss": 0.2318, + "step": 24856 + }, + { + "epoch": 3.32, + "grad_norm": 0.6328125, + "learning_rate": 1.724732855251132e-05, + "loss": 0.1854, + "step": 24857 + }, + { + "epoch": 3.32, + "grad_norm": 0.80078125, + "learning_rate": 1.7240791294393787e-05, + "loss": 0.3243, + "step": 24858 + }, + { + "epoch": 3.32, + "grad_norm": 0.62890625, + "learning_rate": 1.7234255158544245e-05, + "loss": 0.3123, + "step": 24859 + }, + { + "epoch": 3.32, + "grad_norm": 0.58984375, + "learning_rate": 1.7227720145051284e-05, + "loss": 0.2567, + "step": 24860 + }, + { + "epoch": 3.32, + "grad_norm": 0.59375, + "learning_rate": 1.7221186254003573e-05, + "loss": 0.2666, + "step": 24861 + }, + { + "epoch": 3.32, + "grad_norm": 0.65625, + "learning_rate": 1.7214653485489695e-05, + "loss": 0.3085, + "step": 24862 + }, + { + "epoch": 3.32, + "grad_norm": 0.5390625, + "learning_rate": 1.7208121839598278e-05, + "loss": 0.2889, + "step": 24863 + }, + { + "epoch": 3.32, + "grad_norm": 0.58203125, + "learning_rate": 1.720159131641783e-05, + "loss": 0.1736, + "step": 24864 + }, + { + "epoch": 3.32, + "grad_norm": 0.56640625, + "learning_rate": 1.7195061916036958e-05, + "loss": 0.3822, + "step": 24865 + }, + { + "epoch": 3.32, + "grad_norm": 0.57421875, + "learning_rate": 1.718853363854418e-05, + "loss": 0.2863, + "step": 24866 + }, + { + "epoch": 3.32, + "grad_norm": 0.62890625, + "learning_rate": 1.7182006484028035e-05, + "loss": 0.3259, + "step": 24867 + }, + { + "epoch": 3.32, + "grad_norm": 0.65625, + "learning_rate": 1.7175480452577063e-05, + "loss": 0.4304, + "step": 24868 + }, + { + "epoch": 3.32, + "grad_norm": 0.52734375, + "learning_rate": 1.7168955544279707e-05, + "loss": 0.2289, + "step": 24869 + }, + { + "epoch": 3.32, + "grad_norm": 0.6328125, + "learning_rate": 1.7162431759224472e-05, + "loss": 0.3093, + "step": 24870 + }, + { + "epoch": 3.32, + "grad_norm": 0.6875, + "learning_rate": 1.7155909097499856e-05, + "loss": 0.2336, + "step": 24871 + }, + { + "epoch": 3.32, + "grad_norm": 0.44921875, + "learning_rate": 1.7149387559194252e-05, + "loss": 0.1777, + "step": 24872 + }, + { + "epoch": 3.32, + "grad_norm": 0.5390625, + "learning_rate": 1.7142867144396147e-05, + "loss": 0.2233, + "step": 24873 + }, + { + "epoch": 3.32, + "grad_norm": 0.62109375, + "learning_rate": 1.7136347853193925e-05, + "loss": 0.3737, + "step": 24874 + }, + { + "epoch": 3.32, + "grad_norm": 0.6640625, + "learning_rate": 1.7129829685676002e-05, + "loss": 0.296, + "step": 24875 + }, + { + "epoch": 3.32, + "grad_norm": 0.56640625, + "learning_rate": 1.712331264193078e-05, + "loss": 0.4088, + "step": 24876 + }, + { + "epoch": 3.32, + "grad_norm": 0.61328125, + "learning_rate": 1.7116796722046647e-05, + "loss": 0.5441, + "step": 24877 + }, + { + "epoch": 3.32, + "grad_norm": 0.52734375, + "learning_rate": 1.7110281926111925e-05, + "loss": 0.2118, + "step": 24878 + }, + { + "epoch": 3.32, + "grad_norm": 0.54296875, + "learning_rate": 1.710376825421497e-05, + "loss": 0.2635, + "step": 24879 + }, + { + "epoch": 3.32, + "grad_norm": 0.5625, + "learning_rate": 1.7097255706444126e-05, + "loss": 0.308, + "step": 24880 + }, + { + "epoch": 3.32, + "grad_norm": 0.68359375, + "learning_rate": 1.7090744282887726e-05, + "loss": 0.6729, + "step": 24881 + }, + { + "epoch": 3.32, + "grad_norm": 0.61328125, + "learning_rate": 1.7084233983634024e-05, + "loss": 0.1941, + "step": 24882 + }, + { + "epoch": 3.32, + "grad_norm": 0.6328125, + "learning_rate": 1.7077724808771334e-05, + "loss": 0.4746, + "step": 24883 + }, + { + "epoch": 3.32, + "grad_norm": 0.61328125, + "learning_rate": 1.7071216758387907e-05, + "loss": 0.3196, + "step": 24884 + }, + { + "epoch": 3.32, + "grad_norm": 0.68359375, + "learning_rate": 1.706470983257199e-05, + "loss": 0.2379, + "step": 24885 + }, + { + "epoch": 3.32, + "grad_norm": 0.79296875, + "learning_rate": 1.7058204031411874e-05, + "loss": 0.2303, + "step": 24886 + }, + { + "epoch": 3.32, + "grad_norm": 0.84765625, + "learning_rate": 1.7051699354995708e-05, + "loss": 0.2738, + "step": 24887 + }, + { + "epoch": 3.32, + "grad_norm": 0.66796875, + "learning_rate": 1.704519580341174e-05, + "loss": 0.299, + "step": 24888 + }, + { + "epoch": 3.32, + "grad_norm": 0.64453125, + "learning_rate": 1.703869337674815e-05, + "loss": 0.4318, + "step": 24889 + }, + { + "epoch": 3.32, + "grad_norm": 0.88671875, + "learning_rate": 1.7032192075093134e-05, + "loss": 0.2401, + "step": 24890 + }, + { + "epoch": 3.32, + "grad_norm": 0.7578125, + "learning_rate": 1.702569189853486e-05, + "loss": 0.5025, + "step": 24891 + }, + { + "epoch": 3.32, + "grad_norm": 0.71484375, + "learning_rate": 1.7019192847161425e-05, + "loss": 0.2366, + "step": 24892 + }, + { + "epoch": 3.32, + "grad_norm": 0.7734375, + "learning_rate": 1.7012694921061002e-05, + "loss": 0.3663, + "step": 24893 + }, + { + "epoch": 3.32, + "grad_norm": 0.458984375, + "learning_rate": 1.7006198120321692e-05, + "loss": 0.2741, + "step": 24894 + }, + { + "epoch": 3.32, + "grad_norm": 0.765625, + "learning_rate": 1.6999702445031617e-05, + "loss": 0.1906, + "step": 24895 + }, + { + "epoch": 3.32, + "grad_norm": 0.484375, + "learning_rate": 1.699320789527886e-05, + "loss": 0.2041, + "step": 24896 + }, + { + "epoch": 3.32, + "grad_norm": 0.6640625, + "learning_rate": 1.6986714471151444e-05, + "loss": 0.5117, + "step": 24897 + }, + { + "epoch": 3.32, + "grad_norm": 0.6171875, + "learning_rate": 1.698022217273746e-05, + "loss": 0.2761, + "step": 24898 + }, + { + "epoch": 3.32, + "grad_norm": 0.6171875, + "learning_rate": 1.697373100012495e-05, + "loss": 0.2687, + "step": 24899 + }, + { + "epoch": 3.32, + "grad_norm": 0.734375, + "learning_rate": 1.6967240953401954e-05, + "loss": 0.6588, + "step": 24900 + }, + { + "epoch": 3.32, + "grad_norm": 0.59765625, + "learning_rate": 1.6960752032656434e-05, + "loss": 0.1299, + "step": 24901 + }, + { + "epoch": 3.32, + "grad_norm": 0.796875, + "learning_rate": 1.6954264237976413e-05, + "loss": 0.6915, + "step": 24902 + }, + { + "epoch": 3.32, + "grad_norm": 0.734375, + "learning_rate": 1.6947777569449863e-05, + "loss": 0.4468, + "step": 24903 + }, + { + "epoch": 3.32, + "grad_norm": 0.369140625, + "learning_rate": 1.694129202716479e-05, + "loss": 0.1376, + "step": 24904 + }, + { + "epoch": 3.32, + "grad_norm": 0.455078125, + "learning_rate": 1.693480761120907e-05, + "loss": 0.2899, + "step": 24905 + }, + { + "epoch": 3.32, + "grad_norm": 0.6015625, + "learning_rate": 1.6928324321670663e-05, + "loss": 0.1719, + "step": 24906 + }, + { + "epoch": 3.32, + "grad_norm": 0.859375, + "learning_rate": 1.6921842158637503e-05, + "loss": 0.5577, + "step": 24907 + }, + { + "epoch": 3.32, + "grad_norm": 0.734375, + "learning_rate": 1.6915361122197504e-05, + "loss": 0.5529, + "step": 24908 + }, + { + "epoch": 3.32, + "grad_norm": 0.515625, + "learning_rate": 1.690888121243853e-05, + "loss": 0.2775, + "step": 24909 + }, + { + "epoch": 3.32, + "grad_norm": 0.53515625, + "learning_rate": 1.6902402429448426e-05, + "loss": 0.4629, + "step": 24910 + }, + { + "epoch": 3.32, + "grad_norm": 0.72265625, + "learning_rate": 1.6895924773315085e-05, + "loss": 0.2831, + "step": 24911 + }, + { + "epoch": 3.32, + "grad_norm": 0.6015625, + "learning_rate": 1.6889448244126336e-05, + "loss": 0.3022, + "step": 24912 + }, + { + "epoch": 3.32, + "grad_norm": 0.765625, + "learning_rate": 1.6882972841970012e-05, + "loss": 0.5381, + "step": 24913 + }, + { + "epoch": 3.32, + "grad_norm": 0.60546875, + "learning_rate": 1.687649856693394e-05, + "loss": 0.3174, + "step": 24914 + }, + { + "epoch": 3.32, + "grad_norm": 0.83203125, + "learning_rate": 1.6870025419105873e-05, + "loss": 0.2364, + "step": 24915 + }, + { + "epoch": 3.32, + "grad_norm": 0.7265625, + "learning_rate": 1.6863553398573617e-05, + "loss": 0.2757, + "step": 24916 + }, + { + "epoch": 3.32, + "grad_norm": 0.703125, + "learning_rate": 1.6857082505424925e-05, + "loss": 0.3679, + "step": 24917 + }, + { + "epoch": 3.33, + "grad_norm": 0.56640625, + "learning_rate": 1.6850612739747585e-05, + "loss": 0.4849, + "step": 24918 + }, + { + "epoch": 3.33, + "grad_norm": 0.84375, + "learning_rate": 1.6844144101629277e-05, + "loss": 0.2698, + "step": 24919 + }, + { + "epoch": 3.33, + "grad_norm": 0.53125, + "learning_rate": 1.683767659115777e-05, + "loss": 0.3568, + "step": 24920 + }, + { + "epoch": 3.33, + "grad_norm": 0.6015625, + "learning_rate": 1.6831210208420723e-05, + "loss": 0.4677, + "step": 24921 + }, + { + "epoch": 3.33, + "grad_norm": 0.6171875, + "learning_rate": 1.6824744953505834e-05, + "loss": 0.4617, + "step": 24922 + }, + { + "epoch": 3.33, + "grad_norm": 0.6171875, + "learning_rate": 1.681828082650081e-05, + "loss": 0.2003, + "step": 24923 + }, + { + "epoch": 3.33, + "grad_norm": 0.60546875, + "learning_rate": 1.681181782749326e-05, + "loss": 0.4441, + "step": 24924 + }, + { + "epoch": 3.33, + "grad_norm": 0.5703125, + "learning_rate": 1.6805355956570845e-05, + "loss": 0.2551, + "step": 24925 + }, + { + "epoch": 3.33, + "grad_norm": 0.546875, + "learning_rate": 1.6798895213821208e-05, + "loss": 0.2193, + "step": 24926 + }, + { + "epoch": 3.33, + "grad_norm": 0.6015625, + "learning_rate": 1.679243559933197e-05, + "loss": 0.2657, + "step": 24927 + }, + { + "epoch": 3.33, + "grad_norm": 0.57421875, + "learning_rate": 1.6785977113190675e-05, + "loss": 0.3193, + "step": 24928 + }, + { + "epoch": 3.33, + "grad_norm": 0.62890625, + "learning_rate": 1.677951975548495e-05, + "loss": 0.4423, + "step": 24929 + }, + { + "epoch": 3.33, + "grad_norm": 0.63671875, + "learning_rate": 1.6773063526302335e-05, + "loss": 0.3166, + "step": 24930 + }, + { + "epoch": 3.33, + "grad_norm": 0.51953125, + "learning_rate": 1.6766608425730423e-05, + "loss": 0.3836, + "step": 24931 + }, + { + "epoch": 3.33, + "grad_norm": 0.73046875, + "learning_rate": 1.6760154453856723e-05, + "loss": 0.2352, + "step": 24932 + }, + { + "epoch": 3.33, + "grad_norm": 0.83203125, + "learning_rate": 1.6753701610768724e-05, + "loss": 0.7061, + "step": 24933 + }, + { + "epoch": 3.33, + "grad_norm": 0.5390625, + "learning_rate": 1.674724989655396e-05, + "loss": 0.4051, + "step": 24934 + }, + { + "epoch": 3.33, + "grad_norm": 0.56640625, + "learning_rate": 1.6740799311299914e-05, + "loss": 0.5349, + "step": 24935 + }, + { + "epoch": 3.33, + "grad_norm": 0.6015625, + "learning_rate": 1.67343498550941e-05, + "loss": 0.3034, + "step": 24936 + }, + { + "epoch": 3.33, + "grad_norm": 0.70703125, + "learning_rate": 1.6727901528023902e-05, + "loss": 0.4187, + "step": 24937 + }, + { + "epoch": 3.33, + "grad_norm": 0.45703125, + "learning_rate": 1.672145433017682e-05, + "loss": 0.12, + "step": 24938 + }, + { + "epoch": 3.33, + "grad_norm": 0.56640625, + "learning_rate": 1.671500826164025e-05, + "loss": 0.2804, + "step": 24939 + }, + { + "epoch": 3.33, + "grad_norm": 0.5546875, + "learning_rate": 1.6708563322501625e-05, + "loss": 0.409, + "step": 24940 + }, + { + "epoch": 3.33, + "grad_norm": 0.60546875, + "learning_rate": 1.670211951284837e-05, + "loss": 0.3642, + "step": 24941 + }, + { + "epoch": 3.33, + "grad_norm": 0.8046875, + "learning_rate": 1.6695676832767802e-05, + "loss": 0.3501, + "step": 24942 + }, + { + "epoch": 3.33, + "grad_norm": 0.8671875, + "learning_rate": 1.6689235282347315e-05, + "loss": 0.4313, + "step": 24943 + }, + { + "epoch": 3.33, + "grad_norm": 0.66015625, + "learning_rate": 1.66827948616743e-05, + "loss": 0.3704, + "step": 24944 + }, + { + "epoch": 3.33, + "grad_norm": 0.6796875, + "learning_rate": 1.6676355570836034e-05, + "loss": 0.3685, + "step": 24945 + }, + { + "epoch": 3.33, + "grad_norm": 0.474609375, + "learning_rate": 1.6669917409919888e-05, + "loss": 0.3641, + "step": 24946 + }, + { + "epoch": 3.33, + "grad_norm": 0.5, + "learning_rate": 1.6663480379013118e-05, + "loss": 0.3055, + "step": 24947 + }, + { + "epoch": 3.33, + "grad_norm": 0.68359375, + "learning_rate": 1.6657044478203032e-05, + "loss": 0.3822, + "step": 24948 + }, + { + "epoch": 3.33, + "grad_norm": 0.6953125, + "learning_rate": 1.6650609707576915e-05, + "loss": 0.4683, + "step": 24949 + }, + { + "epoch": 3.33, + "grad_norm": 0.6796875, + "learning_rate": 1.664417606722205e-05, + "loss": 0.2385, + "step": 24950 + }, + { + "epoch": 3.33, + "grad_norm": 0.51171875, + "learning_rate": 1.6637743557225638e-05, + "loss": 0.321, + "step": 24951 + }, + { + "epoch": 3.33, + "grad_norm": 0.671875, + "learning_rate": 1.6631312177674906e-05, + "loss": 0.4261, + "step": 24952 + }, + { + "epoch": 3.33, + "grad_norm": 0.65234375, + "learning_rate": 1.66248819286571e-05, + "loss": 0.4457, + "step": 24953 + }, + { + "epoch": 3.33, + "grad_norm": 0.53515625, + "learning_rate": 1.6618452810259434e-05, + "loss": 0.2083, + "step": 24954 + }, + { + "epoch": 3.33, + "grad_norm": 0.609375, + "learning_rate": 1.6612024822569028e-05, + "loss": 0.4918, + "step": 24955 + }, + { + "epoch": 3.33, + "grad_norm": 0.68359375, + "learning_rate": 1.6605597965673113e-05, + "loss": 0.4346, + "step": 24956 + }, + { + "epoch": 3.33, + "grad_norm": 0.58984375, + "learning_rate": 1.659917223965878e-05, + "loss": 0.32, + "step": 24957 + }, + { + "epoch": 3.33, + "grad_norm": 0.4921875, + "learning_rate": 1.6592747644613204e-05, + "loss": 0.3656, + "step": 24958 + }, + { + "epoch": 3.33, + "grad_norm": 0.49609375, + "learning_rate": 1.6586324180623525e-05, + "loss": 0.2546, + "step": 24959 + }, + { + "epoch": 3.33, + "grad_norm": 0.75390625, + "learning_rate": 1.65799018477768e-05, + "loss": 0.4366, + "step": 24960 + }, + { + "epoch": 3.33, + "grad_norm": 0.52734375, + "learning_rate": 1.6573480646160146e-05, + "loss": 0.2152, + "step": 24961 + }, + { + "epoch": 3.33, + "grad_norm": 0.61328125, + "learning_rate": 1.6567060575860628e-05, + "loss": 0.2895, + "step": 24962 + }, + { + "epoch": 3.33, + "grad_norm": 0.90234375, + "learning_rate": 1.6560641636965325e-05, + "loss": 0.3604, + "step": 24963 + }, + { + "epoch": 3.33, + "grad_norm": 0.53515625, + "learning_rate": 1.6554223829561288e-05, + "loss": 0.304, + "step": 24964 + }, + { + "epoch": 3.33, + "grad_norm": 0.5234375, + "learning_rate": 1.6547807153735518e-05, + "loss": 0.1936, + "step": 24965 + }, + { + "epoch": 3.33, + "grad_norm": 0.58203125, + "learning_rate": 1.654139160957503e-05, + "loss": 0.3908, + "step": 24966 + }, + { + "epoch": 3.33, + "grad_norm": 0.474609375, + "learning_rate": 1.6534977197166845e-05, + "loss": 0.1677, + "step": 24967 + }, + { + "epoch": 3.33, + "grad_norm": 0.52734375, + "learning_rate": 1.6528563916597962e-05, + "loss": 0.3029, + "step": 24968 + }, + { + "epoch": 3.33, + "grad_norm": 0.79296875, + "learning_rate": 1.652215176795532e-05, + "loss": 0.3831, + "step": 24969 + }, + { + "epoch": 3.33, + "grad_norm": 0.625, + "learning_rate": 1.651574075132585e-05, + "loss": 0.3375, + "step": 24970 + }, + { + "epoch": 3.33, + "grad_norm": 0.8828125, + "learning_rate": 1.6509330866796513e-05, + "loss": 0.5756, + "step": 24971 + }, + { + "epoch": 3.33, + "grad_norm": 0.65234375, + "learning_rate": 1.6502922114454234e-05, + "loss": 0.3072, + "step": 24972 + }, + { + "epoch": 3.33, + "grad_norm": 0.578125, + "learning_rate": 1.649651449438595e-05, + "loss": 0.297, + "step": 24973 + }, + { + "epoch": 3.33, + "grad_norm": 0.78515625, + "learning_rate": 1.6490108006678494e-05, + "loss": 0.3456, + "step": 24974 + }, + { + "epoch": 3.33, + "grad_norm": 0.62109375, + "learning_rate": 1.6483702651418764e-05, + "loss": 0.3271, + "step": 24975 + }, + { + "epoch": 3.33, + "grad_norm": 0.62109375, + "learning_rate": 1.6477298428693634e-05, + "loss": 0.2511, + "step": 24976 + }, + { + "epoch": 3.33, + "grad_norm": 0.7109375, + "learning_rate": 1.6470895338589963e-05, + "loss": 0.4883, + "step": 24977 + }, + { + "epoch": 3.33, + "grad_norm": 0.66796875, + "learning_rate": 1.6464493381194536e-05, + "loss": 0.2963, + "step": 24978 + }, + { + "epoch": 3.33, + "grad_norm": 0.56640625, + "learning_rate": 1.6458092556594195e-05, + "loss": 0.3762, + "step": 24979 + }, + { + "epoch": 3.33, + "grad_norm": 0.50390625, + "learning_rate": 1.6451692864875756e-05, + "loss": 0.3084, + "step": 24980 + }, + { + "epoch": 3.33, + "grad_norm": 0.474609375, + "learning_rate": 1.6445294306125957e-05, + "loss": 0.2718, + "step": 24981 + }, + { + "epoch": 3.33, + "grad_norm": 0.470703125, + "learning_rate": 1.643889688043162e-05, + "loss": 0.2707, + "step": 24982 + }, + { + "epoch": 3.33, + "grad_norm": 0.5390625, + "learning_rate": 1.6432500587879452e-05, + "loss": 0.2893, + "step": 24983 + }, + { + "epoch": 3.33, + "grad_norm": 0.66015625, + "learning_rate": 1.6426105428556193e-05, + "loss": 0.4859, + "step": 24984 + }, + { + "epoch": 3.33, + "grad_norm": 0.62109375, + "learning_rate": 1.6419711402548598e-05, + "loss": 0.352, + "step": 24985 + }, + { + "epoch": 3.33, + "grad_norm": 0.435546875, + "learning_rate": 1.6413318509943355e-05, + "loss": 0.3835, + "step": 24986 + }, + { + "epoch": 3.33, + "grad_norm": 0.58203125, + "learning_rate": 1.6406926750827167e-05, + "loss": 0.3096, + "step": 24987 + }, + { + "epoch": 3.33, + "grad_norm": 0.60546875, + "learning_rate": 1.640053612528669e-05, + "loss": 0.1987, + "step": 24988 + }, + { + "epoch": 3.33, + "grad_norm": 0.69921875, + "learning_rate": 1.6394146633408604e-05, + "loss": 0.4195, + "step": 24989 + }, + { + "epoch": 3.33, + "grad_norm": 0.51171875, + "learning_rate": 1.6387758275279542e-05, + "loss": 0.2076, + "step": 24990 + }, + { + "epoch": 3.33, + "grad_norm": 0.58203125, + "learning_rate": 1.6381371050986162e-05, + "loss": 0.2781, + "step": 24991 + }, + { + "epoch": 3.33, + "grad_norm": 0.609375, + "learning_rate": 1.637498496061505e-05, + "loss": 0.1981, + "step": 24992 + }, + { + "epoch": 3.34, + "grad_norm": 0.59765625, + "learning_rate": 1.6368600004252798e-05, + "loss": 0.2391, + "step": 24993 + }, + { + "epoch": 3.34, + "grad_norm": 0.515625, + "learning_rate": 1.6362216181986002e-05, + "loss": 0.3222, + "step": 24994 + }, + { + "epoch": 3.34, + "grad_norm": 0.61328125, + "learning_rate": 1.635583349390123e-05, + "loss": 0.2111, + "step": 24995 + }, + { + "epoch": 3.34, + "grad_norm": 0.6796875, + "learning_rate": 1.634945194008506e-05, + "loss": 0.3136, + "step": 24996 + }, + { + "epoch": 3.34, + "grad_norm": 0.64453125, + "learning_rate": 1.6343071520623987e-05, + "loss": 0.379, + "step": 24997 + }, + { + "epoch": 3.34, + "grad_norm": 0.85546875, + "learning_rate": 1.6336692235604556e-05, + "loss": 0.2968, + "step": 24998 + }, + { + "epoch": 3.34, + "grad_norm": 0.76171875, + "learning_rate": 1.6330314085113276e-05, + "loss": 0.6934, + "step": 24999 + }, + { + "epoch": 3.34, + "grad_norm": 0.7109375, + "learning_rate": 1.6323937069236662e-05, + "loss": 0.3513, + "step": 25000 + }, + { + "epoch": 3.34, + "grad_norm": 0.55078125, + "learning_rate": 1.6317561188061126e-05, + "loss": 0.2559, + "step": 25001 + }, + { + "epoch": 3.34, + "grad_norm": 0.76171875, + "learning_rate": 1.6311186441673176e-05, + "loss": 0.3924, + "step": 25002 + }, + { + "epoch": 3.34, + "grad_norm": 0.6171875, + "learning_rate": 1.6304812830159254e-05, + "loss": 0.419, + "step": 25003 + }, + { + "epoch": 3.34, + "grad_norm": 0.51171875, + "learning_rate": 1.6298440353605805e-05, + "loss": 0.2707, + "step": 25004 + }, + { + "epoch": 3.34, + "grad_norm": 0.7265625, + "learning_rate": 1.629206901209922e-05, + "loss": 0.3536, + "step": 25005 + }, + { + "epoch": 3.34, + "grad_norm": 0.6328125, + "learning_rate": 1.6285698805725878e-05, + "loss": 0.4467, + "step": 25006 + }, + { + "epoch": 3.34, + "grad_norm": 0.58203125, + "learning_rate": 1.62793297345722e-05, + "loss": 0.2997, + "step": 25007 + }, + { + "epoch": 3.34, + "grad_norm": 0.72265625, + "learning_rate": 1.627296179872453e-05, + "loss": 0.5883, + "step": 25008 + }, + { + "epoch": 3.34, + "grad_norm": 0.5390625, + "learning_rate": 1.626659499826927e-05, + "loss": 0.2367, + "step": 25009 + }, + { + "epoch": 3.34, + "grad_norm": 0.58984375, + "learning_rate": 1.6260229333292686e-05, + "loss": 0.4173, + "step": 25010 + }, + { + "epoch": 3.34, + "grad_norm": 0.625, + "learning_rate": 1.6253864803881137e-05, + "loss": 0.1714, + "step": 25011 + }, + { + "epoch": 3.34, + "grad_norm": 0.625, + "learning_rate": 1.6247501410120935e-05, + "loss": 0.1584, + "step": 25012 + }, + { + "epoch": 3.34, + "grad_norm": 0.74609375, + "learning_rate": 1.624113915209836e-05, + "loss": 0.4472, + "step": 25013 + }, + { + "epoch": 3.34, + "grad_norm": 0.8046875, + "learning_rate": 1.6234778029899723e-05, + "loss": 0.4981, + "step": 25014 + }, + { + "epoch": 3.34, + "grad_norm": 0.671875, + "learning_rate": 1.6228418043611227e-05, + "loss": 0.4682, + "step": 25015 + }, + { + "epoch": 3.34, + "grad_norm": 0.4921875, + "learning_rate": 1.622205919331915e-05, + "loss": 0.3159, + "step": 25016 + }, + { + "epoch": 3.34, + "grad_norm": 0.7421875, + "learning_rate": 1.6215701479109747e-05, + "loss": 0.3772, + "step": 25017 + }, + { + "epoch": 3.34, + "grad_norm": 0.486328125, + "learning_rate": 1.6209344901069168e-05, + "loss": 0.2281, + "step": 25018 + }, + { + "epoch": 3.34, + "grad_norm": 0.6015625, + "learning_rate": 1.620298945928368e-05, + "loss": 0.3625, + "step": 25019 + }, + { + "epoch": 3.34, + "grad_norm": 0.63671875, + "learning_rate": 1.6196635153839414e-05, + "loss": 0.3843, + "step": 25020 + }, + { + "epoch": 3.34, + "grad_norm": 0.46875, + "learning_rate": 1.6190281984822553e-05, + "loss": 0.2847, + "step": 25021 + }, + { + "epoch": 3.34, + "grad_norm": 0.71484375, + "learning_rate": 1.6183929952319267e-05, + "loss": 0.3724, + "step": 25022 + }, + { + "epoch": 3.34, + "grad_norm": 0.59375, + "learning_rate": 1.6177579056415703e-05, + "loss": 0.3982, + "step": 25023 + }, + { + "epoch": 3.34, + "grad_norm": 0.4765625, + "learning_rate": 1.6171229297197943e-05, + "loss": 0.2186, + "step": 25024 + }, + { + "epoch": 3.34, + "grad_norm": 0.6640625, + "learning_rate": 1.6164880674752104e-05, + "loss": 0.3524, + "step": 25025 + }, + { + "epoch": 3.34, + "grad_norm": 0.671875, + "learning_rate": 1.6158533189164292e-05, + "loss": 0.3736, + "step": 25026 + }, + { + "epoch": 3.34, + "grad_norm": 0.49609375, + "learning_rate": 1.615218684052061e-05, + "loss": 0.2026, + "step": 25027 + }, + { + "epoch": 3.34, + "grad_norm": 0.67578125, + "learning_rate": 1.6145841628907055e-05, + "loss": 0.414, + "step": 25028 + }, + { + "epoch": 3.34, + "grad_norm": 0.6875, + "learning_rate": 1.6139497554409734e-05, + "loss": 0.2846, + "step": 25029 + }, + { + "epoch": 3.34, + "grad_norm": 0.66015625, + "learning_rate": 1.613315461711462e-05, + "loss": 0.4292, + "step": 25030 + }, + { + "epoch": 3.34, + "grad_norm": 0.796875, + "learning_rate": 1.612681281710776e-05, + "loss": 0.3711, + "step": 25031 + }, + { + "epoch": 3.34, + "grad_norm": 0.6171875, + "learning_rate": 1.6120472154475173e-05, + "loss": 0.2615, + "step": 25032 + }, + { + "epoch": 3.34, + "grad_norm": 0.65234375, + "learning_rate": 1.6114132629302792e-05, + "loss": 0.5643, + "step": 25033 + }, + { + "epoch": 3.34, + "grad_norm": 0.5859375, + "learning_rate": 1.6107794241676598e-05, + "loss": 0.5386, + "step": 25034 + }, + { + "epoch": 3.34, + "grad_norm": 0.71484375, + "learning_rate": 1.6101456991682574e-05, + "loss": 0.3148, + "step": 25035 + }, + { + "epoch": 3.34, + "grad_norm": 0.6328125, + "learning_rate": 1.6095120879406624e-05, + "loss": 0.336, + "step": 25036 + }, + { + "epoch": 3.34, + "grad_norm": 0.5859375, + "learning_rate": 1.6088785904934723e-05, + "loss": 0.334, + "step": 25037 + }, + { + "epoch": 3.34, + "grad_norm": 0.71484375, + "learning_rate": 1.6082452068352705e-05, + "loss": 0.2169, + "step": 25038 + }, + { + "epoch": 3.34, + "grad_norm": 0.546875, + "learning_rate": 1.607611936974649e-05, + "loss": 0.3105, + "step": 25039 + }, + { + "epoch": 3.34, + "grad_norm": 0.671875, + "learning_rate": 1.6069787809201963e-05, + "loss": 0.2197, + "step": 25040 + }, + { + "epoch": 3.34, + "grad_norm": 0.478515625, + "learning_rate": 1.6063457386805004e-05, + "loss": 0.3398, + "step": 25041 + }, + { + "epoch": 3.34, + "grad_norm": 0.68359375, + "learning_rate": 1.605712810264144e-05, + "loss": 0.1447, + "step": 25042 + }, + { + "epoch": 3.34, + "grad_norm": 0.63671875, + "learning_rate": 1.6050799956797056e-05, + "loss": 0.3698, + "step": 25043 + }, + { + "epoch": 3.34, + "grad_norm": 0.59765625, + "learning_rate": 1.6044472949357714e-05, + "loss": 0.2508, + "step": 25044 + }, + { + "epoch": 3.34, + "grad_norm": 0.63671875, + "learning_rate": 1.6038147080409193e-05, + "loss": 0.5966, + "step": 25045 + }, + { + "epoch": 3.34, + "grad_norm": 0.5078125, + "learning_rate": 1.603182235003732e-05, + "loss": 0.2701, + "step": 25046 + }, + { + "epoch": 3.34, + "grad_norm": 0.5078125, + "learning_rate": 1.6025498758327793e-05, + "loss": 0.2777, + "step": 25047 + }, + { + "epoch": 3.34, + "grad_norm": 0.59375, + "learning_rate": 1.60191763053664e-05, + "loss": 0.2002, + "step": 25048 + }, + { + "epoch": 3.34, + "grad_norm": 0.52734375, + "learning_rate": 1.6012854991238868e-05, + "loss": 0.1956, + "step": 25049 + }, + { + "epoch": 3.34, + "grad_norm": 0.67578125, + "learning_rate": 1.600653481603096e-05, + "loss": 0.2024, + "step": 25050 + }, + { + "epoch": 3.34, + "grad_norm": 0.6640625, + "learning_rate": 1.600021577982831e-05, + "loss": 0.2527, + "step": 25051 + }, + { + "epoch": 3.34, + "grad_norm": 0.7109375, + "learning_rate": 1.5993897882716658e-05, + "loss": 0.2373, + "step": 25052 + }, + { + "epoch": 3.34, + "grad_norm": 0.5703125, + "learning_rate": 1.5987581124781682e-05, + "loss": 0.3073, + "step": 25053 + }, + { + "epoch": 3.34, + "grad_norm": 0.54296875, + "learning_rate": 1.5981265506109e-05, + "loss": 0.2126, + "step": 25054 + }, + { + "epoch": 3.34, + "grad_norm": 0.8046875, + "learning_rate": 1.597495102678431e-05, + "loss": 0.3581, + "step": 25055 + }, + { + "epoch": 3.34, + "grad_norm": 0.7578125, + "learning_rate": 1.5968637686893186e-05, + "loss": 0.3147, + "step": 25056 + }, + { + "epoch": 3.34, + "grad_norm": 0.69140625, + "learning_rate": 1.5962325486521257e-05, + "loss": 0.3212, + "step": 25057 + }, + { + "epoch": 3.34, + "grad_norm": 0.53515625, + "learning_rate": 1.595601442575414e-05, + "loss": 0.2017, + "step": 25058 + }, + { + "epoch": 3.34, + "grad_norm": 0.75390625, + "learning_rate": 1.5949704504677397e-05, + "loss": 0.4237, + "step": 25059 + }, + { + "epoch": 3.34, + "grad_norm": 0.470703125, + "learning_rate": 1.594339572337663e-05, + "loss": 0.116, + "step": 25060 + }, + { + "epoch": 3.34, + "grad_norm": 0.494140625, + "learning_rate": 1.5937088081937347e-05, + "loss": 0.3023, + "step": 25061 + }, + { + "epoch": 3.34, + "grad_norm": 0.6015625, + "learning_rate": 1.5930781580445097e-05, + "loss": 0.2218, + "step": 25062 + }, + { + "epoch": 3.34, + "grad_norm": 0.41015625, + "learning_rate": 1.5924476218985407e-05, + "loss": 0.1348, + "step": 25063 + }, + { + "epoch": 3.34, + "grad_norm": 0.52734375, + "learning_rate": 1.5918171997643805e-05, + "loss": 0.197, + "step": 25064 + }, + { + "epoch": 3.34, + "grad_norm": 0.84765625, + "learning_rate": 1.5911868916505744e-05, + "loss": 0.3268, + "step": 25065 + }, + { + "epoch": 3.34, + "grad_norm": 0.58984375, + "learning_rate": 1.5905566975656695e-05, + "loss": 0.2492, + "step": 25066 + }, + { + "epoch": 3.34, + "grad_norm": 0.53125, + "learning_rate": 1.5899266175182113e-05, + "loss": 0.5519, + "step": 25067 + }, + { + "epoch": 3.35, + "grad_norm": 0.84765625, + "learning_rate": 1.5892966515167474e-05, + "loss": 0.4696, + "step": 25068 + }, + { + "epoch": 3.35, + "grad_norm": 0.490234375, + "learning_rate": 1.58866679956982e-05, + "loss": 0.2267, + "step": 25069 + }, + { + "epoch": 3.35, + "grad_norm": 0.435546875, + "learning_rate": 1.5880370616859675e-05, + "loss": 0.1993, + "step": 25070 + }, + { + "epoch": 3.35, + "grad_norm": 0.515625, + "learning_rate": 1.5874074378737314e-05, + "loss": 0.4446, + "step": 25071 + }, + { + "epoch": 3.35, + "grad_norm": 0.72265625, + "learning_rate": 1.5867779281416495e-05, + "loss": 0.6105, + "step": 25072 + }, + { + "epoch": 3.35, + "grad_norm": 0.6953125, + "learning_rate": 1.5861485324982604e-05, + "loss": 0.2792, + "step": 25073 + }, + { + "epoch": 3.35, + "grad_norm": 0.4609375, + "learning_rate": 1.5855192509520945e-05, + "loss": 0.1911, + "step": 25074 + }, + { + "epoch": 3.35, + "grad_norm": 0.5703125, + "learning_rate": 1.5848900835116888e-05, + "loss": 0.3736, + "step": 25075 + }, + { + "epoch": 3.35, + "grad_norm": 0.69921875, + "learning_rate": 1.5842610301855743e-05, + "loss": 0.2892, + "step": 25076 + }, + { + "epoch": 3.35, + "grad_norm": 0.5546875, + "learning_rate": 1.5836320909822833e-05, + "loss": 0.3314, + "step": 25077 + }, + { + "epoch": 3.35, + "grad_norm": 0.71875, + "learning_rate": 1.5830032659103435e-05, + "loss": 0.2435, + "step": 25078 + }, + { + "epoch": 3.35, + "grad_norm": 0.6640625, + "learning_rate": 1.5823745549782776e-05, + "loss": 0.329, + "step": 25079 + }, + { + "epoch": 3.35, + "grad_norm": 0.6015625, + "learning_rate": 1.581745958194616e-05, + "loss": 0.4303, + "step": 25080 + }, + { + "epoch": 3.35, + "grad_norm": 0.6484375, + "learning_rate": 1.5811174755678825e-05, + "loss": 0.3993, + "step": 25081 + }, + { + "epoch": 3.35, + "grad_norm": 0.61328125, + "learning_rate": 1.5804891071066007e-05, + "loss": 0.342, + "step": 25082 + }, + { + "epoch": 3.35, + "grad_norm": 0.421875, + "learning_rate": 1.5798608528192883e-05, + "loss": 0.121, + "step": 25083 + }, + { + "epoch": 3.35, + "grad_norm": 0.52734375, + "learning_rate": 1.579232712714467e-05, + "loss": 0.2851, + "step": 25084 + }, + { + "epoch": 3.35, + "grad_norm": 0.625, + "learning_rate": 1.5786046868006544e-05, + "loss": 0.4594, + "step": 25085 + }, + { + "epoch": 3.35, + "grad_norm": 0.48046875, + "learning_rate": 1.577976775086366e-05, + "loss": 0.4096, + "step": 25086 + }, + { + "epoch": 3.35, + "grad_norm": 0.58984375, + "learning_rate": 1.577348977580121e-05, + "loss": 0.3683, + "step": 25087 + }, + { + "epoch": 3.35, + "grad_norm": 0.76953125, + "learning_rate": 1.5767212942904276e-05, + "loss": 0.4558, + "step": 25088 + }, + { + "epoch": 3.35, + "grad_norm": 0.71875, + "learning_rate": 1.576093725225799e-05, + "loss": 0.5373, + "step": 25089 + }, + { + "epoch": 3.35, + "grad_norm": 0.427734375, + "learning_rate": 1.5754662703947476e-05, + "loss": 0.213, + "step": 25090 + }, + { + "epoch": 3.35, + "grad_norm": 0.58984375, + "learning_rate": 1.5748389298057785e-05, + "loss": 0.2242, + "step": 25091 + }, + { + "epoch": 3.35, + "grad_norm": 0.63671875, + "learning_rate": 1.5742117034674042e-05, + "loss": 0.2323, + "step": 25092 + }, + { + "epoch": 3.35, + "grad_norm": 0.77734375, + "learning_rate": 1.573584591388123e-05, + "loss": 0.3422, + "step": 25093 + }, + { + "epoch": 3.35, + "grad_norm": 0.59765625, + "learning_rate": 1.572957593576444e-05, + "loss": 0.2293, + "step": 25094 + }, + { + "epoch": 3.35, + "grad_norm": 0.482421875, + "learning_rate": 1.572330710040868e-05, + "loss": 0.2358, + "step": 25095 + }, + { + "epoch": 3.35, + "grad_norm": 0.66015625, + "learning_rate": 1.571703940789898e-05, + "loss": 0.3743, + "step": 25096 + }, + { + "epoch": 3.35, + "grad_norm": 0.50390625, + "learning_rate": 1.57107728583203e-05, + "loss": 0.4676, + "step": 25097 + }, + { + "epoch": 3.35, + "grad_norm": 0.5078125, + "learning_rate": 1.5704507451757634e-05, + "loss": 0.2262, + "step": 25098 + }, + { + "epoch": 3.35, + "grad_norm": 0.671875, + "learning_rate": 1.569824318829596e-05, + "loss": 0.5807, + "step": 25099 + }, + { + "epoch": 3.35, + "grad_norm": 0.8828125, + "learning_rate": 1.569198006802022e-05, + "loss": 0.4094, + "step": 25100 + }, + { + "epoch": 3.35, + "grad_norm": 0.6015625, + "learning_rate": 1.5685718091015324e-05, + "loss": 0.6369, + "step": 25101 + }, + { + "epoch": 3.35, + "grad_norm": 0.8125, + "learning_rate": 1.5679457257366225e-05, + "loss": 0.4418, + "step": 25102 + }, + { + "epoch": 3.35, + "grad_norm": 0.703125, + "learning_rate": 1.5673197567157784e-05, + "loss": 0.619, + "step": 25103 + }, + { + "epoch": 3.35, + "grad_norm": 0.6328125, + "learning_rate": 1.566693902047489e-05, + "loss": 0.3992, + "step": 25104 + }, + { + "epoch": 3.35, + "grad_norm": 0.38671875, + "learning_rate": 1.5660681617402463e-05, + "loss": 0.0999, + "step": 25105 + }, + { + "epoch": 3.35, + "grad_norm": 0.578125, + "learning_rate": 1.5654425358025294e-05, + "loss": 0.3784, + "step": 25106 + }, + { + "epoch": 3.35, + "grad_norm": 0.671875, + "learning_rate": 1.564817024242825e-05, + "loss": 0.3141, + "step": 25107 + }, + { + "epoch": 3.35, + "grad_norm": 0.7109375, + "learning_rate": 1.5641916270696155e-05, + "loss": 0.2771, + "step": 25108 + }, + { + "epoch": 3.35, + "grad_norm": 0.58984375, + "learning_rate": 1.563566344291382e-05, + "loss": 0.4063, + "step": 25109 + }, + { + "epoch": 3.35, + "grad_norm": 0.4375, + "learning_rate": 1.5629411759166045e-05, + "loss": 0.3771, + "step": 25110 + }, + { + "epoch": 3.35, + "grad_norm": 0.9140625, + "learning_rate": 1.5623161219537587e-05, + "loss": 0.33, + "step": 25111 + }, + { + "epoch": 3.35, + "grad_norm": 0.6171875, + "learning_rate": 1.561691182411321e-05, + "loss": 0.1895, + "step": 25112 + }, + { + "epoch": 3.35, + "grad_norm": 0.51953125, + "learning_rate": 1.5610663572977658e-05, + "loss": 0.3337, + "step": 25113 + }, + { + "epoch": 3.35, + "grad_norm": 0.6953125, + "learning_rate": 1.56044164662157e-05, + "loss": 0.352, + "step": 25114 + }, + { + "epoch": 3.35, + "grad_norm": 0.640625, + "learning_rate": 1.5598170503912024e-05, + "loss": 0.294, + "step": 25115 + }, + { + "epoch": 3.35, + "grad_norm": 0.51171875, + "learning_rate": 1.5591925686151288e-05, + "loss": 0.1268, + "step": 25116 + }, + { + "epoch": 3.35, + "grad_norm": 0.515625, + "learning_rate": 1.5585682013018222e-05, + "loss": 0.1494, + "step": 25117 + }, + { + "epoch": 3.35, + "grad_norm": 0.6796875, + "learning_rate": 1.557943948459748e-05, + "loss": 0.4229, + "step": 25118 + }, + { + "epoch": 3.35, + "grad_norm": 0.43359375, + "learning_rate": 1.557319810097374e-05, + "loss": 0.1748, + "step": 25119 + }, + { + "epoch": 3.35, + "grad_norm": 0.75390625, + "learning_rate": 1.5566957862231603e-05, + "loss": 0.4994, + "step": 25120 + }, + { + "epoch": 3.35, + "grad_norm": 0.5390625, + "learning_rate": 1.5560718768455707e-05, + "loss": 0.2609, + "step": 25121 + }, + { + "epoch": 3.35, + "grad_norm": 0.6875, + "learning_rate": 1.5554480819730653e-05, + "loss": 0.4293, + "step": 25122 + }, + { + "epoch": 3.35, + "grad_norm": 0.5390625, + "learning_rate": 1.5548244016141055e-05, + "loss": 0.569, + "step": 25123 + }, + { + "epoch": 3.35, + "grad_norm": 0.6484375, + "learning_rate": 1.5542008357771453e-05, + "loss": 0.4528, + "step": 25124 + }, + { + "epoch": 3.35, + "grad_norm": 0.6328125, + "learning_rate": 1.5535773844706413e-05, + "loss": 0.3114, + "step": 25125 + }, + { + "epoch": 3.35, + "grad_norm": 0.6484375, + "learning_rate": 1.5529540477030525e-05, + "loss": 0.436, + "step": 25126 + }, + { + "epoch": 3.35, + "grad_norm": 0.6328125, + "learning_rate": 1.552330825482825e-05, + "loss": 0.197, + "step": 25127 + }, + { + "epoch": 3.35, + "grad_norm": 0.62109375, + "learning_rate": 1.551707717818416e-05, + "loss": 0.2346, + "step": 25128 + }, + { + "epoch": 3.35, + "grad_norm": 0.703125, + "learning_rate": 1.5510847247182692e-05, + "loss": 0.4242, + "step": 25129 + }, + { + "epoch": 3.35, + "grad_norm": 0.5859375, + "learning_rate": 1.550461846190836e-05, + "loss": 0.3727, + "step": 25130 + }, + { + "epoch": 3.35, + "grad_norm": 0.68359375, + "learning_rate": 1.549839082244564e-05, + "loss": 0.4529, + "step": 25131 + }, + { + "epoch": 3.35, + "grad_norm": 0.333984375, + "learning_rate": 1.5492164328878965e-05, + "loss": 0.1413, + "step": 25132 + }, + { + "epoch": 3.35, + "grad_norm": 0.60546875, + "learning_rate": 1.5485938981292803e-05, + "loss": 0.4011, + "step": 25133 + }, + { + "epoch": 3.35, + "grad_norm": 0.58203125, + "learning_rate": 1.5479714779771526e-05, + "loss": 0.2569, + "step": 25134 + }, + { + "epoch": 3.35, + "grad_norm": 0.6640625, + "learning_rate": 1.547349172439957e-05, + "loss": 0.457, + "step": 25135 + }, + { + "epoch": 3.35, + "grad_norm": 0.66015625, + "learning_rate": 1.546726981526131e-05, + "loss": 0.2197, + "step": 25136 + }, + { + "epoch": 3.35, + "grad_norm": 0.609375, + "learning_rate": 1.5461049052441145e-05, + "loss": 0.3455, + "step": 25137 + }, + { + "epoch": 3.35, + "grad_norm": 0.625, + "learning_rate": 1.545482943602341e-05, + "loss": 0.3827, + "step": 25138 + }, + { + "epoch": 3.35, + "grad_norm": 0.61328125, + "learning_rate": 1.5448610966092424e-05, + "loss": 0.2213, + "step": 25139 + }, + { + "epoch": 3.35, + "grad_norm": 0.84375, + "learning_rate": 1.5442393642732545e-05, + "loss": 0.3603, + "step": 25140 + }, + { + "epoch": 3.35, + "grad_norm": 0.734375, + "learning_rate": 1.5436177466028068e-05, + "loss": 0.4213, + "step": 25141 + }, + { + "epoch": 3.35, + "grad_norm": 0.443359375, + "learning_rate": 1.5429962436063328e-05, + "loss": 0.1725, + "step": 25142 + }, + { + "epoch": 3.36, + "grad_norm": 0.671875, + "learning_rate": 1.5423748552922556e-05, + "loss": 0.229, + "step": 25143 + }, + { + "epoch": 3.36, + "grad_norm": 0.515625, + "learning_rate": 1.541753581669002e-05, + "loss": 0.1862, + "step": 25144 + }, + { + "epoch": 3.36, + "grad_norm": 0.6328125, + "learning_rate": 1.5411324227449998e-05, + "loss": 0.3456, + "step": 25145 + }, + { + "epoch": 3.36, + "grad_norm": 0.65625, + "learning_rate": 1.5405113785286717e-05, + "loss": 0.594, + "step": 25146 + }, + { + "epoch": 3.36, + "grad_norm": 0.5390625, + "learning_rate": 1.539890449028437e-05, + "loss": 0.4054, + "step": 25147 + }, + { + "epoch": 3.36, + "grad_norm": 0.546875, + "learning_rate": 1.539269634252717e-05, + "loss": 0.3219, + "step": 25148 + }, + { + "epoch": 3.36, + "grad_norm": 0.6484375, + "learning_rate": 1.538648934209932e-05, + "loss": 0.1714, + "step": 25149 + }, + { + "epoch": 3.36, + "grad_norm": 0.59375, + "learning_rate": 1.538028348908499e-05, + "loss": 0.2845, + "step": 25150 + }, + { + "epoch": 3.36, + "grad_norm": 0.578125, + "learning_rate": 1.5374078783568325e-05, + "loss": 0.2218, + "step": 25151 + }, + { + "epoch": 3.36, + "grad_norm": 0.6640625, + "learning_rate": 1.5367875225633444e-05, + "loss": 0.2282, + "step": 25152 + }, + { + "epoch": 3.36, + "grad_norm": 0.62109375, + "learning_rate": 1.5361672815364492e-05, + "loss": 0.2807, + "step": 25153 + }, + { + "epoch": 3.36, + "grad_norm": 0.52734375, + "learning_rate": 1.535547155284558e-05, + "loss": 0.1707, + "step": 25154 + }, + { + "epoch": 3.36, + "grad_norm": 0.62109375, + "learning_rate": 1.5349271438160816e-05, + "loss": 0.3474, + "step": 25155 + }, + { + "epoch": 3.36, + "grad_norm": 0.7109375, + "learning_rate": 1.5343072471394237e-05, + "loss": 0.4703, + "step": 25156 + }, + { + "epoch": 3.36, + "grad_norm": 0.69140625, + "learning_rate": 1.533687465262994e-05, + "loss": 0.3433, + "step": 25157 + }, + { + "epoch": 3.36, + "grad_norm": 0.58984375, + "learning_rate": 1.5330677981951946e-05, + "loss": 0.1999, + "step": 25158 + }, + { + "epoch": 3.36, + "grad_norm": 0.5859375, + "learning_rate": 1.53244824594443e-05, + "loss": 0.263, + "step": 25159 + }, + { + "epoch": 3.36, + "grad_norm": 0.5703125, + "learning_rate": 1.5318288085191046e-05, + "loss": 0.3171, + "step": 25160 + }, + { + "epoch": 3.36, + "grad_norm": 0.59765625, + "learning_rate": 1.5312094859276126e-05, + "loss": 0.3808, + "step": 25161 + }, + { + "epoch": 3.36, + "grad_norm": 0.51171875, + "learning_rate": 1.5305902781783578e-05, + "loss": 0.1805, + "step": 25162 + }, + { + "epoch": 3.36, + "grad_norm": 0.578125, + "learning_rate": 1.5299711852797317e-05, + "loss": 0.4125, + "step": 25163 + }, + { + "epoch": 3.36, + "grad_norm": 0.55859375, + "learning_rate": 1.5293522072401322e-05, + "loss": 0.2109, + "step": 25164 + }, + { + "epoch": 3.36, + "grad_norm": 0.5078125, + "learning_rate": 1.5287333440679552e-05, + "loss": 0.3027, + "step": 25165 + }, + { + "epoch": 3.36, + "grad_norm": 0.6328125, + "learning_rate": 1.5281145957715893e-05, + "loss": 0.3403, + "step": 25166 + }, + { + "epoch": 3.36, + "grad_norm": 0.66015625, + "learning_rate": 1.5274959623594265e-05, + "loss": 0.3186, + "step": 25167 + }, + { + "epoch": 3.36, + "grad_norm": 0.62890625, + "learning_rate": 1.526877443839857e-05, + "loss": 0.5114, + "step": 25168 + }, + { + "epoch": 3.36, + "grad_norm": 0.59375, + "learning_rate": 1.5262590402212685e-05, + "loss": 0.3153, + "step": 25169 + }, + { + "epoch": 3.36, + "grad_norm": 0.765625, + "learning_rate": 1.5256407515120442e-05, + "loss": 0.3564, + "step": 25170 + }, + { + "epoch": 3.36, + "grad_norm": 0.546875, + "learning_rate": 1.5250225777205696e-05, + "loss": 0.2385, + "step": 25171 + }, + { + "epoch": 3.36, + "grad_norm": 0.60546875, + "learning_rate": 1.524404518855228e-05, + "loss": 0.2651, + "step": 25172 + }, + { + "epoch": 3.36, + "grad_norm": 0.412109375, + "learning_rate": 1.523786574924403e-05, + "loss": 0.1026, + "step": 25173 + }, + { + "epoch": 3.36, + "grad_norm": 0.75390625, + "learning_rate": 1.5231687459364707e-05, + "loss": 0.2217, + "step": 25174 + }, + { + "epoch": 3.36, + "grad_norm": 0.5234375, + "learning_rate": 1.5225510318998115e-05, + "loss": 0.1315, + "step": 25175 + }, + { + "epoch": 3.36, + "grad_norm": 0.5078125, + "learning_rate": 1.5219334328227996e-05, + "loss": 0.4862, + "step": 25176 + }, + { + "epoch": 3.36, + "grad_norm": 0.63671875, + "learning_rate": 1.5213159487138118e-05, + "loss": 0.2887, + "step": 25177 + }, + { + "epoch": 3.36, + "grad_norm": 0.61328125, + "learning_rate": 1.5206985795812234e-05, + "loss": 0.3211, + "step": 25178 + }, + { + "epoch": 3.36, + "grad_norm": 0.59765625, + "learning_rate": 1.5200813254334012e-05, + "loss": 0.4107, + "step": 25179 + }, + { + "epoch": 3.36, + "grad_norm": 0.69921875, + "learning_rate": 1.5194641862787195e-05, + "loss": 0.2013, + "step": 25180 + }, + { + "epoch": 3.36, + "grad_norm": 0.60546875, + "learning_rate": 1.5188471621255462e-05, + "loss": 0.3819, + "step": 25181 + }, + { + "epoch": 3.36, + "grad_norm": 0.6484375, + "learning_rate": 1.5182302529822479e-05, + "loss": 0.172, + "step": 25182 + }, + { + "epoch": 3.36, + "grad_norm": 0.51171875, + "learning_rate": 1.5176134588571945e-05, + "loss": 0.3261, + "step": 25183 + }, + { + "epoch": 3.36, + "grad_norm": 0.52734375, + "learning_rate": 1.5169967797587426e-05, + "loss": 0.1784, + "step": 25184 + }, + { + "epoch": 3.36, + "grad_norm": 0.72265625, + "learning_rate": 1.51638021569526e-05, + "loss": 0.3236, + "step": 25185 + }, + { + "epoch": 3.36, + "grad_norm": 0.62109375, + "learning_rate": 1.5157637666751056e-05, + "loss": 0.3031, + "step": 25186 + }, + { + "epoch": 3.36, + "grad_norm": 0.50390625, + "learning_rate": 1.5151474327066417e-05, + "loss": 0.4491, + "step": 25187 + }, + { + "epoch": 3.36, + "grad_norm": 0.671875, + "learning_rate": 1.5145312137982248e-05, + "loss": 0.2904, + "step": 25188 + }, + { + "epoch": 3.36, + "grad_norm": 0.474609375, + "learning_rate": 1.5139151099582082e-05, + "loss": 0.2125, + "step": 25189 + }, + { + "epoch": 3.36, + "grad_norm": 0.5546875, + "learning_rate": 1.51329912119495e-05, + "loss": 0.6086, + "step": 25190 + }, + { + "epoch": 3.36, + "grad_norm": 0.52734375, + "learning_rate": 1.5126832475168007e-05, + "loss": 0.1773, + "step": 25191 + }, + { + "epoch": 3.36, + "grad_norm": 0.515625, + "learning_rate": 1.5120674889321173e-05, + "loss": 0.3087, + "step": 25192 + }, + { + "epoch": 3.36, + "grad_norm": 0.58984375, + "learning_rate": 1.5114518454492433e-05, + "loss": 0.3784, + "step": 25193 + }, + { + "epoch": 3.36, + "grad_norm": 0.578125, + "learning_rate": 1.5108363170765305e-05, + "loss": 0.2919, + "step": 25194 + }, + { + "epoch": 3.36, + "grad_norm": 0.71484375, + "learning_rate": 1.5102209038223247e-05, + "loss": 0.5116, + "step": 25195 + }, + { + "epoch": 3.36, + "grad_norm": 0.62890625, + "learning_rate": 1.509605605694976e-05, + "loss": 0.3773, + "step": 25196 + }, + { + "epoch": 3.36, + "grad_norm": 0.80859375, + "learning_rate": 1.5089904227028207e-05, + "loss": 0.3952, + "step": 25197 + }, + { + "epoch": 3.36, + "grad_norm": 0.7578125, + "learning_rate": 1.5083753548542056e-05, + "loss": 0.5421, + "step": 25198 + }, + { + "epoch": 3.36, + "grad_norm": 0.76171875, + "learning_rate": 1.507760402157472e-05, + "loss": 0.3754, + "step": 25199 + }, + { + "epoch": 3.36, + "grad_norm": 0.7109375, + "learning_rate": 1.5071455646209565e-05, + "loss": 0.3714, + "step": 25200 + }, + { + "epoch": 3.36, + "grad_norm": 0.7265625, + "learning_rate": 1.5065308422529988e-05, + "loss": 0.5422, + "step": 25201 + }, + { + "epoch": 3.36, + "grad_norm": 0.6640625, + "learning_rate": 1.5059162350619327e-05, + "loss": 0.5471, + "step": 25202 + }, + { + "epoch": 3.36, + "grad_norm": 0.6875, + "learning_rate": 1.5053017430560934e-05, + "loss": 0.2067, + "step": 25203 + }, + { + "epoch": 3.36, + "grad_norm": 0.62890625, + "learning_rate": 1.5046873662438143e-05, + "loss": 0.246, + "step": 25204 + }, + { + "epoch": 3.36, + "grad_norm": 0.953125, + "learning_rate": 1.5040731046334267e-05, + "loss": 0.3953, + "step": 25205 + }, + { + "epoch": 3.36, + "grad_norm": 0.70703125, + "learning_rate": 1.5034589582332637e-05, + "loss": 0.2649, + "step": 25206 + }, + { + "epoch": 3.36, + "grad_norm": 0.78515625, + "learning_rate": 1.5028449270516476e-05, + "loss": 0.1977, + "step": 25207 + }, + { + "epoch": 3.36, + "grad_norm": 0.5546875, + "learning_rate": 1.5022310110969074e-05, + "loss": 0.1594, + "step": 25208 + }, + { + "epoch": 3.36, + "grad_norm": 0.58203125, + "learning_rate": 1.5016172103773685e-05, + "loss": 0.1407, + "step": 25209 + }, + { + "epoch": 3.36, + "grad_norm": 0.53515625, + "learning_rate": 1.5010035249013566e-05, + "loss": 0.2188, + "step": 25210 + }, + { + "epoch": 3.36, + "grad_norm": 0.490234375, + "learning_rate": 1.5003899546771915e-05, + "loss": 0.1336, + "step": 25211 + }, + { + "epoch": 3.36, + "grad_norm": 0.74609375, + "learning_rate": 1.4997764997131914e-05, + "loss": 0.3224, + "step": 25212 + }, + { + "epoch": 3.36, + "grad_norm": 0.578125, + "learning_rate": 1.4991631600176769e-05, + "loss": 0.3108, + "step": 25213 + }, + { + "epoch": 3.36, + "grad_norm": 0.640625, + "learning_rate": 1.4985499355989674e-05, + "loss": 0.4421, + "step": 25214 + }, + { + "epoch": 3.36, + "grad_norm": 0.875, + "learning_rate": 1.4979368264653781e-05, + "loss": 0.5149, + "step": 25215 + }, + { + "epoch": 3.36, + "grad_norm": 0.60546875, + "learning_rate": 1.4973238326252192e-05, + "loss": 0.3609, + "step": 25216 + }, + { + "epoch": 3.36, + "grad_norm": 0.56640625, + "learning_rate": 1.4967109540868075e-05, + "loss": 0.2214, + "step": 25217 + }, + { + "epoch": 3.37, + "grad_norm": 0.70703125, + "learning_rate": 1.4960981908584515e-05, + "loss": 0.1968, + "step": 25218 + }, + { + "epoch": 3.37, + "grad_norm": 0.56640625, + "learning_rate": 1.4954855429484649e-05, + "loss": 0.272, + "step": 25219 + }, + { + "epoch": 3.37, + "grad_norm": 0.65234375, + "learning_rate": 1.4948730103651498e-05, + "loss": 0.2714, + "step": 25220 + }, + { + "epoch": 3.37, + "grad_norm": 0.625, + "learning_rate": 1.4942605931168152e-05, + "loss": 0.4919, + "step": 25221 + }, + { + "epoch": 3.37, + "grad_norm": 0.5078125, + "learning_rate": 1.4936482912117655e-05, + "loss": 0.2962, + "step": 25222 + }, + { + "epoch": 3.37, + "grad_norm": 0.58984375, + "learning_rate": 1.4930361046583075e-05, + "loss": 0.208, + "step": 25223 + }, + { + "epoch": 3.37, + "grad_norm": 0.63671875, + "learning_rate": 1.4924240334647387e-05, + "loss": 0.5789, + "step": 25224 + }, + { + "epoch": 3.37, + "grad_norm": 0.490234375, + "learning_rate": 1.4918120776393584e-05, + "loss": 0.2193, + "step": 25225 + }, + { + "epoch": 3.37, + "grad_norm": 0.44140625, + "learning_rate": 1.4912002371904665e-05, + "loss": 0.2227, + "step": 25226 + }, + { + "epoch": 3.37, + "grad_norm": 0.6953125, + "learning_rate": 1.4905885121263608e-05, + "loss": 0.4135, + "step": 25227 + }, + { + "epoch": 3.37, + "grad_norm": 0.75, + "learning_rate": 1.4899769024553379e-05, + "loss": 0.4814, + "step": 25228 + }, + { + "epoch": 3.37, + "grad_norm": 0.6875, + "learning_rate": 1.4893654081856878e-05, + "loss": 0.5384, + "step": 25229 + }, + { + "epoch": 3.37, + "grad_norm": 0.66796875, + "learning_rate": 1.4887540293257052e-05, + "loss": 0.2655, + "step": 25230 + }, + { + "epoch": 3.37, + "grad_norm": 0.734375, + "learning_rate": 1.4881427658836799e-05, + "loss": 0.3254, + "step": 25231 + }, + { + "epoch": 3.37, + "grad_norm": 0.5625, + "learning_rate": 1.4875316178679022e-05, + "loss": 0.328, + "step": 25232 + }, + { + "epoch": 3.37, + "grad_norm": 0.69140625, + "learning_rate": 1.4869205852866607e-05, + "loss": 0.2177, + "step": 25233 + }, + { + "epoch": 3.37, + "grad_norm": 0.65234375, + "learning_rate": 1.486309668148238e-05, + "loss": 0.2439, + "step": 25234 + }, + { + "epoch": 3.37, + "grad_norm": 0.546875, + "learning_rate": 1.4856988664609229e-05, + "loss": 0.239, + "step": 25235 + }, + { + "epoch": 3.37, + "grad_norm": 0.546875, + "learning_rate": 1.485088180232992e-05, + "loss": 0.1664, + "step": 25236 + }, + { + "epoch": 3.37, + "grad_norm": 0.69140625, + "learning_rate": 1.4844776094727308e-05, + "loss": 0.3286, + "step": 25237 + }, + { + "epoch": 3.37, + "grad_norm": 0.61328125, + "learning_rate": 1.4838671541884208e-05, + "loss": 0.1724, + "step": 25238 + }, + { + "epoch": 3.37, + "grad_norm": 0.55078125, + "learning_rate": 1.4832568143883362e-05, + "loss": 0.2752, + "step": 25239 + }, + { + "epoch": 3.37, + "grad_norm": 0.8125, + "learning_rate": 1.482646590080754e-05, + "loss": 0.309, + "step": 25240 + }, + { + "epoch": 3.37, + "grad_norm": 0.60546875, + "learning_rate": 1.4820364812739518e-05, + "loss": 0.3705, + "step": 25241 + }, + { + "epoch": 3.37, + "grad_norm": 0.494140625, + "learning_rate": 1.481426487976203e-05, + "loss": 0.3257, + "step": 25242 + }, + { + "epoch": 3.37, + "grad_norm": 0.498046875, + "learning_rate": 1.4808166101957754e-05, + "loss": 0.3162, + "step": 25243 + }, + { + "epoch": 3.37, + "grad_norm": 0.66015625, + "learning_rate": 1.4802068479409436e-05, + "loss": 0.3246, + "step": 25244 + }, + { + "epoch": 3.37, + "grad_norm": 0.73828125, + "learning_rate": 1.4795972012199743e-05, + "loss": 0.3288, + "step": 25245 + }, + { + "epoch": 3.37, + "grad_norm": 0.5625, + "learning_rate": 1.4789876700411376e-05, + "loss": 0.371, + "step": 25246 + }, + { + "epoch": 3.37, + "grad_norm": 0.51171875, + "learning_rate": 1.4783782544126967e-05, + "loss": 0.191, + "step": 25247 + }, + { + "epoch": 3.37, + "grad_norm": 0.61328125, + "learning_rate": 1.477768954342913e-05, + "loss": 0.4054, + "step": 25248 + }, + { + "epoch": 3.37, + "grad_norm": 0.431640625, + "learning_rate": 1.477159769840053e-05, + "loss": 0.2068, + "step": 25249 + }, + { + "epoch": 3.37, + "grad_norm": 0.52734375, + "learning_rate": 1.4765507009123746e-05, + "loss": 0.2391, + "step": 25250 + }, + { + "epoch": 3.37, + "grad_norm": 0.98046875, + "learning_rate": 1.4759417475681436e-05, + "loss": 0.5003, + "step": 25251 + }, + { + "epoch": 3.37, + "grad_norm": 0.70703125, + "learning_rate": 1.4753329098156088e-05, + "loss": 0.5443, + "step": 25252 + }, + { + "epoch": 3.37, + "grad_norm": 0.4609375, + "learning_rate": 1.4747241876630324e-05, + "loss": 0.2016, + "step": 25253 + }, + { + "epoch": 3.37, + "grad_norm": 0.58984375, + "learning_rate": 1.474115581118667e-05, + "loss": 0.4214, + "step": 25254 + }, + { + "epoch": 3.37, + "grad_norm": 0.6875, + "learning_rate": 1.4735070901907655e-05, + "loss": 0.2911, + "step": 25255 + }, + { + "epoch": 3.37, + "grad_norm": 0.703125, + "learning_rate": 1.472898714887584e-05, + "loss": 0.2937, + "step": 25256 + }, + { + "epoch": 3.37, + "grad_norm": 0.57421875, + "learning_rate": 1.4722904552173645e-05, + "loss": 0.3743, + "step": 25257 + }, + { + "epoch": 3.37, + "grad_norm": 0.4375, + "learning_rate": 1.4716823111883604e-05, + "loss": 0.2453, + "step": 25258 + }, + { + "epoch": 3.37, + "grad_norm": 0.6953125, + "learning_rate": 1.4710742828088176e-05, + "loss": 0.4267, + "step": 25259 + }, + { + "epoch": 3.37, + "grad_norm": 0.4921875, + "learning_rate": 1.4704663700869837e-05, + "loss": 0.2938, + "step": 25260 + }, + { + "epoch": 3.37, + "grad_norm": 0.50390625, + "learning_rate": 1.4698585730310998e-05, + "loss": 0.3262, + "step": 25261 + }, + { + "epoch": 3.37, + "grad_norm": 0.69921875, + "learning_rate": 1.4692508916494063e-05, + "loss": 0.2526, + "step": 25262 + }, + { + "epoch": 3.37, + "grad_norm": 0.7578125, + "learning_rate": 1.4686433259501453e-05, + "loss": 0.3471, + "step": 25263 + }, + { + "epoch": 3.37, + "grad_norm": 0.8203125, + "learning_rate": 1.4680358759415569e-05, + "loss": 0.2725, + "step": 25264 + }, + { + "epoch": 3.37, + "grad_norm": 0.5703125, + "learning_rate": 1.467428541631879e-05, + "loss": 0.4066, + "step": 25265 + }, + { + "epoch": 3.37, + "grad_norm": 0.66796875, + "learning_rate": 1.4668213230293449e-05, + "loss": 0.3074, + "step": 25266 + }, + { + "epoch": 3.37, + "grad_norm": 0.69921875, + "learning_rate": 1.4662142201421892e-05, + "loss": 0.5146, + "step": 25267 + }, + { + "epoch": 3.37, + "grad_norm": 0.53515625, + "learning_rate": 1.4656072329786463e-05, + "loss": 0.3925, + "step": 25268 + }, + { + "epoch": 3.37, + "grad_norm": 0.6171875, + "learning_rate": 1.4650003615469488e-05, + "loss": 0.2398, + "step": 25269 + }, + { + "epoch": 3.37, + "grad_norm": 0.478515625, + "learning_rate": 1.464393605855322e-05, + "loss": 0.1848, + "step": 25270 + }, + { + "epoch": 3.37, + "grad_norm": 0.61328125, + "learning_rate": 1.4637869659119963e-05, + "loss": 0.28, + "step": 25271 + }, + { + "epoch": 3.37, + "grad_norm": 0.44921875, + "learning_rate": 1.4631804417251993e-05, + "loss": 0.2223, + "step": 25272 + }, + { + "epoch": 3.37, + "grad_norm": 0.50390625, + "learning_rate": 1.4625740333031534e-05, + "loss": 0.2339, + "step": 25273 + }, + { + "epoch": 3.37, + "grad_norm": 0.63671875, + "learning_rate": 1.4619677406540844e-05, + "loss": 0.2414, + "step": 25274 + }, + { + "epoch": 3.37, + "grad_norm": 0.5703125, + "learning_rate": 1.4613615637862099e-05, + "loss": 0.218, + "step": 25275 + }, + { + "epoch": 3.37, + "grad_norm": 0.70703125, + "learning_rate": 1.4607555027077525e-05, + "loss": 0.3923, + "step": 25276 + }, + { + "epoch": 3.37, + "grad_norm": 0.70703125, + "learning_rate": 1.460149557426932e-05, + "loss": 0.3773, + "step": 25277 + }, + { + "epoch": 3.37, + "grad_norm": 0.55859375, + "learning_rate": 1.459543727951963e-05, + "loss": 0.2699, + "step": 25278 + }, + { + "epoch": 3.37, + "grad_norm": 0.498046875, + "learning_rate": 1.4589380142910658e-05, + "loss": 0.2057, + "step": 25279 + }, + { + "epoch": 3.37, + "grad_norm": 0.53515625, + "learning_rate": 1.4583324164524482e-05, + "loss": 0.1789, + "step": 25280 + }, + { + "epoch": 3.37, + "grad_norm": 0.71484375, + "learning_rate": 1.4577269344443245e-05, + "loss": 0.484, + "step": 25281 + }, + { + "epoch": 3.37, + "grad_norm": 0.51171875, + "learning_rate": 1.4571215682749062e-05, + "loss": 0.1981, + "step": 25282 + }, + { + "epoch": 3.37, + "grad_norm": 0.75, + "learning_rate": 1.4565163179524043e-05, + "loss": 0.5721, + "step": 25283 + }, + { + "epoch": 3.37, + "grad_norm": 0.62890625, + "learning_rate": 1.4559111834850248e-05, + "loss": 0.234, + "step": 25284 + }, + { + "epoch": 3.37, + "grad_norm": 0.57421875, + "learning_rate": 1.4553061648809707e-05, + "loss": 0.3167, + "step": 25285 + }, + { + "epoch": 3.37, + "grad_norm": 0.6171875, + "learning_rate": 1.454701262148449e-05, + "loss": 0.2309, + "step": 25286 + }, + { + "epoch": 3.37, + "grad_norm": 0.53515625, + "learning_rate": 1.4540964752956621e-05, + "loss": 0.3577, + "step": 25287 + }, + { + "epoch": 3.37, + "grad_norm": 0.60546875, + "learning_rate": 1.4534918043308144e-05, + "loss": 0.3185, + "step": 25288 + }, + { + "epoch": 3.37, + "grad_norm": 0.63671875, + "learning_rate": 1.4528872492621004e-05, + "loss": 0.3644, + "step": 25289 + }, + { + "epoch": 3.37, + "grad_norm": 0.66796875, + "learning_rate": 1.4522828100977214e-05, + "loss": 0.4535, + "step": 25290 + }, + { + "epoch": 3.37, + "grad_norm": 0.484375, + "learning_rate": 1.4516784868458733e-05, + "loss": 0.2008, + "step": 25291 + }, + { + "epoch": 3.37, + "grad_norm": 0.46875, + "learning_rate": 1.4510742795147525e-05, + "loss": 0.1764, + "step": 25292 + }, + { + "epoch": 3.38, + "grad_norm": 0.470703125, + "learning_rate": 1.4504701881125493e-05, + "loss": 0.1095, + "step": 25293 + }, + { + "epoch": 3.38, + "grad_norm": 0.78125, + "learning_rate": 1.4498662126474582e-05, + "loss": 0.3267, + "step": 25294 + }, + { + "epoch": 3.38, + "grad_norm": 0.51171875, + "learning_rate": 1.4492623531276683e-05, + "loss": 0.2166, + "step": 25295 + }, + { + "epoch": 3.38, + "grad_norm": 0.67578125, + "learning_rate": 1.4486586095613707e-05, + "loss": 0.3554, + "step": 25296 + }, + { + "epoch": 3.38, + "grad_norm": 0.75, + "learning_rate": 1.4480549819567501e-05, + "loss": 0.4641, + "step": 25297 + }, + { + "epoch": 3.38, + "grad_norm": 0.61328125, + "learning_rate": 1.447451470321991e-05, + "loss": 0.2267, + "step": 25298 + }, + { + "epoch": 3.38, + "grad_norm": 0.703125, + "learning_rate": 1.4468480746652791e-05, + "loss": 0.3568, + "step": 25299 + }, + { + "epoch": 3.38, + "grad_norm": 0.546875, + "learning_rate": 1.4462447949947966e-05, + "loss": 0.3434, + "step": 25300 + }, + { + "epoch": 3.38, + "grad_norm": 0.7421875, + "learning_rate": 1.4456416313187259e-05, + "loss": 0.3846, + "step": 25301 + }, + { + "epoch": 3.38, + "grad_norm": 0.7890625, + "learning_rate": 1.4450385836452429e-05, + "loss": 0.2415, + "step": 25302 + }, + { + "epoch": 3.38, + "grad_norm": 0.59765625, + "learning_rate": 1.4444356519825275e-05, + "loss": 0.3348, + "step": 25303 + }, + { + "epoch": 3.38, + "grad_norm": 0.5859375, + "learning_rate": 1.4438328363387565e-05, + "loss": 0.2226, + "step": 25304 + }, + { + "epoch": 3.38, + "grad_norm": 0.65625, + "learning_rate": 1.4432301367221024e-05, + "loss": 0.4458, + "step": 25305 + }, + { + "epoch": 3.38, + "grad_norm": 0.6875, + "learning_rate": 1.4426275531407418e-05, + "loss": 0.2204, + "step": 25306 + }, + { + "epoch": 3.38, + "grad_norm": 0.54296875, + "learning_rate": 1.4420250856028417e-05, + "loss": 0.1927, + "step": 25307 + }, + { + "epoch": 3.38, + "grad_norm": 0.86328125, + "learning_rate": 1.4414227341165754e-05, + "loss": 0.491, + "step": 25308 + }, + { + "epoch": 3.38, + "grad_norm": 0.58984375, + "learning_rate": 1.4408204986901075e-05, + "loss": 0.438, + "step": 25309 + }, + { + "epoch": 3.38, + "grad_norm": 0.70703125, + "learning_rate": 1.4402183793316082e-05, + "loss": 0.2325, + "step": 25310 + }, + { + "epoch": 3.38, + "grad_norm": 0.6953125, + "learning_rate": 1.4396163760492421e-05, + "loss": 0.3264, + "step": 25311 + }, + { + "epoch": 3.38, + "grad_norm": 0.59765625, + "learning_rate": 1.4390144888511703e-05, + "loss": 0.2991, + "step": 25312 + }, + { + "epoch": 3.38, + "grad_norm": 0.5703125, + "learning_rate": 1.4384127177455565e-05, + "loss": 0.4014, + "step": 25313 + }, + { + "epoch": 3.38, + "grad_norm": 0.59375, + "learning_rate": 1.4378110627405606e-05, + "loss": 0.4285, + "step": 25314 + }, + { + "epoch": 3.38, + "grad_norm": 0.5703125, + "learning_rate": 1.437209523844345e-05, + "loss": 0.224, + "step": 25315 + }, + { + "epoch": 3.38, + "grad_norm": 0.734375, + "learning_rate": 1.4366081010650612e-05, + "loss": 0.2511, + "step": 25316 + }, + { + "epoch": 3.38, + "grad_norm": 0.458984375, + "learning_rate": 1.4360067944108669e-05, + "loss": 0.1919, + "step": 25317 + }, + { + "epoch": 3.38, + "grad_norm": 0.671875, + "learning_rate": 1.4354056038899177e-05, + "loss": 0.3515, + "step": 25318 + }, + { + "epoch": 3.38, + "grad_norm": 0.7421875, + "learning_rate": 1.4348045295103662e-05, + "loss": 0.4355, + "step": 25319 + }, + { + "epoch": 3.38, + "grad_norm": 0.482421875, + "learning_rate": 1.4342035712803636e-05, + "loss": 0.1566, + "step": 25320 + }, + { + "epoch": 3.38, + "grad_norm": 0.609375, + "learning_rate": 1.4336027292080555e-05, + "loss": 0.2337, + "step": 25321 + }, + { + "epoch": 3.38, + "grad_norm": 0.69921875, + "learning_rate": 1.4330020033015923e-05, + "loss": 0.2647, + "step": 25322 + }, + { + "epoch": 3.38, + "grad_norm": 0.6640625, + "learning_rate": 1.4324013935691206e-05, + "loss": 0.3311, + "step": 25323 + }, + { + "epoch": 3.38, + "grad_norm": 0.58203125, + "learning_rate": 1.431800900018786e-05, + "loss": 0.298, + "step": 25324 + }, + { + "epoch": 3.38, + "grad_norm": 0.4765625, + "learning_rate": 1.4312005226587278e-05, + "loss": 0.1461, + "step": 25325 + }, + { + "epoch": 3.38, + "grad_norm": 0.69140625, + "learning_rate": 1.4306002614970904e-05, + "loss": 0.3168, + "step": 25326 + }, + { + "epoch": 3.38, + "grad_norm": 0.59765625, + "learning_rate": 1.430000116542014e-05, + "loss": 0.3102, + "step": 25327 + }, + { + "epoch": 3.38, + "grad_norm": 0.5703125, + "learning_rate": 1.4294000878016356e-05, + "loss": 0.2375, + "step": 25328 + }, + { + "epoch": 3.38, + "grad_norm": 0.6953125, + "learning_rate": 1.4288001752840941e-05, + "loss": 0.3273, + "step": 25329 + }, + { + "epoch": 3.38, + "grad_norm": 0.53125, + "learning_rate": 1.4282003789975219e-05, + "loss": 0.4997, + "step": 25330 + }, + { + "epoch": 3.38, + "grad_norm": 0.5390625, + "learning_rate": 1.4276006989500535e-05, + "loss": 0.3932, + "step": 25331 + }, + { + "epoch": 3.38, + "grad_norm": 0.59765625, + "learning_rate": 1.4270011351498237e-05, + "loss": 0.5883, + "step": 25332 + }, + { + "epoch": 3.38, + "grad_norm": 0.44921875, + "learning_rate": 1.426401687604959e-05, + "loss": 0.257, + "step": 25333 + }, + { + "epoch": 3.38, + "grad_norm": 0.578125, + "learning_rate": 1.4258023563235911e-05, + "loss": 0.311, + "step": 25334 + }, + { + "epoch": 3.38, + "grad_norm": 0.63671875, + "learning_rate": 1.4252031413138445e-05, + "loss": 0.2428, + "step": 25335 + }, + { + "epoch": 3.38, + "grad_norm": 0.546875, + "learning_rate": 1.4246040425838469e-05, + "loss": 0.1797, + "step": 25336 + }, + { + "epoch": 3.38, + "grad_norm": 0.8125, + "learning_rate": 1.424005060141721e-05, + "loss": 0.4562, + "step": 25337 + }, + { + "epoch": 3.38, + "grad_norm": 0.6953125, + "learning_rate": 1.4234061939955934e-05, + "loss": 0.4421, + "step": 25338 + }, + { + "epoch": 3.38, + "grad_norm": 0.5625, + "learning_rate": 1.42280744415358e-05, + "loss": 0.177, + "step": 25339 + }, + { + "epoch": 3.38, + "grad_norm": 0.5859375, + "learning_rate": 1.422208810623803e-05, + "loss": 0.3986, + "step": 25340 + }, + { + "epoch": 3.38, + "grad_norm": 0.6875, + "learning_rate": 1.4216102934143794e-05, + "loss": 0.5321, + "step": 25341 + }, + { + "epoch": 3.38, + "grad_norm": 0.494140625, + "learning_rate": 1.4210118925334271e-05, + "loss": 0.3097, + "step": 25342 + }, + { + "epoch": 3.38, + "grad_norm": 0.7265625, + "learning_rate": 1.4204136079890584e-05, + "loss": 0.2708, + "step": 25343 + }, + { + "epoch": 3.38, + "grad_norm": 0.6796875, + "learning_rate": 1.419815439789387e-05, + "loss": 0.4394, + "step": 25344 + }, + { + "epoch": 3.38, + "grad_norm": 0.59765625, + "learning_rate": 1.4192173879425275e-05, + "loss": 0.341, + "step": 25345 + }, + { + "epoch": 3.38, + "grad_norm": 0.66015625, + "learning_rate": 1.4186194524565844e-05, + "loss": 0.2655, + "step": 25346 + }, + { + "epoch": 3.38, + "grad_norm": 0.6640625, + "learning_rate": 1.4180216333396722e-05, + "loss": 0.2861, + "step": 25347 + }, + { + "epoch": 3.38, + "grad_norm": 0.62890625, + "learning_rate": 1.4174239305998915e-05, + "loss": 0.3329, + "step": 25348 + }, + { + "epoch": 3.38, + "grad_norm": 0.51171875, + "learning_rate": 1.4168263442453511e-05, + "loss": 0.1904, + "step": 25349 + }, + { + "epoch": 3.38, + "grad_norm": 0.62890625, + "learning_rate": 1.4162288742841534e-05, + "loss": 0.4023, + "step": 25350 + }, + { + "epoch": 3.38, + "grad_norm": 0.58203125, + "learning_rate": 1.415631520724402e-05, + "loss": 0.2953, + "step": 25351 + }, + { + "epoch": 3.38, + "grad_norm": 0.671875, + "learning_rate": 1.4150342835741982e-05, + "loss": 0.2993, + "step": 25352 + }, + { + "epoch": 3.38, + "grad_norm": 0.74609375, + "learning_rate": 1.4144371628416375e-05, + "loss": 0.2288, + "step": 25353 + }, + { + "epoch": 3.38, + "grad_norm": 0.55078125, + "learning_rate": 1.4138401585348193e-05, + "loss": 0.2639, + "step": 25354 + }, + { + "epoch": 3.38, + "grad_norm": 0.5078125, + "learning_rate": 1.4132432706618381e-05, + "loss": 0.2841, + "step": 25355 + }, + { + "epoch": 3.38, + "grad_norm": 0.62890625, + "learning_rate": 1.4126464992307931e-05, + "loss": 0.455, + "step": 25356 + }, + { + "epoch": 3.38, + "grad_norm": 0.7578125, + "learning_rate": 1.4120498442497709e-05, + "loss": 0.5241, + "step": 25357 + }, + { + "epoch": 3.38, + "grad_norm": 0.734375, + "learning_rate": 1.4114533057268632e-05, + "loss": 0.3629, + "step": 25358 + }, + { + "epoch": 3.38, + "grad_norm": 0.55859375, + "learning_rate": 1.4108568836701608e-05, + "loss": 0.3393, + "step": 25359 + }, + { + "epoch": 3.38, + "grad_norm": 0.5703125, + "learning_rate": 1.410260578087752e-05, + "loss": 0.2577, + "step": 25360 + }, + { + "epoch": 3.38, + "grad_norm": 0.486328125, + "learning_rate": 1.4096643889877237e-05, + "loss": 0.1676, + "step": 25361 + }, + { + "epoch": 3.38, + "grad_norm": 0.5625, + "learning_rate": 1.4090683163781581e-05, + "loss": 0.3594, + "step": 25362 + }, + { + "epoch": 3.38, + "grad_norm": 0.63671875, + "learning_rate": 1.4084723602671401e-05, + "loss": 0.4194, + "step": 25363 + }, + { + "epoch": 3.38, + "grad_norm": 0.44140625, + "learning_rate": 1.4078765206627498e-05, + "loss": 0.1223, + "step": 25364 + }, + { + "epoch": 3.38, + "grad_norm": 0.6328125, + "learning_rate": 1.4072807975730718e-05, + "loss": 0.3102, + "step": 25365 + }, + { + "epoch": 3.38, + "grad_norm": 0.54296875, + "learning_rate": 1.4066851910061773e-05, + "loss": 0.2941, + "step": 25366 + }, + { + "epoch": 3.38, + "grad_norm": 0.50390625, + "learning_rate": 1.406089700970148e-05, + "loss": 0.2977, + "step": 25367 + }, + { + "epoch": 3.39, + "grad_norm": 0.515625, + "learning_rate": 1.405494327473057e-05, + "loss": 0.2889, + "step": 25368 + }, + { + "epoch": 3.39, + "grad_norm": 0.5625, + "learning_rate": 1.4048990705229815e-05, + "loss": 0.287, + "step": 25369 + }, + { + "epoch": 3.39, + "grad_norm": 0.72265625, + "learning_rate": 1.4043039301279903e-05, + "loss": 0.251, + "step": 25370 + }, + { + "epoch": 3.39, + "grad_norm": 0.59765625, + "learning_rate": 1.4037089062961527e-05, + "loss": 0.2181, + "step": 25371 + }, + { + "epoch": 3.39, + "grad_norm": 0.6484375, + "learning_rate": 1.4031139990355401e-05, + "loss": 0.386, + "step": 25372 + }, + { + "epoch": 3.39, + "grad_norm": 0.78515625, + "learning_rate": 1.4025192083542183e-05, + "loss": 0.3827, + "step": 25373 + }, + { + "epoch": 3.39, + "grad_norm": 0.6640625, + "learning_rate": 1.401924534260256e-05, + "loss": 0.4815, + "step": 25374 + }, + { + "epoch": 3.39, + "grad_norm": 0.66015625, + "learning_rate": 1.4013299767617138e-05, + "loss": 0.4203, + "step": 25375 + }, + { + "epoch": 3.39, + "grad_norm": 0.478515625, + "learning_rate": 1.4007355358666551e-05, + "loss": 0.1718, + "step": 25376 + }, + { + "epoch": 3.39, + "grad_norm": 0.65625, + "learning_rate": 1.4001412115831414e-05, + "loss": 0.5353, + "step": 25377 + }, + { + "epoch": 3.39, + "grad_norm": 0.490234375, + "learning_rate": 1.3995470039192326e-05, + "loss": 0.3249, + "step": 25378 + }, + { + "epoch": 3.39, + "grad_norm": 0.45703125, + "learning_rate": 1.398952912882988e-05, + "loss": 0.2643, + "step": 25379 + }, + { + "epoch": 3.39, + "grad_norm": 0.74609375, + "learning_rate": 1.3983589384824603e-05, + "loss": 0.2622, + "step": 25380 + }, + { + "epoch": 3.39, + "grad_norm": 0.70703125, + "learning_rate": 1.397765080725707e-05, + "loss": 0.557, + "step": 25381 + }, + { + "epoch": 3.39, + "grad_norm": 0.37890625, + "learning_rate": 1.3971713396207787e-05, + "loss": 0.144, + "step": 25382 + }, + { + "epoch": 3.39, + "grad_norm": 0.484375, + "learning_rate": 1.3965777151757287e-05, + "loss": 0.2923, + "step": 25383 + }, + { + "epoch": 3.39, + "grad_norm": 0.58203125, + "learning_rate": 1.3959842073986085e-05, + "loss": 0.3656, + "step": 25384 + }, + { + "epoch": 3.39, + "grad_norm": 0.46875, + "learning_rate": 1.3953908162974617e-05, + "loss": 0.1481, + "step": 25385 + }, + { + "epoch": 3.39, + "grad_norm": 0.478515625, + "learning_rate": 1.3947975418803383e-05, + "loss": 0.13, + "step": 25386 + }, + { + "epoch": 3.39, + "grad_norm": 0.62890625, + "learning_rate": 1.3942043841552821e-05, + "loss": 0.395, + "step": 25387 + }, + { + "epoch": 3.39, + "grad_norm": 0.52734375, + "learning_rate": 1.3936113431303411e-05, + "loss": 0.1273, + "step": 25388 + }, + { + "epoch": 3.39, + "grad_norm": 0.56640625, + "learning_rate": 1.3930184188135508e-05, + "loss": 0.1854, + "step": 25389 + }, + { + "epoch": 3.39, + "grad_norm": 0.9765625, + "learning_rate": 1.3924256112129552e-05, + "loss": 0.5578, + "step": 25390 + }, + { + "epoch": 3.39, + "grad_norm": 0.62109375, + "learning_rate": 1.391832920336592e-05, + "loss": 0.2416, + "step": 25391 + }, + { + "epoch": 3.39, + "grad_norm": 0.65625, + "learning_rate": 1.3912403461925017e-05, + "loss": 0.3758, + "step": 25392 + }, + { + "epoch": 3.39, + "grad_norm": 0.69140625, + "learning_rate": 1.3906478887887175e-05, + "loss": 0.4272, + "step": 25393 + }, + { + "epoch": 3.39, + "grad_norm": 0.65234375, + "learning_rate": 1.390055548133271e-05, + "loss": 0.4095, + "step": 25394 + }, + { + "epoch": 3.39, + "grad_norm": 0.6484375, + "learning_rate": 1.389463324234197e-05, + "loss": 0.3112, + "step": 25395 + }, + { + "epoch": 3.39, + "grad_norm": 0.90625, + "learning_rate": 1.3888712170995267e-05, + "loss": 0.3695, + "step": 25396 + }, + { + "epoch": 3.39, + "grad_norm": 0.6015625, + "learning_rate": 1.3882792267372924e-05, + "loss": 0.1834, + "step": 25397 + }, + { + "epoch": 3.39, + "grad_norm": 0.6171875, + "learning_rate": 1.3876873531555145e-05, + "loss": 0.2084, + "step": 25398 + }, + { + "epoch": 3.39, + "grad_norm": 0.61328125, + "learning_rate": 1.3870955963622245e-05, + "loss": 0.6029, + "step": 25399 + }, + { + "epoch": 3.39, + "grad_norm": 0.6875, + "learning_rate": 1.386503956365447e-05, + "loss": 0.2428, + "step": 25400 + }, + { + "epoch": 3.39, + "grad_norm": 0.69921875, + "learning_rate": 1.385912433173202e-05, + "loss": 0.5936, + "step": 25401 + }, + { + "epoch": 3.39, + "grad_norm": 0.6875, + "learning_rate": 1.3853210267935168e-05, + "loss": 0.4546, + "step": 25402 + }, + { + "epoch": 3.39, + "grad_norm": 0.5703125, + "learning_rate": 1.3847297372344037e-05, + "loss": 0.1868, + "step": 25403 + }, + { + "epoch": 3.39, + "grad_norm": 0.546875, + "learning_rate": 1.384138564503885e-05, + "loss": 0.4366, + "step": 25404 + }, + { + "epoch": 3.39, + "grad_norm": 0.578125, + "learning_rate": 1.3835475086099803e-05, + "loss": 0.321, + "step": 25405 + }, + { + "epoch": 3.39, + "grad_norm": 0.5546875, + "learning_rate": 1.3829565695606983e-05, + "loss": 0.4683, + "step": 25406 + }, + { + "epoch": 3.39, + "grad_norm": 0.443359375, + "learning_rate": 1.3823657473640572e-05, + "loss": 0.2372, + "step": 25407 + }, + { + "epoch": 3.39, + "grad_norm": 0.61328125, + "learning_rate": 1.3817750420280662e-05, + "loss": 0.5685, + "step": 25408 + }, + { + "epoch": 3.39, + "grad_norm": 0.498046875, + "learning_rate": 1.3811844535607365e-05, + "loss": 0.2666, + "step": 25409 + }, + { + "epoch": 3.39, + "grad_norm": 0.5390625, + "learning_rate": 1.3805939819700764e-05, + "loss": 0.3667, + "step": 25410 + }, + { + "epoch": 3.39, + "grad_norm": 0.6171875, + "learning_rate": 1.380003627264097e-05, + "loss": 0.651, + "step": 25411 + }, + { + "epoch": 3.39, + "grad_norm": 0.66015625, + "learning_rate": 1.3794133894507976e-05, + "loss": 0.3119, + "step": 25412 + }, + { + "epoch": 3.39, + "grad_norm": 0.69921875, + "learning_rate": 1.3788232685381863e-05, + "loss": 0.4652, + "step": 25413 + }, + { + "epoch": 3.39, + "grad_norm": 0.72265625, + "learning_rate": 1.3782332645342633e-05, + "loss": 0.2081, + "step": 25414 + }, + { + "epoch": 3.39, + "grad_norm": 0.54296875, + "learning_rate": 1.3776433774470333e-05, + "loss": 0.2811, + "step": 25415 + }, + { + "epoch": 3.39, + "grad_norm": 0.75390625, + "learning_rate": 1.3770536072844908e-05, + "loss": 0.3336, + "step": 25416 + }, + { + "epoch": 3.39, + "grad_norm": 0.7109375, + "learning_rate": 1.3764639540546386e-05, + "loss": 0.2205, + "step": 25417 + }, + { + "epoch": 3.39, + "grad_norm": 0.609375, + "learning_rate": 1.3758744177654659e-05, + "loss": 0.417, + "step": 25418 + }, + { + "epoch": 3.39, + "grad_norm": 0.6875, + "learning_rate": 1.3752849984249727e-05, + "loss": 0.2202, + "step": 25419 + }, + { + "epoch": 3.39, + "grad_norm": 0.62890625, + "learning_rate": 1.3746956960411516e-05, + "loss": 0.2938, + "step": 25420 + }, + { + "epoch": 3.39, + "grad_norm": 0.6953125, + "learning_rate": 1.3741065106219897e-05, + "loss": 0.3317, + "step": 25421 + }, + { + "epoch": 3.39, + "grad_norm": 0.61328125, + "learning_rate": 1.3735174421754803e-05, + "loss": 0.3304, + "step": 25422 + }, + { + "epoch": 3.39, + "grad_norm": 0.61328125, + "learning_rate": 1.3729284907096106e-05, + "loss": 0.2812, + "step": 25423 + }, + { + "epoch": 3.39, + "grad_norm": 0.55078125, + "learning_rate": 1.3723396562323676e-05, + "loss": 0.1628, + "step": 25424 + }, + { + "epoch": 3.39, + "grad_norm": 0.6328125, + "learning_rate": 1.371750938751739e-05, + "loss": 0.4568, + "step": 25425 + }, + { + "epoch": 3.39, + "grad_norm": 0.546875, + "learning_rate": 1.3711623382757022e-05, + "loss": 0.2893, + "step": 25426 + }, + { + "epoch": 3.39, + "grad_norm": 0.58984375, + "learning_rate": 1.3705738548122415e-05, + "loss": 0.2382, + "step": 25427 + }, + { + "epoch": 3.39, + "grad_norm": 0.6640625, + "learning_rate": 1.3699854883693385e-05, + "loss": 0.3325, + "step": 25428 + }, + { + "epoch": 3.39, + "grad_norm": 0.50390625, + "learning_rate": 1.3693972389549725e-05, + "loss": 0.1691, + "step": 25429 + }, + { + "epoch": 3.39, + "grad_norm": 0.58203125, + "learning_rate": 1.3688091065771203e-05, + "loss": 0.3465, + "step": 25430 + }, + { + "epoch": 3.39, + "grad_norm": 0.703125, + "learning_rate": 1.3682210912437521e-05, + "loss": 0.4513, + "step": 25431 + }, + { + "epoch": 3.39, + "grad_norm": 0.5078125, + "learning_rate": 1.3676331929628471e-05, + "loss": 0.1578, + "step": 25432 + }, + { + "epoch": 3.39, + "grad_norm": 0.84765625, + "learning_rate": 1.3670454117423759e-05, + "loss": 0.3465, + "step": 25433 + }, + { + "epoch": 3.39, + "grad_norm": 0.6015625, + "learning_rate": 1.3664577475903117e-05, + "loss": 0.2855, + "step": 25434 + }, + { + "epoch": 3.39, + "grad_norm": 0.68359375, + "learning_rate": 1.3658702005146185e-05, + "loss": 0.4487, + "step": 25435 + }, + { + "epoch": 3.39, + "grad_norm": 0.56640625, + "learning_rate": 1.3652827705232674e-05, + "loss": 0.3162, + "step": 25436 + }, + { + "epoch": 3.39, + "grad_norm": 0.55078125, + "learning_rate": 1.3646954576242244e-05, + "loss": 0.1815, + "step": 25437 + }, + { + "epoch": 3.39, + "grad_norm": 0.46875, + "learning_rate": 1.3641082618254542e-05, + "loss": 0.2064, + "step": 25438 + }, + { + "epoch": 3.39, + "grad_norm": 0.6484375, + "learning_rate": 1.363521183134916e-05, + "loss": 0.3587, + "step": 25439 + }, + { + "epoch": 3.39, + "grad_norm": 0.6171875, + "learning_rate": 1.3629342215605745e-05, + "loss": 0.3869, + "step": 25440 + }, + { + "epoch": 3.39, + "grad_norm": 0.52734375, + "learning_rate": 1.362347377110388e-05, + "loss": 0.3086, + "step": 25441 + }, + { + "epoch": 3.39, + "grad_norm": 0.609375, + "learning_rate": 1.3617606497923163e-05, + "loss": 0.1742, + "step": 25442 + }, + { + "epoch": 3.4, + "grad_norm": 0.482421875, + "learning_rate": 1.3611740396143135e-05, + "loss": 0.3245, + "step": 25443 + }, + { + "epoch": 3.4, + "grad_norm": 0.6171875, + "learning_rate": 1.3605875465843343e-05, + "loss": 0.3388, + "step": 25444 + }, + { + "epoch": 3.4, + "grad_norm": 0.640625, + "learning_rate": 1.3600011707103322e-05, + "loss": 0.1689, + "step": 25445 + }, + { + "epoch": 3.4, + "grad_norm": 0.50390625, + "learning_rate": 1.3594149120002586e-05, + "loss": 0.3164, + "step": 25446 + }, + { + "epoch": 3.4, + "grad_norm": 0.55859375, + "learning_rate": 1.3588287704620672e-05, + "loss": 0.2601, + "step": 25447 + }, + { + "epoch": 3.4, + "grad_norm": 0.55859375, + "learning_rate": 1.3582427461037017e-05, + "loss": 0.2791, + "step": 25448 + }, + { + "epoch": 3.4, + "grad_norm": 0.6875, + "learning_rate": 1.3576568389331102e-05, + "loss": 0.2547, + "step": 25449 + }, + { + "epoch": 3.4, + "grad_norm": 0.49609375, + "learning_rate": 1.3570710489582395e-05, + "loss": 0.3444, + "step": 25450 + }, + { + "epoch": 3.4, + "grad_norm": 0.66796875, + "learning_rate": 1.3564853761870311e-05, + "loss": 0.1481, + "step": 25451 + }, + { + "epoch": 3.4, + "grad_norm": 0.494140625, + "learning_rate": 1.355899820627431e-05, + "loss": 0.1368, + "step": 25452 + }, + { + "epoch": 3.4, + "grad_norm": 0.94140625, + "learning_rate": 1.355314382287376e-05, + "loss": 0.594, + "step": 25453 + }, + { + "epoch": 3.4, + "grad_norm": 0.58203125, + "learning_rate": 1.3547290611748065e-05, + "loss": 0.4355, + "step": 25454 + }, + { + "epoch": 3.4, + "grad_norm": 0.5859375, + "learning_rate": 1.3541438572976583e-05, + "loss": 0.27, + "step": 25455 + }, + { + "epoch": 3.4, + "grad_norm": 0.58984375, + "learning_rate": 1.3535587706638685e-05, + "loss": 0.269, + "step": 25456 + }, + { + "epoch": 3.4, + "grad_norm": 0.609375, + "learning_rate": 1.352973801281372e-05, + "loss": 0.296, + "step": 25457 + }, + { + "epoch": 3.4, + "grad_norm": 0.69140625, + "learning_rate": 1.3523889491580988e-05, + "loss": 0.2742, + "step": 25458 + }, + { + "epoch": 3.4, + "grad_norm": 0.69140625, + "learning_rate": 1.3518042143019816e-05, + "loss": 0.4762, + "step": 25459 + }, + { + "epoch": 3.4, + "grad_norm": 0.640625, + "learning_rate": 1.3512195967209496e-05, + "loss": 0.2298, + "step": 25460 + }, + { + "epoch": 3.4, + "grad_norm": 0.6171875, + "learning_rate": 1.350635096422933e-05, + "loss": 0.2765, + "step": 25461 + }, + { + "epoch": 3.4, + "grad_norm": 0.6328125, + "learning_rate": 1.3500507134158536e-05, + "loss": 0.2529, + "step": 25462 + }, + { + "epoch": 3.4, + "grad_norm": 0.59375, + "learning_rate": 1.3494664477076379e-05, + "loss": 0.2196, + "step": 25463 + }, + { + "epoch": 3.4, + "grad_norm": 0.482421875, + "learning_rate": 1.3488822993062089e-05, + "loss": 0.3905, + "step": 25464 + }, + { + "epoch": 3.4, + "grad_norm": 0.6796875, + "learning_rate": 1.34829826821949e-05, + "loss": 0.4981, + "step": 25465 + }, + { + "epoch": 3.4, + "grad_norm": 0.81640625, + "learning_rate": 1.3477143544553995e-05, + "loss": 0.5797, + "step": 25466 + }, + { + "epoch": 3.4, + "grad_norm": 0.48828125, + "learning_rate": 1.3471305580218529e-05, + "loss": 0.2266, + "step": 25467 + }, + { + "epoch": 3.4, + "grad_norm": 0.57421875, + "learning_rate": 1.3465468789267698e-05, + "loss": 0.3508, + "step": 25468 + }, + { + "epoch": 3.4, + "grad_norm": 0.640625, + "learning_rate": 1.3459633171780639e-05, + "loss": 0.4366, + "step": 25469 + }, + { + "epoch": 3.4, + "grad_norm": 0.68359375, + "learning_rate": 1.345379872783653e-05, + "loss": 0.3353, + "step": 25470 + }, + { + "epoch": 3.4, + "grad_norm": 0.498046875, + "learning_rate": 1.3447965457514423e-05, + "loss": 0.3872, + "step": 25471 + }, + { + "epoch": 3.4, + "grad_norm": 0.72265625, + "learning_rate": 1.344213336089346e-05, + "loss": 0.3741, + "step": 25472 + }, + { + "epoch": 3.4, + "grad_norm": 0.474609375, + "learning_rate": 1.3436302438052717e-05, + "loss": 0.2494, + "step": 25473 + }, + { + "epoch": 3.4, + "grad_norm": 0.796875, + "learning_rate": 1.3430472689071272e-05, + "loss": 0.3585, + "step": 25474 + }, + { + "epoch": 3.4, + "grad_norm": 0.796875, + "learning_rate": 1.3424644114028196e-05, + "loss": 0.4926, + "step": 25475 + }, + { + "epoch": 3.4, + "grad_norm": 0.51953125, + "learning_rate": 1.341881671300249e-05, + "loss": 0.2518, + "step": 25476 + }, + { + "epoch": 3.4, + "grad_norm": 0.6015625, + "learning_rate": 1.3412990486073195e-05, + "loss": 0.5675, + "step": 25477 + }, + { + "epoch": 3.4, + "grad_norm": 0.6015625, + "learning_rate": 1.3407165433319347e-05, + "loss": 0.2702, + "step": 25478 + }, + { + "epoch": 3.4, + "grad_norm": 0.482421875, + "learning_rate": 1.340134155481988e-05, + "loss": 0.2432, + "step": 25479 + }, + { + "epoch": 3.4, + "grad_norm": 0.859375, + "learning_rate": 1.3395518850653822e-05, + "loss": 0.4123, + "step": 25480 + }, + { + "epoch": 3.4, + "grad_norm": 0.72265625, + "learning_rate": 1.3389697320900096e-05, + "loss": 0.5409, + "step": 25481 + }, + { + "epoch": 3.4, + "grad_norm": 0.63671875, + "learning_rate": 1.3383876965637653e-05, + "loss": 0.3193, + "step": 25482 + }, + { + "epoch": 3.4, + "grad_norm": 0.67578125, + "learning_rate": 1.3378057784945441e-05, + "loss": 0.4132, + "step": 25483 + }, + { + "epoch": 3.4, + "grad_norm": 0.58984375, + "learning_rate": 1.3372239778902373e-05, + "loss": 0.2776, + "step": 25484 + }, + { + "epoch": 3.4, + "grad_norm": 0.765625, + "learning_rate": 1.336642294758731e-05, + "loss": 0.2379, + "step": 25485 + }, + { + "epoch": 3.4, + "grad_norm": 0.5234375, + "learning_rate": 1.3360607291079152e-05, + "loss": 0.4108, + "step": 25486 + }, + { + "epoch": 3.4, + "grad_norm": 0.640625, + "learning_rate": 1.3354792809456763e-05, + "loss": 0.3337, + "step": 25487 + }, + { + "epoch": 3.4, + "grad_norm": 0.66015625, + "learning_rate": 1.3348979502799008e-05, + "loss": 0.4097, + "step": 25488 + }, + { + "epoch": 3.4, + "grad_norm": 0.5546875, + "learning_rate": 1.3343167371184695e-05, + "loss": 0.2415, + "step": 25489 + }, + { + "epoch": 3.4, + "grad_norm": 0.76953125, + "learning_rate": 1.3337356414692659e-05, + "loss": 0.6312, + "step": 25490 + }, + { + "epoch": 3.4, + "grad_norm": 0.5625, + "learning_rate": 1.333154663340167e-05, + "loss": 0.4521, + "step": 25491 + }, + { + "epoch": 3.4, + "grad_norm": 0.76953125, + "learning_rate": 1.3325738027390544e-05, + "loss": 0.4358, + "step": 25492 + }, + { + "epoch": 3.4, + "grad_norm": 0.59765625, + "learning_rate": 1.331993059673805e-05, + "loss": 0.2679, + "step": 25493 + }, + { + "epoch": 3.4, + "grad_norm": 0.71484375, + "learning_rate": 1.3314124341522904e-05, + "loss": 0.3849, + "step": 25494 + }, + { + "epoch": 3.4, + "grad_norm": 0.8359375, + "learning_rate": 1.3308319261823865e-05, + "loss": 0.5391, + "step": 25495 + }, + { + "epoch": 3.4, + "grad_norm": 0.63671875, + "learning_rate": 1.3302515357719658e-05, + "loss": 0.3864, + "step": 25496 + }, + { + "epoch": 3.4, + "grad_norm": 0.55859375, + "learning_rate": 1.3296712629288988e-05, + "loss": 0.3753, + "step": 25497 + }, + { + "epoch": 3.4, + "grad_norm": 0.58203125, + "learning_rate": 1.3290911076610557e-05, + "loss": 0.1558, + "step": 25498 + }, + { + "epoch": 3.4, + "grad_norm": 0.5859375, + "learning_rate": 1.3285110699763003e-05, + "loss": 0.3474, + "step": 25499 + }, + { + "epoch": 3.4, + "grad_norm": 0.470703125, + "learning_rate": 1.3279311498824997e-05, + "loss": 0.3842, + "step": 25500 + }, + { + "epoch": 3.4, + "grad_norm": 0.7578125, + "learning_rate": 1.3273513473875188e-05, + "loss": 0.3472, + "step": 25501 + }, + { + "epoch": 3.4, + "grad_norm": 0.53515625, + "learning_rate": 1.326771662499222e-05, + "loss": 0.1419, + "step": 25502 + }, + { + "epoch": 3.4, + "grad_norm": 0.69140625, + "learning_rate": 1.326192095225468e-05, + "loss": 0.4302, + "step": 25503 + }, + { + "epoch": 3.4, + "grad_norm": 0.72265625, + "learning_rate": 1.3256126455741135e-05, + "loss": 0.3492, + "step": 25504 + }, + { + "epoch": 3.4, + "grad_norm": 0.62109375, + "learning_rate": 1.325033313553019e-05, + "loss": 0.3358, + "step": 25505 + }, + { + "epoch": 3.4, + "grad_norm": 0.5234375, + "learning_rate": 1.3244540991700394e-05, + "loss": 0.2294, + "step": 25506 + }, + { + "epoch": 3.4, + "grad_norm": 0.8359375, + "learning_rate": 1.3238750024330338e-05, + "loss": 0.306, + "step": 25507 + }, + { + "epoch": 3.4, + "grad_norm": 0.5546875, + "learning_rate": 1.3232960233498482e-05, + "loss": 0.277, + "step": 25508 + }, + { + "epoch": 3.4, + "grad_norm": 0.75390625, + "learning_rate": 1.3227171619283374e-05, + "loss": 0.1646, + "step": 25509 + }, + { + "epoch": 3.4, + "grad_norm": 0.71875, + "learning_rate": 1.3221384181763507e-05, + "loss": 0.2823, + "step": 25510 + }, + { + "epoch": 3.4, + "grad_norm": 0.63671875, + "learning_rate": 1.3215597921017387e-05, + "loss": 0.3133, + "step": 25511 + }, + { + "epoch": 3.4, + "grad_norm": 0.57421875, + "learning_rate": 1.3209812837123447e-05, + "loss": 0.3027, + "step": 25512 + }, + { + "epoch": 3.4, + "grad_norm": 0.734375, + "learning_rate": 1.3204028930160128e-05, + "loss": 0.4243, + "step": 25513 + }, + { + "epoch": 3.4, + "grad_norm": 0.5625, + "learning_rate": 1.31982462002059e-05, + "loss": 0.2013, + "step": 25514 + }, + { + "epoch": 3.4, + "grad_norm": 0.51953125, + "learning_rate": 1.3192464647339176e-05, + "loss": 0.3229, + "step": 25515 + }, + { + "epoch": 3.4, + "grad_norm": 0.7265625, + "learning_rate": 1.3186684271638338e-05, + "loss": 0.3847, + "step": 25516 + }, + { + "epoch": 3.4, + "grad_norm": 0.6875, + "learning_rate": 1.318090507318176e-05, + "loss": 0.2821, + "step": 25517 + }, + { + "epoch": 3.41, + "grad_norm": 0.69921875, + "learning_rate": 1.317512705204783e-05, + "loss": 0.5997, + "step": 25518 + }, + { + "epoch": 3.41, + "grad_norm": 0.478515625, + "learning_rate": 1.3169350208314902e-05, + "loss": 0.4475, + "step": 25519 + }, + { + "epoch": 3.41, + "grad_norm": 0.5859375, + "learning_rate": 1.3163574542061341e-05, + "loss": 0.2178, + "step": 25520 + }, + { + "epoch": 3.41, + "grad_norm": 0.58984375, + "learning_rate": 1.315780005336541e-05, + "loss": 0.2942, + "step": 25521 + }, + { + "epoch": 3.41, + "grad_norm": 0.640625, + "learning_rate": 1.3152026742305446e-05, + "loss": 0.1504, + "step": 25522 + }, + { + "epoch": 3.41, + "grad_norm": 0.53515625, + "learning_rate": 1.314625460895974e-05, + "loss": 0.2628, + "step": 25523 + }, + { + "epoch": 3.41, + "grad_norm": 0.55078125, + "learning_rate": 1.3140483653406566e-05, + "loss": 0.4117, + "step": 25524 + }, + { + "epoch": 3.41, + "grad_norm": 0.51953125, + "learning_rate": 1.3134713875724191e-05, + "loss": 0.2554, + "step": 25525 + }, + { + "epoch": 3.41, + "grad_norm": 0.578125, + "learning_rate": 1.3128945275990834e-05, + "loss": 0.2679, + "step": 25526 + }, + { + "epoch": 3.41, + "grad_norm": 0.494140625, + "learning_rate": 1.3123177854284752e-05, + "loss": 0.154, + "step": 25527 + }, + { + "epoch": 3.41, + "grad_norm": 0.71484375, + "learning_rate": 1.3117411610684117e-05, + "loss": 0.473, + "step": 25528 + }, + { + "epoch": 3.41, + "grad_norm": 0.78125, + "learning_rate": 1.3111646545267132e-05, + "loss": 0.3037, + "step": 25529 + }, + { + "epoch": 3.41, + "grad_norm": 0.6328125, + "learning_rate": 1.3105882658112011e-05, + "loss": 0.2294, + "step": 25530 + }, + { + "epoch": 3.41, + "grad_norm": 0.625, + "learning_rate": 1.3100119949296874e-05, + "loss": 0.3743, + "step": 25531 + }, + { + "epoch": 3.41, + "grad_norm": 0.578125, + "learning_rate": 1.3094358418899878e-05, + "loss": 0.165, + "step": 25532 + }, + { + "epoch": 3.41, + "grad_norm": 0.5234375, + "learning_rate": 1.3088598066999157e-05, + "loss": 0.1816, + "step": 25533 + }, + { + "epoch": 3.41, + "grad_norm": 0.67578125, + "learning_rate": 1.3082838893672845e-05, + "loss": 0.4206, + "step": 25534 + }, + { + "epoch": 3.41, + "grad_norm": 0.66015625, + "learning_rate": 1.3077080898999006e-05, + "loss": 0.3221, + "step": 25535 + }, + { + "epoch": 3.41, + "grad_norm": 0.470703125, + "learning_rate": 1.3071324083055725e-05, + "loss": 0.1976, + "step": 25536 + }, + { + "epoch": 3.41, + "grad_norm": 0.5234375, + "learning_rate": 1.3065568445921084e-05, + "loss": 0.1843, + "step": 25537 + }, + { + "epoch": 3.41, + "grad_norm": 0.703125, + "learning_rate": 1.3059813987673153e-05, + "loss": 0.311, + "step": 25538 + }, + { + "epoch": 3.41, + "grad_norm": 0.671875, + "learning_rate": 1.3054060708389947e-05, + "loss": 0.2964, + "step": 25539 + }, + { + "epoch": 3.41, + "grad_norm": 0.431640625, + "learning_rate": 1.3048308608149451e-05, + "loss": 0.2062, + "step": 25540 + }, + { + "epoch": 3.41, + "grad_norm": 0.6640625, + "learning_rate": 1.30425576870297e-05, + "loss": 0.2424, + "step": 25541 + }, + { + "epoch": 3.41, + "grad_norm": 0.703125, + "learning_rate": 1.3036807945108676e-05, + "loss": 0.2922, + "step": 25542 + }, + { + "epoch": 3.41, + "grad_norm": 0.78125, + "learning_rate": 1.3031059382464362e-05, + "loss": 0.3093, + "step": 25543 + }, + { + "epoch": 3.41, + "grad_norm": 0.6015625, + "learning_rate": 1.3025311999174683e-05, + "loss": 0.3393, + "step": 25544 + }, + { + "epoch": 3.41, + "grad_norm": 0.74609375, + "learning_rate": 1.30195657953176e-05, + "loss": 0.2155, + "step": 25545 + }, + { + "epoch": 3.41, + "grad_norm": 0.5, + "learning_rate": 1.3013820770971019e-05, + "loss": 0.3108, + "step": 25546 + }, + { + "epoch": 3.41, + "grad_norm": 0.6640625, + "learning_rate": 1.3008076926212853e-05, + "loss": 0.421, + "step": 25547 + }, + { + "epoch": 3.41, + "grad_norm": 0.53515625, + "learning_rate": 1.300233426112103e-05, + "loss": 0.4165, + "step": 25548 + }, + { + "epoch": 3.41, + "grad_norm": 0.4609375, + "learning_rate": 1.2996592775773353e-05, + "loss": 0.2944, + "step": 25549 + }, + { + "epoch": 3.41, + "grad_norm": 0.64453125, + "learning_rate": 1.2990852470247727e-05, + "loss": 0.5093, + "step": 25550 + }, + { + "epoch": 3.41, + "grad_norm": 0.46875, + "learning_rate": 1.2985113344622001e-05, + "loss": 0.1407, + "step": 25551 + }, + { + "epoch": 3.41, + "grad_norm": 0.61328125, + "learning_rate": 1.2979375398973958e-05, + "loss": 0.2095, + "step": 25552 + }, + { + "epoch": 3.41, + "grad_norm": 0.58203125, + "learning_rate": 1.297363863338147e-05, + "loss": 0.211, + "step": 25553 + }, + { + "epoch": 3.41, + "grad_norm": 0.5, + "learning_rate": 1.296790304792226e-05, + "loss": 0.2897, + "step": 25554 + }, + { + "epoch": 3.41, + "grad_norm": 0.59375, + "learning_rate": 1.2962168642674145e-05, + "loss": 0.2734, + "step": 25555 + }, + { + "epoch": 3.41, + "grad_norm": 0.5390625, + "learning_rate": 1.2956435417714885e-05, + "loss": 0.3537, + "step": 25556 + }, + { + "epoch": 3.41, + "grad_norm": 0.66796875, + "learning_rate": 1.2950703373122241e-05, + "loss": 0.3967, + "step": 25557 + }, + { + "epoch": 3.41, + "grad_norm": 0.54296875, + "learning_rate": 1.2944972508973908e-05, + "loss": 0.1982, + "step": 25558 + }, + { + "epoch": 3.41, + "grad_norm": 0.54296875, + "learning_rate": 1.293924282534762e-05, + "loss": 0.2374, + "step": 25559 + }, + { + "epoch": 3.41, + "grad_norm": 0.546875, + "learning_rate": 1.2933514322321082e-05, + "loss": 0.3213, + "step": 25560 + }, + { + "epoch": 3.41, + "grad_norm": 0.64453125, + "learning_rate": 1.292778699997198e-05, + "loss": 0.45, + "step": 25561 + }, + { + "epoch": 3.41, + "grad_norm": 0.546875, + "learning_rate": 1.2922060858377949e-05, + "loss": 0.3048, + "step": 25562 + }, + { + "epoch": 3.41, + "grad_norm": 0.59375, + "learning_rate": 1.2916335897616694e-05, + "loss": 0.4237, + "step": 25563 + }, + { + "epoch": 3.41, + "grad_norm": 0.64453125, + "learning_rate": 1.2910612117765775e-05, + "loss": 0.2005, + "step": 25564 + }, + { + "epoch": 3.41, + "grad_norm": 0.59375, + "learning_rate": 1.2904889518902852e-05, + "loss": 0.2366, + "step": 25565 + }, + { + "epoch": 3.41, + "grad_norm": 0.6015625, + "learning_rate": 1.2899168101105552e-05, + "loss": 0.2825, + "step": 25566 + }, + { + "epoch": 3.41, + "grad_norm": 0.734375, + "learning_rate": 1.2893447864451414e-05, + "loss": 0.2916, + "step": 25567 + }, + { + "epoch": 3.41, + "grad_norm": 0.41796875, + "learning_rate": 1.2887728809018017e-05, + "loss": 0.2272, + "step": 25568 + }, + { + "epoch": 3.41, + "grad_norm": 0.68359375, + "learning_rate": 1.2882010934882938e-05, + "loss": 0.4632, + "step": 25569 + }, + { + "epoch": 3.41, + "grad_norm": 0.66015625, + "learning_rate": 1.2876294242123698e-05, + "loss": 0.2305, + "step": 25570 + }, + { + "epoch": 3.41, + "grad_norm": 0.55078125, + "learning_rate": 1.287057873081784e-05, + "loss": 0.4323, + "step": 25571 + }, + { + "epoch": 3.41, + "grad_norm": 0.671875, + "learning_rate": 1.286486440104283e-05, + "loss": 0.2639, + "step": 25572 + }, + { + "epoch": 3.41, + "grad_norm": 0.546875, + "learning_rate": 1.2859151252876189e-05, + "loss": 0.3528, + "step": 25573 + }, + { + "epoch": 3.41, + "grad_norm": 0.73046875, + "learning_rate": 1.2853439286395385e-05, + "loss": 0.2846, + "step": 25574 + }, + { + "epoch": 3.41, + "grad_norm": 0.55078125, + "learning_rate": 1.2847728501677891e-05, + "loss": 0.3513, + "step": 25575 + }, + { + "epoch": 3.41, + "grad_norm": 0.53125, + "learning_rate": 1.2842018898801146e-05, + "loss": 0.23, + "step": 25576 + }, + { + "epoch": 3.41, + "grad_norm": 0.65234375, + "learning_rate": 1.2836310477842528e-05, + "loss": 0.2215, + "step": 25577 + }, + { + "epoch": 3.41, + "grad_norm": 0.67578125, + "learning_rate": 1.283060323887948e-05, + "loss": 0.3215, + "step": 25578 + }, + { + "epoch": 3.41, + "grad_norm": 0.71875, + "learning_rate": 1.2824897181989393e-05, + "loss": 0.2806, + "step": 25579 + }, + { + "epoch": 3.41, + "grad_norm": 0.58984375, + "learning_rate": 1.2819192307249683e-05, + "loss": 0.3039, + "step": 25580 + }, + { + "epoch": 3.41, + "grad_norm": 0.6953125, + "learning_rate": 1.2813488614737645e-05, + "loss": 0.4596, + "step": 25581 + }, + { + "epoch": 3.41, + "grad_norm": 0.671875, + "learning_rate": 1.2807786104530672e-05, + "loss": 0.3945, + "step": 25582 + }, + { + "epoch": 3.41, + "grad_norm": 0.51171875, + "learning_rate": 1.2802084776706068e-05, + "loss": 0.3543, + "step": 25583 + }, + { + "epoch": 3.41, + "grad_norm": 0.69140625, + "learning_rate": 1.2796384631341174e-05, + "loss": 0.2442, + "step": 25584 + }, + { + "epoch": 3.41, + "grad_norm": 0.73828125, + "learning_rate": 1.2790685668513257e-05, + "loss": 0.4373, + "step": 25585 + }, + { + "epoch": 3.41, + "grad_norm": 0.6171875, + "learning_rate": 1.2784987888299615e-05, + "loss": 0.5286, + "step": 25586 + }, + { + "epoch": 3.41, + "grad_norm": 0.80859375, + "learning_rate": 1.2779291290777529e-05, + "loss": 0.292, + "step": 25587 + }, + { + "epoch": 3.41, + "grad_norm": 0.53125, + "learning_rate": 1.2773595876024214e-05, + "loss": 0.2617, + "step": 25588 + }, + { + "epoch": 3.41, + "grad_norm": 0.72265625, + "learning_rate": 1.2767901644116941e-05, + "loss": 0.2657, + "step": 25589 + }, + { + "epoch": 3.41, + "grad_norm": 0.63671875, + "learning_rate": 1.2762208595132885e-05, + "loss": 0.3658, + "step": 25590 + }, + { + "epoch": 3.41, + "grad_norm": 0.64453125, + "learning_rate": 1.2756516729149259e-05, + "loss": 0.1869, + "step": 25591 + }, + { + "epoch": 3.41, + "grad_norm": 0.6171875, + "learning_rate": 1.275082604624327e-05, + "loss": 0.6322, + "step": 25592 + }, + { + "epoch": 3.42, + "grad_norm": 0.66796875, + "learning_rate": 1.2745136546492098e-05, + "loss": 0.2965, + "step": 25593 + }, + { + "epoch": 3.42, + "grad_norm": 0.578125, + "learning_rate": 1.273944822997285e-05, + "loss": 0.2252, + "step": 25594 + }, + { + "epoch": 3.42, + "grad_norm": 0.609375, + "learning_rate": 1.2733761096762686e-05, + "loss": 0.259, + "step": 25595 + }, + { + "epoch": 3.42, + "grad_norm": 0.5625, + "learning_rate": 1.2728075146938723e-05, + "loss": 0.2767, + "step": 25596 + }, + { + "epoch": 3.42, + "grad_norm": 0.61328125, + "learning_rate": 1.2722390380578086e-05, + "loss": 0.3387, + "step": 25597 + }, + { + "epoch": 3.42, + "grad_norm": 0.71875, + "learning_rate": 1.271670679775786e-05, + "loss": 0.4592, + "step": 25598 + }, + { + "epoch": 3.42, + "grad_norm": 0.6484375, + "learning_rate": 1.2711024398555093e-05, + "loss": 0.3017, + "step": 25599 + }, + { + "epoch": 3.42, + "grad_norm": 0.57421875, + "learning_rate": 1.2705343183046869e-05, + "loss": 0.2281, + "step": 25600 + }, + { + "epoch": 3.42, + "grad_norm": 0.640625, + "learning_rate": 1.2699663151310203e-05, + "loss": 0.3365, + "step": 25601 + }, + { + "epoch": 3.42, + "grad_norm": 0.515625, + "learning_rate": 1.2693984303422135e-05, + "loss": 0.1918, + "step": 25602 + }, + { + "epoch": 3.42, + "grad_norm": 0.6484375, + "learning_rate": 1.268830663945969e-05, + "loss": 0.3099, + "step": 25603 + }, + { + "epoch": 3.42, + "grad_norm": 0.55859375, + "learning_rate": 1.2682630159499819e-05, + "loss": 0.5859, + "step": 25604 + }, + { + "epoch": 3.42, + "grad_norm": 0.59375, + "learning_rate": 1.2676954863619528e-05, + "loss": 0.3612, + "step": 25605 + }, + { + "epoch": 3.42, + "grad_norm": 0.51953125, + "learning_rate": 1.2671280751895776e-05, + "loss": 0.2887, + "step": 25606 + }, + { + "epoch": 3.42, + "grad_norm": 0.486328125, + "learning_rate": 1.2665607824405513e-05, + "loss": 0.4494, + "step": 25607 + }, + { + "epoch": 3.42, + "grad_norm": 0.6953125, + "learning_rate": 1.2659936081225654e-05, + "loss": 0.2263, + "step": 25608 + }, + { + "epoch": 3.42, + "grad_norm": 0.439453125, + "learning_rate": 1.2654265522433106e-05, + "loss": 0.2248, + "step": 25609 + }, + { + "epoch": 3.42, + "grad_norm": 0.47265625, + "learning_rate": 1.2648596148104775e-05, + "loss": 0.1139, + "step": 25610 + }, + { + "epoch": 3.42, + "grad_norm": 0.578125, + "learning_rate": 1.2642927958317563e-05, + "loss": 0.3743, + "step": 25611 + }, + { + "epoch": 3.42, + "grad_norm": 0.59375, + "learning_rate": 1.2637260953148322e-05, + "loss": 0.4032, + "step": 25612 + }, + { + "epoch": 3.42, + "grad_norm": 0.68359375, + "learning_rate": 1.2631595132673856e-05, + "loss": 0.265, + "step": 25613 + }, + { + "epoch": 3.42, + "grad_norm": 0.625, + "learning_rate": 1.262593049697104e-05, + "loss": 0.4149, + "step": 25614 + }, + { + "epoch": 3.42, + "grad_norm": 0.6796875, + "learning_rate": 1.2620267046116685e-05, + "loss": 0.2597, + "step": 25615 + }, + { + "epoch": 3.42, + "grad_norm": 0.83984375, + "learning_rate": 1.2614604780187611e-05, + "loss": 0.4384, + "step": 25616 + }, + { + "epoch": 3.42, + "grad_norm": 0.5703125, + "learning_rate": 1.2608943699260557e-05, + "loss": 0.2852, + "step": 25617 + }, + { + "epoch": 3.42, + "grad_norm": 0.796875, + "learning_rate": 1.2603283803412314e-05, + "loss": 0.5127, + "step": 25618 + }, + { + "epoch": 3.42, + "grad_norm": 0.703125, + "learning_rate": 1.2597625092719634e-05, + "loss": 0.3636, + "step": 25619 + }, + { + "epoch": 3.42, + "grad_norm": 0.5546875, + "learning_rate": 1.2591967567259244e-05, + "loss": 0.2997, + "step": 25620 + }, + { + "epoch": 3.42, + "grad_norm": 0.5859375, + "learning_rate": 1.2586311227107905e-05, + "loss": 0.1379, + "step": 25621 + }, + { + "epoch": 3.42, + "grad_norm": 0.408203125, + "learning_rate": 1.2580656072342256e-05, + "loss": 0.2258, + "step": 25622 + }, + { + "epoch": 3.42, + "grad_norm": 0.71875, + "learning_rate": 1.2575002103039025e-05, + "loss": 0.3528, + "step": 25623 + }, + { + "epoch": 3.42, + "grad_norm": 0.62890625, + "learning_rate": 1.2569349319274893e-05, + "loss": 0.3339, + "step": 25624 + }, + { + "epoch": 3.42, + "grad_norm": 0.609375, + "learning_rate": 1.2563697721126466e-05, + "loss": 0.5554, + "step": 25625 + }, + { + "epoch": 3.42, + "grad_norm": 0.5078125, + "learning_rate": 1.255804730867044e-05, + "loss": 0.2322, + "step": 25626 + }, + { + "epoch": 3.42, + "grad_norm": 0.65234375, + "learning_rate": 1.2552398081983397e-05, + "loss": 0.378, + "step": 25627 + }, + { + "epoch": 3.42, + "grad_norm": 0.78125, + "learning_rate": 1.2546750041141964e-05, + "loss": 0.2439, + "step": 25628 + }, + { + "epoch": 3.42, + "grad_norm": 0.61328125, + "learning_rate": 1.2541103186222713e-05, + "loss": 0.3158, + "step": 25629 + }, + { + "epoch": 3.42, + "grad_norm": 0.76171875, + "learning_rate": 1.2535457517302263e-05, + "loss": 0.3179, + "step": 25630 + }, + { + "epoch": 3.42, + "grad_norm": 0.62109375, + "learning_rate": 1.2529813034457128e-05, + "loss": 0.4403, + "step": 25631 + }, + { + "epoch": 3.42, + "grad_norm": 0.82421875, + "learning_rate": 1.2524169737763859e-05, + "loss": 0.3684, + "step": 25632 + }, + { + "epoch": 3.42, + "grad_norm": 0.60546875, + "learning_rate": 1.2518527627298981e-05, + "loss": 0.5036, + "step": 25633 + }, + { + "epoch": 3.42, + "grad_norm": 0.4609375, + "learning_rate": 1.2512886703139048e-05, + "loss": 0.2799, + "step": 25634 + }, + { + "epoch": 3.42, + "grad_norm": 0.625, + "learning_rate": 1.2507246965360498e-05, + "loss": 0.3125, + "step": 25635 + }, + { + "epoch": 3.42, + "grad_norm": 0.6640625, + "learning_rate": 1.2501608414039845e-05, + "loss": 0.4028, + "step": 25636 + }, + { + "epoch": 3.42, + "grad_norm": 0.60546875, + "learning_rate": 1.249597104925353e-05, + "loss": 0.3786, + "step": 25637 + }, + { + "epoch": 3.42, + "grad_norm": 0.75, + "learning_rate": 1.2490334871077991e-05, + "loss": 0.7039, + "step": 25638 + }, + { + "epoch": 3.42, + "grad_norm": 0.70703125, + "learning_rate": 1.2484699879589712e-05, + "loss": 0.2101, + "step": 25639 + }, + { + "epoch": 3.42, + "grad_norm": 0.72265625, + "learning_rate": 1.2479066074865042e-05, + "loss": 0.4357, + "step": 25640 + }, + { + "epoch": 3.42, + "grad_norm": 0.51171875, + "learning_rate": 1.24734334569804e-05, + "loss": 0.2179, + "step": 25641 + }, + { + "epoch": 3.42, + "grad_norm": 0.66796875, + "learning_rate": 1.2467802026012177e-05, + "loss": 0.3133, + "step": 25642 + }, + { + "epoch": 3.42, + "grad_norm": 0.62890625, + "learning_rate": 1.2462171782036746e-05, + "loss": 0.3437, + "step": 25643 + }, + { + "epoch": 3.42, + "grad_norm": 0.58203125, + "learning_rate": 1.245654272513046e-05, + "loss": 0.2099, + "step": 25644 + }, + { + "epoch": 3.42, + "grad_norm": 0.58203125, + "learning_rate": 1.2450914855369611e-05, + "loss": 0.3794, + "step": 25645 + }, + { + "epoch": 3.42, + "grad_norm": 0.7265625, + "learning_rate": 1.244528817283056e-05, + "loss": 0.2485, + "step": 25646 + }, + { + "epoch": 3.42, + "grad_norm": 0.54296875, + "learning_rate": 1.2439662677589581e-05, + "loss": 0.1973, + "step": 25647 + }, + { + "epoch": 3.42, + "grad_norm": 0.365234375, + "learning_rate": 1.2434038369723e-05, + "loss": 0.1, + "step": 25648 + }, + { + "epoch": 3.42, + "grad_norm": 0.5859375, + "learning_rate": 1.2428415249307057e-05, + "loss": 0.3818, + "step": 25649 + }, + { + "epoch": 3.42, + "grad_norm": 0.66796875, + "learning_rate": 1.242279331641799e-05, + "loss": 0.1799, + "step": 25650 + }, + { + "epoch": 3.42, + "grad_norm": 0.59765625, + "learning_rate": 1.2417172571132052e-05, + "loss": 0.4754, + "step": 25651 + }, + { + "epoch": 3.42, + "grad_norm": 0.66796875, + "learning_rate": 1.2411553013525457e-05, + "loss": 0.3338, + "step": 25652 + }, + { + "epoch": 3.42, + "grad_norm": 0.439453125, + "learning_rate": 1.2405934643674444e-05, + "loss": 0.1572, + "step": 25653 + }, + { + "epoch": 3.42, + "grad_norm": 0.95703125, + "learning_rate": 1.2400317461655142e-05, + "loss": 0.4911, + "step": 25654 + }, + { + "epoch": 3.42, + "grad_norm": 0.53515625, + "learning_rate": 1.2394701467543768e-05, + "loss": 0.2887, + "step": 25655 + }, + { + "epoch": 3.42, + "grad_norm": 0.64453125, + "learning_rate": 1.2389086661416471e-05, + "loss": 0.1982, + "step": 25656 + }, + { + "epoch": 3.42, + "grad_norm": 0.7265625, + "learning_rate": 1.2383473043349392e-05, + "loss": 0.2682, + "step": 25657 + }, + { + "epoch": 3.42, + "grad_norm": 0.640625, + "learning_rate": 1.2377860613418646e-05, + "loss": 0.3763, + "step": 25658 + }, + { + "epoch": 3.42, + "grad_norm": 0.6875, + "learning_rate": 1.237224937170034e-05, + "loss": 0.4534, + "step": 25659 + }, + { + "epoch": 3.42, + "grad_norm": 0.421875, + "learning_rate": 1.236663931827059e-05, + "loss": 0.1756, + "step": 25660 + }, + { + "epoch": 3.42, + "grad_norm": 0.55859375, + "learning_rate": 1.2361030453205435e-05, + "loss": 0.2316, + "step": 25661 + }, + { + "epoch": 3.42, + "grad_norm": 0.640625, + "learning_rate": 1.2355422776580972e-05, + "loss": 0.546, + "step": 25662 + }, + { + "epoch": 3.42, + "grad_norm": 0.74609375, + "learning_rate": 1.2349816288473203e-05, + "loss": 0.1666, + "step": 25663 + }, + { + "epoch": 3.42, + "grad_norm": 0.72265625, + "learning_rate": 1.2344210988958183e-05, + "loss": 0.3008, + "step": 25664 + }, + { + "epoch": 3.42, + "grad_norm": 0.6640625, + "learning_rate": 1.2338606878111913e-05, + "loss": 0.3166, + "step": 25665 + }, + { + "epoch": 3.42, + "grad_norm": 0.75, + "learning_rate": 1.2333003956010424e-05, + "loss": 0.3829, + "step": 25666 + }, + { + "epoch": 3.43, + "grad_norm": 0.62109375, + "learning_rate": 1.2327402222729633e-05, + "loss": 0.2365, + "step": 25667 + }, + { + "epoch": 3.43, + "grad_norm": 0.796875, + "learning_rate": 1.2321801678345545e-05, + "loss": 0.3647, + "step": 25668 + }, + { + "epoch": 3.43, + "grad_norm": 0.55078125, + "learning_rate": 1.23162023229341e-05, + "loss": 0.1965, + "step": 25669 + }, + { + "epoch": 3.43, + "grad_norm": 0.72265625, + "learning_rate": 1.2310604156571226e-05, + "loss": 0.2815, + "step": 25670 + }, + { + "epoch": 3.43, + "grad_norm": 0.435546875, + "learning_rate": 1.230500717933285e-05, + "loss": 0.1179, + "step": 25671 + }, + { + "epoch": 3.43, + "grad_norm": 0.5546875, + "learning_rate": 1.2299411391294869e-05, + "loss": 0.327, + "step": 25672 + }, + { + "epoch": 3.43, + "grad_norm": 0.48828125, + "learning_rate": 1.2293816792533131e-05, + "loss": 0.3387, + "step": 25673 + }, + { + "epoch": 3.43, + "grad_norm": 0.55859375, + "learning_rate": 1.228822338312352e-05, + "loss": 0.1521, + "step": 25674 + }, + { + "epoch": 3.43, + "grad_norm": 0.4453125, + "learning_rate": 1.22826311631419e-05, + "loss": 0.2107, + "step": 25675 + }, + { + "epoch": 3.43, + "grad_norm": 0.625, + "learning_rate": 1.2277040132664119e-05, + "loss": 0.1849, + "step": 25676 + }, + { + "epoch": 3.43, + "grad_norm": 0.5078125, + "learning_rate": 1.2271450291765951e-05, + "loss": 0.3366, + "step": 25677 + }, + { + "epoch": 3.43, + "grad_norm": 0.64453125, + "learning_rate": 1.2265861640523224e-05, + "loss": 0.3552, + "step": 25678 + }, + { + "epoch": 3.43, + "grad_norm": 0.85546875, + "learning_rate": 1.2260274179011722e-05, + "loss": 0.3171, + "step": 25679 + }, + { + "epoch": 3.43, + "grad_norm": 0.6484375, + "learning_rate": 1.2254687907307228e-05, + "loss": 0.2207, + "step": 25680 + }, + { + "epoch": 3.43, + "grad_norm": 0.625, + "learning_rate": 1.2249102825485459e-05, + "loss": 0.6012, + "step": 25681 + }, + { + "epoch": 3.43, + "grad_norm": 0.7109375, + "learning_rate": 1.2243518933622178e-05, + "loss": 0.585, + "step": 25682 + }, + { + "epoch": 3.43, + "grad_norm": 0.6875, + "learning_rate": 1.22379362317931e-05, + "loss": 0.2813, + "step": 25683 + }, + { + "epoch": 3.43, + "grad_norm": 0.62890625, + "learning_rate": 1.2232354720073958e-05, + "loss": 0.4433, + "step": 25684 + }, + { + "epoch": 3.43, + "grad_norm": 0.703125, + "learning_rate": 1.2226774398540408e-05, + "loss": 0.3791, + "step": 25685 + }, + { + "epoch": 3.43, + "grad_norm": 0.5703125, + "learning_rate": 1.2221195267268115e-05, + "loss": 0.196, + "step": 25686 + }, + { + "epoch": 3.43, + "grad_norm": 0.7109375, + "learning_rate": 1.2215617326332752e-05, + "loss": 0.2935, + "step": 25687 + }, + { + "epoch": 3.43, + "grad_norm": 0.5, + "learning_rate": 1.2210040575809944e-05, + "loss": 0.2232, + "step": 25688 + }, + { + "epoch": 3.43, + "grad_norm": 0.546875, + "learning_rate": 1.2204465015775357e-05, + "loss": 0.1589, + "step": 25689 + }, + { + "epoch": 3.43, + "grad_norm": 0.53125, + "learning_rate": 1.219889064630455e-05, + "loss": 0.3216, + "step": 25690 + }, + { + "epoch": 3.43, + "grad_norm": 0.7109375, + "learning_rate": 1.2193317467473142e-05, + "loss": 0.538, + "step": 25691 + }, + { + "epoch": 3.43, + "grad_norm": 0.482421875, + "learning_rate": 1.2187745479356704e-05, + "loss": 0.2978, + "step": 25692 + }, + { + "epoch": 3.43, + "grad_norm": 0.66796875, + "learning_rate": 1.2182174682030789e-05, + "loss": 0.3344, + "step": 25693 + }, + { + "epoch": 3.43, + "grad_norm": 0.61328125, + "learning_rate": 1.2176605075570968e-05, + "loss": 0.4083, + "step": 25694 + }, + { + "epoch": 3.43, + "grad_norm": 0.6484375, + "learning_rate": 1.2171036660052726e-05, + "loss": 0.4796, + "step": 25695 + }, + { + "epoch": 3.43, + "grad_norm": 0.421875, + "learning_rate": 1.2165469435551591e-05, + "loss": 0.271, + "step": 25696 + }, + { + "epoch": 3.43, + "grad_norm": 0.482421875, + "learning_rate": 1.2159903402143091e-05, + "loss": 0.2474, + "step": 25697 + }, + { + "epoch": 3.43, + "grad_norm": 0.65625, + "learning_rate": 1.2154338559902645e-05, + "loss": 0.454, + "step": 25698 + }, + { + "epoch": 3.43, + "grad_norm": 0.6484375, + "learning_rate": 1.214877490890578e-05, + "loss": 0.2743, + "step": 25699 + }, + { + "epoch": 3.43, + "grad_norm": 0.51953125, + "learning_rate": 1.2143212449227881e-05, + "loss": 0.0965, + "step": 25700 + }, + { + "epoch": 3.43, + "grad_norm": 0.62890625, + "learning_rate": 1.213765118094441e-05, + "loss": 0.4667, + "step": 25701 + }, + { + "epoch": 3.43, + "grad_norm": 0.5, + "learning_rate": 1.2132091104130782e-05, + "loss": 0.3997, + "step": 25702 + }, + { + "epoch": 3.43, + "grad_norm": 0.609375, + "learning_rate": 1.2126532218862408e-05, + "loss": 0.3431, + "step": 25703 + }, + { + "epoch": 3.43, + "grad_norm": 0.6015625, + "learning_rate": 1.2120974525214634e-05, + "loss": 0.4221, + "step": 25704 + }, + { + "epoch": 3.43, + "grad_norm": 0.59375, + "learning_rate": 1.2115418023262849e-05, + "loss": 0.201, + "step": 25705 + }, + { + "epoch": 3.43, + "grad_norm": 0.46875, + "learning_rate": 1.2109862713082398e-05, + "loss": 0.335, + "step": 25706 + }, + { + "epoch": 3.43, + "grad_norm": 0.498046875, + "learning_rate": 1.2104308594748626e-05, + "loss": 0.238, + "step": 25707 + }, + { + "epoch": 3.43, + "grad_norm": 0.7265625, + "learning_rate": 1.2098755668336836e-05, + "loss": 0.6162, + "step": 25708 + }, + { + "epoch": 3.43, + "grad_norm": 0.54296875, + "learning_rate": 1.2093203933922347e-05, + "loss": 0.2763, + "step": 25709 + }, + { + "epoch": 3.43, + "grad_norm": 0.66796875, + "learning_rate": 1.2087653391580412e-05, + "loss": 0.2978, + "step": 25710 + }, + { + "epoch": 3.43, + "grad_norm": 0.63671875, + "learning_rate": 1.2082104041386311e-05, + "loss": 0.2604, + "step": 25711 + }, + { + "epoch": 3.43, + "grad_norm": 0.482421875, + "learning_rate": 1.207655588341534e-05, + "loss": 0.3748, + "step": 25712 + }, + { + "epoch": 3.43, + "grad_norm": 0.51953125, + "learning_rate": 1.2071008917742665e-05, + "loss": 0.2224, + "step": 25713 + }, + { + "epoch": 3.43, + "grad_norm": 0.59375, + "learning_rate": 1.2065463144443556e-05, + "loss": 0.2663, + "step": 25714 + }, + { + "epoch": 3.43, + "grad_norm": 0.6171875, + "learning_rate": 1.2059918563593187e-05, + "loss": 0.4301, + "step": 25715 + }, + { + "epoch": 3.43, + "grad_norm": 0.427734375, + "learning_rate": 1.2054375175266774e-05, + "loss": 0.1161, + "step": 25716 + }, + { + "epoch": 3.43, + "grad_norm": 0.62109375, + "learning_rate": 1.2048832979539493e-05, + "loss": 0.3555, + "step": 25717 + }, + { + "epoch": 3.43, + "grad_norm": 0.55078125, + "learning_rate": 1.2043291976486459e-05, + "loss": 0.4323, + "step": 25718 + }, + { + "epoch": 3.43, + "grad_norm": 0.609375, + "learning_rate": 1.2037752166182847e-05, + "loss": 0.4037, + "step": 25719 + }, + { + "epoch": 3.43, + "grad_norm": 0.4375, + "learning_rate": 1.2032213548703752e-05, + "loss": 0.1523, + "step": 25720 + }, + { + "epoch": 3.43, + "grad_norm": 0.640625, + "learning_rate": 1.2026676124124336e-05, + "loss": 0.3083, + "step": 25721 + }, + { + "epoch": 3.43, + "grad_norm": 0.6171875, + "learning_rate": 1.202113989251965e-05, + "loss": 0.4576, + "step": 25722 + }, + { + "epoch": 3.43, + "grad_norm": 0.6484375, + "learning_rate": 1.2015604853964746e-05, + "loss": 0.5314, + "step": 25723 + }, + { + "epoch": 3.43, + "grad_norm": 0.69140625, + "learning_rate": 1.2010071008534707e-05, + "loss": 0.2088, + "step": 25724 + }, + { + "epoch": 3.43, + "grad_norm": 0.69140625, + "learning_rate": 1.2004538356304573e-05, + "loss": 0.5677, + "step": 25725 + }, + { + "epoch": 3.43, + "grad_norm": 0.59765625, + "learning_rate": 1.1999006897349408e-05, + "loss": 0.3983, + "step": 25726 + }, + { + "epoch": 3.43, + "grad_norm": 0.53125, + "learning_rate": 1.1993476631744161e-05, + "loss": 0.4531, + "step": 25727 + }, + { + "epoch": 3.43, + "grad_norm": 0.66015625, + "learning_rate": 1.198794755956385e-05, + "loss": 0.4142, + "step": 25728 + }, + { + "epoch": 3.43, + "grad_norm": 0.77734375, + "learning_rate": 1.1982419680883461e-05, + "loss": 0.5527, + "step": 25729 + }, + { + "epoch": 3.43, + "grad_norm": 0.72265625, + "learning_rate": 1.1976892995777966e-05, + "loss": 0.4708, + "step": 25730 + }, + { + "epoch": 3.43, + "grad_norm": 0.5859375, + "learning_rate": 1.1971367504322273e-05, + "loss": 0.4159, + "step": 25731 + }, + { + "epoch": 3.43, + "grad_norm": 0.5859375, + "learning_rate": 1.1965843206591332e-05, + "loss": 0.5476, + "step": 25732 + }, + { + "epoch": 3.43, + "grad_norm": 0.53125, + "learning_rate": 1.1960320102660083e-05, + "loss": 0.2166, + "step": 25733 + }, + { + "epoch": 3.43, + "grad_norm": 0.5390625, + "learning_rate": 1.1954798192603379e-05, + "loss": 0.331, + "step": 25734 + }, + { + "epoch": 3.43, + "grad_norm": 0.4765625, + "learning_rate": 1.1949277476496134e-05, + "loss": 0.2547, + "step": 25735 + }, + { + "epoch": 3.43, + "grad_norm": 0.78515625, + "learning_rate": 1.1943757954413182e-05, + "loss": 0.4453, + "step": 25736 + }, + { + "epoch": 3.43, + "grad_norm": 0.62890625, + "learning_rate": 1.1938239626429381e-05, + "loss": 0.3518, + "step": 25737 + }, + { + "epoch": 3.43, + "grad_norm": 0.6875, + "learning_rate": 1.1932722492619563e-05, + "loss": 0.2913, + "step": 25738 + }, + { + "epoch": 3.43, + "grad_norm": 0.5859375, + "learning_rate": 1.1927206553058579e-05, + "loss": 0.5007, + "step": 25739 + }, + { + "epoch": 3.43, + "grad_norm": 0.6015625, + "learning_rate": 1.1921691807821178e-05, + "loss": 0.3857, + "step": 25740 + }, + { + "epoch": 3.43, + "grad_norm": 0.57421875, + "learning_rate": 1.1916178256982159e-05, + "loss": 0.3234, + "step": 25741 + }, + { + "epoch": 3.44, + "grad_norm": 0.703125, + "learning_rate": 1.1910665900616303e-05, + "loss": 0.7507, + "step": 25742 + }, + { + "epoch": 3.44, + "grad_norm": 0.54296875, + "learning_rate": 1.190515473879834e-05, + "loss": 0.2149, + "step": 25743 + }, + { + "epoch": 3.44, + "grad_norm": 0.4296875, + "learning_rate": 1.1899644771603046e-05, + "loss": 0.2183, + "step": 25744 + }, + { + "epoch": 3.44, + "grad_norm": 0.52734375, + "learning_rate": 1.1894135999105117e-05, + "loss": 0.2209, + "step": 25745 + }, + { + "epoch": 3.44, + "grad_norm": 0.671875, + "learning_rate": 1.1888628421379221e-05, + "loss": 0.4227, + "step": 25746 + }, + { + "epoch": 3.44, + "grad_norm": 0.640625, + "learning_rate": 1.1883122038500072e-05, + "loss": 0.3943, + "step": 25747 + }, + { + "epoch": 3.44, + "grad_norm": 0.76953125, + "learning_rate": 1.1877616850542339e-05, + "loss": 0.1694, + "step": 25748 + }, + { + "epoch": 3.44, + "grad_norm": 0.392578125, + "learning_rate": 1.1872112857580709e-05, + "loss": 0.1649, + "step": 25749 + }, + { + "epoch": 3.44, + "grad_norm": 0.6484375, + "learning_rate": 1.1866610059689765e-05, + "loss": 0.2802, + "step": 25750 + }, + { + "epoch": 3.44, + "grad_norm": 0.515625, + "learning_rate": 1.1861108456944147e-05, + "loss": 0.3522, + "step": 25751 + }, + { + "epoch": 3.44, + "grad_norm": 0.40625, + "learning_rate": 1.1855608049418466e-05, + "loss": 0.1607, + "step": 25752 + }, + { + "epoch": 3.44, + "grad_norm": 0.478515625, + "learning_rate": 1.1850108837187335e-05, + "loss": 0.2473, + "step": 25753 + }, + { + "epoch": 3.44, + "grad_norm": 0.5859375, + "learning_rate": 1.1844610820325286e-05, + "loss": 0.4368, + "step": 25754 + }, + { + "epoch": 3.44, + "grad_norm": 0.57421875, + "learning_rate": 1.1839113998906882e-05, + "loss": 0.1646, + "step": 25755 + }, + { + "epoch": 3.44, + "grad_norm": 0.60546875, + "learning_rate": 1.1833618373006683e-05, + "loss": 0.3283, + "step": 25756 + }, + { + "epoch": 3.44, + "grad_norm": 0.4921875, + "learning_rate": 1.182812394269922e-05, + "loss": 0.1376, + "step": 25757 + }, + { + "epoch": 3.44, + "grad_norm": 0.58984375, + "learning_rate": 1.1822630708058979e-05, + "loss": 0.2031, + "step": 25758 + }, + { + "epoch": 3.44, + "grad_norm": 0.69140625, + "learning_rate": 1.1817138669160443e-05, + "loss": 0.4005, + "step": 25759 + }, + { + "epoch": 3.44, + "grad_norm": 0.51171875, + "learning_rate": 1.1811647826078098e-05, + "loss": 0.1865, + "step": 25760 + }, + { + "epoch": 3.44, + "grad_norm": 0.64453125, + "learning_rate": 1.1806158178886417e-05, + "loss": 0.5234, + "step": 25761 + }, + { + "epoch": 3.44, + "grad_norm": 0.70703125, + "learning_rate": 1.180066972765984e-05, + "loss": 0.2543, + "step": 25762 + }, + { + "epoch": 3.44, + "grad_norm": 0.64453125, + "learning_rate": 1.1795182472472777e-05, + "loss": 0.3894, + "step": 25763 + }, + { + "epoch": 3.44, + "grad_norm": 0.5625, + "learning_rate": 1.1789696413399642e-05, + "loss": 0.4436, + "step": 25764 + }, + { + "epoch": 3.44, + "grad_norm": 0.63671875, + "learning_rate": 1.1784211550514846e-05, + "loss": 0.1479, + "step": 25765 + }, + { + "epoch": 3.44, + "grad_norm": 0.435546875, + "learning_rate": 1.1778727883892748e-05, + "loss": 0.2727, + "step": 25766 + }, + { + "epoch": 3.44, + "grad_norm": 0.65625, + "learning_rate": 1.1773245413607747e-05, + "loss": 0.2682, + "step": 25767 + }, + { + "epoch": 3.44, + "grad_norm": 0.65234375, + "learning_rate": 1.1767764139734139e-05, + "loss": 0.3057, + "step": 25768 + }, + { + "epoch": 3.44, + "grad_norm": 0.79296875, + "learning_rate": 1.1762284062346286e-05, + "loss": 0.3493, + "step": 25769 + }, + { + "epoch": 3.44, + "grad_norm": 0.546875, + "learning_rate": 1.1756805181518492e-05, + "loss": 0.2076, + "step": 25770 + }, + { + "epoch": 3.44, + "grad_norm": 0.74609375, + "learning_rate": 1.1751327497325049e-05, + "loss": 0.2398, + "step": 25771 + }, + { + "epoch": 3.44, + "grad_norm": 0.640625, + "learning_rate": 1.174585100984026e-05, + "loss": 0.5201, + "step": 25772 + }, + { + "epoch": 3.44, + "grad_norm": 0.69921875, + "learning_rate": 1.1740375719138352e-05, + "loss": 0.2697, + "step": 25773 + }, + { + "epoch": 3.44, + "grad_norm": 0.578125, + "learning_rate": 1.1734901625293592e-05, + "loss": 0.2833, + "step": 25774 + }, + { + "epoch": 3.44, + "grad_norm": 0.51171875, + "learning_rate": 1.172942872838021e-05, + "loss": 0.2522, + "step": 25775 + }, + { + "epoch": 3.44, + "grad_norm": 0.478515625, + "learning_rate": 1.1723957028472454e-05, + "loss": 0.313, + "step": 25776 + }, + { + "epoch": 3.44, + "grad_norm": 0.80078125, + "learning_rate": 1.171848652564448e-05, + "loss": 0.4447, + "step": 25777 + }, + { + "epoch": 3.44, + "grad_norm": 0.50390625, + "learning_rate": 1.171301721997048e-05, + "loss": 0.2171, + "step": 25778 + }, + { + "epoch": 3.44, + "grad_norm": 0.6171875, + "learning_rate": 1.1707549111524618e-05, + "loss": 0.2407, + "step": 25779 + }, + { + "epoch": 3.44, + "grad_norm": 0.64453125, + "learning_rate": 1.1702082200381092e-05, + "loss": 0.464, + "step": 25780 + }, + { + "epoch": 3.44, + "grad_norm": 0.53125, + "learning_rate": 1.1696616486613977e-05, + "loss": 0.3478, + "step": 25781 + }, + { + "epoch": 3.44, + "grad_norm": 0.73046875, + "learning_rate": 1.1691151970297432e-05, + "loss": 0.3598, + "step": 25782 + }, + { + "epoch": 3.44, + "grad_norm": 0.62890625, + "learning_rate": 1.1685688651505521e-05, + "loss": 0.3494, + "step": 25783 + }, + { + "epoch": 3.44, + "grad_norm": 0.6328125, + "learning_rate": 1.1680226530312354e-05, + "loss": 0.2552, + "step": 25784 + }, + { + "epoch": 3.44, + "grad_norm": 0.71484375, + "learning_rate": 1.1674765606792026e-05, + "loss": 0.3727, + "step": 25785 + }, + { + "epoch": 3.44, + "grad_norm": 0.5234375, + "learning_rate": 1.1669305881018534e-05, + "loss": 0.3193, + "step": 25786 + }, + { + "epoch": 3.44, + "grad_norm": 0.77734375, + "learning_rate": 1.1663847353065948e-05, + "loss": 0.2935, + "step": 25787 + }, + { + "epoch": 3.44, + "grad_norm": 0.5703125, + "learning_rate": 1.165839002300828e-05, + "loss": 0.2495, + "step": 25788 + }, + { + "epoch": 3.44, + "grad_norm": 0.490234375, + "learning_rate": 1.1652933890919549e-05, + "loss": 0.2104, + "step": 25789 + }, + { + "epoch": 3.44, + "grad_norm": 0.474609375, + "learning_rate": 1.1647478956873747e-05, + "loss": 0.1875, + "step": 25790 + }, + { + "epoch": 3.44, + "grad_norm": 0.64453125, + "learning_rate": 1.1642025220944819e-05, + "loss": 0.1211, + "step": 25791 + }, + { + "epoch": 3.44, + "grad_norm": 0.57421875, + "learning_rate": 1.1636572683206736e-05, + "loss": 0.3306, + "step": 25792 + }, + { + "epoch": 3.44, + "grad_norm": 0.59375, + "learning_rate": 1.1631121343733443e-05, + "loss": 0.1348, + "step": 25793 + }, + { + "epoch": 3.44, + "grad_norm": 0.76953125, + "learning_rate": 1.1625671202598875e-05, + "loss": 0.1601, + "step": 25794 + }, + { + "epoch": 3.44, + "grad_norm": 0.71484375, + "learning_rate": 1.1620222259876923e-05, + "loss": 0.5087, + "step": 25795 + }, + { + "epoch": 3.44, + "grad_norm": 0.63671875, + "learning_rate": 1.1614774515641457e-05, + "loss": 0.23, + "step": 25796 + }, + { + "epoch": 3.44, + "grad_norm": 0.470703125, + "learning_rate": 1.1609327969966377e-05, + "loss": 0.2171, + "step": 25797 + }, + { + "epoch": 3.44, + "grad_norm": 0.5703125, + "learning_rate": 1.1603882622925544e-05, + "loss": 0.286, + "step": 25798 + }, + { + "epoch": 3.44, + "grad_norm": 0.734375, + "learning_rate": 1.159843847459281e-05, + "loss": 0.4928, + "step": 25799 + }, + { + "epoch": 3.44, + "grad_norm": 0.703125, + "learning_rate": 1.1592995525041962e-05, + "loss": 0.4218, + "step": 25800 + }, + { + "epoch": 3.44, + "grad_norm": 0.4765625, + "learning_rate": 1.1587553774346838e-05, + "loss": 0.2006, + "step": 25801 + }, + { + "epoch": 3.44, + "grad_norm": 0.50390625, + "learning_rate": 1.1582113222581225e-05, + "loss": 0.1271, + "step": 25802 + }, + { + "epoch": 3.44, + "grad_norm": 0.578125, + "learning_rate": 1.1576673869818921e-05, + "loss": 0.3983, + "step": 25803 + }, + { + "epoch": 3.44, + "grad_norm": 0.9921875, + "learning_rate": 1.1571235716133656e-05, + "loss": 0.3831, + "step": 25804 + }, + { + "epoch": 3.44, + "grad_norm": 0.53125, + "learning_rate": 1.156579876159919e-05, + "loss": 0.3816, + "step": 25805 + }, + { + "epoch": 3.44, + "grad_norm": 0.4765625, + "learning_rate": 1.1560363006289266e-05, + "loss": 0.3493, + "step": 25806 + }, + { + "epoch": 3.44, + "grad_norm": 0.6796875, + "learning_rate": 1.1554928450277546e-05, + "loss": 0.3855, + "step": 25807 + }, + { + "epoch": 3.44, + "grad_norm": 0.51171875, + "learning_rate": 1.1549495093637796e-05, + "loss": 0.2295, + "step": 25808 + }, + { + "epoch": 3.44, + "grad_norm": 0.57421875, + "learning_rate": 1.1544062936443634e-05, + "loss": 0.1977, + "step": 25809 + }, + { + "epoch": 3.44, + "grad_norm": 0.73828125, + "learning_rate": 1.1538631978768754e-05, + "loss": 0.3373, + "step": 25810 + }, + { + "epoch": 3.44, + "grad_norm": 0.5703125, + "learning_rate": 1.153320222068679e-05, + "loss": 0.1847, + "step": 25811 + }, + { + "epoch": 3.44, + "grad_norm": 0.47265625, + "learning_rate": 1.1527773662271413e-05, + "loss": 0.2817, + "step": 25812 + }, + { + "epoch": 3.44, + "grad_norm": 0.55078125, + "learning_rate": 1.1522346303596177e-05, + "loss": 0.173, + "step": 25813 + }, + { + "epoch": 3.44, + "grad_norm": 0.490234375, + "learning_rate": 1.1516920144734711e-05, + "loss": 0.1814, + "step": 25814 + }, + { + "epoch": 3.44, + "grad_norm": 0.78515625, + "learning_rate": 1.1511495185760601e-05, + "loss": 0.3022, + "step": 25815 + }, + { + "epoch": 3.44, + "grad_norm": 0.80859375, + "learning_rate": 1.15060714267474e-05, + "loss": 0.381, + "step": 25816 + }, + { + "epoch": 3.45, + "grad_norm": 0.6484375, + "learning_rate": 1.1500648867768693e-05, + "loss": 0.3915, + "step": 25817 + }, + { + "epoch": 3.45, + "grad_norm": 0.49609375, + "learning_rate": 1.1495227508897988e-05, + "loss": 0.2536, + "step": 25818 + }, + { + "epoch": 3.45, + "grad_norm": 0.6328125, + "learning_rate": 1.148980735020877e-05, + "loss": 0.2766, + "step": 25819 + }, + { + "epoch": 3.45, + "grad_norm": 0.5625, + "learning_rate": 1.148438839177457e-05, + "loss": 0.2984, + "step": 25820 + }, + { + "epoch": 3.45, + "grad_norm": 0.953125, + "learning_rate": 1.1478970633668884e-05, + "loss": 0.3164, + "step": 25821 + }, + { + "epoch": 3.45, + "grad_norm": 0.66796875, + "learning_rate": 1.1473554075965176e-05, + "loss": 0.4258, + "step": 25822 + }, + { + "epoch": 3.45, + "grad_norm": 0.88671875, + "learning_rate": 1.1468138718736877e-05, + "loss": 0.4279, + "step": 25823 + }, + { + "epoch": 3.45, + "grad_norm": 0.55078125, + "learning_rate": 1.1462724562057437e-05, + "loss": 0.1524, + "step": 25824 + }, + { + "epoch": 3.45, + "grad_norm": 0.44140625, + "learning_rate": 1.1457311606000276e-05, + "loss": 0.1152, + "step": 25825 + }, + { + "epoch": 3.45, + "grad_norm": 0.51953125, + "learning_rate": 1.1451899850638814e-05, + "loss": 0.2557, + "step": 25826 + }, + { + "epoch": 3.45, + "grad_norm": 0.5390625, + "learning_rate": 1.1446489296046392e-05, + "loss": 0.2742, + "step": 25827 + }, + { + "epoch": 3.45, + "grad_norm": 0.6171875, + "learning_rate": 1.1441079942296417e-05, + "loss": 0.1695, + "step": 25828 + }, + { + "epoch": 3.45, + "grad_norm": 0.75, + "learning_rate": 1.1435671789462232e-05, + "loss": 0.5325, + "step": 25829 + }, + { + "epoch": 3.45, + "grad_norm": 0.5546875, + "learning_rate": 1.14302648376172e-05, + "loss": 0.2211, + "step": 25830 + }, + { + "epoch": 3.45, + "grad_norm": 0.515625, + "learning_rate": 1.1424859086834617e-05, + "loss": 0.2139, + "step": 25831 + }, + { + "epoch": 3.45, + "grad_norm": 0.6796875, + "learning_rate": 1.1419454537187779e-05, + "loss": 0.2687, + "step": 25832 + }, + { + "epoch": 3.45, + "grad_norm": 0.68359375, + "learning_rate": 1.1414051188749985e-05, + "loss": 0.4386, + "step": 25833 + }, + { + "epoch": 3.45, + "grad_norm": 0.73046875, + "learning_rate": 1.140864904159451e-05, + "loss": 0.3373, + "step": 25834 + }, + { + "epoch": 3.45, + "grad_norm": 0.455078125, + "learning_rate": 1.1403248095794628e-05, + "loss": 0.1247, + "step": 25835 + }, + { + "epoch": 3.45, + "grad_norm": 0.6484375, + "learning_rate": 1.1397848351423545e-05, + "loss": 0.2246, + "step": 25836 + }, + { + "epoch": 3.45, + "grad_norm": 0.49609375, + "learning_rate": 1.1392449808554506e-05, + "loss": 0.1963, + "step": 25837 + }, + { + "epoch": 3.45, + "grad_norm": 0.72265625, + "learning_rate": 1.1387052467260706e-05, + "loss": 0.2495, + "step": 25838 + }, + { + "epoch": 3.45, + "grad_norm": 0.54296875, + "learning_rate": 1.1381656327615354e-05, + "loss": 0.2398, + "step": 25839 + }, + { + "epoch": 3.45, + "grad_norm": 0.75, + "learning_rate": 1.1376261389691634e-05, + "loss": 0.3736, + "step": 25840 + }, + { + "epoch": 3.45, + "grad_norm": 0.52734375, + "learning_rate": 1.1370867653562678e-05, + "loss": 0.2643, + "step": 25841 + }, + { + "epoch": 3.45, + "grad_norm": 0.69921875, + "learning_rate": 1.136547511930165e-05, + "loss": 0.5008, + "step": 25842 + }, + { + "epoch": 3.45, + "grad_norm": 0.7578125, + "learning_rate": 1.1360083786981645e-05, + "loss": 0.3916, + "step": 25843 + }, + { + "epoch": 3.45, + "grad_norm": 0.5859375, + "learning_rate": 1.1354693656675786e-05, + "loss": 0.4089, + "step": 25844 + }, + { + "epoch": 3.45, + "grad_norm": 0.77734375, + "learning_rate": 1.1349304728457199e-05, + "loss": 0.3149, + "step": 25845 + }, + { + "epoch": 3.45, + "grad_norm": 0.5, + "learning_rate": 1.1343917002398919e-05, + "loss": 0.1801, + "step": 25846 + }, + { + "epoch": 3.45, + "grad_norm": 0.51953125, + "learning_rate": 1.1338530478574017e-05, + "loss": 0.2589, + "step": 25847 + }, + { + "epoch": 3.45, + "grad_norm": 0.67578125, + "learning_rate": 1.1333145157055546e-05, + "loss": 0.3741, + "step": 25848 + }, + { + "epoch": 3.45, + "grad_norm": 0.765625, + "learning_rate": 1.1327761037916563e-05, + "loss": 0.5598, + "step": 25849 + }, + { + "epoch": 3.45, + "grad_norm": 0.64453125, + "learning_rate": 1.1322378121230015e-05, + "loss": 0.3654, + "step": 25850 + }, + { + "epoch": 3.45, + "grad_norm": 0.73828125, + "learning_rate": 1.1316996407068946e-05, + "loss": 0.2537, + "step": 25851 + }, + { + "epoch": 3.45, + "grad_norm": 0.53125, + "learning_rate": 1.1311615895506311e-05, + "loss": 0.3726, + "step": 25852 + }, + { + "epoch": 3.45, + "grad_norm": 0.6484375, + "learning_rate": 1.1306236586615104e-05, + "loss": 0.2682, + "step": 25853 + }, + { + "epoch": 3.45, + "grad_norm": 0.6953125, + "learning_rate": 1.1300858480468246e-05, + "loss": 0.4386, + "step": 25854 + }, + { + "epoch": 3.45, + "grad_norm": 0.6328125, + "learning_rate": 1.1295481577138677e-05, + "loss": 0.4156, + "step": 25855 + }, + { + "epoch": 3.45, + "grad_norm": 0.8515625, + "learning_rate": 1.1290105876699298e-05, + "loss": 0.4878, + "step": 25856 + }, + { + "epoch": 3.45, + "grad_norm": 0.6484375, + "learning_rate": 1.1284731379223012e-05, + "loss": 0.221, + "step": 25857 + }, + { + "epoch": 3.45, + "grad_norm": 0.77734375, + "learning_rate": 1.1279358084782732e-05, + "loss": 0.529, + "step": 25858 + }, + { + "epoch": 3.45, + "grad_norm": 0.76953125, + "learning_rate": 1.1273985993451274e-05, + "loss": 0.2471, + "step": 25859 + }, + { + "epoch": 3.45, + "grad_norm": 0.54296875, + "learning_rate": 1.1268615105301505e-05, + "loss": 0.1621, + "step": 25860 + }, + { + "epoch": 3.45, + "grad_norm": 0.5625, + "learning_rate": 1.1263245420406266e-05, + "loss": 0.1896, + "step": 25861 + }, + { + "epoch": 3.45, + "grad_norm": 0.494140625, + "learning_rate": 1.1257876938838375e-05, + "loss": 0.4644, + "step": 25862 + }, + { + "epoch": 3.45, + "grad_norm": 0.4921875, + "learning_rate": 1.1252509660670641e-05, + "loss": 0.2005, + "step": 25863 + }, + { + "epoch": 3.45, + "grad_norm": 0.396484375, + "learning_rate": 1.1247143585975806e-05, + "loss": 0.1079, + "step": 25864 + }, + { + "epoch": 3.45, + "grad_norm": 0.54296875, + "learning_rate": 1.1241778714826679e-05, + "loss": 0.1885, + "step": 25865 + }, + { + "epoch": 3.45, + "grad_norm": 0.5546875, + "learning_rate": 1.123641504729599e-05, + "loss": 0.3217, + "step": 25866 + }, + { + "epoch": 3.45, + "grad_norm": 0.6328125, + "learning_rate": 1.1231052583456503e-05, + "loss": 0.3254, + "step": 25867 + }, + { + "epoch": 3.45, + "grad_norm": 0.640625, + "learning_rate": 1.1225691323380917e-05, + "loss": 0.5895, + "step": 25868 + }, + { + "epoch": 3.45, + "grad_norm": 0.72265625, + "learning_rate": 1.1220331267141914e-05, + "loss": 0.604, + "step": 25869 + }, + { + "epoch": 3.45, + "grad_norm": 0.5703125, + "learning_rate": 1.1214972414812185e-05, + "loss": 0.298, + "step": 25870 + }, + { + "epoch": 3.45, + "grad_norm": 0.76953125, + "learning_rate": 1.1209614766464427e-05, + "loss": 0.3897, + "step": 25871 + }, + { + "epoch": 3.45, + "grad_norm": 0.59375, + "learning_rate": 1.1204258322171291e-05, + "loss": 0.3875, + "step": 25872 + }, + { + "epoch": 3.45, + "grad_norm": 0.52734375, + "learning_rate": 1.1198903082005375e-05, + "loss": 0.2458, + "step": 25873 + }, + { + "epoch": 3.45, + "grad_norm": 0.57421875, + "learning_rate": 1.1193549046039332e-05, + "loss": 0.2382, + "step": 25874 + }, + { + "epoch": 3.45, + "grad_norm": 0.70703125, + "learning_rate": 1.1188196214345758e-05, + "loss": 0.2898, + "step": 25875 + }, + { + "epoch": 3.45, + "grad_norm": 0.66015625, + "learning_rate": 1.1182844586997266e-05, + "loss": 0.5617, + "step": 25876 + }, + { + "epoch": 3.45, + "grad_norm": 0.78515625, + "learning_rate": 1.1177494164066382e-05, + "loss": 0.3021, + "step": 25877 + }, + { + "epoch": 3.45, + "grad_norm": 0.59375, + "learning_rate": 1.1172144945625685e-05, + "loss": 0.5339, + "step": 25878 + }, + { + "epoch": 3.45, + "grad_norm": 0.59765625, + "learning_rate": 1.1166796931747725e-05, + "loss": 0.1837, + "step": 25879 + }, + { + "epoch": 3.45, + "grad_norm": 0.56640625, + "learning_rate": 1.116145012250499e-05, + "loss": 0.177, + "step": 25880 + }, + { + "epoch": 3.45, + "grad_norm": 0.59765625, + "learning_rate": 1.1156104517970035e-05, + "loss": 0.1977, + "step": 25881 + }, + { + "epoch": 3.45, + "grad_norm": 0.5546875, + "learning_rate": 1.1150760118215298e-05, + "loss": 0.522, + "step": 25882 + }, + { + "epoch": 3.45, + "grad_norm": 0.5546875, + "learning_rate": 1.1145416923313279e-05, + "loss": 0.2198, + "step": 25883 + }, + { + "epoch": 3.45, + "grad_norm": 0.515625, + "learning_rate": 1.114007493333643e-05, + "loss": 0.1458, + "step": 25884 + }, + { + "epoch": 3.45, + "grad_norm": 0.546875, + "learning_rate": 1.1134734148357218e-05, + "loss": 0.2829, + "step": 25885 + }, + { + "epoch": 3.45, + "grad_norm": 0.77734375, + "learning_rate": 1.1129394568448027e-05, + "loss": 0.3372, + "step": 25886 + }, + { + "epoch": 3.45, + "grad_norm": 0.6796875, + "learning_rate": 1.1124056193681275e-05, + "loss": 0.3219, + "step": 25887 + }, + { + "epoch": 3.45, + "grad_norm": 0.54296875, + "learning_rate": 1.1118719024129376e-05, + "loss": 0.264, + "step": 25888 + }, + { + "epoch": 3.45, + "grad_norm": 0.59765625, + "learning_rate": 1.111338305986468e-05, + "loss": 0.1911, + "step": 25889 + }, + { + "epoch": 3.45, + "grad_norm": 0.6484375, + "learning_rate": 1.1108048300959572e-05, + "loss": 0.1198, + "step": 25890 + }, + { + "epoch": 3.45, + "grad_norm": 0.5390625, + "learning_rate": 1.1102714747486387e-05, + "loss": 0.2717, + "step": 25891 + }, + { + "epoch": 3.46, + "grad_norm": 0.58203125, + "learning_rate": 1.109738239951742e-05, + "loss": 0.2065, + "step": 25892 + }, + { + "epoch": 3.46, + "grad_norm": 0.69921875, + "learning_rate": 1.1092051257125002e-05, + "loss": 0.3867, + "step": 25893 + }, + { + "epoch": 3.46, + "grad_norm": 0.5078125, + "learning_rate": 1.1086721320381443e-05, + "loss": 0.2869, + "step": 25894 + }, + { + "epoch": 3.46, + "grad_norm": 0.765625, + "learning_rate": 1.1081392589359018e-05, + "loss": 0.3186, + "step": 25895 + }, + { + "epoch": 3.46, + "grad_norm": 0.54296875, + "learning_rate": 1.1076065064129948e-05, + "loss": 0.1548, + "step": 25896 + }, + { + "epoch": 3.46, + "grad_norm": 0.6015625, + "learning_rate": 1.1070738744766517e-05, + "loss": 0.2754, + "step": 25897 + }, + { + "epoch": 3.46, + "grad_norm": 0.5234375, + "learning_rate": 1.1065413631340949e-05, + "loss": 0.1771, + "step": 25898 + }, + { + "epoch": 3.46, + "grad_norm": 0.55859375, + "learning_rate": 1.106008972392546e-05, + "loss": 0.2543, + "step": 25899 + }, + { + "epoch": 3.46, + "grad_norm": 0.6796875, + "learning_rate": 1.1054767022592216e-05, + "loss": 0.3482, + "step": 25900 + }, + { + "epoch": 3.46, + "grad_norm": 0.69140625, + "learning_rate": 1.1049445527413416e-05, + "loss": 0.6392, + "step": 25901 + }, + { + "epoch": 3.46, + "grad_norm": 0.515625, + "learning_rate": 1.1044125238461233e-05, + "loss": 0.2521, + "step": 25902 + }, + { + "epoch": 3.46, + "grad_norm": 0.6640625, + "learning_rate": 1.1038806155807812e-05, + "loss": 0.3826, + "step": 25903 + }, + { + "epoch": 3.46, + "grad_norm": 0.484375, + "learning_rate": 1.1033488279525273e-05, + "loss": 0.1148, + "step": 25904 + }, + { + "epoch": 3.46, + "grad_norm": 0.69140625, + "learning_rate": 1.1028171609685723e-05, + "loss": 0.2992, + "step": 25905 + }, + { + "epoch": 3.46, + "grad_norm": 0.51953125, + "learning_rate": 1.102285614636126e-05, + "loss": 0.4091, + "step": 25906 + }, + { + "epoch": 3.46, + "grad_norm": 0.60546875, + "learning_rate": 1.1017541889623973e-05, + "loss": 0.2006, + "step": 25907 + }, + { + "epoch": 3.46, + "grad_norm": 0.6015625, + "learning_rate": 1.1012228839545946e-05, + "loss": 0.4636, + "step": 25908 + }, + { + "epoch": 3.46, + "grad_norm": 0.51953125, + "learning_rate": 1.100691699619919e-05, + "loss": 0.2341, + "step": 25909 + }, + { + "epoch": 3.46, + "grad_norm": 0.70703125, + "learning_rate": 1.1001606359655759e-05, + "loss": 0.3652, + "step": 25910 + }, + { + "epoch": 3.46, + "grad_norm": 0.6484375, + "learning_rate": 1.0996296929987648e-05, + "loss": 0.5664, + "step": 25911 + }, + { + "epoch": 3.46, + "grad_norm": 0.55859375, + "learning_rate": 1.0990988707266891e-05, + "loss": 0.2984, + "step": 25912 + }, + { + "epoch": 3.46, + "grad_norm": 0.423828125, + "learning_rate": 1.0985681691565464e-05, + "loss": 0.2611, + "step": 25913 + }, + { + "epoch": 3.46, + "grad_norm": 0.443359375, + "learning_rate": 1.0980375882955307e-05, + "loss": 0.1109, + "step": 25914 + }, + { + "epoch": 3.46, + "grad_norm": 0.80078125, + "learning_rate": 1.0975071281508398e-05, + "loss": 0.4527, + "step": 25915 + }, + { + "epoch": 3.46, + "grad_norm": 0.609375, + "learning_rate": 1.0969767887296644e-05, + "loss": 0.2232, + "step": 25916 + }, + { + "epoch": 3.46, + "grad_norm": 0.482421875, + "learning_rate": 1.096446570039198e-05, + "loss": 0.1547, + "step": 25917 + }, + { + "epoch": 3.46, + "grad_norm": 0.52734375, + "learning_rate": 1.0959164720866322e-05, + "loss": 0.2507, + "step": 25918 + }, + { + "epoch": 3.46, + "grad_norm": 0.69140625, + "learning_rate": 1.0953864948791503e-05, + "loss": 0.2967, + "step": 25919 + }, + { + "epoch": 3.46, + "grad_norm": 0.7578125, + "learning_rate": 1.0948566384239445e-05, + "loss": 0.3875, + "step": 25920 + }, + { + "epoch": 3.46, + "grad_norm": 0.65625, + "learning_rate": 1.0943269027281967e-05, + "loss": 0.4209, + "step": 25921 + }, + { + "epoch": 3.46, + "grad_norm": 0.462890625, + "learning_rate": 1.0937972877990944e-05, + "loss": 0.2663, + "step": 25922 + }, + { + "epoch": 3.46, + "grad_norm": 0.55078125, + "learning_rate": 1.093267793643814e-05, + "loss": 0.3059, + "step": 25923 + }, + { + "epoch": 3.46, + "grad_norm": 0.86328125, + "learning_rate": 1.092738420269539e-05, + "loss": 0.432, + "step": 25924 + }, + { + "epoch": 3.46, + "grad_norm": 0.5625, + "learning_rate": 1.0922091676834479e-05, + "loss": 0.2344, + "step": 25925 + }, + { + "epoch": 3.46, + "grad_norm": 0.59375, + "learning_rate": 1.0916800358927193e-05, + "loss": 0.2751, + "step": 25926 + }, + { + "epoch": 3.46, + "grad_norm": 0.62890625, + "learning_rate": 1.0911510249045265e-05, + "loss": 0.3164, + "step": 25927 + }, + { + "epoch": 3.46, + "grad_norm": 0.55078125, + "learning_rate": 1.0906221347260415e-05, + "loss": 0.278, + "step": 25928 + }, + { + "epoch": 3.46, + "grad_norm": 0.55078125, + "learning_rate": 1.0900933653644385e-05, + "loss": 0.4899, + "step": 25929 + }, + { + "epoch": 3.46, + "grad_norm": 0.5234375, + "learning_rate": 1.0895647168268885e-05, + "loss": 0.2834, + "step": 25930 + }, + { + "epoch": 3.46, + "grad_norm": 0.74609375, + "learning_rate": 1.0890361891205614e-05, + "loss": 0.5216, + "step": 25931 + }, + { + "epoch": 3.46, + "grad_norm": 0.458984375, + "learning_rate": 1.0885077822526201e-05, + "loss": 0.1642, + "step": 25932 + }, + { + "epoch": 3.46, + "grad_norm": 0.6484375, + "learning_rate": 1.0879794962302326e-05, + "loss": 0.3262, + "step": 25933 + }, + { + "epoch": 3.46, + "grad_norm": 0.6015625, + "learning_rate": 1.0874513310605628e-05, + "loss": 0.2946, + "step": 25934 + }, + { + "epoch": 3.46, + "grad_norm": 0.5625, + "learning_rate": 1.0869232867507728e-05, + "loss": 0.3945, + "step": 25935 + }, + { + "epoch": 3.46, + "grad_norm": 0.73046875, + "learning_rate": 1.086395363308026e-05, + "loss": 0.2155, + "step": 25936 + }, + { + "epoch": 3.46, + "grad_norm": 0.6015625, + "learning_rate": 1.0858675607394763e-05, + "loss": 0.2611, + "step": 25937 + }, + { + "epoch": 3.46, + "grad_norm": 0.5546875, + "learning_rate": 1.0853398790522839e-05, + "loss": 0.3245, + "step": 25938 + }, + { + "epoch": 3.46, + "grad_norm": 0.76953125, + "learning_rate": 1.084812318253604e-05, + "loss": 0.3252, + "step": 25939 + }, + { + "epoch": 3.46, + "grad_norm": 0.6171875, + "learning_rate": 1.0842848783505932e-05, + "loss": 0.3233, + "step": 25940 + }, + { + "epoch": 3.46, + "grad_norm": 0.64453125, + "learning_rate": 1.0837575593504002e-05, + "loss": 0.2852, + "step": 25941 + }, + { + "epoch": 3.46, + "grad_norm": 0.6484375, + "learning_rate": 1.0832303612601769e-05, + "loss": 0.2938, + "step": 25942 + }, + { + "epoch": 3.46, + "grad_norm": 0.5625, + "learning_rate": 1.082703284087071e-05, + "loss": 0.31, + "step": 25943 + }, + { + "epoch": 3.46, + "grad_norm": 0.65234375, + "learning_rate": 1.0821763278382324e-05, + "loss": 0.7076, + "step": 25944 + }, + { + "epoch": 3.46, + "grad_norm": 0.61328125, + "learning_rate": 1.0816494925208087e-05, + "loss": 0.356, + "step": 25945 + }, + { + "epoch": 3.46, + "grad_norm": 0.68359375, + "learning_rate": 1.0811227781419386e-05, + "loss": 0.4075, + "step": 25946 + }, + { + "epoch": 3.46, + "grad_norm": 0.7109375, + "learning_rate": 1.0805961847087687e-05, + "loss": 0.2558, + "step": 25947 + }, + { + "epoch": 3.46, + "grad_norm": 0.7109375, + "learning_rate": 1.0800697122284386e-05, + "loss": 0.2938, + "step": 25948 + }, + { + "epoch": 3.46, + "grad_norm": 0.6640625, + "learning_rate": 1.0795433607080896e-05, + "loss": 0.3154, + "step": 25949 + }, + { + "epoch": 3.46, + "grad_norm": 0.7734375, + "learning_rate": 1.0790171301548557e-05, + "loss": 0.3473, + "step": 25950 + }, + { + "epoch": 3.46, + "grad_norm": 0.49609375, + "learning_rate": 1.0784910205758758e-05, + "loss": 0.4056, + "step": 25951 + }, + { + "epoch": 3.46, + "grad_norm": 0.70703125, + "learning_rate": 1.0779650319782852e-05, + "loss": 0.2075, + "step": 25952 + }, + { + "epoch": 3.46, + "grad_norm": 0.60546875, + "learning_rate": 1.0774391643692128e-05, + "loss": 0.2502, + "step": 25953 + }, + { + "epoch": 3.46, + "grad_norm": 0.451171875, + "learning_rate": 1.0769134177557938e-05, + "loss": 0.2391, + "step": 25954 + }, + { + "epoch": 3.46, + "grad_norm": 0.6484375, + "learning_rate": 1.0763877921451526e-05, + "loss": 0.4329, + "step": 25955 + }, + { + "epoch": 3.46, + "grad_norm": 0.60546875, + "learning_rate": 1.0758622875444214e-05, + "loss": 0.2567, + "step": 25956 + }, + { + "epoch": 3.46, + "grad_norm": 0.76171875, + "learning_rate": 1.0753369039607242e-05, + "loss": 0.4477, + "step": 25957 + }, + { + "epoch": 3.46, + "grad_norm": 0.5078125, + "learning_rate": 1.0748116414011888e-05, + "loss": 0.2618, + "step": 25958 + }, + { + "epoch": 3.46, + "grad_norm": 0.4296875, + "learning_rate": 1.074286499872934e-05, + "loss": 0.2352, + "step": 25959 + }, + { + "epoch": 3.46, + "grad_norm": 0.53515625, + "learning_rate": 1.073761479383083e-05, + "loss": 0.2504, + "step": 25960 + }, + { + "epoch": 3.46, + "grad_norm": 0.5703125, + "learning_rate": 1.0732365799387546e-05, + "loss": 0.3626, + "step": 25961 + }, + { + "epoch": 3.46, + "grad_norm": 0.453125, + "learning_rate": 1.0727118015470672e-05, + "loss": 0.2772, + "step": 25962 + }, + { + "epoch": 3.46, + "grad_norm": 0.6953125, + "learning_rate": 1.0721871442151387e-05, + "loss": 0.4063, + "step": 25963 + }, + { + "epoch": 3.46, + "grad_norm": 0.478515625, + "learning_rate": 1.0716626079500835e-05, + "loss": 0.1684, + "step": 25964 + }, + { + "epoch": 3.46, + "grad_norm": 0.51171875, + "learning_rate": 1.0711381927590114e-05, + "loss": 0.2159, + "step": 25965 + }, + { + "epoch": 3.46, + "grad_norm": 0.76953125, + "learning_rate": 1.0706138986490355e-05, + "loss": 0.4877, + "step": 25966 + }, + { + "epoch": 3.47, + "grad_norm": 0.59765625, + "learning_rate": 1.0700897256272657e-05, + "loss": 0.3651, + "step": 25967 + }, + { + "epoch": 3.47, + "grad_norm": 0.46875, + "learning_rate": 1.069565673700813e-05, + "loss": 0.1379, + "step": 25968 + }, + { + "epoch": 3.47, + "grad_norm": 0.53125, + "learning_rate": 1.0690417428767785e-05, + "loss": 0.4361, + "step": 25969 + }, + { + "epoch": 3.47, + "grad_norm": 0.486328125, + "learning_rate": 1.0685179331622708e-05, + "loss": 0.3758, + "step": 25970 + }, + { + "epoch": 3.47, + "grad_norm": 0.79296875, + "learning_rate": 1.0679942445643921e-05, + "loss": 0.1859, + "step": 25971 + }, + { + "epoch": 3.47, + "grad_norm": 0.50390625, + "learning_rate": 1.0674706770902454e-05, + "loss": 0.2955, + "step": 25972 + }, + { + "epoch": 3.47, + "grad_norm": 0.8046875, + "learning_rate": 1.0669472307469275e-05, + "loss": 0.2674, + "step": 25973 + }, + { + "epoch": 3.47, + "grad_norm": 0.5078125, + "learning_rate": 1.0664239055415381e-05, + "loss": 0.2042, + "step": 25974 + }, + { + "epoch": 3.47, + "grad_norm": 0.412109375, + "learning_rate": 1.065900701481174e-05, + "loss": 0.2322, + "step": 25975 + }, + { + "epoch": 3.47, + "grad_norm": 0.6953125, + "learning_rate": 1.0653776185729325e-05, + "loss": 0.3717, + "step": 25976 + }, + { + "epoch": 3.47, + "grad_norm": 0.578125, + "learning_rate": 1.0648546568239038e-05, + "loss": 0.2995, + "step": 25977 + }, + { + "epoch": 3.47, + "grad_norm": 0.66796875, + "learning_rate": 1.0643318162411798e-05, + "loss": 0.5138, + "step": 25978 + }, + { + "epoch": 3.47, + "grad_norm": 0.71484375, + "learning_rate": 1.0638090968318504e-05, + "loss": 0.1904, + "step": 25979 + }, + { + "epoch": 3.47, + "grad_norm": 0.48046875, + "learning_rate": 1.0632864986030055e-05, + "loss": 0.3086, + "step": 25980 + }, + { + "epoch": 3.47, + "grad_norm": 0.4765625, + "learning_rate": 1.062764021561733e-05, + "loss": 0.2366, + "step": 25981 + }, + { + "epoch": 3.47, + "grad_norm": 0.416015625, + "learning_rate": 1.0622416657151136e-05, + "loss": 0.237, + "step": 25982 + }, + { + "epoch": 3.47, + "grad_norm": 0.49609375, + "learning_rate": 1.0617194310702338e-05, + "loss": 0.1942, + "step": 25983 + }, + { + "epoch": 3.47, + "grad_norm": 0.76171875, + "learning_rate": 1.061197317634176e-05, + "loss": 0.2302, + "step": 25984 + }, + { + "epoch": 3.47, + "grad_norm": 0.65625, + "learning_rate": 1.0606753254140179e-05, + "loss": 0.3967, + "step": 25985 + }, + { + "epoch": 3.47, + "grad_norm": 0.67578125, + "learning_rate": 1.0601534544168424e-05, + "loss": 0.1516, + "step": 25986 + }, + { + "epoch": 3.47, + "grad_norm": 0.5703125, + "learning_rate": 1.0596317046497217e-05, + "loss": 0.2469, + "step": 25987 + }, + { + "epoch": 3.47, + "grad_norm": 0.5, + "learning_rate": 1.0591100761197348e-05, + "loss": 0.1545, + "step": 25988 + }, + { + "epoch": 3.47, + "grad_norm": 0.6953125, + "learning_rate": 1.0585885688339514e-05, + "loss": 0.4972, + "step": 25989 + }, + { + "epoch": 3.47, + "grad_norm": 0.703125, + "learning_rate": 1.058067182799446e-05, + "loss": 0.2665, + "step": 25990 + }, + { + "epoch": 3.47, + "grad_norm": 0.546875, + "learning_rate": 1.0575459180232906e-05, + "loss": 0.3301, + "step": 25991 + }, + { + "epoch": 3.47, + "grad_norm": 0.5, + "learning_rate": 1.0570247745125495e-05, + "loss": 0.2004, + "step": 25992 + }, + { + "epoch": 3.47, + "grad_norm": 0.5546875, + "learning_rate": 1.0565037522742915e-05, + "loss": 0.3061, + "step": 25993 + }, + { + "epoch": 3.47, + "grad_norm": 0.64453125, + "learning_rate": 1.0559828513155834e-05, + "loss": 0.2279, + "step": 25994 + }, + { + "epoch": 3.47, + "grad_norm": 0.7890625, + "learning_rate": 1.0554620716434894e-05, + "loss": 0.3266, + "step": 25995 + }, + { + "epoch": 3.47, + "grad_norm": 0.58984375, + "learning_rate": 1.0549414132650682e-05, + "loss": 0.3116, + "step": 25996 + }, + { + "epoch": 3.47, + "grad_norm": 0.71875, + "learning_rate": 1.054420876187383e-05, + "loss": 0.2495, + "step": 25997 + }, + { + "epoch": 3.47, + "grad_norm": 0.52734375, + "learning_rate": 1.0539004604174918e-05, + "loss": 0.3086, + "step": 25998 + }, + { + "epoch": 3.47, + "grad_norm": 0.6015625, + "learning_rate": 1.0533801659624531e-05, + "loss": 0.3998, + "step": 25999 + }, + { + "epoch": 3.47, + "grad_norm": 0.5234375, + "learning_rate": 1.0528599928293226e-05, + "loss": 0.1772, + "step": 26000 + }, + { + "epoch": 3.47, + "grad_norm": 0.59765625, + "learning_rate": 1.05233994102515e-05, + "loss": 0.2348, + "step": 26001 + }, + { + "epoch": 3.47, + "grad_norm": 0.609375, + "learning_rate": 1.0518200105569898e-05, + "loss": 0.2519, + "step": 26002 + }, + { + "epoch": 3.47, + "grad_norm": 0.69140625, + "learning_rate": 1.051300201431894e-05, + "loss": 0.4035, + "step": 26003 + }, + { + "epoch": 3.47, + "grad_norm": 0.6640625, + "learning_rate": 1.0507805136569126e-05, + "loss": 0.352, + "step": 26004 + }, + { + "epoch": 3.47, + "grad_norm": 0.51171875, + "learning_rate": 1.050260947239089e-05, + "loss": 0.2566, + "step": 26005 + }, + { + "epoch": 3.47, + "grad_norm": 0.55859375, + "learning_rate": 1.0497415021854707e-05, + "loss": 0.4914, + "step": 26006 + }, + { + "epoch": 3.47, + "grad_norm": 0.5625, + "learning_rate": 1.049222178503102e-05, + "loss": 0.4288, + "step": 26007 + }, + { + "epoch": 3.47, + "grad_norm": 0.56640625, + "learning_rate": 1.0487029761990252e-05, + "loss": 0.1545, + "step": 26008 + }, + { + "epoch": 3.47, + "grad_norm": 0.58984375, + "learning_rate": 1.0481838952802825e-05, + "loss": 0.2592, + "step": 26009 + }, + { + "epoch": 3.47, + "grad_norm": 0.70703125, + "learning_rate": 1.0476649357539103e-05, + "loss": 0.3604, + "step": 26010 + }, + { + "epoch": 3.47, + "grad_norm": 0.5703125, + "learning_rate": 1.0471460976269465e-05, + "loss": 0.2666, + "step": 26011 + }, + { + "epoch": 3.47, + "grad_norm": 0.609375, + "learning_rate": 1.0466273809064297e-05, + "loss": 0.2612, + "step": 26012 + }, + { + "epoch": 3.47, + "grad_norm": 0.5390625, + "learning_rate": 1.04610878559939e-05, + "loss": 0.2152, + "step": 26013 + }, + { + "epoch": 3.47, + "grad_norm": 0.42578125, + "learning_rate": 1.045590311712863e-05, + "loss": 0.2025, + "step": 26014 + }, + { + "epoch": 3.47, + "grad_norm": 0.734375, + "learning_rate": 1.0450719592538771e-05, + "loss": 0.3608, + "step": 26015 + }, + { + "epoch": 3.47, + "grad_norm": 0.78515625, + "learning_rate": 1.0445537282294616e-05, + "loss": 0.3523, + "step": 26016 + }, + { + "epoch": 3.47, + "grad_norm": 0.55078125, + "learning_rate": 1.044035618646646e-05, + "loss": 0.3901, + "step": 26017 + }, + { + "epoch": 3.47, + "grad_norm": 0.703125, + "learning_rate": 1.0435176305124572e-05, + "loss": 0.5483, + "step": 26018 + }, + { + "epoch": 3.47, + "grad_norm": 0.8203125, + "learning_rate": 1.0429997638339162e-05, + "loss": 0.5411, + "step": 26019 + }, + { + "epoch": 3.47, + "grad_norm": 0.66796875, + "learning_rate": 1.0424820186180461e-05, + "loss": 0.429, + "step": 26020 + }, + { + "epoch": 3.47, + "grad_norm": 0.53515625, + "learning_rate": 1.0419643948718682e-05, + "loss": 0.1526, + "step": 26021 + }, + { + "epoch": 3.47, + "grad_norm": 0.73828125, + "learning_rate": 1.0414468926024058e-05, + "loss": 0.5706, + "step": 26022 + }, + { + "epoch": 3.47, + "grad_norm": 0.73828125, + "learning_rate": 1.0409295118166695e-05, + "loss": 0.4141, + "step": 26023 + }, + { + "epoch": 3.47, + "grad_norm": 0.66796875, + "learning_rate": 1.0404122525216809e-05, + "loss": 0.2174, + "step": 26024 + }, + { + "epoch": 3.47, + "grad_norm": 0.7265625, + "learning_rate": 1.0398951147244529e-05, + "loss": 0.2701, + "step": 26025 + }, + { + "epoch": 3.47, + "grad_norm": 0.66015625, + "learning_rate": 1.0393780984319956e-05, + "loss": 0.4325, + "step": 26026 + }, + { + "epoch": 3.47, + "grad_norm": 0.6640625, + "learning_rate": 1.0388612036513246e-05, + "loss": 0.1802, + "step": 26027 + }, + { + "epoch": 3.47, + "grad_norm": 0.58984375, + "learning_rate": 1.0383444303894452e-05, + "loss": 0.368, + "step": 26028 + }, + { + "epoch": 3.47, + "grad_norm": 0.68359375, + "learning_rate": 1.0378277786533664e-05, + "loss": 0.1948, + "step": 26029 + }, + { + "epoch": 3.47, + "grad_norm": 0.625, + "learning_rate": 1.0373112484500947e-05, + "loss": 0.2582, + "step": 26030 + }, + { + "epoch": 3.47, + "grad_norm": 0.75390625, + "learning_rate": 1.0367948397866379e-05, + "loss": 0.4579, + "step": 26031 + }, + { + "epoch": 3.47, + "grad_norm": 0.640625, + "learning_rate": 1.0362785526699926e-05, + "loss": 0.1891, + "step": 26032 + }, + { + "epoch": 3.47, + "grad_norm": 0.498046875, + "learning_rate": 1.035762387107163e-05, + "loss": 0.2604, + "step": 26033 + }, + { + "epoch": 3.47, + "grad_norm": 0.55078125, + "learning_rate": 1.0352463431051484e-05, + "loss": 0.443, + "step": 26034 + }, + { + "epoch": 3.47, + "grad_norm": 0.64453125, + "learning_rate": 1.0347304206709474e-05, + "loss": 0.4448, + "step": 26035 + }, + { + "epoch": 3.47, + "grad_norm": 0.71875, + "learning_rate": 1.0342146198115577e-05, + "loss": 0.487, + "step": 26036 + }, + { + "epoch": 3.47, + "grad_norm": 0.703125, + "learning_rate": 1.0336989405339713e-05, + "loss": 0.6461, + "step": 26037 + }, + { + "epoch": 3.47, + "grad_norm": 0.462890625, + "learning_rate": 1.033183382845181e-05, + "loss": 0.1371, + "step": 26038 + }, + { + "epoch": 3.47, + "grad_norm": 0.6328125, + "learning_rate": 1.0326679467521783e-05, + "loss": 0.4443, + "step": 26039 + }, + { + "epoch": 3.47, + "grad_norm": 0.388671875, + "learning_rate": 1.0321526322619534e-05, + "loss": 0.1272, + "step": 26040 + }, + { + "epoch": 3.47, + "grad_norm": 0.71484375, + "learning_rate": 1.0316374393814964e-05, + "loss": 0.219, + "step": 26041 + }, + { + "epoch": 3.48, + "grad_norm": 0.65625, + "learning_rate": 1.0311223681177896e-05, + "loss": 0.3226, + "step": 26042 + }, + { + "epoch": 3.48, + "grad_norm": 0.478515625, + "learning_rate": 1.0306074184778191e-05, + "loss": 0.2479, + "step": 26043 + }, + { + "epoch": 3.48, + "grad_norm": 0.5078125, + "learning_rate": 1.0300925904685687e-05, + "loss": 0.1776, + "step": 26044 + }, + { + "epoch": 3.48, + "grad_norm": 0.67578125, + "learning_rate": 1.0295778840970226e-05, + "loss": 0.2647, + "step": 26045 + }, + { + "epoch": 3.48, + "grad_norm": 0.5859375, + "learning_rate": 1.0290632993701544e-05, + "loss": 0.459, + "step": 26046 + }, + { + "epoch": 3.48, + "grad_norm": 0.65625, + "learning_rate": 1.0285488362949458e-05, + "loss": 0.3373, + "step": 26047 + }, + { + "epoch": 3.48, + "grad_norm": 0.55078125, + "learning_rate": 1.0280344948783738e-05, + "loss": 0.291, + "step": 26048 + }, + { + "epoch": 3.48, + "grad_norm": 0.56640625, + "learning_rate": 1.0275202751274126e-05, + "loss": 0.2619, + "step": 26049 + }, + { + "epoch": 3.48, + "grad_norm": 0.515625, + "learning_rate": 1.0270061770490358e-05, + "loss": 0.2886, + "step": 26050 + }, + { + "epoch": 3.48, + "grad_norm": 0.7265625, + "learning_rate": 1.026492200650212e-05, + "loss": 0.4117, + "step": 26051 + }, + { + "epoch": 3.48, + "grad_norm": 0.455078125, + "learning_rate": 1.0259783459379135e-05, + "loss": 0.2165, + "step": 26052 + }, + { + "epoch": 3.48, + "grad_norm": 0.6171875, + "learning_rate": 1.025464612919107e-05, + "loss": 0.3097, + "step": 26053 + }, + { + "epoch": 3.48, + "grad_norm": 0.5625, + "learning_rate": 1.0249510016007635e-05, + "loss": 0.3137, + "step": 26054 + }, + { + "epoch": 3.48, + "grad_norm": 0.62109375, + "learning_rate": 1.024437511989842e-05, + "loss": 0.423, + "step": 26055 + }, + { + "epoch": 3.48, + "grad_norm": 0.5859375, + "learning_rate": 1.0239241440933078e-05, + "loss": 0.3102, + "step": 26056 + }, + { + "epoch": 3.48, + "grad_norm": 0.5078125, + "learning_rate": 1.0234108979181233e-05, + "loss": 0.1131, + "step": 26057 + }, + { + "epoch": 3.48, + "grad_norm": 0.52734375, + "learning_rate": 1.0228977734712486e-05, + "loss": 0.3432, + "step": 26058 + }, + { + "epoch": 3.48, + "grad_norm": 0.7265625, + "learning_rate": 1.0223847707596434e-05, + "loss": 0.4669, + "step": 26059 + }, + { + "epoch": 3.48, + "grad_norm": 0.365234375, + "learning_rate": 1.0218718897902602e-05, + "loss": 0.0816, + "step": 26060 + }, + { + "epoch": 3.48, + "grad_norm": 0.74609375, + "learning_rate": 1.0213591305700588e-05, + "loss": 0.4326, + "step": 26061 + }, + { + "epoch": 3.48, + "grad_norm": 0.5390625, + "learning_rate": 1.020846493105987e-05, + "loss": 0.3701, + "step": 26062 + }, + { + "epoch": 3.48, + "grad_norm": 0.6015625, + "learning_rate": 1.0203339774050003e-05, + "loss": 0.2278, + "step": 26063 + }, + { + "epoch": 3.48, + "grad_norm": 0.5078125, + "learning_rate": 1.01982158347405e-05, + "loss": 0.3121, + "step": 26064 + }, + { + "epoch": 3.48, + "grad_norm": 0.58203125, + "learning_rate": 1.0193093113200802e-05, + "loss": 0.4447, + "step": 26065 + }, + { + "epoch": 3.48, + "grad_norm": 0.61328125, + "learning_rate": 1.0187971609500401e-05, + "loss": 0.2543, + "step": 26066 + }, + { + "epoch": 3.48, + "grad_norm": 0.609375, + "learning_rate": 1.0182851323708753e-05, + "loss": 0.1907, + "step": 26067 + }, + { + "epoch": 3.48, + "grad_norm": 0.546875, + "learning_rate": 1.0177732255895289e-05, + "loss": 0.1394, + "step": 26068 + }, + { + "epoch": 3.48, + "grad_norm": 0.60546875, + "learning_rate": 1.0172614406129421e-05, + "loss": 0.278, + "step": 26069 + }, + { + "epoch": 3.48, + "grad_norm": 0.5859375, + "learning_rate": 1.0167497774480539e-05, + "loss": 0.1708, + "step": 26070 + }, + { + "epoch": 3.48, + "grad_norm": 0.60546875, + "learning_rate": 1.0162382361018053e-05, + "loss": 0.1881, + "step": 26071 + }, + { + "epoch": 3.48, + "grad_norm": 0.6953125, + "learning_rate": 1.0157268165811329e-05, + "loss": 0.2165, + "step": 26072 + }, + { + "epoch": 3.48, + "grad_norm": 0.8046875, + "learning_rate": 1.0152155188929712e-05, + "loss": 0.3542, + "step": 26073 + }, + { + "epoch": 3.48, + "grad_norm": 0.625, + "learning_rate": 1.0147043430442515e-05, + "loss": 0.1667, + "step": 26074 + }, + { + "epoch": 3.48, + "grad_norm": 0.6171875, + "learning_rate": 1.0141932890419082e-05, + "loss": 0.1777, + "step": 26075 + }, + { + "epoch": 3.48, + "grad_norm": 0.796875, + "learning_rate": 1.0136823568928699e-05, + "loss": 0.3246, + "step": 26076 + }, + { + "epoch": 3.48, + "grad_norm": 0.79296875, + "learning_rate": 1.0131715466040691e-05, + "loss": 0.3694, + "step": 26077 + }, + { + "epoch": 3.48, + "grad_norm": 0.4296875, + "learning_rate": 1.0126608581824271e-05, + "loss": 0.3206, + "step": 26078 + }, + { + "epoch": 3.48, + "grad_norm": 0.51171875, + "learning_rate": 1.0121502916348725e-05, + "loss": 0.1689, + "step": 26079 + }, + { + "epoch": 3.48, + "grad_norm": 0.6015625, + "learning_rate": 1.0116398469683286e-05, + "loss": 0.1873, + "step": 26080 + }, + { + "epoch": 3.48, + "grad_norm": 0.5078125, + "learning_rate": 1.0111295241897157e-05, + "loss": 0.3024, + "step": 26081 + }, + { + "epoch": 3.48, + "grad_norm": 0.59375, + "learning_rate": 1.0106193233059592e-05, + "loss": 0.4264, + "step": 26082 + }, + { + "epoch": 3.48, + "grad_norm": 0.56640625, + "learning_rate": 1.0101092443239712e-05, + "loss": 0.2025, + "step": 26083 + }, + { + "epoch": 3.48, + "grad_norm": 0.73828125, + "learning_rate": 1.009599287250671e-05, + "loss": 0.3634, + "step": 26084 + }, + { + "epoch": 3.48, + "grad_norm": 0.73046875, + "learning_rate": 1.0090894520929773e-05, + "loss": 0.4108, + "step": 26085 + }, + { + "epoch": 3.48, + "grad_norm": 0.7734375, + "learning_rate": 1.0085797388577978e-05, + "loss": 0.3954, + "step": 26086 + }, + { + "epoch": 3.48, + "grad_norm": 0.62890625, + "learning_rate": 1.0080701475520505e-05, + "loss": 0.4697, + "step": 26087 + }, + { + "epoch": 3.48, + "grad_norm": 0.78125, + "learning_rate": 1.0075606781826418e-05, + "loss": 0.3174, + "step": 26088 + }, + { + "epoch": 3.48, + "grad_norm": 0.546875, + "learning_rate": 1.0070513307564799e-05, + "loss": 0.275, + "step": 26089 + }, + { + "epoch": 3.48, + "grad_norm": 0.62109375, + "learning_rate": 1.0065421052804746e-05, + "loss": 0.2852, + "step": 26090 + }, + { + "epoch": 3.48, + "grad_norm": 0.6171875, + "learning_rate": 1.0060330017615327e-05, + "loss": 0.4421, + "step": 26091 + }, + { + "epoch": 3.48, + "grad_norm": 0.53125, + "learning_rate": 1.005524020206553e-05, + "loss": 0.259, + "step": 26092 + }, + { + "epoch": 3.48, + "grad_norm": 0.7109375, + "learning_rate": 1.0050151606224401e-05, + "loss": 0.3015, + "step": 26093 + }, + { + "epoch": 3.48, + "grad_norm": 0.458984375, + "learning_rate": 1.004506423016095e-05, + "loss": 0.2086, + "step": 26094 + }, + { + "epoch": 3.48, + "grad_norm": 0.609375, + "learning_rate": 1.0039978073944179e-05, + "loss": 0.4091, + "step": 26095 + }, + { + "epoch": 3.48, + "grad_norm": 0.5859375, + "learning_rate": 1.003489313764302e-05, + "loss": 0.3037, + "step": 26096 + }, + { + "epoch": 3.48, + "grad_norm": 0.640625, + "learning_rate": 1.0029809421326475e-05, + "loss": 0.1924, + "step": 26097 + }, + { + "epoch": 3.48, + "grad_norm": 0.44140625, + "learning_rate": 1.0024726925063422e-05, + "loss": 0.2573, + "step": 26098 + }, + { + "epoch": 3.48, + "grad_norm": 0.62890625, + "learning_rate": 1.0019645648922837e-05, + "loss": 0.2748, + "step": 26099 + }, + { + "epoch": 3.48, + "grad_norm": 0.61328125, + "learning_rate": 1.0014565592973612e-05, + "loss": 0.41, + "step": 26100 + }, + { + "epoch": 3.48, + "grad_norm": 0.5703125, + "learning_rate": 1.0009486757284614e-05, + "loss": 0.2245, + "step": 26101 + }, + { + "epoch": 3.48, + "grad_norm": 0.640625, + "learning_rate": 1.000440914192473e-05, + "loss": 0.2621, + "step": 26102 + }, + { + "epoch": 3.48, + "grad_norm": 0.5390625, + "learning_rate": 9.99933274696282e-06, + "loss": 0.2526, + "step": 26103 + }, + { + "epoch": 3.48, + "grad_norm": 0.349609375, + "learning_rate": 9.994257572467735e-06, + "loss": 0.1347, + "step": 26104 + }, + { + "epoch": 3.48, + "grad_norm": 0.6640625, + "learning_rate": 9.989183618508269e-06, + "loss": 0.2697, + "step": 26105 + }, + { + "epoch": 3.48, + "grad_norm": 0.4140625, + "learning_rate": 9.98411088515323e-06, + "loss": 0.1431, + "step": 26106 + }, + { + "epoch": 3.48, + "grad_norm": 0.76171875, + "learning_rate": 9.979039372471433e-06, + "loss": 0.4455, + "step": 26107 + }, + { + "epoch": 3.48, + "grad_norm": 0.59375, + "learning_rate": 9.97396908053163e-06, + "loss": 0.2734, + "step": 26108 + }, + { + "epoch": 3.48, + "grad_norm": 0.72265625, + "learning_rate": 9.968900009402605e-06, + "loss": 0.4515, + "step": 26109 + }, + { + "epoch": 3.48, + "grad_norm": 0.69921875, + "learning_rate": 9.963832159153086e-06, + "loss": 0.5514, + "step": 26110 + }, + { + "epoch": 3.48, + "grad_norm": 0.69921875, + "learning_rate": 9.958765529851754e-06, + "loss": 0.1499, + "step": 26111 + }, + { + "epoch": 3.48, + "grad_norm": 0.4296875, + "learning_rate": 9.953700121567355e-06, + "loss": 0.1844, + "step": 26112 + }, + { + "epoch": 3.48, + "grad_norm": 0.66015625, + "learning_rate": 9.948635934368577e-06, + "loss": 0.3659, + "step": 26113 + }, + { + "epoch": 3.48, + "grad_norm": 0.8984375, + "learning_rate": 9.94357296832411e-06, + "loss": 0.1654, + "step": 26114 + }, + { + "epoch": 3.48, + "grad_norm": 0.69140625, + "learning_rate": 9.938511223502567e-06, + "loss": 0.2695, + "step": 26115 + }, + { + "epoch": 3.48, + "grad_norm": 0.54296875, + "learning_rate": 9.933450699972612e-06, + "loss": 0.3483, + "step": 26116 + }, + { + "epoch": 3.49, + "grad_norm": 0.9453125, + "learning_rate": 9.928391397802873e-06, + "loss": 0.5107, + "step": 26117 + }, + { + "epoch": 3.49, + "grad_norm": 0.57421875, + "learning_rate": 9.923333317061978e-06, + "loss": 0.3114, + "step": 26118 + }, + { + "epoch": 3.49, + "grad_norm": 0.69140625, + "learning_rate": 9.918276457818464e-06, + "loss": 0.2865, + "step": 26119 + }, + { + "epoch": 3.49, + "grad_norm": 0.73828125, + "learning_rate": 9.913220820140945e-06, + "loss": 0.3139, + "step": 26120 + }, + { + "epoch": 3.49, + "grad_norm": 0.52734375, + "learning_rate": 9.908166404097974e-06, + "loss": 0.417, + "step": 26121 + }, + { + "epoch": 3.49, + "grad_norm": 0.380859375, + "learning_rate": 9.903113209758096e-06, + "loss": 0.1269, + "step": 26122 + }, + { + "epoch": 3.49, + "grad_norm": 0.52734375, + "learning_rate": 9.898061237189826e-06, + "loss": 0.2169, + "step": 26123 + }, + { + "epoch": 3.49, + "grad_norm": 0.640625, + "learning_rate": 9.893010486461651e-06, + "loss": 0.213, + "step": 26124 + }, + { + "epoch": 3.49, + "grad_norm": 0.4296875, + "learning_rate": 9.887960957642096e-06, + "loss": 0.2208, + "step": 26125 + }, + { + "epoch": 3.49, + "grad_norm": 0.6015625, + "learning_rate": 9.882912650799614e-06, + "loss": 0.2177, + "step": 26126 + }, + { + "epoch": 3.49, + "grad_norm": 0.87890625, + "learning_rate": 9.877865566002698e-06, + "loss": 0.2679, + "step": 26127 + }, + { + "epoch": 3.49, + "grad_norm": 0.5625, + "learning_rate": 9.872819703319747e-06, + "loss": 0.4176, + "step": 26128 + }, + { + "epoch": 3.49, + "grad_norm": 0.60546875, + "learning_rate": 9.867775062819196e-06, + "loss": 0.2299, + "step": 26129 + }, + { + "epoch": 3.49, + "grad_norm": 0.62890625, + "learning_rate": 9.862731644569467e-06, + "loss": 0.3026, + "step": 26130 + }, + { + "epoch": 3.49, + "grad_norm": 0.75390625, + "learning_rate": 9.857689448638941e-06, + "loss": 0.4377, + "step": 26131 + }, + { + "epoch": 3.49, + "grad_norm": 0.5546875, + "learning_rate": 9.852648475096027e-06, + "loss": 0.1762, + "step": 26132 + }, + { + "epoch": 3.49, + "grad_norm": 0.796875, + "learning_rate": 9.847608724009027e-06, + "loss": 0.3261, + "step": 26133 + }, + { + "epoch": 3.49, + "grad_norm": 0.53125, + "learning_rate": 9.842570195446343e-06, + "loss": 0.4076, + "step": 26134 + }, + { + "epoch": 3.49, + "grad_norm": 0.51171875, + "learning_rate": 9.837532889476241e-06, + "loss": 0.2565, + "step": 26135 + }, + { + "epoch": 3.49, + "grad_norm": 0.71875, + "learning_rate": 9.832496806167057e-06, + "loss": 0.2246, + "step": 26136 + }, + { + "epoch": 3.49, + "grad_norm": 0.5546875, + "learning_rate": 9.827461945587113e-06, + "loss": 0.4146, + "step": 26137 + }, + { + "epoch": 3.49, + "grad_norm": 0.5546875, + "learning_rate": 9.822428307804644e-06, + "loss": 0.207, + "step": 26138 + }, + { + "epoch": 3.49, + "grad_norm": 0.62890625, + "learning_rate": 9.817395892887904e-06, + "loss": 0.3147, + "step": 26139 + }, + { + "epoch": 3.49, + "grad_norm": 0.70703125, + "learning_rate": 9.812364700905164e-06, + "loss": 0.3464, + "step": 26140 + }, + { + "epoch": 3.49, + "grad_norm": 0.51171875, + "learning_rate": 9.807334731924666e-06, + "loss": 0.3625, + "step": 26141 + }, + { + "epoch": 3.49, + "grad_norm": 0.6484375, + "learning_rate": 9.80230598601457e-06, + "loss": 0.33, + "step": 26142 + }, + { + "epoch": 3.49, + "grad_norm": 0.78125, + "learning_rate": 9.797278463243086e-06, + "loss": 0.3569, + "step": 26143 + }, + { + "epoch": 3.49, + "grad_norm": 0.484375, + "learning_rate": 9.792252163678405e-06, + "loss": 0.2539, + "step": 26144 + }, + { + "epoch": 3.49, + "grad_norm": 0.70703125, + "learning_rate": 9.787227087388707e-06, + "loss": 0.6086, + "step": 26145 + }, + { + "epoch": 3.49, + "grad_norm": 0.5703125, + "learning_rate": 9.782203234442088e-06, + "loss": 0.3591, + "step": 26146 + }, + { + "epoch": 3.49, + "grad_norm": 0.4921875, + "learning_rate": 9.777180604906687e-06, + "loss": 0.1703, + "step": 26147 + }, + { + "epoch": 3.49, + "grad_norm": 0.55078125, + "learning_rate": 9.772159198850628e-06, + "loss": 0.143, + "step": 26148 + }, + { + "epoch": 3.49, + "grad_norm": 0.57421875, + "learning_rate": 9.767139016341986e-06, + "loss": 0.3344, + "step": 26149 + }, + { + "epoch": 3.49, + "grad_norm": 0.60546875, + "learning_rate": 9.762120057448875e-06, + "loss": 0.2704, + "step": 26150 + }, + { + "epoch": 3.49, + "grad_norm": 0.58984375, + "learning_rate": 9.75710232223931e-06, + "loss": 0.2783, + "step": 26151 + }, + { + "epoch": 3.49, + "grad_norm": 0.5703125, + "learning_rate": 9.752085810781353e-06, + "loss": 0.4651, + "step": 26152 + }, + { + "epoch": 3.49, + "grad_norm": 0.6484375, + "learning_rate": 9.747070523143042e-06, + "loss": 0.1867, + "step": 26153 + }, + { + "epoch": 3.49, + "grad_norm": 0.482421875, + "learning_rate": 9.742056459392369e-06, + "loss": 0.1352, + "step": 26154 + }, + { + "epoch": 3.49, + "grad_norm": 0.8125, + "learning_rate": 9.737043619597363e-06, + "loss": 0.5728, + "step": 26155 + }, + { + "epoch": 3.49, + "grad_norm": 0.66015625, + "learning_rate": 9.732032003825953e-06, + "loss": 0.469, + "step": 26156 + }, + { + "epoch": 3.49, + "grad_norm": 0.546875, + "learning_rate": 9.727021612146126e-06, + "loss": 0.2872, + "step": 26157 + }, + { + "epoch": 3.49, + "grad_norm": 0.63671875, + "learning_rate": 9.722012444625838e-06, + "loss": 0.3135, + "step": 26158 + }, + { + "epoch": 3.49, + "grad_norm": 0.494140625, + "learning_rate": 9.717004501332982e-06, + "loss": 0.2244, + "step": 26159 + }, + { + "epoch": 3.49, + "grad_norm": 0.462890625, + "learning_rate": 9.711997782335503e-06, + "loss": 0.3035, + "step": 26160 + }, + { + "epoch": 3.49, + "grad_norm": 0.466796875, + "learning_rate": 9.706992287701266e-06, + "loss": 0.164, + "step": 26161 + }, + { + "epoch": 3.49, + "grad_norm": 0.703125, + "learning_rate": 9.701988017498164e-06, + "loss": 0.1623, + "step": 26162 + }, + { + "epoch": 3.49, + "grad_norm": 0.74609375, + "learning_rate": 9.696984971794065e-06, + "loss": 0.5587, + "step": 26163 + }, + { + "epoch": 3.49, + "grad_norm": 0.61328125, + "learning_rate": 9.691983150656814e-06, + "loss": 0.224, + "step": 26164 + }, + { + "epoch": 3.49, + "grad_norm": 0.53515625, + "learning_rate": 9.68698255415421e-06, + "loss": 0.1217, + "step": 26165 + }, + { + "epoch": 3.49, + "grad_norm": 0.7890625, + "learning_rate": 9.681983182354093e-06, + "loss": 0.4693, + "step": 26166 + }, + { + "epoch": 3.49, + "grad_norm": 0.65625, + "learning_rate": 9.676985035324248e-06, + "loss": 0.4667, + "step": 26167 + }, + { + "epoch": 3.49, + "grad_norm": 0.5859375, + "learning_rate": 9.67198811313248e-06, + "loss": 0.176, + "step": 26168 + }, + { + "epoch": 3.49, + "grad_norm": 0.68359375, + "learning_rate": 9.6669924158465e-06, + "loss": 0.3489, + "step": 26169 + }, + { + "epoch": 3.49, + "grad_norm": 0.3984375, + "learning_rate": 9.661997943534085e-06, + "loss": 0.1261, + "step": 26170 + }, + { + "epoch": 3.49, + "grad_norm": 0.447265625, + "learning_rate": 9.65700469626295e-06, + "loss": 0.2186, + "step": 26171 + }, + { + "epoch": 3.49, + "grad_norm": 0.515625, + "learning_rate": 9.652012674100818e-06, + "loss": 0.1533, + "step": 26172 + }, + { + "epoch": 3.49, + "grad_norm": 0.62109375, + "learning_rate": 9.647021877115392e-06, + "loss": 0.4602, + "step": 26173 + }, + { + "epoch": 3.49, + "grad_norm": 0.734375, + "learning_rate": 9.642032305374315e-06, + "loss": 0.5002, + "step": 26174 + }, + { + "epoch": 3.49, + "grad_norm": 0.4921875, + "learning_rate": 9.63704395894528e-06, + "loss": 0.2766, + "step": 26175 + }, + { + "epoch": 3.49, + "grad_norm": 0.68359375, + "learning_rate": 9.632056837895908e-06, + "loss": 0.3258, + "step": 26176 + }, + { + "epoch": 3.49, + "grad_norm": 0.70703125, + "learning_rate": 9.62707094229388e-06, + "loss": 0.3876, + "step": 26177 + }, + { + "epoch": 3.49, + "grad_norm": 0.546875, + "learning_rate": 9.622086272206742e-06, + "loss": 0.1915, + "step": 26178 + }, + { + "epoch": 3.49, + "grad_norm": 0.83203125, + "learning_rate": 9.617102827702117e-06, + "loss": 0.4608, + "step": 26179 + }, + { + "epoch": 3.49, + "grad_norm": 0.8671875, + "learning_rate": 9.612120608847596e-06, + "loss": 0.5397, + "step": 26180 + }, + { + "epoch": 3.49, + "grad_norm": 0.484375, + "learning_rate": 9.607139615710714e-06, + "loss": 0.2191, + "step": 26181 + }, + { + "epoch": 3.49, + "grad_norm": 0.734375, + "learning_rate": 9.602159848359061e-06, + "loss": 0.2275, + "step": 26182 + }, + { + "epoch": 3.49, + "grad_norm": 0.86328125, + "learning_rate": 9.597181306860136e-06, + "loss": 0.4873, + "step": 26183 + }, + { + "epoch": 3.49, + "grad_norm": 0.546875, + "learning_rate": 9.592203991281434e-06, + "loss": 0.2242, + "step": 26184 + }, + { + "epoch": 3.49, + "grad_norm": 0.82421875, + "learning_rate": 9.587227901690476e-06, + "loss": 0.2028, + "step": 26185 + }, + { + "epoch": 3.49, + "grad_norm": 0.7109375, + "learning_rate": 9.58225303815472e-06, + "loss": 0.3697, + "step": 26186 + }, + { + "epoch": 3.49, + "grad_norm": 0.80859375, + "learning_rate": 9.577279400741679e-06, + "loss": 0.5726, + "step": 26187 + }, + { + "epoch": 3.49, + "grad_norm": 0.51171875, + "learning_rate": 9.572306989518732e-06, + "loss": 0.14, + "step": 26188 + }, + { + "epoch": 3.49, + "grad_norm": 0.67578125, + "learning_rate": 9.567335804553335e-06, + "loss": 0.2741, + "step": 26189 + }, + { + "epoch": 3.49, + "grad_norm": 0.470703125, + "learning_rate": 9.562365845912913e-06, + "loss": 0.291, + "step": 26190 + }, + { + "epoch": 3.49, + "grad_norm": 0.50390625, + "learning_rate": 9.557397113664867e-06, + "loss": 0.2068, + "step": 26191 + }, + { + "epoch": 3.5, + "grad_norm": 0.62109375, + "learning_rate": 9.552429607876535e-06, + "loss": 0.2532, + "step": 26192 + }, + { + "epoch": 3.5, + "grad_norm": 0.62109375, + "learning_rate": 9.547463328615314e-06, + "loss": 0.5357, + "step": 26193 + }, + { + "epoch": 3.5, + "grad_norm": 0.4609375, + "learning_rate": 9.542498275948564e-06, + "loss": 0.3556, + "step": 26194 + }, + { + "epoch": 3.5, + "grad_norm": 0.373046875, + "learning_rate": 9.537534449943574e-06, + "loss": 0.1207, + "step": 26195 + }, + { + "epoch": 3.5, + "grad_norm": 0.5234375, + "learning_rate": 9.532571850667693e-06, + "loss": 0.2696, + "step": 26196 + }, + { + "epoch": 3.5, + "grad_norm": 0.703125, + "learning_rate": 9.527610478188175e-06, + "loss": 0.3463, + "step": 26197 + }, + { + "epoch": 3.5, + "grad_norm": 0.609375, + "learning_rate": 9.522650332572324e-06, + "loss": 0.3388, + "step": 26198 + }, + { + "epoch": 3.5, + "grad_norm": 0.57421875, + "learning_rate": 9.517691413887409e-06, + "loss": 0.2881, + "step": 26199 + }, + { + "epoch": 3.5, + "grad_norm": 0.671875, + "learning_rate": 9.512733722200684e-06, + "loss": 0.3101, + "step": 26200 + }, + { + "epoch": 3.5, + "grad_norm": 0.65625, + "learning_rate": 9.507777257579353e-06, + "loss": 0.3937, + "step": 26201 + }, + { + "epoch": 3.5, + "grad_norm": 0.58203125, + "learning_rate": 9.502822020090629e-06, + "loss": 0.389, + "step": 26202 + }, + { + "epoch": 3.5, + "grad_norm": 0.69921875, + "learning_rate": 9.497868009801725e-06, + "loss": 0.3584, + "step": 26203 + }, + { + "epoch": 3.5, + "grad_norm": 0.7265625, + "learning_rate": 9.492915226779808e-06, + "loss": 0.2338, + "step": 26204 + }, + { + "epoch": 3.5, + "grad_norm": 0.55859375, + "learning_rate": 9.487963671092081e-06, + "loss": 0.3242, + "step": 26205 + }, + { + "epoch": 3.5, + "grad_norm": 0.498046875, + "learning_rate": 9.483013342805624e-06, + "loss": 0.1988, + "step": 26206 + }, + { + "epoch": 3.5, + "grad_norm": 0.54296875, + "learning_rate": 9.478064241987628e-06, + "loss": 0.3255, + "step": 26207 + }, + { + "epoch": 3.5, + "grad_norm": 0.6015625, + "learning_rate": 9.473116368705149e-06, + "loss": 0.4514, + "step": 26208 + }, + { + "epoch": 3.5, + "grad_norm": 0.71484375, + "learning_rate": 9.46816972302531e-06, + "loss": 0.5467, + "step": 26209 + }, + { + "epoch": 3.5, + "grad_norm": 0.82421875, + "learning_rate": 9.463224305015228e-06, + "loss": 0.2517, + "step": 26210 + }, + { + "epoch": 3.5, + "grad_norm": 0.57421875, + "learning_rate": 9.458280114741903e-06, + "loss": 0.1763, + "step": 26211 + }, + { + "epoch": 3.5, + "grad_norm": 0.7734375, + "learning_rate": 9.453337152272401e-06, + "loss": 0.3669, + "step": 26212 + }, + { + "epoch": 3.5, + "grad_norm": 0.546875, + "learning_rate": 9.448395417673771e-06, + "loss": 0.2006, + "step": 26213 + }, + { + "epoch": 3.5, + "grad_norm": 0.71484375, + "learning_rate": 9.443454911013028e-06, + "loss": 0.3389, + "step": 26214 + }, + { + "epoch": 3.5, + "grad_norm": 0.6484375, + "learning_rate": 9.438515632357136e-06, + "loss": 0.2431, + "step": 26215 + }, + { + "epoch": 3.5, + "grad_norm": 0.54296875, + "learning_rate": 9.43357758177309e-06, + "loss": 0.3263, + "step": 26216 + }, + { + "epoch": 3.5, + "grad_norm": 0.46875, + "learning_rate": 9.428640759327867e-06, + "loss": 0.252, + "step": 26217 + }, + { + "epoch": 3.5, + "grad_norm": 0.75, + "learning_rate": 9.423705165088415e-06, + "loss": 0.2235, + "step": 26218 + }, + { + "epoch": 3.5, + "grad_norm": 0.6953125, + "learning_rate": 9.418770799121657e-06, + "loss": 0.373, + "step": 26219 + }, + { + "epoch": 3.5, + "grad_norm": 0.6875, + "learning_rate": 9.413837661494474e-06, + "loss": 0.2803, + "step": 26220 + }, + { + "epoch": 3.5, + "grad_norm": 0.6640625, + "learning_rate": 9.4089057522738e-06, + "loss": 0.2276, + "step": 26221 + }, + { + "epoch": 3.5, + "grad_norm": 0.50390625, + "learning_rate": 9.403975071526505e-06, + "loss": 0.1505, + "step": 26222 + }, + { + "epoch": 3.5, + "grad_norm": 0.578125, + "learning_rate": 9.399045619319468e-06, + "loss": 0.4064, + "step": 26223 + }, + { + "epoch": 3.5, + "grad_norm": 0.443359375, + "learning_rate": 9.394117395719504e-06, + "loss": 0.1571, + "step": 26224 + }, + { + "epoch": 3.5, + "grad_norm": 0.6953125, + "learning_rate": 9.389190400793447e-06, + "loss": 0.2735, + "step": 26225 + }, + { + "epoch": 3.5, + "grad_norm": 0.6171875, + "learning_rate": 9.38426463460813e-06, + "loss": 0.3672, + "step": 26226 + }, + { + "epoch": 3.5, + "grad_norm": 0.62109375, + "learning_rate": 9.379340097230349e-06, + "loss": 0.3399, + "step": 26227 + }, + { + "epoch": 3.5, + "grad_norm": 0.5703125, + "learning_rate": 9.37441678872688e-06, + "loss": 0.3177, + "step": 26228 + }, + { + "epoch": 3.5, + "grad_norm": 0.625, + "learning_rate": 9.369494709164472e-06, + "loss": 0.3665, + "step": 26229 + }, + { + "epoch": 3.5, + "grad_norm": 0.72265625, + "learning_rate": 9.364573858609881e-06, + "loss": 0.2734, + "step": 26230 + }, + { + "epoch": 3.5, + "grad_norm": 0.7265625, + "learning_rate": 9.359654237129855e-06, + "loss": 0.3739, + "step": 26231 + }, + { + "epoch": 3.5, + "grad_norm": 0.63671875, + "learning_rate": 9.354735844791074e-06, + "loss": 0.297, + "step": 26232 + }, + { + "epoch": 3.5, + "grad_norm": 0.68359375, + "learning_rate": 9.349818681660272e-06, + "loss": 0.1551, + "step": 26233 + }, + { + "epoch": 3.5, + "grad_norm": 0.8046875, + "learning_rate": 9.344902747804086e-06, + "loss": 0.2102, + "step": 26234 + }, + { + "epoch": 3.5, + "grad_norm": 0.439453125, + "learning_rate": 9.339988043289205e-06, + "loss": 0.2483, + "step": 26235 + }, + { + "epoch": 3.5, + "grad_norm": 0.64453125, + "learning_rate": 9.335074568182256e-06, + "loss": 0.2437, + "step": 26236 + }, + { + "epoch": 3.5, + "grad_norm": 0.68359375, + "learning_rate": 9.330162322549918e-06, + "loss": 0.3691, + "step": 26237 + }, + { + "epoch": 3.5, + "grad_norm": 0.625, + "learning_rate": 9.32525130645875e-06, + "loss": 0.251, + "step": 26238 + }, + { + "epoch": 3.5, + "grad_norm": 0.76171875, + "learning_rate": 9.320341519975363e-06, + "loss": 0.3083, + "step": 26239 + }, + { + "epoch": 3.5, + "grad_norm": 0.66796875, + "learning_rate": 9.315432963166348e-06, + "loss": 0.3057, + "step": 26240 + }, + { + "epoch": 3.5, + "grad_norm": 0.8125, + "learning_rate": 9.310525636098277e-06, + "loss": 0.5122, + "step": 26241 + }, + { + "epoch": 3.5, + "grad_norm": 0.5625, + "learning_rate": 9.305619538837673e-06, + "loss": 0.2093, + "step": 26242 + }, + { + "epoch": 3.5, + "grad_norm": 0.4453125, + "learning_rate": 9.300714671451083e-06, + "loss": 0.3461, + "step": 26243 + }, + { + "epoch": 3.5, + "grad_norm": 0.65234375, + "learning_rate": 9.295811034004998e-06, + "loss": 0.2878, + "step": 26244 + }, + { + "epoch": 3.5, + "grad_norm": 0.7421875, + "learning_rate": 9.29090862656593e-06, + "loss": 0.3095, + "step": 26245 + }, + { + "epoch": 3.5, + "grad_norm": 0.578125, + "learning_rate": 9.286007449200374e-06, + "loss": 0.303, + "step": 26246 + }, + { + "epoch": 3.5, + "grad_norm": 0.78515625, + "learning_rate": 9.281107501974761e-06, + "loss": 0.3764, + "step": 26247 + }, + { + "epoch": 3.5, + "grad_norm": 0.73046875, + "learning_rate": 9.276208784955554e-06, + "loss": 0.4802, + "step": 26248 + }, + { + "epoch": 3.5, + "grad_norm": 0.6953125, + "learning_rate": 9.271311298209173e-06, + "loss": 0.5741, + "step": 26249 + }, + { + "epoch": 3.5, + "grad_norm": 0.6328125, + "learning_rate": 9.266415041802068e-06, + "loss": 0.4909, + "step": 26250 + }, + { + "epoch": 3.5, + "grad_norm": 0.625, + "learning_rate": 9.261520015800584e-06, + "loss": 0.5101, + "step": 26251 + }, + { + "epoch": 3.5, + "grad_norm": 0.66015625, + "learning_rate": 9.256626220271136e-06, + "loss": 0.6332, + "step": 26252 + }, + { + "epoch": 3.5, + "grad_norm": 0.51171875, + "learning_rate": 9.25173365528006e-06, + "loss": 0.314, + "step": 26253 + }, + { + "epoch": 3.5, + "grad_norm": 0.57421875, + "learning_rate": 9.246842320893723e-06, + "loss": 0.4189, + "step": 26254 + }, + { + "epoch": 3.5, + "grad_norm": 0.5625, + "learning_rate": 9.241952217178473e-06, + "loss": 0.2299, + "step": 26255 + }, + { + "epoch": 3.5, + "grad_norm": 0.69140625, + "learning_rate": 9.237063344200592e-06, + "loss": 0.474, + "step": 26256 + }, + { + "epoch": 3.5, + "grad_norm": 0.61328125, + "learning_rate": 9.23217570202637e-06, + "loss": 0.3622, + "step": 26257 + }, + { + "epoch": 3.5, + "grad_norm": 0.65234375, + "learning_rate": 9.227289290722096e-06, + "loss": 0.3603, + "step": 26258 + }, + { + "epoch": 3.5, + "grad_norm": 0.78125, + "learning_rate": 9.222404110354033e-06, + "loss": 0.2252, + "step": 26259 + }, + { + "epoch": 3.5, + "grad_norm": 0.640625, + "learning_rate": 9.217520160988458e-06, + "loss": 0.3055, + "step": 26260 + }, + { + "epoch": 3.5, + "grad_norm": 0.64453125, + "learning_rate": 9.212637442691551e-06, + "loss": 0.322, + "step": 26261 + }, + { + "epoch": 3.5, + "grad_norm": 0.73046875, + "learning_rate": 9.20775595552954e-06, + "loss": 0.2446, + "step": 26262 + }, + { + "epoch": 3.5, + "grad_norm": 0.69140625, + "learning_rate": 9.202875699568636e-06, + "loss": 0.5016, + "step": 26263 + }, + { + "epoch": 3.5, + "grad_norm": 0.703125, + "learning_rate": 9.197996674875032e-06, + "loss": 0.5984, + "step": 26264 + }, + { + "epoch": 3.5, + "grad_norm": 0.60546875, + "learning_rate": 9.19311888151485e-06, + "loss": 0.2281, + "step": 26265 + }, + { + "epoch": 3.5, + "grad_norm": 0.49609375, + "learning_rate": 9.188242319554253e-06, + "loss": 0.1957, + "step": 26266 + }, + { + "epoch": 3.51, + "grad_norm": 0.51171875, + "learning_rate": 9.183366989059383e-06, + "loss": 0.185, + "step": 26267 + }, + { + "epoch": 3.51, + "grad_norm": 0.60546875, + "learning_rate": 9.178492890096334e-06, + "loss": 0.419, + "step": 26268 + }, + { + "epoch": 3.51, + "grad_norm": 0.55859375, + "learning_rate": 9.173620022731233e-06, + "loss": 0.3369, + "step": 26269 + }, + { + "epoch": 3.51, + "grad_norm": 0.53125, + "learning_rate": 9.168748387030113e-06, + "loss": 0.2027, + "step": 26270 + }, + { + "epoch": 3.51, + "grad_norm": 0.80078125, + "learning_rate": 9.163877983059055e-06, + "loss": 0.2867, + "step": 26271 + }, + { + "epoch": 3.51, + "grad_norm": 0.69140625, + "learning_rate": 9.159008810884107e-06, + "loss": 0.2935, + "step": 26272 + }, + { + "epoch": 3.51, + "grad_norm": 0.78515625, + "learning_rate": 9.154140870571326e-06, + "loss": 0.2525, + "step": 26273 + }, + { + "epoch": 3.51, + "grad_norm": 0.55859375, + "learning_rate": 9.149274162186683e-06, + "loss": 0.2357, + "step": 26274 + }, + { + "epoch": 3.51, + "grad_norm": 0.56640625, + "learning_rate": 9.14440868579619e-06, + "loss": 0.1426, + "step": 26275 + }, + { + "epoch": 3.51, + "grad_norm": 0.80078125, + "learning_rate": 9.13954444146582e-06, + "loss": 0.5067, + "step": 26276 + }, + { + "epoch": 3.51, + "grad_norm": 0.58203125, + "learning_rate": 9.13468142926155e-06, + "loss": 0.1866, + "step": 26277 + }, + { + "epoch": 3.51, + "grad_norm": 0.80859375, + "learning_rate": 9.129819649249327e-06, + "loss": 0.2108, + "step": 26278 + }, + { + "epoch": 3.51, + "grad_norm": 0.625, + "learning_rate": 9.124959101495079e-06, + "loss": 0.3308, + "step": 26279 + }, + { + "epoch": 3.51, + "grad_norm": 0.73828125, + "learning_rate": 9.120099786064673e-06, + "loss": 0.2908, + "step": 26280 + }, + { + "epoch": 3.51, + "grad_norm": 0.6328125, + "learning_rate": 9.115241703024057e-06, + "loss": 0.309, + "step": 26281 + }, + { + "epoch": 3.51, + "grad_norm": 0.62890625, + "learning_rate": 9.11038485243909e-06, + "loss": 0.3072, + "step": 26282 + }, + { + "epoch": 3.51, + "grad_norm": 0.83984375, + "learning_rate": 9.105529234375653e-06, + "loss": 0.4437, + "step": 26283 + }, + { + "epoch": 3.51, + "grad_norm": 0.5390625, + "learning_rate": 9.100674848899548e-06, + "loss": 0.1181, + "step": 26284 + }, + { + "epoch": 3.51, + "grad_norm": 0.451171875, + "learning_rate": 9.095821696076646e-06, + "loss": 0.1908, + "step": 26285 + }, + { + "epoch": 3.51, + "grad_norm": 0.4453125, + "learning_rate": 9.090969775972736e-06, + "loss": 0.1119, + "step": 26286 + }, + { + "epoch": 3.51, + "grad_norm": 0.87890625, + "learning_rate": 9.086119088653644e-06, + "loss": 0.4056, + "step": 26287 + }, + { + "epoch": 3.51, + "grad_norm": 0.62109375, + "learning_rate": 9.081269634185097e-06, + "loss": 0.5264, + "step": 26288 + }, + { + "epoch": 3.51, + "grad_norm": 0.5, + "learning_rate": 9.076421412632896e-06, + "loss": 0.2226, + "step": 26289 + }, + { + "epoch": 3.51, + "grad_norm": 0.53515625, + "learning_rate": 9.071574424062768e-06, + "loss": 0.2103, + "step": 26290 + }, + { + "epoch": 3.51, + "grad_norm": 0.56640625, + "learning_rate": 9.06672866854047e-06, + "loss": 0.2026, + "step": 26291 + }, + { + "epoch": 3.51, + "grad_norm": 0.83203125, + "learning_rate": 9.061884146131683e-06, + "loss": 0.4304, + "step": 26292 + }, + { + "epoch": 3.51, + "grad_norm": 0.7421875, + "learning_rate": 9.0570408569021e-06, + "loss": 0.3997, + "step": 26293 + }, + { + "epoch": 3.51, + "grad_norm": 0.73046875, + "learning_rate": 9.052198800917411e-06, + "loss": 0.2593, + "step": 26294 + }, + { + "epoch": 3.51, + "grad_norm": 0.470703125, + "learning_rate": 9.047357978243275e-06, + "loss": 0.233, + "step": 26295 + }, + { + "epoch": 3.51, + "grad_norm": 0.6875, + "learning_rate": 9.042518388945354e-06, + "loss": 0.4427, + "step": 26296 + }, + { + "epoch": 3.51, + "grad_norm": 0.54296875, + "learning_rate": 9.037680033089246e-06, + "loss": 0.1514, + "step": 26297 + }, + { + "epoch": 3.51, + "grad_norm": 0.578125, + "learning_rate": 9.03284291074058e-06, + "loss": 0.2817, + "step": 26298 + }, + { + "epoch": 3.51, + "grad_norm": 0.53515625, + "learning_rate": 9.028007021964935e-06, + "loss": 0.3637, + "step": 26299 + }, + { + "epoch": 3.51, + "grad_norm": 0.53515625, + "learning_rate": 9.023172366827903e-06, + "loss": 0.2554, + "step": 26300 + }, + { + "epoch": 3.51, + "grad_norm": 0.61328125, + "learning_rate": 9.018338945395067e-06, + "loss": 0.2921, + "step": 26301 + }, + { + "epoch": 3.51, + "grad_norm": 0.56640625, + "learning_rate": 9.013506757731926e-06, + "loss": 0.2938, + "step": 26302 + }, + { + "epoch": 3.51, + "grad_norm": 0.76953125, + "learning_rate": 9.00867580390402e-06, + "loss": 0.2576, + "step": 26303 + }, + { + "epoch": 3.51, + "grad_norm": 0.609375, + "learning_rate": 9.003846083976897e-06, + "loss": 0.2903, + "step": 26304 + }, + { + "epoch": 3.51, + "grad_norm": 0.62890625, + "learning_rate": 8.99901759801599e-06, + "loss": 0.3604, + "step": 26305 + }, + { + "epoch": 3.51, + "grad_norm": 0.453125, + "learning_rate": 8.99419034608684e-06, + "loss": 0.3405, + "step": 26306 + }, + { + "epoch": 3.51, + "grad_norm": 0.53125, + "learning_rate": 8.989364328254856e-06, + "loss": 0.3023, + "step": 26307 + }, + { + "epoch": 3.51, + "grad_norm": 0.66796875, + "learning_rate": 8.984539544585502e-06, + "loss": 0.4042, + "step": 26308 + }, + { + "epoch": 3.51, + "grad_norm": 0.51953125, + "learning_rate": 8.979715995144201e-06, + "loss": 0.2656, + "step": 26309 + }, + { + "epoch": 3.51, + "grad_norm": 0.8984375, + "learning_rate": 8.974893679996388e-06, + "loss": 0.4904, + "step": 26310 + }, + { + "epoch": 3.51, + "grad_norm": 0.7265625, + "learning_rate": 8.970072599207413e-06, + "loss": 0.4317, + "step": 26311 + }, + { + "epoch": 3.51, + "grad_norm": 0.671875, + "learning_rate": 8.96525275284268e-06, + "loss": 0.4643, + "step": 26312 + }, + { + "epoch": 3.51, + "grad_norm": 0.578125, + "learning_rate": 8.960434140967544e-06, + "loss": 0.2182, + "step": 26313 + }, + { + "epoch": 3.51, + "grad_norm": 0.498046875, + "learning_rate": 8.955616763647368e-06, + "loss": 0.1525, + "step": 26314 + }, + { + "epoch": 3.51, + "grad_norm": 0.5625, + "learning_rate": 8.950800620947452e-06, + "loss": 0.3138, + "step": 26315 + }, + { + "epoch": 3.51, + "grad_norm": 0.671875, + "learning_rate": 8.945985712933125e-06, + "loss": 0.4161, + "step": 26316 + }, + { + "epoch": 3.51, + "grad_norm": 0.451171875, + "learning_rate": 8.941172039669643e-06, + "loss": 0.1464, + "step": 26317 + }, + { + "epoch": 3.51, + "grad_norm": 0.72265625, + "learning_rate": 8.936359601222322e-06, + "loss": 0.4194, + "step": 26318 + }, + { + "epoch": 3.51, + "grad_norm": 0.490234375, + "learning_rate": 8.93154839765642e-06, + "loss": 0.1505, + "step": 26319 + }, + { + "epoch": 3.51, + "grad_norm": 0.3828125, + "learning_rate": 8.926738429037163e-06, + "loss": 0.0943, + "step": 26320 + }, + { + "epoch": 3.51, + "grad_norm": 0.61328125, + "learning_rate": 8.921929695429765e-06, + "loss": 0.5705, + "step": 26321 + }, + { + "epoch": 3.51, + "grad_norm": 0.5078125, + "learning_rate": 8.917122196899463e-06, + "loss": 0.222, + "step": 26322 + }, + { + "epoch": 3.51, + "grad_norm": 0.78515625, + "learning_rate": 8.91231593351146e-06, + "loss": 0.4403, + "step": 26323 + }, + { + "epoch": 3.51, + "grad_norm": 0.66015625, + "learning_rate": 8.907510905330896e-06, + "loss": 0.2957, + "step": 26324 + }, + { + "epoch": 3.51, + "grad_norm": 0.58203125, + "learning_rate": 8.902707112422947e-06, + "loss": 0.3229, + "step": 26325 + }, + { + "epoch": 3.51, + "grad_norm": 0.5, + "learning_rate": 8.897904554852753e-06, + "loss": 0.2362, + "step": 26326 + }, + { + "epoch": 3.51, + "grad_norm": 0.9375, + "learning_rate": 8.89310323268544e-06, + "loss": 0.3503, + "step": 26327 + }, + { + "epoch": 3.51, + "grad_norm": 0.486328125, + "learning_rate": 8.888303145986143e-06, + "loss": 0.1823, + "step": 26328 + }, + { + "epoch": 3.51, + "grad_norm": 0.7578125, + "learning_rate": 8.883504294819922e-06, + "loss": 0.5904, + "step": 26329 + }, + { + "epoch": 3.51, + "grad_norm": 0.60546875, + "learning_rate": 8.878706679251846e-06, + "loss": 0.4111, + "step": 26330 + }, + { + "epoch": 3.51, + "grad_norm": 0.54296875, + "learning_rate": 8.873910299346989e-06, + "loss": 0.2772, + "step": 26331 + }, + { + "epoch": 3.51, + "grad_norm": 0.67578125, + "learning_rate": 8.869115155170394e-06, + "loss": 0.3368, + "step": 26332 + }, + { + "epoch": 3.51, + "grad_norm": 0.609375, + "learning_rate": 8.86432124678711e-06, + "loss": 0.196, + "step": 26333 + }, + { + "epoch": 3.51, + "grad_norm": 0.58203125, + "learning_rate": 8.859528574262087e-06, + "loss": 0.4288, + "step": 26334 + }, + { + "epoch": 3.51, + "grad_norm": 0.56640625, + "learning_rate": 8.854737137660363e-06, + "loss": 0.3091, + "step": 26335 + }, + { + "epoch": 3.51, + "grad_norm": 0.6328125, + "learning_rate": 8.849946937046894e-06, + "loss": 0.269, + "step": 26336 + }, + { + "epoch": 3.51, + "grad_norm": 0.62890625, + "learning_rate": 8.845157972486662e-06, + "loss": 0.2098, + "step": 26337 + }, + { + "epoch": 3.51, + "grad_norm": 0.5703125, + "learning_rate": 8.840370244044572e-06, + "loss": 0.3268, + "step": 26338 + }, + { + "epoch": 3.51, + "grad_norm": 0.62890625, + "learning_rate": 8.83558375178557e-06, + "loss": 0.2733, + "step": 26339 + }, + { + "epoch": 3.51, + "grad_norm": 0.65234375, + "learning_rate": 8.830798495774584e-06, + "loss": 0.2799, + "step": 26340 + }, + { + "epoch": 3.51, + "grad_norm": 0.68359375, + "learning_rate": 8.82601447607646e-06, + "loss": 0.2943, + "step": 26341 + }, + { + "epoch": 3.52, + "grad_norm": 0.60546875, + "learning_rate": 8.821231692756105e-06, + "loss": 0.407, + "step": 26342 + }, + { + "epoch": 3.52, + "grad_norm": 0.609375, + "learning_rate": 8.816450145878363e-06, + "loss": 0.2923, + "step": 26343 + }, + { + "epoch": 3.52, + "grad_norm": 0.625, + "learning_rate": 8.811669835508064e-06, + "loss": 0.4301, + "step": 26344 + }, + { + "epoch": 3.52, + "grad_norm": 0.703125, + "learning_rate": 8.806890761710051e-06, + "loss": 0.3756, + "step": 26345 + }, + { + "epoch": 3.52, + "grad_norm": 0.66796875, + "learning_rate": 8.802112924549155e-06, + "loss": 0.2971, + "step": 26346 + }, + { + "epoch": 3.52, + "grad_norm": 0.796875, + "learning_rate": 8.797336324090111e-06, + "loss": 0.5797, + "step": 26347 + }, + { + "epoch": 3.52, + "grad_norm": 0.66015625, + "learning_rate": 8.792560960397711e-06, + "loss": 0.2595, + "step": 26348 + }, + { + "epoch": 3.52, + "grad_norm": 0.6015625, + "learning_rate": 8.787786833536737e-06, + "loss": 0.2268, + "step": 26349 + }, + { + "epoch": 3.52, + "grad_norm": 0.6796875, + "learning_rate": 8.783013943571906e-06, + "loss": 0.2048, + "step": 26350 + }, + { + "epoch": 3.52, + "grad_norm": 0.796875, + "learning_rate": 8.778242290567961e-06, + "loss": 0.3178, + "step": 26351 + }, + { + "epoch": 3.52, + "grad_norm": 0.77734375, + "learning_rate": 8.773471874589611e-06, + "loss": 0.4181, + "step": 26352 + }, + { + "epoch": 3.52, + "grad_norm": 0.64453125, + "learning_rate": 8.768702695701502e-06, + "loss": 0.3139, + "step": 26353 + }, + { + "epoch": 3.52, + "grad_norm": 0.439453125, + "learning_rate": 8.763934753968338e-06, + "loss": 0.2511, + "step": 26354 + }, + { + "epoch": 3.52, + "grad_norm": 0.470703125, + "learning_rate": 8.759168049454769e-06, + "loss": 0.195, + "step": 26355 + }, + { + "epoch": 3.52, + "grad_norm": 0.64453125, + "learning_rate": 8.754402582225463e-06, + "loss": 0.1612, + "step": 26356 + }, + { + "epoch": 3.52, + "grad_norm": 0.71484375, + "learning_rate": 8.749638352345002e-06, + "loss": 0.194, + "step": 26357 + }, + { + "epoch": 3.52, + "grad_norm": 0.56640625, + "learning_rate": 8.744875359878012e-06, + "loss": 0.4499, + "step": 26358 + }, + { + "epoch": 3.52, + "grad_norm": 0.5703125, + "learning_rate": 8.740113604889077e-06, + "loss": 0.242, + "step": 26359 + }, + { + "epoch": 3.52, + "grad_norm": 0.51953125, + "learning_rate": 8.735353087442788e-06, + "loss": 0.356, + "step": 26360 + }, + { + "epoch": 3.52, + "grad_norm": 0.671875, + "learning_rate": 8.73059380760366e-06, + "loss": 0.1617, + "step": 26361 + }, + { + "epoch": 3.52, + "grad_norm": 0.482421875, + "learning_rate": 8.725835765436264e-06, + "loss": 0.3291, + "step": 26362 + }, + { + "epoch": 3.52, + "grad_norm": 0.75, + "learning_rate": 8.721078961005113e-06, + "loss": 0.5515, + "step": 26363 + }, + { + "epoch": 3.52, + "grad_norm": 0.7421875, + "learning_rate": 8.716323394374737e-06, + "loss": 0.291, + "step": 26364 + }, + { + "epoch": 3.52, + "grad_norm": 0.443359375, + "learning_rate": 8.711569065609593e-06, + "loss": 0.0969, + "step": 26365 + }, + { + "epoch": 3.52, + "grad_norm": 0.60546875, + "learning_rate": 8.70681597477414e-06, + "loss": 0.2913, + "step": 26366 + }, + { + "epoch": 3.52, + "grad_norm": 0.78125, + "learning_rate": 8.702064121932862e-06, + "loss": 0.2994, + "step": 26367 + }, + { + "epoch": 3.52, + "grad_norm": 0.68359375, + "learning_rate": 8.697313507150184e-06, + "loss": 0.2956, + "step": 26368 + }, + { + "epoch": 3.52, + "grad_norm": 0.62109375, + "learning_rate": 8.692564130490555e-06, + "loss": 0.4522, + "step": 26369 + }, + { + "epoch": 3.52, + "grad_norm": 0.625, + "learning_rate": 8.687815992018345e-06, + "loss": 0.1507, + "step": 26370 + }, + { + "epoch": 3.52, + "grad_norm": 0.60546875, + "learning_rate": 8.683069091797946e-06, + "loss": 0.2759, + "step": 26371 + }, + { + "epoch": 3.52, + "grad_norm": 0.72265625, + "learning_rate": 8.678323429893731e-06, + "loss": 0.3644, + "step": 26372 + }, + { + "epoch": 3.52, + "grad_norm": 0.7109375, + "learning_rate": 8.67357900637007e-06, + "loss": 0.5795, + "step": 26373 + }, + { + "epoch": 3.52, + "grad_norm": 0.57421875, + "learning_rate": 8.668835821291299e-06, + "loss": 0.4429, + "step": 26374 + }, + { + "epoch": 3.52, + "grad_norm": 0.5390625, + "learning_rate": 8.664093874721724e-06, + "loss": 0.3499, + "step": 26375 + }, + { + "epoch": 3.52, + "grad_norm": 0.6015625, + "learning_rate": 8.659353166725647e-06, + "loss": 0.3623, + "step": 26376 + }, + { + "epoch": 3.52, + "grad_norm": 0.45703125, + "learning_rate": 8.654613697367386e-06, + "loss": 0.2269, + "step": 26377 + }, + { + "epoch": 3.52, + "grad_norm": 0.51171875, + "learning_rate": 8.649875466711166e-06, + "loss": 0.2959, + "step": 26378 + }, + { + "epoch": 3.52, + "grad_norm": 0.73046875, + "learning_rate": 8.64513847482128e-06, + "loss": 0.2468, + "step": 26379 + }, + { + "epoch": 3.52, + "grad_norm": 0.7421875, + "learning_rate": 8.640402721761931e-06, + "loss": 0.1787, + "step": 26380 + }, + { + "epoch": 3.52, + "grad_norm": 0.63671875, + "learning_rate": 8.635668207597359e-06, + "loss": 0.2203, + "step": 26381 + }, + { + "epoch": 3.52, + "grad_norm": 0.5234375, + "learning_rate": 8.630934932391766e-06, + "loss": 0.2124, + "step": 26382 + }, + { + "epoch": 3.52, + "grad_norm": 0.68359375, + "learning_rate": 8.626202896209345e-06, + "loss": 0.1525, + "step": 26383 + }, + { + "epoch": 3.52, + "grad_norm": 0.51171875, + "learning_rate": 8.621472099114236e-06, + "loss": 0.216, + "step": 26384 + }, + { + "epoch": 3.52, + "grad_norm": 0.62890625, + "learning_rate": 8.616742541170619e-06, + "loss": 0.1653, + "step": 26385 + }, + { + "epoch": 3.52, + "grad_norm": 0.58203125, + "learning_rate": 8.612014222442622e-06, + "loss": 0.3529, + "step": 26386 + }, + { + "epoch": 3.52, + "grad_norm": 0.6953125, + "learning_rate": 8.60728714299438e-06, + "loss": 0.3337, + "step": 26387 + }, + { + "epoch": 3.52, + "grad_norm": 0.578125, + "learning_rate": 8.602561302889966e-06, + "loss": 0.5045, + "step": 26388 + }, + { + "epoch": 3.52, + "grad_norm": 0.68359375, + "learning_rate": 8.597836702193496e-06, + "loss": 0.1945, + "step": 26389 + }, + { + "epoch": 3.52, + "grad_norm": 0.84375, + "learning_rate": 8.593113340968994e-06, + "loss": 0.5067, + "step": 26390 + }, + { + "epoch": 3.52, + "grad_norm": 0.71484375, + "learning_rate": 8.588391219280544e-06, + "loss": 0.2705, + "step": 26391 + }, + { + "epoch": 3.52, + "grad_norm": 0.6171875, + "learning_rate": 8.583670337192207e-06, + "loss": 0.3336, + "step": 26392 + }, + { + "epoch": 3.52, + "grad_norm": 0.79296875, + "learning_rate": 8.57895069476794e-06, + "loss": 0.3695, + "step": 26393 + }, + { + "epoch": 3.52, + "grad_norm": 0.671875, + "learning_rate": 8.574232292071782e-06, + "loss": 0.4281, + "step": 26394 + }, + { + "epoch": 3.52, + "grad_norm": 0.6796875, + "learning_rate": 8.569515129167704e-06, + "loss": 0.266, + "step": 26395 + }, + { + "epoch": 3.52, + "grad_norm": 0.380859375, + "learning_rate": 8.5647992061197e-06, + "loss": 0.1139, + "step": 26396 + }, + { + "epoch": 3.52, + "grad_norm": 1.2578125, + "learning_rate": 8.560084522991685e-06, + "loss": 0.7046, + "step": 26397 + }, + { + "epoch": 3.52, + "grad_norm": 0.58203125, + "learning_rate": 8.555371079847617e-06, + "loss": 0.4372, + "step": 26398 + }, + { + "epoch": 3.52, + "grad_norm": 0.6484375, + "learning_rate": 8.550658876751394e-06, + "loss": 0.5892, + "step": 26399 + }, + { + "epoch": 3.52, + "grad_norm": 0.58984375, + "learning_rate": 8.545947913766939e-06, + "loss": 0.4535, + "step": 26400 + }, + { + "epoch": 3.52, + "grad_norm": 0.53515625, + "learning_rate": 8.541238190958134e-06, + "loss": 0.2516, + "step": 26401 + }, + { + "epoch": 3.52, + "grad_norm": 0.6328125, + "learning_rate": 8.53652970838884e-06, + "loss": 0.3688, + "step": 26402 + }, + { + "epoch": 3.52, + "grad_norm": 0.478515625, + "learning_rate": 8.531822466122885e-06, + "loss": 0.1753, + "step": 26403 + }, + { + "epoch": 3.52, + "grad_norm": 0.6875, + "learning_rate": 8.527116464224127e-06, + "loss": 0.3966, + "step": 26404 + }, + { + "epoch": 3.52, + "grad_norm": 0.5703125, + "learning_rate": 8.522411702756372e-06, + "loss": 0.3991, + "step": 26405 + }, + { + "epoch": 3.52, + "grad_norm": 0.55859375, + "learning_rate": 8.517708181783445e-06, + "loss": 0.2017, + "step": 26406 + }, + { + "epoch": 3.52, + "grad_norm": 0.59375, + "learning_rate": 8.513005901369086e-06, + "loss": 0.293, + "step": 26407 + }, + { + "epoch": 3.52, + "grad_norm": 0.4921875, + "learning_rate": 8.508304861577088e-06, + "loss": 0.2274, + "step": 26408 + }, + { + "epoch": 3.52, + "grad_norm": 0.58984375, + "learning_rate": 8.503605062471187e-06, + "loss": 0.3759, + "step": 26409 + }, + { + "epoch": 3.52, + "grad_norm": 0.443359375, + "learning_rate": 8.498906504115146e-06, + "loss": 0.1692, + "step": 26410 + }, + { + "epoch": 3.52, + "grad_norm": 0.8046875, + "learning_rate": 8.494209186572644e-06, + "loss": 0.3586, + "step": 26411 + }, + { + "epoch": 3.52, + "grad_norm": 1.0078125, + "learning_rate": 8.489513109907388e-06, + "loss": 0.4947, + "step": 26412 + }, + { + "epoch": 3.52, + "grad_norm": 0.4296875, + "learning_rate": 8.484818274183082e-06, + "loss": 0.1014, + "step": 26413 + }, + { + "epoch": 3.52, + "grad_norm": 0.79296875, + "learning_rate": 8.480124679463363e-06, + "loss": 0.4801, + "step": 26414 + }, + { + "epoch": 3.52, + "grad_norm": 0.55859375, + "learning_rate": 8.475432325811905e-06, + "loss": 0.2075, + "step": 26415 + }, + { + "epoch": 3.52, + "grad_norm": 0.6875, + "learning_rate": 8.47074121329231e-06, + "loss": 0.4755, + "step": 26416 + }, + { + "epoch": 3.53, + "grad_norm": 0.7265625, + "learning_rate": 8.466051341968195e-06, + "loss": 0.429, + "step": 26417 + }, + { + "epoch": 3.53, + "grad_norm": 0.54296875, + "learning_rate": 8.461362711903187e-06, + "loss": 0.1597, + "step": 26418 + }, + { + "epoch": 3.53, + "grad_norm": 0.46875, + "learning_rate": 8.456675323160857e-06, + "loss": 0.247, + "step": 26419 + }, + { + "epoch": 3.53, + "grad_norm": 0.57421875, + "learning_rate": 8.451989175804742e-06, + "loss": 0.1986, + "step": 26420 + }, + { + "epoch": 3.53, + "grad_norm": 0.52734375, + "learning_rate": 8.447304269898415e-06, + "loss": 0.3219, + "step": 26421 + }, + { + "epoch": 3.53, + "grad_norm": 0.61328125, + "learning_rate": 8.442620605505403e-06, + "loss": 0.3267, + "step": 26422 + }, + { + "epoch": 3.53, + "grad_norm": 0.55078125, + "learning_rate": 8.43793818268921e-06, + "loss": 0.2916, + "step": 26423 + }, + { + "epoch": 3.53, + "grad_norm": 0.703125, + "learning_rate": 8.43325700151335e-06, + "loss": 0.4994, + "step": 26424 + }, + { + "epoch": 3.53, + "grad_norm": 0.53125, + "learning_rate": 8.428577062041309e-06, + "loss": 0.1545, + "step": 26425 + }, + { + "epoch": 3.53, + "grad_norm": 0.451171875, + "learning_rate": 8.423898364336503e-06, + "loss": 0.1966, + "step": 26426 + }, + { + "epoch": 3.53, + "grad_norm": 0.72265625, + "learning_rate": 8.419220908462411e-06, + "loss": 0.1719, + "step": 26427 + }, + { + "epoch": 3.53, + "grad_norm": 0.67578125, + "learning_rate": 8.414544694482462e-06, + "loss": 0.2325, + "step": 26428 + }, + { + "epoch": 3.53, + "grad_norm": 0.75, + "learning_rate": 8.409869722460084e-06, + "loss": 0.3767, + "step": 26429 + }, + { + "epoch": 3.53, + "grad_norm": 0.5234375, + "learning_rate": 8.405195992458636e-06, + "loss": 0.263, + "step": 26430 + }, + { + "epoch": 3.53, + "grad_norm": 0.57421875, + "learning_rate": 8.400523504541512e-06, + "loss": 0.2812, + "step": 26431 + }, + { + "epoch": 3.53, + "grad_norm": 0.7890625, + "learning_rate": 8.395852258772086e-06, + "loss": 0.3471, + "step": 26432 + }, + { + "epoch": 3.53, + "grad_norm": 0.71484375, + "learning_rate": 8.391182255213703e-06, + "loss": 0.5363, + "step": 26433 + }, + { + "epoch": 3.53, + "grad_norm": 0.546875, + "learning_rate": 8.386513493929671e-06, + "loss": 0.2762, + "step": 26434 + }, + { + "epoch": 3.53, + "grad_norm": 0.75, + "learning_rate": 8.381845974983316e-06, + "loss": 0.2691, + "step": 26435 + }, + { + "epoch": 3.53, + "grad_norm": 0.72265625, + "learning_rate": 8.377179698437921e-06, + "loss": 0.4269, + "step": 26436 + }, + { + "epoch": 3.53, + "grad_norm": 0.5625, + "learning_rate": 8.3725146643568e-06, + "loss": 0.2561, + "step": 26437 + }, + { + "epoch": 3.53, + "grad_norm": 0.65234375, + "learning_rate": 8.367850872803174e-06, + "loss": 0.2217, + "step": 26438 + }, + { + "epoch": 3.53, + "grad_norm": 0.6484375, + "learning_rate": 8.363188323840288e-06, + "loss": 0.593, + "step": 26439 + }, + { + "epoch": 3.53, + "grad_norm": 0.703125, + "learning_rate": 8.358527017531382e-06, + "loss": 0.4032, + "step": 26440 + }, + { + "epoch": 3.53, + "grad_norm": 0.56640625, + "learning_rate": 8.353866953939671e-06, + "loss": 0.4424, + "step": 26441 + }, + { + "epoch": 3.53, + "grad_norm": 0.484375, + "learning_rate": 8.34920813312835e-06, + "loss": 0.2771, + "step": 26442 + }, + { + "epoch": 3.53, + "grad_norm": 0.7421875, + "learning_rate": 8.344550555160569e-06, + "loss": 0.2298, + "step": 26443 + }, + { + "epoch": 3.53, + "grad_norm": 0.5859375, + "learning_rate": 8.339894220099509e-06, + "loss": 0.566, + "step": 26444 + }, + { + "epoch": 3.53, + "grad_norm": 0.546875, + "learning_rate": 8.33523912800831e-06, + "loss": 0.445, + "step": 26445 + }, + { + "epoch": 3.53, + "grad_norm": 0.6328125, + "learning_rate": 8.33058527895011e-06, + "loss": 0.2276, + "step": 26446 + }, + { + "epoch": 3.53, + "grad_norm": 0.51171875, + "learning_rate": 8.325932672988012e-06, + "loss": 0.1929, + "step": 26447 + }, + { + "epoch": 3.53, + "grad_norm": 0.7421875, + "learning_rate": 8.32128131018508e-06, + "loss": 0.3084, + "step": 26448 + }, + { + "epoch": 3.53, + "grad_norm": 0.69140625, + "learning_rate": 8.31663119060444e-06, + "loss": 0.3945, + "step": 26449 + }, + { + "epoch": 3.53, + "grad_norm": 0.5546875, + "learning_rate": 8.311982314309109e-06, + "loss": 0.3274, + "step": 26450 + }, + { + "epoch": 3.53, + "grad_norm": 0.53125, + "learning_rate": 8.307334681362133e-06, + "loss": 0.2696, + "step": 26451 + }, + { + "epoch": 3.53, + "grad_norm": 0.5625, + "learning_rate": 8.302688291826565e-06, + "loss": 0.1883, + "step": 26452 + }, + { + "epoch": 3.53, + "grad_norm": 0.53125, + "learning_rate": 8.298043145765377e-06, + "loss": 0.311, + "step": 26453 + }, + { + "epoch": 3.53, + "grad_norm": 0.5546875, + "learning_rate": 8.293399243241584e-06, + "loss": 0.2797, + "step": 26454 + }, + { + "epoch": 3.53, + "grad_norm": 0.59375, + "learning_rate": 8.288756584318158e-06, + "loss": 0.325, + "step": 26455 + }, + { + "epoch": 3.53, + "grad_norm": 0.5859375, + "learning_rate": 8.28411516905806e-06, + "loss": 0.3237, + "step": 26456 + }, + { + "epoch": 3.53, + "grad_norm": 0.5625, + "learning_rate": 8.279474997524207e-06, + "loss": 0.1435, + "step": 26457 + }, + { + "epoch": 3.53, + "grad_norm": 1.0390625, + "learning_rate": 8.274836069779546e-06, + "loss": 0.4327, + "step": 26458 + }, + { + "epoch": 3.53, + "grad_norm": 0.56640625, + "learning_rate": 8.270198385886963e-06, + "loss": 0.216, + "step": 26459 + }, + { + "epoch": 3.53, + "grad_norm": 0.58984375, + "learning_rate": 8.265561945909395e-06, + "loss": 0.3989, + "step": 26460 + }, + { + "epoch": 3.53, + "grad_norm": 0.65234375, + "learning_rate": 8.26092674990966e-06, + "loss": 0.4357, + "step": 26461 + }, + { + "epoch": 3.53, + "grad_norm": 0.8125, + "learning_rate": 8.256292797950648e-06, + "loss": 0.3922, + "step": 26462 + }, + { + "epoch": 3.53, + "grad_norm": 0.66015625, + "learning_rate": 8.251660090095171e-06, + "loss": 0.2862, + "step": 26463 + }, + { + "epoch": 3.53, + "grad_norm": 0.69140625, + "learning_rate": 8.247028626406061e-06, + "loss": 0.2727, + "step": 26464 + }, + { + "epoch": 3.53, + "grad_norm": 0.65625, + "learning_rate": 8.242398406946162e-06, + "loss": 0.2689, + "step": 26465 + }, + { + "epoch": 3.53, + "grad_norm": 0.828125, + "learning_rate": 8.237769431778197e-06, + "loss": 0.4607, + "step": 26466 + }, + { + "epoch": 3.53, + "grad_norm": 0.734375, + "learning_rate": 8.233141700964975e-06, + "loss": 0.2959, + "step": 26467 + }, + { + "epoch": 3.53, + "grad_norm": 0.69921875, + "learning_rate": 8.228515214569255e-06, + "loss": 0.2961, + "step": 26468 + }, + { + "epoch": 3.53, + "grad_norm": 0.6796875, + "learning_rate": 8.223889972653775e-06, + "loss": 0.3898, + "step": 26469 + }, + { + "epoch": 3.53, + "grad_norm": 0.765625, + "learning_rate": 8.219265975281242e-06, + "loss": 0.2132, + "step": 26470 + }, + { + "epoch": 3.53, + "grad_norm": 0.75390625, + "learning_rate": 8.214643222514363e-06, + "loss": 0.2576, + "step": 26471 + }, + { + "epoch": 3.53, + "grad_norm": 0.421875, + "learning_rate": 8.210021714415828e-06, + "loss": 0.1954, + "step": 26472 + }, + { + "epoch": 3.53, + "grad_norm": 0.546875, + "learning_rate": 8.205401451048301e-06, + "loss": 0.252, + "step": 26473 + }, + { + "epoch": 3.53, + "grad_norm": 0.62109375, + "learning_rate": 8.200782432474474e-06, + "loss": 0.3394, + "step": 26474 + }, + { + "epoch": 3.53, + "grad_norm": 0.8203125, + "learning_rate": 8.196164658756943e-06, + "loss": 0.3162, + "step": 26475 + }, + { + "epoch": 3.53, + "grad_norm": 0.52734375, + "learning_rate": 8.191548129958326e-06, + "loss": 0.3465, + "step": 26476 + }, + { + "epoch": 3.53, + "grad_norm": 0.6875, + "learning_rate": 8.186932846141237e-06, + "loss": 0.2099, + "step": 26477 + }, + { + "epoch": 3.53, + "grad_norm": 0.71484375, + "learning_rate": 8.182318807368272e-06, + "loss": 0.6402, + "step": 26478 + }, + { + "epoch": 3.53, + "grad_norm": 0.5078125, + "learning_rate": 8.177706013702002e-06, + "loss": 0.2953, + "step": 26479 + }, + { + "epoch": 3.53, + "grad_norm": 0.6640625, + "learning_rate": 8.173094465204955e-06, + "loss": 0.1756, + "step": 26480 + }, + { + "epoch": 3.53, + "grad_norm": 0.67578125, + "learning_rate": 8.168484161939672e-06, + "loss": 0.3457, + "step": 26481 + }, + { + "epoch": 3.53, + "grad_norm": 0.6875, + "learning_rate": 8.163875103968688e-06, + "loss": 0.3713, + "step": 26482 + }, + { + "epoch": 3.53, + "grad_norm": 0.55859375, + "learning_rate": 8.159267291354522e-06, + "loss": 0.3382, + "step": 26483 + }, + { + "epoch": 3.53, + "grad_norm": 0.5703125, + "learning_rate": 8.154660724159602e-06, + "loss": 0.4054, + "step": 26484 + }, + { + "epoch": 3.53, + "grad_norm": 0.61328125, + "learning_rate": 8.150055402446443e-06, + "loss": 0.3024, + "step": 26485 + }, + { + "epoch": 3.53, + "grad_norm": 0.44921875, + "learning_rate": 8.145451326277487e-06, + "loss": 0.1339, + "step": 26486 + }, + { + "epoch": 3.53, + "grad_norm": 0.79296875, + "learning_rate": 8.14084849571516e-06, + "loss": 0.4998, + "step": 26487 + }, + { + "epoch": 3.53, + "grad_norm": 0.466796875, + "learning_rate": 8.136246910821888e-06, + "loss": 0.2249, + "step": 26488 + }, + { + "epoch": 3.53, + "grad_norm": 0.7578125, + "learning_rate": 8.131646571660045e-06, + "loss": 0.2164, + "step": 26489 + }, + { + "epoch": 3.53, + "grad_norm": 0.498046875, + "learning_rate": 8.127047478292048e-06, + "loss": 0.3667, + "step": 26490 + }, + { + "epoch": 3.53, + "grad_norm": 0.7734375, + "learning_rate": 8.122449630780238e-06, + "loss": 0.306, + "step": 26491 + }, + { + "epoch": 3.54, + "grad_norm": 0.7265625, + "learning_rate": 8.117853029187006e-06, + "loss": 0.3069, + "step": 26492 + }, + { + "epoch": 3.54, + "grad_norm": 0.431640625, + "learning_rate": 8.113257673574636e-06, + "loss": 0.1928, + "step": 26493 + }, + { + "epoch": 3.54, + "grad_norm": 0.578125, + "learning_rate": 8.10866356400546e-06, + "loss": 0.3887, + "step": 26494 + }, + { + "epoch": 3.54, + "grad_norm": 0.79296875, + "learning_rate": 8.104070700541788e-06, + "loss": 0.3793, + "step": 26495 + }, + { + "epoch": 3.54, + "grad_norm": 0.62109375, + "learning_rate": 8.099479083245887e-06, + "loss": 0.439, + "step": 26496 + }, + { + "epoch": 3.54, + "grad_norm": 0.71875, + "learning_rate": 8.09488871218006e-06, + "loss": 0.2111, + "step": 26497 + }, + { + "epoch": 3.54, + "grad_norm": 0.625, + "learning_rate": 8.090299587406514e-06, + "loss": 0.1948, + "step": 26498 + }, + { + "epoch": 3.54, + "grad_norm": 0.60546875, + "learning_rate": 8.085711708987486e-06, + "loss": 0.292, + "step": 26499 + }, + { + "epoch": 3.54, + "grad_norm": 0.546875, + "learning_rate": 8.081125076985186e-06, + "loss": 0.1924, + "step": 26500 + }, + { + "epoch": 3.54, + "grad_norm": 0.59765625, + "learning_rate": 8.076539691461826e-06, + "loss": 0.3182, + "step": 26501 + }, + { + "epoch": 3.54, + "grad_norm": 0.6171875, + "learning_rate": 8.071955552479615e-06, + "loss": 0.2656, + "step": 26502 + }, + { + "epoch": 3.54, + "grad_norm": 0.69921875, + "learning_rate": 8.067372660100658e-06, + "loss": 0.4529, + "step": 26503 + }, + { + "epoch": 3.54, + "grad_norm": 0.53515625, + "learning_rate": 8.062791014387128e-06, + "loss": 0.1812, + "step": 26504 + }, + { + "epoch": 3.54, + "grad_norm": 0.6484375, + "learning_rate": 8.058210615401152e-06, + "loss": 0.2071, + "step": 26505 + }, + { + "epoch": 3.54, + "grad_norm": 0.48828125, + "learning_rate": 8.05363146320488e-06, + "loss": 0.2064, + "step": 26506 + }, + { + "epoch": 3.54, + "grad_norm": 0.58203125, + "learning_rate": 8.049053557860343e-06, + "loss": 0.3041, + "step": 26507 + }, + { + "epoch": 3.54, + "grad_norm": 0.66015625, + "learning_rate": 8.044476899429665e-06, + "loss": 0.203, + "step": 26508 + }, + { + "epoch": 3.54, + "grad_norm": 0.69921875, + "learning_rate": 8.039901487974898e-06, + "loss": 0.4116, + "step": 26509 + }, + { + "epoch": 3.54, + "grad_norm": 0.734375, + "learning_rate": 8.035327323558095e-06, + "loss": 0.2976, + "step": 26510 + }, + { + "epoch": 3.54, + "grad_norm": 0.703125, + "learning_rate": 8.030754406241281e-06, + "loss": 0.3803, + "step": 26511 + }, + { + "epoch": 3.54, + "grad_norm": 0.8046875, + "learning_rate": 8.026182736086441e-06, + "loss": 0.2448, + "step": 26512 + }, + { + "epoch": 3.54, + "grad_norm": 0.421875, + "learning_rate": 8.021612313155601e-06, + "loss": 0.1372, + "step": 26513 + }, + { + "epoch": 3.54, + "grad_norm": 0.66015625, + "learning_rate": 8.017043137510716e-06, + "loss": 0.4102, + "step": 26514 + }, + { + "epoch": 3.54, + "grad_norm": 0.6953125, + "learning_rate": 8.012475209213787e-06, + "loss": 0.3168, + "step": 26515 + }, + { + "epoch": 3.54, + "grad_norm": 0.5078125, + "learning_rate": 8.007908528326713e-06, + "loss": 0.2569, + "step": 26516 + }, + { + "epoch": 3.54, + "grad_norm": 0.703125, + "learning_rate": 8.003343094911442e-06, + "loss": 0.5307, + "step": 26517 + }, + { + "epoch": 3.54, + "grad_norm": 0.58984375, + "learning_rate": 7.998778909029869e-06, + "loss": 0.2561, + "step": 26518 + }, + { + "epoch": 3.54, + "grad_norm": 0.6328125, + "learning_rate": 7.994215970743913e-06, + "loss": 0.289, + "step": 26519 + }, + { + "epoch": 3.54, + "grad_norm": 1.109375, + "learning_rate": 7.989654280115456e-06, + "loss": 0.4726, + "step": 26520 + }, + { + "epoch": 3.54, + "grad_norm": 0.703125, + "learning_rate": 7.985093837206315e-06, + "loss": 0.3578, + "step": 26521 + }, + { + "epoch": 3.54, + "grad_norm": 0.75390625, + "learning_rate": 7.980534642078374e-06, + "loss": 0.3449, + "step": 26522 + }, + { + "epoch": 3.54, + "grad_norm": 0.703125, + "learning_rate": 7.975976694793419e-06, + "loss": 0.3526, + "step": 26523 + }, + { + "epoch": 3.54, + "grad_norm": 0.65234375, + "learning_rate": 7.971419995413287e-06, + "loss": 0.5194, + "step": 26524 + }, + { + "epoch": 3.54, + "grad_norm": 0.73828125, + "learning_rate": 7.966864543999787e-06, + "loss": 0.4942, + "step": 26525 + }, + { + "epoch": 3.54, + "grad_norm": 0.60546875, + "learning_rate": 7.962310340614642e-06, + "loss": 0.6389, + "step": 26526 + }, + { + "epoch": 3.54, + "grad_norm": 0.55078125, + "learning_rate": 7.957757385319642e-06, + "loss": 0.2104, + "step": 26527 + }, + { + "epoch": 3.54, + "grad_norm": 0.6953125, + "learning_rate": 7.953205678176523e-06, + "loss": 0.3885, + "step": 26528 + }, + { + "epoch": 3.54, + "grad_norm": 0.61328125, + "learning_rate": 7.948655219247025e-06, + "loss": 0.6099, + "step": 26529 + }, + { + "epoch": 3.54, + "grad_norm": 0.7734375, + "learning_rate": 7.944106008592833e-06, + "loss": 0.2489, + "step": 26530 + }, + { + "epoch": 3.54, + "grad_norm": 0.51953125, + "learning_rate": 7.93955804627563e-06, + "loss": 0.2184, + "step": 26531 + }, + { + "epoch": 3.54, + "grad_norm": 0.69140625, + "learning_rate": 7.935011332357112e-06, + "loss": 0.4348, + "step": 26532 + }, + { + "epoch": 3.54, + "grad_norm": 0.74609375, + "learning_rate": 7.93046586689895e-06, + "loss": 0.273, + "step": 26533 + }, + { + "epoch": 3.54, + "grad_norm": 0.60546875, + "learning_rate": 7.925921649962754e-06, + "loss": 0.5202, + "step": 26534 + }, + { + "epoch": 3.54, + "grad_norm": 0.6015625, + "learning_rate": 7.921378681610137e-06, + "loss": 0.408, + "step": 26535 + }, + { + "epoch": 3.54, + "grad_norm": 0.640625, + "learning_rate": 7.916836961902718e-06, + "loss": 0.5803, + "step": 26536 + }, + { + "epoch": 3.54, + "grad_norm": 0.65234375, + "learning_rate": 7.912296490902093e-06, + "loss": 0.5264, + "step": 26537 + }, + { + "epoch": 3.54, + "grad_norm": 0.74609375, + "learning_rate": 7.907757268669847e-06, + "loss": 0.8295, + "step": 26538 + }, + { + "epoch": 3.54, + "grad_norm": 0.59765625, + "learning_rate": 7.903219295267494e-06, + "loss": 0.3697, + "step": 26539 + }, + { + "epoch": 3.54, + "grad_norm": 0.54296875, + "learning_rate": 7.898682570756599e-06, + "loss": 0.2178, + "step": 26540 + }, + { + "epoch": 3.54, + "grad_norm": 0.625, + "learning_rate": 7.894147095198678e-06, + "loss": 0.4472, + "step": 26541 + }, + { + "epoch": 3.54, + "grad_norm": 0.66015625, + "learning_rate": 7.88961286865525e-06, + "loss": 0.3324, + "step": 26542 + }, + { + "epoch": 3.54, + "grad_norm": 0.65234375, + "learning_rate": 7.885079891187775e-06, + "loss": 0.3182, + "step": 26543 + }, + { + "epoch": 3.54, + "grad_norm": 0.6015625, + "learning_rate": 7.880548162857737e-06, + "loss": 0.2656, + "step": 26544 + }, + { + "epoch": 3.54, + "grad_norm": 0.7109375, + "learning_rate": 7.876017683726578e-06, + "loss": 0.3005, + "step": 26545 + }, + { + "epoch": 3.54, + "grad_norm": 0.59375, + "learning_rate": 7.871488453855757e-06, + "loss": 0.247, + "step": 26546 + }, + { + "epoch": 3.54, + "grad_norm": 0.68359375, + "learning_rate": 7.866960473306683e-06, + "loss": 0.2096, + "step": 26547 + }, + { + "epoch": 3.54, + "grad_norm": 0.625, + "learning_rate": 7.862433742140763e-06, + "loss": 0.2033, + "step": 26548 + }, + { + "epoch": 3.54, + "grad_norm": 0.58984375, + "learning_rate": 7.857908260419367e-06, + "loss": 0.3827, + "step": 26549 + }, + { + "epoch": 3.54, + "grad_norm": 0.74609375, + "learning_rate": 7.853384028203859e-06, + "loss": 0.2084, + "step": 26550 + }, + { + "epoch": 3.54, + "grad_norm": 0.9453125, + "learning_rate": 7.8488610455556e-06, + "loss": 0.7035, + "step": 26551 + }, + { + "epoch": 3.54, + "grad_norm": 0.625, + "learning_rate": 7.844339312535954e-06, + "loss": 0.3729, + "step": 26552 + }, + { + "epoch": 3.54, + "grad_norm": 0.6640625, + "learning_rate": 7.839818829206192e-06, + "loss": 0.4992, + "step": 26553 + }, + { + "epoch": 3.54, + "grad_norm": 0.48046875, + "learning_rate": 7.835299595627632e-06, + "loss": 0.184, + "step": 26554 + }, + { + "epoch": 3.54, + "grad_norm": 0.6015625, + "learning_rate": 7.83078161186157e-06, + "loss": 0.3804, + "step": 26555 + }, + { + "epoch": 3.54, + "grad_norm": 0.65625, + "learning_rate": 7.826264877969269e-06, + "loss": 0.2539, + "step": 26556 + }, + { + "epoch": 3.54, + "grad_norm": 0.5703125, + "learning_rate": 7.821749394011957e-06, + "loss": 0.2886, + "step": 26557 + }, + { + "epoch": 3.54, + "grad_norm": 0.640625, + "learning_rate": 7.817235160050885e-06, + "loss": 0.3492, + "step": 26558 + }, + { + "epoch": 3.54, + "grad_norm": 0.55078125, + "learning_rate": 7.812722176147292e-06, + "loss": 0.1174, + "step": 26559 + }, + { + "epoch": 3.54, + "grad_norm": 0.55859375, + "learning_rate": 7.808210442362318e-06, + "loss": 0.3864, + "step": 26560 + }, + { + "epoch": 3.54, + "grad_norm": 0.5859375, + "learning_rate": 7.803699958757205e-06, + "loss": 0.3052, + "step": 26561 + }, + { + "epoch": 3.54, + "grad_norm": 0.61328125, + "learning_rate": 7.799190725393079e-06, + "loss": 0.2813, + "step": 26562 + }, + { + "epoch": 3.54, + "grad_norm": 0.5390625, + "learning_rate": 7.794682742331094e-06, + "loss": 0.1802, + "step": 26563 + }, + { + "epoch": 3.54, + "grad_norm": 0.73046875, + "learning_rate": 7.790176009632389e-06, + "loss": 0.412, + "step": 26564 + }, + { + "epoch": 3.54, + "grad_norm": 0.578125, + "learning_rate": 7.785670527358102e-06, + "loss": 0.2755, + "step": 26565 + }, + { + "epoch": 3.54, + "grad_norm": 0.671875, + "learning_rate": 7.781166295569275e-06, + "loss": 0.2222, + "step": 26566 + }, + { + "epoch": 3.55, + "grad_norm": 0.6796875, + "learning_rate": 7.776663314327027e-06, + "loss": 0.2469, + "step": 26567 + }, + { + "epoch": 3.55, + "grad_norm": 0.65234375, + "learning_rate": 7.772161583692416e-06, + "loss": 0.5576, + "step": 26568 + }, + { + "epoch": 3.55, + "grad_norm": 0.6171875, + "learning_rate": 7.767661103726476e-06, + "loss": 0.2343, + "step": 26569 + }, + { + "epoch": 3.55, + "grad_norm": 0.625, + "learning_rate": 7.763161874490266e-06, + "loss": 0.3233, + "step": 26570 + }, + { + "epoch": 3.55, + "grad_norm": 0.52734375, + "learning_rate": 7.758663896044782e-06, + "loss": 0.2128, + "step": 26571 + }, + { + "epoch": 3.55, + "grad_norm": 0.56640625, + "learning_rate": 7.754167168450998e-06, + "loss": 0.384, + "step": 26572 + }, + { + "epoch": 3.55, + "grad_norm": 0.76953125, + "learning_rate": 7.749671691769911e-06, + "loss": 0.245, + "step": 26573 + }, + { + "epoch": 3.55, + "grad_norm": 0.6484375, + "learning_rate": 7.745177466062482e-06, + "loss": 0.6022, + "step": 26574 + }, + { + "epoch": 3.55, + "grad_norm": 0.58984375, + "learning_rate": 7.74068449138966e-06, + "loss": 0.1683, + "step": 26575 + }, + { + "epoch": 3.55, + "grad_norm": 0.625, + "learning_rate": 7.736192767812367e-06, + "loss": 0.2283, + "step": 26576 + }, + { + "epoch": 3.55, + "grad_norm": 0.69921875, + "learning_rate": 7.731702295391507e-06, + "loss": 0.4102, + "step": 26577 + }, + { + "epoch": 3.55, + "grad_norm": 0.640625, + "learning_rate": 7.727213074187977e-06, + "loss": 0.147, + "step": 26578 + }, + { + "epoch": 3.55, + "grad_norm": 0.67578125, + "learning_rate": 7.722725104262685e-06, + "loss": 0.3771, + "step": 26579 + }, + { + "epoch": 3.55, + "grad_norm": 0.5, + "learning_rate": 7.718238385676434e-06, + "loss": 0.3342, + "step": 26580 + }, + { + "epoch": 3.55, + "grad_norm": 0.73828125, + "learning_rate": 7.713752918490113e-06, + "loss": 0.1852, + "step": 26581 + }, + { + "epoch": 3.55, + "grad_norm": 0.7109375, + "learning_rate": 7.709268702764517e-06, + "loss": 0.1344, + "step": 26582 + }, + { + "epoch": 3.55, + "grad_norm": 0.6015625, + "learning_rate": 7.704785738560494e-06, + "loss": 0.4669, + "step": 26583 + }, + { + "epoch": 3.55, + "grad_norm": 0.66015625, + "learning_rate": 7.70030402593881e-06, + "loss": 0.4251, + "step": 26584 + }, + { + "epoch": 3.55, + "grad_norm": 0.51953125, + "learning_rate": 7.695823564960225e-06, + "loss": 0.3968, + "step": 26585 + }, + { + "epoch": 3.55, + "grad_norm": 0.52734375, + "learning_rate": 7.691344355685515e-06, + "loss": 0.2281, + "step": 26586 + }, + { + "epoch": 3.55, + "grad_norm": 0.75, + "learning_rate": 7.686866398175407e-06, + "loss": 0.4142, + "step": 26587 + }, + { + "epoch": 3.55, + "grad_norm": 0.64453125, + "learning_rate": 7.682389692490666e-06, + "loss": 0.4498, + "step": 26588 + }, + { + "epoch": 3.55, + "grad_norm": 0.6328125, + "learning_rate": 7.677914238691953e-06, + "loss": 0.1818, + "step": 26589 + }, + { + "epoch": 3.55, + "grad_norm": 0.66796875, + "learning_rate": 7.673440036839963e-06, + "loss": 0.2324, + "step": 26590 + }, + { + "epoch": 3.55, + "grad_norm": 0.6640625, + "learning_rate": 7.668967086995393e-06, + "loss": 0.2633, + "step": 26591 + }, + { + "epoch": 3.55, + "grad_norm": 0.53515625, + "learning_rate": 7.664495389218884e-06, + "loss": 0.1362, + "step": 26592 + }, + { + "epoch": 3.55, + "grad_norm": 0.69140625, + "learning_rate": 7.660024943571086e-06, + "loss": 0.2602, + "step": 26593 + }, + { + "epoch": 3.55, + "grad_norm": 0.74609375, + "learning_rate": 7.655555750112609e-06, + "loss": 0.4043, + "step": 26594 + }, + { + "epoch": 3.55, + "grad_norm": 0.4375, + "learning_rate": 7.651087808904067e-06, + "loss": 0.1244, + "step": 26595 + }, + { + "epoch": 3.55, + "grad_norm": 0.58203125, + "learning_rate": 7.646621120006037e-06, + "loss": 0.2861, + "step": 26596 + }, + { + "epoch": 3.55, + "grad_norm": 0.63671875, + "learning_rate": 7.642155683479091e-06, + "loss": 0.3689, + "step": 26597 + }, + { + "epoch": 3.55, + "grad_norm": 0.64453125, + "learning_rate": 7.637691499383814e-06, + "loss": 0.4243, + "step": 26598 + }, + { + "epoch": 3.55, + "grad_norm": 0.5859375, + "learning_rate": 7.633228567780692e-06, + "loss": 0.3168, + "step": 26599 + }, + { + "epoch": 3.55, + "grad_norm": 0.7109375, + "learning_rate": 7.628766888730277e-06, + "loss": 0.2609, + "step": 26600 + }, + { + "epoch": 3.55, + "grad_norm": 0.66796875, + "learning_rate": 7.624306462293062e-06, + "loss": 0.3004, + "step": 26601 + }, + { + "epoch": 3.55, + "grad_norm": 0.66015625, + "learning_rate": 7.619847288529547e-06, + "loss": 0.2078, + "step": 26602 + }, + { + "epoch": 3.55, + "grad_norm": 0.66015625, + "learning_rate": 7.615389367500192e-06, + "loss": 0.3884, + "step": 26603 + }, + { + "epoch": 3.55, + "grad_norm": 0.58984375, + "learning_rate": 7.6109326992654385e-06, + "loss": 0.2357, + "step": 26604 + }, + { + "epoch": 3.55, + "grad_norm": 0.69140625, + "learning_rate": 7.606477283885738e-06, + "loss": 0.3426, + "step": 26605 + }, + { + "epoch": 3.55, + "grad_norm": 0.67578125, + "learning_rate": 7.602023121421509e-06, + "loss": 0.2837, + "step": 26606 + }, + { + "epoch": 3.55, + "grad_norm": 0.6484375, + "learning_rate": 7.597570211933158e-06, + "loss": 0.3423, + "step": 26607 + }, + { + "epoch": 3.55, + "grad_norm": 0.5625, + "learning_rate": 7.593118555481038e-06, + "loss": 0.4197, + "step": 26608 + }, + { + "epoch": 3.55, + "grad_norm": 0.6796875, + "learning_rate": 7.588668152125544e-06, + "loss": 0.3737, + "step": 26609 + }, + { + "epoch": 3.55, + "grad_norm": 0.6328125, + "learning_rate": 7.584219001927006e-06, + "loss": 0.358, + "step": 26610 + }, + { + "epoch": 3.55, + "grad_norm": 0.462890625, + "learning_rate": 7.579771104945799e-06, + "loss": 0.2188, + "step": 26611 + }, + { + "epoch": 3.55, + "grad_norm": 0.58984375, + "learning_rate": 7.5753244612421835e-06, + "loss": 0.3084, + "step": 26612 + }, + { + "epoch": 3.55, + "grad_norm": 0.578125, + "learning_rate": 7.5708790708764905e-06, + "loss": 0.2981, + "step": 26613 + }, + { + "epoch": 3.55, + "grad_norm": 0.78515625, + "learning_rate": 7.566434933909006e-06, + "loss": 0.2876, + "step": 26614 + }, + { + "epoch": 3.55, + "grad_norm": 0.63671875, + "learning_rate": 7.5619920503999905e-06, + "loss": 0.2768, + "step": 26615 + }, + { + "epoch": 3.55, + "grad_norm": 0.53125, + "learning_rate": 7.5575504204096755e-06, + "loss": 0.2907, + "step": 26616 + }, + { + "epoch": 3.55, + "grad_norm": 0.5546875, + "learning_rate": 7.553110043998313e-06, + "loss": 0.2398, + "step": 26617 + }, + { + "epoch": 3.55, + "grad_norm": 0.69140625, + "learning_rate": 7.548670921226109e-06, + "loss": 0.4077, + "step": 26618 + }, + { + "epoch": 3.55, + "grad_norm": 0.60546875, + "learning_rate": 7.544233052153271e-06, + "loss": 0.4344, + "step": 26619 + }, + { + "epoch": 3.55, + "grad_norm": 0.54296875, + "learning_rate": 7.539796436839963e-06, + "loss": 0.2591, + "step": 26620 + }, + { + "epoch": 3.55, + "grad_norm": 0.6484375, + "learning_rate": 7.53536107534637e-06, + "loss": 0.2763, + "step": 26621 + }, + { + "epoch": 3.55, + "grad_norm": 0.74609375, + "learning_rate": 7.53092696773261e-06, + "loss": 0.372, + "step": 26622 + }, + { + "epoch": 3.55, + "grad_norm": 0.6953125, + "learning_rate": 7.526494114058813e-06, + "loss": 0.3324, + "step": 26623 + }, + { + "epoch": 3.55, + "grad_norm": 0.703125, + "learning_rate": 7.52206251438512e-06, + "loss": 0.2488, + "step": 26624 + }, + { + "epoch": 3.55, + "grad_norm": 0.640625, + "learning_rate": 7.517632168771626e-06, + "loss": 0.4402, + "step": 26625 + }, + { + "epoch": 3.55, + "grad_norm": 0.515625, + "learning_rate": 7.513203077278374e-06, + "loss": 0.2092, + "step": 26626 + }, + { + "epoch": 3.55, + "grad_norm": 0.466796875, + "learning_rate": 7.508775239965438e-06, + "loss": 0.1716, + "step": 26627 + }, + { + "epoch": 3.55, + "grad_norm": 0.71484375, + "learning_rate": 7.504348656892879e-06, + "loss": 0.345, + "step": 26628 + }, + { + "epoch": 3.55, + "grad_norm": 0.66796875, + "learning_rate": 7.499923328120739e-06, + "loss": 0.1586, + "step": 26629 + }, + { + "epoch": 3.55, + "grad_norm": 0.578125, + "learning_rate": 7.495499253708982e-06, + "loss": 0.3518, + "step": 26630 + }, + { + "epoch": 3.55, + "grad_norm": 0.5, + "learning_rate": 7.4910764337176145e-06, + "loss": 0.2702, + "step": 26631 + }, + { + "epoch": 3.55, + "grad_norm": 0.66015625, + "learning_rate": 7.486654868206655e-06, + "loss": 0.3893, + "step": 26632 + }, + { + "epoch": 3.55, + "grad_norm": 0.5625, + "learning_rate": 7.482234557236001e-06, + "loss": 0.124, + "step": 26633 + }, + { + "epoch": 3.55, + "grad_norm": 0.6953125, + "learning_rate": 7.477815500865648e-06, + "loss": 0.3027, + "step": 26634 + }, + { + "epoch": 3.55, + "grad_norm": 0.58203125, + "learning_rate": 7.473397699155482e-06, + "loss": 0.3168, + "step": 26635 + }, + { + "epoch": 3.55, + "grad_norm": 0.431640625, + "learning_rate": 7.4689811521654215e-06, + "loss": 0.1848, + "step": 26636 + }, + { + "epoch": 3.55, + "grad_norm": 0.734375, + "learning_rate": 7.464565859955364e-06, + "loss": 0.3461, + "step": 26637 + }, + { + "epoch": 3.55, + "grad_norm": 0.42578125, + "learning_rate": 7.460151822585193e-06, + "loss": 0.1354, + "step": 26638 + }, + { + "epoch": 3.55, + "grad_norm": 0.57421875, + "learning_rate": 7.455739040114751e-06, + "loss": 0.195, + "step": 26639 + }, + { + "epoch": 3.55, + "grad_norm": 0.51171875, + "learning_rate": 7.4513275126038676e-06, + "loss": 0.2201, + "step": 26640 + }, + { + "epoch": 3.55, + "grad_norm": 0.6015625, + "learning_rate": 7.446917240112394e-06, + "loss": 0.3881, + "step": 26641 + }, + { + "epoch": 3.56, + "grad_norm": 0.60546875, + "learning_rate": 7.4425082227001175e-06, + "loss": 0.1949, + "step": 26642 + }, + { + "epoch": 3.56, + "grad_norm": 0.52734375, + "learning_rate": 7.438100460426845e-06, + "loss": 0.1319, + "step": 26643 + }, + { + "epoch": 3.56, + "grad_norm": 0.59765625, + "learning_rate": 7.433693953352339e-06, + "loss": 0.3238, + "step": 26644 + }, + { + "epoch": 3.56, + "grad_norm": 0.4921875, + "learning_rate": 7.42928870153633e-06, + "loss": 0.2603, + "step": 26645 + }, + { + "epoch": 3.56, + "grad_norm": 0.68359375, + "learning_rate": 7.424884705038593e-06, + "loss": 0.4004, + "step": 26646 + }, + { + "epoch": 3.56, + "grad_norm": 0.98046875, + "learning_rate": 7.4204819639188235e-06, + "loss": 0.5185, + "step": 26647 + }, + { + "epoch": 3.56, + "grad_norm": 0.5234375, + "learning_rate": 7.416080478236742e-06, + "loss": 0.3496, + "step": 26648 + }, + { + "epoch": 3.56, + "grad_norm": 0.7421875, + "learning_rate": 7.411680248052033e-06, + "loss": 0.3978, + "step": 26649 + }, + { + "epoch": 3.56, + "grad_norm": 0.6015625, + "learning_rate": 7.4072812734243495e-06, + "loss": 0.4728, + "step": 26650 + }, + { + "epoch": 3.56, + "grad_norm": 0.494140625, + "learning_rate": 7.402883554413365e-06, + "loss": 0.2743, + "step": 26651 + }, + { + "epoch": 3.56, + "grad_norm": 0.66015625, + "learning_rate": 7.39848709107871e-06, + "loss": 0.24, + "step": 26652 + }, + { + "epoch": 3.56, + "grad_norm": 0.61328125, + "learning_rate": 7.394091883480003e-06, + "loss": 0.294, + "step": 26653 + }, + { + "epoch": 3.56, + "grad_norm": 0.578125, + "learning_rate": 7.389697931676831e-06, + "loss": 0.3158, + "step": 26654 + }, + { + "epoch": 3.56, + "grad_norm": 0.56640625, + "learning_rate": 7.385305235728801e-06, + "loss": 0.3048, + "step": 26655 + }, + { + "epoch": 3.56, + "grad_norm": 0.6484375, + "learning_rate": 7.380913795695488e-06, + "loss": 0.3418, + "step": 26656 + }, + { + "epoch": 3.56, + "grad_norm": 0.7890625, + "learning_rate": 7.376523611636421e-06, + "loss": 0.46, + "step": 26657 + }, + { + "epoch": 3.56, + "grad_norm": 0.6953125, + "learning_rate": 7.3721346836111205e-06, + "loss": 0.2744, + "step": 26658 + }, + { + "epoch": 3.56, + "grad_norm": 0.51953125, + "learning_rate": 7.367747011679127e-06, + "loss": 0.4048, + "step": 26659 + }, + { + "epoch": 3.56, + "grad_norm": 0.6796875, + "learning_rate": 7.363360595899938e-06, + "loss": 0.2526, + "step": 26660 + }, + { + "epoch": 3.56, + "grad_norm": 0.546875, + "learning_rate": 7.358975436333038e-06, + "loss": 0.1348, + "step": 26661 + }, + { + "epoch": 3.56, + "grad_norm": 0.7109375, + "learning_rate": 7.35459153303788e-06, + "loss": 0.472, + "step": 26662 + }, + { + "epoch": 3.56, + "grad_norm": 0.70703125, + "learning_rate": 7.350208886073917e-06, + "loss": 0.4548, + "step": 26663 + }, + { + "epoch": 3.56, + "grad_norm": 0.66015625, + "learning_rate": 7.34582749550059e-06, + "loss": 0.3654, + "step": 26664 + }, + { + "epoch": 3.56, + "grad_norm": 0.458984375, + "learning_rate": 7.341447361377296e-06, + "loss": 0.1917, + "step": 26665 + }, + { + "epoch": 3.56, + "grad_norm": 0.515625, + "learning_rate": 7.337068483763476e-06, + "loss": 0.198, + "step": 26666 + }, + { + "epoch": 3.56, + "grad_norm": 0.75390625, + "learning_rate": 7.33269086271845e-06, + "loss": 0.371, + "step": 26667 + }, + { + "epoch": 3.56, + "grad_norm": 0.66015625, + "learning_rate": 7.328314498301625e-06, + "loss": 0.4949, + "step": 26668 + }, + { + "epoch": 3.56, + "grad_norm": 0.58203125, + "learning_rate": 7.323939390572321e-06, + "loss": 0.3169, + "step": 26669 + }, + { + "epoch": 3.56, + "grad_norm": 0.40625, + "learning_rate": 7.319565539589879e-06, + "loss": 0.1088, + "step": 26670 + }, + { + "epoch": 3.56, + "grad_norm": 0.640625, + "learning_rate": 7.315192945413618e-06, + "loss": 0.5183, + "step": 26671 + }, + { + "epoch": 3.56, + "grad_norm": 0.60546875, + "learning_rate": 7.310821608102814e-06, + "loss": 0.3804, + "step": 26672 + }, + { + "epoch": 3.56, + "grad_norm": 0.482421875, + "learning_rate": 7.306451527716762e-06, + "loss": 0.2473, + "step": 26673 + }, + { + "epoch": 3.56, + "grad_norm": 0.87890625, + "learning_rate": 7.302082704314706e-06, + "loss": 0.4443, + "step": 26674 + }, + { + "epoch": 3.56, + "grad_norm": 0.59375, + "learning_rate": 7.29771513795593e-06, + "loss": 0.3785, + "step": 26675 + }, + { + "epoch": 3.56, + "grad_norm": 0.5390625, + "learning_rate": 7.293348828699609e-06, + "loss": 0.1412, + "step": 26676 + }, + { + "epoch": 3.56, + "grad_norm": 0.57421875, + "learning_rate": 7.288983776604974e-06, + "loss": 0.1789, + "step": 26677 + }, + { + "epoch": 3.56, + "grad_norm": 0.54296875, + "learning_rate": 7.284619981731222e-06, + "loss": 0.4022, + "step": 26678 + }, + { + "epoch": 3.56, + "grad_norm": 0.56640625, + "learning_rate": 7.28025744413755e-06, + "loss": 0.1483, + "step": 26679 + }, + { + "epoch": 3.56, + "grad_norm": 0.60546875, + "learning_rate": 7.275896163883089e-06, + "loss": 0.1893, + "step": 26680 + }, + { + "epoch": 3.56, + "grad_norm": 0.5703125, + "learning_rate": 7.271536141026969e-06, + "loss": 0.19, + "step": 26681 + }, + { + "epoch": 3.56, + "grad_norm": 0.71484375, + "learning_rate": 7.267177375628342e-06, + "loss": 0.128, + "step": 26682 + }, + { + "epoch": 3.56, + "grad_norm": 0.57421875, + "learning_rate": 7.262819867746296e-06, + "loss": 0.4468, + "step": 26683 + }, + { + "epoch": 3.56, + "grad_norm": 0.67578125, + "learning_rate": 7.258463617439948e-06, + "loss": 0.2675, + "step": 26684 + }, + { + "epoch": 3.56, + "grad_norm": 0.5859375, + "learning_rate": 7.254108624768341e-06, + "loss": 0.2858, + "step": 26685 + }, + { + "epoch": 3.56, + "grad_norm": 0.59765625, + "learning_rate": 7.249754889790539e-06, + "loss": 0.4041, + "step": 26686 + }, + { + "epoch": 3.56, + "grad_norm": 0.578125, + "learning_rate": 7.2454024125655935e-06, + "loss": 0.1859, + "step": 26687 + }, + { + "epoch": 3.56, + "grad_norm": 0.51171875, + "learning_rate": 7.241051193152526e-06, + "loss": 0.2434, + "step": 26688 + }, + { + "epoch": 3.56, + "grad_norm": 0.6328125, + "learning_rate": 7.2367012316103326e-06, + "loss": 0.5412, + "step": 26689 + }, + { + "epoch": 3.56, + "grad_norm": 0.6640625, + "learning_rate": 7.232352527997999e-06, + "loss": 0.2634, + "step": 26690 + }, + { + "epoch": 3.56, + "grad_norm": 0.54296875, + "learning_rate": 7.228005082374512e-06, + "loss": 0.3054, + "step": 26691 + }, + { + "epoch": 3.56, + "grad_norm": 0.66796875, + "learning_rate": 7.223658894798823e-06, + "loss": 0.38, + "step": 26692 + }, + { + "epoch": 3.56, + "grad_norm": 0.62109375, + "learning_rate": 7.219313965329854e-06, + "loss": 0.2808, + "step": 26693 + }, + { + "epoch": 3.56, + "grad_norm": 0.57421875, + "learning_rate": 7.214970294026546e-06, + "loss": 0.2214, + "step": 26694 + }, + { + "epoch": 3.56, + "grad_norm": 0.447265625, + "learning_rate": 7.2106278809477735e-06, + "loss": 0.2646, + "step": 26695 + }, + { + "epoch": 3.56, + "grad_norm": 0.53125, + "learning_rate": 7.206286726152434e-06, + "loss": 0.2561, + "step": 26696 + }, + { + "epoch": 3.56, + "grad_norm": 0.6171875, + "learning_rate": 7.201946829699413e-06, + "loss": 0.4786, + "step": 26697 + }, + { + "epoch": 3.56, + "grad_norm": 0.6640625, + "learning_rate": 7.197608191647553e-06, + "loss": 0.3536, + "step": 26698 + }, + { + "epoch": 3.56, + "grad_norm": 0.9140625, + "learning_rate": 7.1932708120556744e-06, + "loss": 0.4802, + "step": 26699 + }, + { + "epoch": 3.56, + "grad_norm": 0.70703125, + "learning_rate": 7.1889346909826185e-06, + "loss": 0.2796, + "step": 26700 + }, + { + "epoch": 3.56, + "grad_norm": 0.5625, + "learning_rate": 7.1845998284871596e-06, + "loss": 0.2601, + "step": 26701 + }, + { + "epoch": 3.56, + "grad_norm": 0.470703125, + "learning_rate": 7.180266224628118e-06, + "loss": 0.1557, + "step": 26702 + }, + { + "epoch": 3.56, + "grad_norm": 0.61328125, + "learning_rate": 7.175933879464214e-06, + "loss": 0.404, + "step": 26703 + }, + { + "epoch": 3.56, + "grad_norm": 0.81640625, + "learning_rate": 7.171602793054244e-06, + "loss": 0.2732, + "step": 26704 + }, + { + "epoch": 3.56, + "grad_norm": 0.65625, + "learning_rate": 7.167272965456906e-06, + "loss": 0.3438, + "step": 26705 + }, + { + "epoch": 3.56, + "grad_norm": 0.6796875, + "learning_rate": 7.162944396730919e-06, + "loss": 0.3201, + "step": 26706 + }, + { + "epoch": 3.56, + "grad_norm": 0.70703125, + "learning_rate": 7.158617086935005e-06, + "loss": 0.4059, + "step": 26707 + }, + { + "epoch": 3.56, + "grad_norm": 0.69921875, + "learning_rate": 7.154291036127813e-06, + "loss": 0.3697, + "step": 26708 + }, + { + "epoch": 3.56, + "grad_norm": 0.61328125, + "learning_rate": 7.149966244368022e-06, + "loss": 0.3639, + "step": 26709 + }, + { + "epoch": 3.56, + "grad_norm": 0.7421875, + "learning_rate": 7.1456427117142845e-06, + "loss": 0.5098, + "step": 26710 + }, + { + "epoch": 3.56, + "grad_norm": 0.56640625, + "learning_rate": 7.14132043822523e-06, + "loss": 0.2432, + "step": 26711 + }, + { + "epoch": 3.56, + "grad_norm": 0.59375, + "learning_rate": 7.136999423959456e-06, + "loss": 0.2057, + "step": 26712 + }, + { + "epoch": 3.56, + "grad_norm": 0.8046875, + "learning_rate": 7.132679668975573e-06, + "loss": 0.5805, + "step": 26713 + }, + { + "epoch": 3.56, + "grad_norm": 0.7109375, + "learning_rate": 7.1283611733321545e-06, + "loss": 0.4298, + "step": 26714 + }, + { + "epoch": 3.56, + "grad_norm": 0.53125, + "learning_rate": 7.124043937087766e-06, + "loss": 0.2081, + "step": 26715 + }, + { + "epoch": 3.56, + "grad_norm": 0.7421875, + "learning_rate": 7.119727960300959e-06, + "loss": 0.3587, + "step": 26716 + }, + { + "epoch": 3.57, + "grad_norm": 0.53125, + "learning_rate": 7.1154132430302445e-06, + "loss": 0.2221, + "step": 26717 + }, + { + "epoch": 3.57, + "grad_norm": 0.47265625, + "learning_rate": 7.111099785334141e-06, + "loss": 0.2225, + "step": 26718 + }, + { + "epoch": 3.57, + "grad_norm": 0.65625, + "learning_rate": 7.1067875872711245e-06, + "loss": 0.2383, + "step": 26719 + }, + { + "epoch": 3.57, + "grad_norm": 0.71484375, + "learning_rate": 7.1024766488996915e-06, + "loss": 0.2865, + "step": 26720 + }, + { + "epoch": 3.57, + "grad_norm": 0.48828125, + "learning_rate": 7.098166970278319e-06, + "loss": 0.2566, + "step": 26721 + }, + { + "epoch": 3.57, + "grad_norm": 0.65234375, + "learning_rate": 7.093858551465404e-06, + "loss": 0.487, + "step": 26722 + }, + { + "epoch": 3.57, + "grad_norm": 0.63671875, + "learning_rate": 7.089551392519389e-06, + "loss": 0.2821, + "step": 26723 + }, + { + "epoch": 3.57, + "grad_norm": 0.57421875, + "learning_rate": 7.085245493498693e-06, + "loss": 0.1781, + "step": 26724 + }, + { + "epoch": 3.57, + "grad_norm": 0.77734375, + "learning_rate": 7.080940854461715e-06, + "loss": 0.4467, + "step": 26725 + }, + { + "epoch": 3.57, + "grad_norm": 0.61328125, + "learning_rate": 7.076637475466807e-06, + "loss": 0.3709, + "step": 26726 + }, + { + "epoch": 3.57, + "grad_norm": 0.53515625, + "learning_rate": 7.072335356572324e-06, + "loss": 0.3185, + "step": 26727 + }, + { + "epoch": 3.57, + "grad_norm": 0.478515625, + "learning_rate": 7.068034497836607e-06, + "loss": 0.2182, + "step": 26728 + }, + { + "epoch": 3.57, + "grad_norm": 0.60546875, + "learning_rate": 7.063734899318009e-06, + "loss": 0.3397, + "step": 26729 + }, + { + "epoch": 3.57, + "grad_norm": 0.490234375, + "learning_rate": 7.059436561074817e-06, + "loss": 0.3745, + "step": 26730 + }, + { + "epoch": 3.57, + "grad_norm": 0.75390625, + "learning_rate": 7.0551394831652966e-06, + "loss": 0.3839, + "step": 26731 + }, + { + "epoch": 3.57, + "grad_norm": 0.62109375, + "learning_rate": 7.050843665647733e-06, + "loss": 0.395, + "step": 26732 + }, + { + "epoch": 3.57, + "grad_norm": 0.58984375, + "learning_rate": 7.04654910858038e-06, + "loss": 0.1997, + "step": 26733 + }, + { + "epoch": 3.57, + "grad_norm": 0.7734375, + "learning_rate": 7.042255812021503e-06, + "loss": 0.4905, + "step": 26734 + }, + { + "epoch": 3.57, + "grad_norm": 0.68359375, + "learning_rate": 7.037963776029266e-06, + "loss": 0.2428, + "step": 26735 + }, + { + "epoch": 3.57, + "grad_norm": 0.56640625, + "learning_rate": 7.033673000661922e-06, + "loss": 0.3487, + "step": 26736 + }, + { + "epoch": 3.57, + "grad_norm": 0.6796875, + "learning_rate": 7.029383485977625e-06, + "loss": 0.4658, + "step": 26737 + }, + { + "epoch": 3.57, + "grad_norm": 0.671875, + "learning_rate": 7.025095232034562e-06, + "loss": 0.3459, + "step": 26738 + }, + { + "epoch": 3.57, + "grad_norm": 0.6015625, + "learning_rate": 7.020808238890886e-06, + "loss": 0.2854, + "step": 26739 + }, + { + "epoch": 3.57, + "grad_norm": 0.7890625, + "learning_rate": 7.016522506604717e-06, + "loss": 0.4749, + "step": 26740 + }, + { + "epoch": 3.57, + "grad_norm": 0.51953125, + "learning_rate": 7.012238035234186e-06, + "loss": 0.3199, + "step": 26741 + }, + { + "epoch": 3.57, + "grad_norm": 0.640625, + "learning_rate": 7.007954824837382e-06, + "loss": 0.3403, + "step": 26742 + }, + { + "epoch": 3.57, + "grad_norm": 0.7265625, + "learning_rate": 7.003672875472389e-06, + "loss": 0.4468, + "step": 26743 + }, + { + "epoch": 3.57, + "grad_norm": 0.94140625, + "learning_rate": 6.999392187197284e-06, + "loss": 0.4577, + "step": 26744 + }, + { + "epoch": 3.57, + "grad_norm": 0.71484375, + "learning_rate": 6.9951127600700995e-06, + "loss": 0.3211, + "step": 26745 + }, + { + "epoch": 3.57, + "grad_norm": 0.7421875, + "learning_rate": 6.990834594148876e-06, + "loss": 0.4481, + "step": 26746 + }, + { + "epoch": 3.57, + "grad_norm": 0.494140625, + "learning_rate": 6.986557689491624e-06, + "loss": 0.1557, + "step": 26747 + }, + { + "epoch": 3.57, + "grad_norm": 0.64453125, + "learning_rate": 6.9822820461563745e-06, + "loss": 0.1903, + "step": 26748 + }, + { + "epoch": 3.57, + "grad_norm": 0.609375, + "learning_rate": 6.978007664201047e-06, + "loss": 0.3924, + "step": 26749 + }, + { + "epoch": 3.57, + "grad_norm": 0.6171875, + "learning_rate": 6.973734543683641e-06, + "loss": 0.4599, + "step": 26750 + }, + { + "epoch": 3.57, + "grad_norm": 0.62890625, + "learning_rate": 6.969462684662109e-06, + "loss": 0.2912, + "step": 26751 + }, + { + "epoch": 3.57, + "grad_norm": 0.734375, + "learning_rate": 6.9651920871943725e-06, + "loss": 0.2842, + "step": 26752 + }, + { + "epoch": 3.57, + "grad_norm": 0.43359375, + "learning_rate": 6.9609227513383505e-06, + "loss": 0.1944, + "step": 26753 + }, + { + "epoch": 3.57, + "grad_norm": 0.56640625, + "learning_rate": 6.956654677151897e-06, + "loss": 0.4133, + "step": 26754 + }, + { + "epoch": 3.57, + "grad_norm": 0.58203125, + "learning_rate": 6.952387864692933e-06, + "loss": 0.2451, + "step": 26755 + }, + { + "epoch": 3.57, + "grad_norm": 0.56640625, + "learning_rate": 6.948122314019301e-06, + "loss": 0.287, + "step": 26756 + }, + { + "epoch": 3.57, + "grad_norm": 0.56640625, + "learning_rate": 6.943858025188877e-06, + "loss": 0.235, + "step": 26757 + }, + { + "epoch": 3.57, + "grad_norm": 0.66015625, + "learning_rate": 6.9395949982594355e-06, + "loss": 0.263, + "step": 26758 + }, + { + "epoch": 3.57, + "grad_norm": 0.5390625, + "learning_rate": 6.935333233288821e-06, + "loss": 0.2586, + "step": 26759 + }, + { + "epoch": 3.57, + "grad_norm": 0.69921875, + "learning_rate": 6.931072730334809e-06, + "loss": 0.3277, + "step": 26760 + }, + { + "epoch": 3.57, + "grad_norm": 0.609375, + "learning_rate": 6.926813489455198e-06, + "loss": 0.4412, + "step": 26761 + }, + { + "epoch": 3.57, + "grad_norm": 0.66796875, + "learning_rate": 6.922555510707717e-06, + "loss": 0.3146, + "step": 26762 + }, + { + "epoch": 3.57, + "grad_norm": 0.63671875, + "learning_rate": 6.918298794150113e-06, + "loss": 0.5011, + "step": 26763 + }, + { + "epoch": 3.57, + "grad_norm": 0.51953125, + "learning_rate": 6.914043339840126e-06, + "loss": 0.3999, + "step": 26764 + }, + { + "epoch": 3.57, + "grad_norm": 0.57421875, + "learning_rate": 6.909789147835466e-06, + "loss": 0.3154, + "step": 26765 + }, + { + "epoch": 3.57, + "grad_norm": 0.59765625, + "learning_rate": 6.905536218193787e-06, + "loss": 0.2535, + "step": 26766 + }, + { + "epoch": 3.57, + "grad_norm": 0.5859375, + "learning_rate": 6.901284550972808e-06, + "loss": 0.3444, + "step": 26767 + }, + { + "epoch": 3.57, + "grad_norm": 0.41796875, + "learning_rate": 6.897034146230142e-06, + "loss": 0.2112, + "step": 26768 + }, + { + "epoch": 3.57, + "grad_norm": 0.65234375, + "learning_rate": 6.89278500402345e-06, + "loss": 0.2502, + "step": 26769 + }, + { + "epoch": 3.57, + "grad_norm": 0.53125, + "learning_rate": 6.888537124410343e-06, + "loss": 0.1235, + "step": 26770 + }, + { + "epoch": 3.57, + "grad_norm": 0.52734375, + "learning_rate": 6.884290507448455e-06, + "loss": 0.2922, + "step": 26771 + }, + { + "epoch": 3.57, + "grad_norm": 0.7265625, + "learning_rate": 6.880045153195325e-06, + "loss": 0.397, + "step": 26772 + }, + { + "epoch": 3.57, + "grad_norm": 0.6328125, + "learning_rate": 6.875801061708553e-06, + "loss": 0.4482, + "step": 26773 + }, + { + "epoch": 3.57, + "grad_norm": 0.74609375, + "learning_rate": 6.871558233045683e-06, + "loss": 0.3172, + "step": 26774 + }, + { + "epoch": 3.57, + "grad_norm": 0.7421875, + "learning_rate": 6.867316667264267e-06, + "loss": 0.3291, + "step": 26775 + }, + { + "epoch": 3.57, + "grad_norm": 0.453125, + "learning_rate": 6.863076364421794e-06, + "loss": 0.1416, + "step": 26776 + }, + { + "epoch": 3.57, + "grad_norm": 0.734375, + "learning_rate": 6.858837324575807e-06, + "loss": 0.2931, + "step": 26777 + }, + { + "epoch": 3.57, + "grad_norm": 0.51953125, + "learning_rate": 6.854599547783736e-06, + "loss": 0.2055, + "step": 26778 + }, + { + "epoch": 3.57, + "grad_norm": 0.39453125, + "learning_rate": 6.850363034103069e-06, + "loss": 0.1591, + "step": 26779 + }, + { + "epoch": 3.57, + "grad_norm": 0.765625, + "learning_rate": 6.846127783591294e-06, + "loss": 0.2255, + "step": 26780 + }, + { + "epoch": 3.57, + "grad_norm": 0.65625, + "learning_rate": 6.841893796305787e-06, + "loss": 0.4937, + "step": 26781 + }, + { + "epoch": 3.57, + "grad_norm": 0.671875, + "learning_rate": 6.837661072303992e-06, + "loss": 0.551, + "step": 26782 + }, + { + "epoch": 3.57, + "grad_norm": 0.58203125, + "learning_rate": 6.833429611643294e-06, + "loss": 0.2906, + "step": 26783 + }, + { + "epoch": 3.57, + "grad_norm": 0.671875, + "learning_rate": 6.829199414381115e-06, + "loss": 0.3267, + "step": 26784 + }, + { + "epoch": 3.57, + "grad_norm": 0.7265625, + "learning_rate": 6.8249704805747664e-06, + "loss": 0.3657, + "step": 26785 + }, + { + "epoch": 3.57, + "grad_norm": 0.640625, + "learning_rate": 6.82074281028161e-06, + "loss": 0.2947, + "step": 26786 + }, + { + "epoch": 3.57, + "grad_norm": 0.462890625, + "learning_rate": 6.816516403558992e-06, + "loss": 0.2883, + "step": 26787 + }, + { + "epoch": 3.57, + "grad_norm": 0.703125, + "learning_rate": 6.812291260464221e-06, + "loss": 0.3004, + "step": 26788 + }, + { + "epoch": 3.57, + "grad_norm": 0.8828125, + "learning_rate": 6.808067381054595e-06, + "loss": 0.3545, + "step": 26789 + }, + { + "epoch": 3.57, + "grad_norm": 0.70703125, + "learning_rate": 6.80384476538739e-06, + "loss": 0.5714, + "step": 26790 + }, + { + "epoch": 3.57, + "grad_norm": 0.54296875, + "learning_rate": 6.79962341351984e-06, + "loss": 0.2451, + "step": 26791 + }, + { + "epoch": 3.58, + "grad_norm": 0.65234375, + "learning_rate": 6.7954033255092196e-06, + "loss": 0.3038, + "step": 26792 + }, + { + "epoch": 3.58, + "grad_norm": 0.52734375, + "learning_rate": 6.7911845014127506e-06, + "loss": 0.1912, + "step": 26793 + }, + { + "epoch": 3.58, + "grad_norm": 0.62109375, + "learning_rate": 6.786966941287642e-06, + "loss": 0.2139, + "step": 26794 + }, + { + "epoch": 3.58, + "grad_norm": 0.62109375, + "learning_rate": 6.782750645191083e-06, + "loss": 0.2266, + "step": 26795 + }, + { + "epoch": 3.58, + "grad_norm": 0.48828125, + "learning_rate": 6.778535613180248e-06, + "loss": 0.1685, + "step": 26796 + }, + { + "epoch": 3.58, + "grad_norm": 0.58984375, + "learning_rate": 6.774321845312304e-06, + "loss": 0.2107, + "step": 26797 + }, + { + "epoch": 3.58, + "grad_norm": 0.6328125, + "learning_rate": 6.770109341644393e-06, + "loss": 0.3523, + "step": 26798 + }, + { + "epoch": 3.58, + "grad_norm": 0.546875, + "learning_rate": 6.765898102233625e-06, + "loss": 0.2999, + "step": 26799 + }, + { + "epoch": 3.58, + "grad_norm": 0.5390625, + "learning_rate": 6.761688127137123e-06, + "loss": 0.2595, + "step": 26800 + }, + { + "epoch": 3.58, + "grad_norm": 0.6953125, + "learning_rate": 6.75747941641196e-06, + "loss": 0.3295, + "step": 26801 + }, + { + "epoch": 3.58, + "grad_norm": 0.76953125, + "learning_rate": 6.753271970115227e-06, + "loss": 0.5768, + "step": 26802 + }, + { + "epoch": 3.58, + "grad_norm": 0.859375, + "learning_rate": 6.749065788303988e-06, + "loss": 0.4522, + "step": 26803 + }, + { + "epoch": 3.58, + "grad_norm": 0.55859375, + "learning_rate": 6.744860871035241e-06, + "loss": 0.3142, + "step": 26804 + }, + { + "epoch": 3.58, + "grad_norm": 0.51171875, + "learning_rate": 6.740657218366031e-06, + "loss": 0.1234, + "step": 26805 + }, + { + "epoch": 3.58, + "grad_norm": 0.6171875, + "learning_rate": 6.736454830353367e-06, + "loss": 0.3521, + "step": 26806 + }, + { + "epoch": 3.58, + "grad_norm": 0.6328125, + "learning_rate": 6.732253707054237e-06, + "loss": 0.4238, + "step": 26807 + }, + { + "epoch": 3.58, + "grad_norm": 0.5859375, + "learning_rate": 6.728053848525606e-06, + "loss": 0.2758, + "step": 26808 + }, + { + "epoch": 3.58, + "grad_norm": 0.55859375, + "learning_rate": 6.723855254824407e-06, + "loss": 0.2868, + "step": 26809 + }, + { + "epoch": 3.58, + "grad_norm": 0.51953125, + "learning_rate": 6.719657926007605e-06, + "loss": 0.1878, + "step": 26810 + }, + { + "epoch": 3.58, + "grad_norm": 0.5703125, + "learning_rate": 6.715461862132111e-06, + "loss": 0.1499, + "step": 26811 + }, + { + "epoch": 3.58, + "grad_norm": 1.0390625, + "learning_rate": 6.711267063254844e-06, + "loss": 0.5062, + "step": 26812 + }, + { + "epoch": 3.58, + "grad_norm": 0.59375, + "learning_rate": 6.7070735294326396e-06, + "loss": 0.2781, + "step": 26813 + }, + { + "epoch": 3.58, + "grad_norm": 0.66796875, + "learning_rate": 6.702881260722416e-06, + "loss": 0.2992, + "step": 26814 + }, + { + "epoch": 3.58, + "grad_norm": 0.609375, + "learning_rate": 6.698690257180984e-06, + "loss": 0.2536, + "step": 26815 + }, + { + "epoch": 3.58, + "grad_norm": 0.73046875, + "learning_rate": 6.694500518865188e-06, + "loss": 0.3481, + "step": 26816 + }, + { + "epoch": 3.58, + "grad_norm": 1.0, + "learning_rate": 6.69031204583187e-06, + "loss": 0.3758, + "step": 26817 + }, + { + "epoch": 3.58, + "grad_norm": 0.6484375, + "learning_rate": 6.686124838137786e-06, + "loss": 0.372, + "step": 26818 + }, + { + "epoch": 3.58, + "grad_norm": 0.734375, + "learning_rate": 6.681938895839746e-06, + "loss": 0.4924, + "step": 26819 + }, + { + "epoch": 3.58, + "grad_norm": 0.65625, + "learning_rate": 6.677754218994492e-06, + "loss": 0.3338, + "step": 26820 + }, + { + "epoch": 3.58, + "grad_norm": 0.66015625, + "learning_rate": 6.673570807658813e-06, + "loss": 0.4056, + "step": 26821 + }, + { + "epoch": 3.58, + "grad_norm": 0.73828125, + "learning_rate": 6.669388661889387e-06, + "loss": 0.696, + "step": 26822 + }, + { + "epoch": 3.58, + "grad_norm": 0.765625, + "learning_rate": 6.665207781742955e-06, + "loss": 0.3147, + "step": 26823 + }, + { + "epoch": 3.58, + "grad_norm": 0.76171875, + "learning_rate": 6.661028167276207e-06, + "loss": 0.3966, + "step": 26824 + }, + { + "epoch": 3.58, + "grad_norm": 0.5546875, + "learning_rate": 6.656849818545829e-06, + "loss": 0.3597, + "step": 26825 + }, + { + "epoch": 3.58, + "grad_norm": 0.578125, + "learning_rate": 6.6526727356084896e-06, + "loss": 0.437, + "step": 26826 + }, + { + "epoch": 3.58, + "grad_norm": 0.78515625, + "learning_rate": 6.6484969185207855e-06, + "loss": 0.4159, + "step": 26827 + }, + { + "epoch": 3.58, + "grad_norm": 0.7890625, + "learning_rate": 6.644322367339384e-06, + "loss": 0.2118, + "step": 26828 + }, + { + "epoch": 3.58, + "grad_norm": 0.765625, + "learning_rate": 6.640149082120884e-06, + "loss": 0.428, + "step": 26829 + }, + { + "epoch": 3.58, + "grad_norm": 0.81640625, + "learning_rate": 6.635977062921905e-06, + "loss": 0.4583, + "step": 26830 + }, + { + "epoch": 3.58, + "grad_norm": 0.57421875, + "learning_rate": 6.631806309798971e-06, + "loss": 0.4644, + "step": 26831 + }, + { + "epoch": 3.58, + "grad_norm": 0.65234375, + "learning_rate": 6.62763682280867e-06, + "loss": 0.3444, + "step": 26832 + }, + { + "epoch": 3.58, + "grad_norm": 0.57421875, + "learning_rate": 6.623468602007543e-06, + "loss": 0.1395, + "step": 26833 + }, + { + "epoch": 3.58, + "grad_norm": 0.3984375, + "learning_rate": 6.619301647452125e-06, + "loss": 0.1058, + "step": 26834 + }, + { + "epoch": 3.58, + "grad_norm": 0.53515625, + "learning_rate": 6.615135959198893e-06, + "loss": 0.2528, + "step": 26835 + }, + { + "epoch": 3.58, + "grad_norm": 0.59765625, + "learning_rate": 6.610971537304344e-06, + "loss": 0.2473, + "step": 26836 + }, + { + "epoch": 3.58, + "grad_norm": 0.5859375, + "learning_rate": 6.606808381824958e-06, + "loss": 0.3541, + "step": 26837 + }, + { + "epoch": 3.58, + "grad_norm": 0.494140625, + "learning_rate": 6.60264649281721e-06, + "loss": 0.2619, + "step": 26838 + }, + { + "epoch": 3.58, + "grad_norm": 0.73046875, + "learning_rate": 6.5984858703375115e-06, + "loss": 0.5638, + "step": 26839 + }, + { + "epoch": 3.58, + "grad_norm": 0.59765625, + "learning_rate": 6.594326514442295e-06, + "loss": 0.3484, + "step": 26840 + }, + { + "epoch": 3.58, + "grad_norm": 0.61328125, + "learning_rate": 6.590168425187937e-06, + "loss": 0.4453, + "step": 26841 + }, + { + "epoch": 3.58, + "grad_norm": 0.51953125, + "learning_rate": 6.586011602630849e-06, + "loss": 0.3237, + "step": 26842 + }, + { + "epoch": 3.58, + "grad_norm": 0.68359375, + "learning_rate": 6.581856046827406e-06, + "loss": 0.1274, + "step": 26843 + }, + { + "epoch": 3.58, + "grad_norm": 0.66015625, + "learning_rate": 6.577701757833954e-06, + "loss": 0.4053, + "step": 26844 + }, + { + "epoch": 3.58, + "grad_norm": 0.6171875, + "learning_rate": 6.5735487357068135e-06, + "loss": 0.2188, + "step": 26845 + }, + { + "epoch": 3.58, + "grad_norm": 0.63671875, + "learning_rate": 6.569396980502318e-06, + "loss": 0.2741, + "step": 26846 + }, + { + "epoch": 3.58, + "grad_norm": 0.72265625, + "learning_rate": 6.565246492276755e-06, + "loss": 0.4364, + "step": 26847 + }, + { + "epoch": 3.58, + "grad_norm": 0.7578125, + "learning_rate": 6.561097271086436e-06, + "loss": 0.4002, + "step": 26848 + }, + { + "epoch": 3.58, + "grad_norm": 0.67578125, + "learning_rate": 6.5569493169875816e-06, + "loss": 0.3761, + "step": 26849 + }, + { + "epoch": 3.58, + "grad_norm": 0.80078125, + "learning_rate": 6.552802630036492e-06, + "loss": 0.5648, + "step": 26850 + }, + { + "epoch": 3.58, + "grad_norm": 0.80078125, + "learning_rate": 6.548657210289355e-06, + "loss": 0.3724, + "step": 26851 + }, + { + "epoch": 3.58, + "grad_norm": 0.498046875, + "learning_rate": 6.544513057802404e-06, + "loss": 0.2205, + "step": 26852 + }, + { + "epoch": 3.58, + "grad_norm": 0.73046875, + "learning_rate": 6.540370172631849e-06, + "loss": 0.3821, + "step": 26853 + }, + { + "epoch": 3.58, + "grad_norm": 0.58984375, + "learning_rate": 6.536228554833834e-06, + "loss": 0.3115, + "step": 26854 + }, + { + "epoch": 3.58, + "grad_norm": 0.5625, + "learning_rate": 6.532088204464548e-06, + "loss": 0.1511, + "step": 26855 + }, + { + "epoch": 3.58, + "grad_norm": 0.64453125, + "learning_rate": 6.527949121580146e-06, + "loss": 0.3942, + "step": 26856 + }, + { + "epoch": 3.58, + "grad_norm": 0.455078125, + "learning_rate": 6.523811306236749e-06, + "loss": 0.3376, + "step": 26857 + }, + { + "epoch": 3.58, + "grad_norm": 0.46875, + "learning_rate": 6.519674758490446e-06, + "loss": 0.1766, + "step": 26858 + }, + { + "epoch": 3.58, + "grad_norm": 0.6484375, + "learning_rate": 6.515539478397348e-06, + "loss": 0.2224, + "step": 26859 + }, + { + "epoch": 3.58, + "grad_norm": 0.73046875, + "learning_rate": 6.5114054660135315e-06, + "loss": 0.357, + "step": 26860 + }, + { + "epoch": 3.58, + "grad_norm": 0.4609375, + "learning_rate": 6.507272721395064e-06, + "loss": 0.2255, + "step": 26861 + }, + { + "epoch": 3.58, + "grad_norm": 0.7265625, + "learning_rate": 6.503141244597999e-06, + "loss": 0.3798, + "step": 26862 + }, + { + "epoch": 3.58, + "grad_norm": 0.400390625, + "learning_rate": 6.499011035678337e-06, + "loss": 0.2221, + "step": 26863 + }, + { + "epoch": 3.58, + "grad_norm": 0.66796875, + "learning_rate": 6.494882094692078e-06, + "loss": 0.3492, + "step": 26864 + }, + { + "epoch": 3.58, + "grad_norm": 0.75, + "learning_rate": 6.490754421695233e-06, + "loss": 0.5477, + "step": 26865 + }, + { + "epoch": 3.59, + "grad_norm": 0.64453125, + "learning_rate": 6.486628016743768e-06, + "loss": 0.2384, + "step": 26866 + }, + { + "epoch": 3.59, + "grad_norm": 0.76953125, + "learning_rate": 6.482502879893671e-06, + "loss": 0.2444, + "step": 26867 + }, + { + "epoch": 3.59, + "grad_norm": 0.6796875, + "learning_rate": 6.4783790112008305e-06, + "loss": 0.2524, + "step": 26868 + }, + { + "epoch": 3.59, + "grad_norm": 0.8046875, + "learning_rate": 6.474256410721191e-06, + "loss": 0.4495, + "step": 26869 + }, + { + "epoch": 3.59, + "grad_norm": 0.86328125, + "learning_rate": 6.470135078510653e-06, + "loss": 0.3939, + "step": 26870 + }, + { + "epoch": 3.59, + "grad_norm": 0.69140625, + "learning_rate": 6.4660150146251375e-06, + "loss": 0.3485, + "step": 26871 + }, + { + "epoch": 3.59, + "grad_norm": 0.77734375, + "learning_rate": 6.461896219120467e-06, + "loss": 0.3486, + "step": 26872 + }, + { + "epoch": 3.59, + "grad_norm": 0.6015625, + "learning_rate": 6.457778692052518e-06, + "loss": 0.5511, + "step": 26873 + }, + { + "epoch": 3.59, + "grad_norm": 0.6796875, + "learning_rate": 6.453662433477136e-06, + "loss": 0.1477, + "step": 26874 + }, + { + "epoch": 3.59, + "grad_norm": 0.75390625, + "learning_rate": 6.449547443450121e-06, + "loss": 0.3362, + "step": 26875 + }, + { + "epoch": 3.59, + "grad_norm": 0.66015625, + "learning_rate": 6.445433722027294e-06, + "loss": 0.2587, + "step": 26876 + }, + { + "epoch": 3.59, + "grad_norm": 0.828125, + "learning_rate": 6.441321269264411e-06, + "loss": 0.3524, + "step": 26877 + }, + { + "epoch": 3.59, + "grad_norm": 0.66015625, + "learning_rate": 6.43721008521726e-06, + "loss": 0.1844, + "step": 26878 + }, + { + "epoch": 3.59, + "grad_norm": 0.6796875, + "learning_rate": 6.433100169941586e-06, + "loss": 0.4481, + "step": 26879 + }, + { + "epoch": 3.59, + "grad_norm": 0.875, + "learning_rate": 6.428991523493144e-06, + "loss": 0.3628, + "step": 26880 + }, + { + "epoch": 3.59, + "grad_norm": 0.79296875, + "learning_rate": 6.4248841459276125e-06, + "loss": 0.5882, + "step": 26881 + }, + { + "epoch": 3.59, + "grad_norm": 0.71875, + "learning_rate": 6.420778037300712e-06, + "loss": 0.4139, + "step": 26882 + }, + { + "epoch": 3.59, + "grad_norm": 0.419921875, + "learning_rate": 6.416673197668111e-06, + "loss": 0.1778, + "step": 26883 + }, + { + "epoch": 3.59, + "grad_norm": 0.58203125, + "learning_rate": 6.412569627085485e-06, + "loss": 0.4274, + "step": 26884 + }, + { + "epoch": 3.59, + "grad_norm": 0.5703125, + "learning_rate": 6.408467325608502e-06, + "loss": 0.3805, + "step": 26885 + }, + { + "epoch": 3.59, + "grad_norm": 0.7734375, + "learning_rate": 6.404366293292752e-06, + "loss": 0.4568, + "step": 26886 + }, + { + "epoch": 3.59, + "grad_norm": 0.640625, + "learning_rate": 6.400266530193877e-06, + "loss": 0.256, + "step": 26887 + }, + { + "epoch": 3.59, + "grad_norm": 0.5625, + "learning_rate": 6.396168036367445e-06, + "loss": 0.3974, + "step": 26888 + }, + { + "epoch": 3.59, + "grad_norm": 0.6484375, + "learning_rate": 6.392070811869044e-06, + "loss": 0.4379, + "step": 26889 + }, + { + "epoch": 3.59, + "grad_norm": 0.7578125, + "learning_rate": 6.387974856754264e-06, + "loss": 0.2813, + "step": 26890 + }, + { + "epoch": 3.59, + "grad_norm": 0.6796875, + "learning_rate": 6.383880171078605e-06, + "loss": 0.4385, + "step": 26891 + }, + { + "epoch": 3.59, + "grad_norm": 0.671875, + "learning_rate": 6.37978675489761e-06, + "loss": 0.2787, + "step": 26892 + }, + { + "epoch": 3.59, + "grad_norm": 0.5703125, + "learning_rate": 6.375694608266791e-06, + "loss": 0.3254, + "step": 26893 + }, + { + "epoch": 3.59, + "grad_norm": 0.5078125, + "learning_rate": 6.3716037312416596e-06, + "loss": 0.2861, + "step": 26894 + }, + { + "epoch": 3.59, + "grad_norm": 0.859375, + "learning_rate": 6.36751412387766e-06, + "loss": 0.4113, + "step": 26895 + }, + { + "epoch": 3.59, + "grad_norm": 0.5234375, + "learning_rate": 6.363425786230259e-06, + "loss": 0.219, + "step": 26896 + }, + { + "epoch": 3.59, + "grad_norm": 0.61328125, + "learning_rate": 6.359338718354901e-06, + "loss": 0.1866, + "step": 26897 + }, + { + "epoch": 3.59, + "grad_norm": 0.60546875, + "learning_rate": 6.35525292030702e-06, + "loss": 0.1575, + "step": 26898 + }, + { + "epoch": 3.59, + "grad_norm": 0.6640625, + "learning_rate": 6.351168392142015e-06, + "loss": 0.4613, + "step": 26899 + }, + { + "epoch": 3.59, + "grad_norm": 0.5703125, + "learning_rate": 6.347085133915243e-06, + "loss": 0.33, + "step": 26900 + }, + { + "epoch": 3.59, + "grad_norm": 0.7578125, + "learning_rate": 6.343003145682114e-06, + "loss": 0.1791, + "step": 26901 + }, + { + "epoch": 3.59, + "grad_norm": 0.58984375, + "learning_rate": 6.338922427497973e-06, + "loss": 0.3235, + "step": 26902 + }, + { + "epoch": 3.59, + "grad_norm": 0.51953125, + "learning_rate": 6.334842979418165e-06, + "loss": 0.4127, + "step": 26903 + }, + { + "epoch": 3.59, + "grad_norm": 0.76953125, + "learning_rate": 6.330764801497979e-06, + "loss": 0.4622, + "step": 26904 + }, + { + "epoch": 3.59, + "grad_norm": 0.546875, + "learning_rate": 6.326687893792749e-06, + "loss": 0.2609, + "step": 26905 + }, + { + "epoch": 3.59, + "grad_norm": 0.81640625, + "learning_rate": 6.322612256357752e-06, + "loss": 0.2354, + "step": 26906 + }, + { + "epoch": 3.59, + "grad_norm": 0.58984375, + "learning_rate": 6.318537889248255e-06, + "loss": 0.2309, + "step": 26907 + }, + { + "epoch": 3.59, + "grad_norm": 0.6171875, + "learning_rate": 6.3144647925195036e-06, + "loss": 0.2579, + "step": 26908 + }, + { + "epoch": 3.59, + "grad_norm": 0.5859375, + "learning_rate": 6.310392966226741e-06, + "loss": 0.3524, + "step": 26909 + }, + { + "epoch": 3.59, + "grad_norm": 0.68359375, + "learning_rate": 6.30632241042517e-06, + "loss": 0.4736, + "step": 26910 + }, + { + "epoch": 3.59, + "grad_norm": 0.7890625, + "learning_rate": 6.3022531251700214e-06, + "loss": 0.4079, + "step": 26911 + }, + { + "epoch": 3.59, + "grad_norm": 0.59765625, + "learning_rate": 6.298185110516442e-06, + "loss": 0.421, + "step": 26912 + }, + { + "epoch": 3.59, + "grad_norm": 0.6171875, + "learning_rate": 6.294118366519619e-06, + "loss": 0.404, + "step": 26913 + }, + { + "epoch": 3.59, + "grad_norm": 0.60546875, + "learning_rate": 6.290052893234677e-06, + "loss": 0.3628, + "step": 26914 + }, + { + "epoch": 3.59, + "grad_norm": 0.625, + "learning_rate": 6.28598869071676e-06, + "loss": 0.2294, + "step": 26915 + }, + { + "epoch": 3.59, + "grad_norm": 0.64453125, + "learning_rate": 6.281925759020979e-06, + "loss": 0.183, + "step": 26916 + }, + { + "epoch": 3.59, + "grad_norm": 0.474609375, + "learning_rate": 6.2778640982024575e-06, + "loss": 0.3722, + "step": 26917 + }, + { + "epoch": 3.59, + "grad_norm": 0.6328125, + "learning_rate": 6.273803708316217e-06, + "loss": 0.3002, + "step": 26918 + }, + { + "epoch": 3.59, + "grad_norm": 0.58203125, + "learning_rate": 6.26974458941737e-06, + "loss": 0.2227, + "step": 26919 + }, + { + "epoch": 3.59, + "grad_norm": 0.7109375, + "learning_rate": 6.265686741560927e-06, + "loss": 0.2865, + "step": 26920 + }, + { + "epoch": 3.59, + "grad_norm": 0.44921875, + "learning_rate": 6.261630164801957e-06, + "loss": 0.2435, + "step": 26921 + }, + { + "epoch": 3.59, + "grad_norm": 0.58203125, + "learning_rate": 6.257574859195425e-06, + "loss": 0.2386, + "step": 26922 + }, + { + "epoch": 3.59, + "grad_norm": 0.73046875, + "learning_rate": 6.253520824796355e-06, + "loss": 0.4045, + "step": 26923 + }, + { + "epoch": 3.59, + "grad_norm": 0.73828125, + "learning_rate": 6.249468061659691e-06, + "loss": 0.3103, + "step": 26924 + }, + { + "epoch": 3.59, + "grad_norm": 0.51171875, + "learning_rate": 6.245416569840412e-06, + "loss": 0.1662, + "step": 26925 + }, + { + "epoch": 3.59, + "grad_norm": 0.7109375, + "learning_rate": 6.241366349393463e-06, + "loss": 0.3708, + "step": 26926 + }, + { + "epoch": 3.59, + "grad_norm": 0.61328125, + "learning_rate": 6.237317400373754e-06, + "loss": 0.3695, + "step": 26927 + }, + { + "epoch": 3.59, + "grad_norm": 0.56640625, + "learning_rate": 6.233269722836199e-06, + "loss": 0.2601, + "step": 26928 + }, + { + "epoch": 3.59, + "grad_norm": 0.66015625, + "learning_rate": 6.229223316835675e-06, + "loss": 0.4951, + "step": 26929 + }, + { + "epoch": 3.59, + "grad_norm": 0.625, + "learning_rate": 6.2251781824270935e-06, + "loss": 0.3813, + "step": 26930 + }, + { + "epoch": 3.59, + "grad_norm": 0.46484375, + "learning_rate": 6.221134319665256e-06, + "loss": 0.3383, + "step": 26931 + }, + { + "epoch": 3.59, + "grad_norm": 0.48828125, + "learning_rate": 6.217091728605029e-06, + "loss": 0.392, + "step": 26932 + }, + { + "epoch": 3.59, + "grad_norm": 0.72265625, + "learning_rate": 6.213050409301224e-06, + "loss": 0.2706, + "step": 26933 + }, + { + "epoch": 3.59, + "grad_norm": 0.703125, + "learning_rate": 6.209010361808643e-06, + "loss": 0.3123, + "step": 26934 + }, + { + "epoch": 3.59, + "grad_norm": 0.671875, + "learning_rate": 6.204971586182096e-06, + "loss": 0.4129, + "step": 26935 + }, + { + "epoch": 3.59, + "grad_norm": 0.72265625, + "learning_rate": 6.200934082476328e-06, + "loss": 0.2995, + "step": 26936 + }, + { + "epoch": 3.59, + "grad_norm": 0.6796875, + "learning_rate": 6.1968978507460865e-06, + "loss": 0.3767, + "step": 26937 + }, + { + "epoch": 3.59, + "grad_norm": 0.57421875, + "learning_rate": 6.192862891046114e-06, + "loss": 0.1915, + "step": 26938 + }, + { + "epoch": 3.59, + "grad_norm": 0.5625, + "learning_rate": 6.188829203431113e-06, + "loss": 0.2067, + "step": 26939 + }, + { + "epoch": 3.59, + "grad_norm": 0.53125, + "learning_rate": 6.184796787955816e-06, + "loss": 0.3104, + "step": 26940 + }, + { + "epoch": 3.6, + "grad_norm": 0.390625, + "learning_rate": 6.18076564467488e-06, + "loss": 0.1933, + "step": 26941 + }, + { + "epoch": 3.6, + "grad_norm": 0.6328125, + "learning_rate": 6.176735773642961e-06, + "loss": 0.2537, + "step": 26942 + }, + { + "epoch": 3.6, + "grad_norm": 0.9765625, + "learning_rate": 6.172707174914727e-06, + "loss": 0.3078, + "step": 26943 + }, + { + "epoch": 3.6, + "grad_norm": 0.484375, + "learning_rate": 6.168679848544811e-06, + "loss": 0.2305, + "step": 26944 + }, + { + "epoch": 3.6, + "grad_norm": 0.65234375, + "learning_rate": 6.164653794587805e-06, + "loss": 0.3388, + "step": 26945 + }, + { + "epoch": 3.6, + "grad_norm": 0.58984375, + "learning_rate": 6.160629013098318e-06, + "loss": 0.1524, + "step": 26946 + }, + { + "epoch": 3.6, + "grad_norm": 0.70703125, + "learning_rate": 6.156605504130941e-06, + "loss": 0.327, + "step": 26947 + }, + { + "epoch": 3.6, + "grad_norm": 0.73046875, + "learning_rate": 6.152583267740208e-06, + "loss": 0.548, + "step": 26948 + }, + { + "epoch": 3.6, + "grad_norm": 0.54296875, + "learning_rate": 6.148562303980698e-06, + "loss": 0.2436, + "step": 26949 + }, + { + "epoch": 3.6, + "grad_norm": 0.4921875, + "learning_rate": 6.14454261290689e-06, + "loss": 0.2189, + "step": 26950 + }, + { + "epoch": 3.6, + "grad_norm": 0.5625, + "learning_rate": 6.140524194573327e-06, + "loss": 0.2405, + "step": 26951 + }, + { + "epoch": 3.6, + "grad_norm": 0.6640625, + "learning_rate": 6.13650704903449e-06, + "loss": 0.4211, + "step": 26952 + }, + { + "epoch": 3.6, + "grad_norm": 0.62890625, + "learning_rate": 6.132491176344879e-06, + "loss": 0.3656, + "step": 26953 + }, + { + "epoch": 3.6, + "grad_norm": 0.58203125, + "learning_rate": 6.128476576558917e-06, + "loss": 0.3118, + "step": 26954 + }, + { + "epoch": 3.6, + "grad_norm": 0.62890625, + "learning_rate": 6.124463249731049e-06, + "loss": 0.2101, + "step": 26955 + }, + { + "epoch": 3.6, + "grad_norm": 0.828125, + "learning_rate": 6.120451195915722e-06, + "loss": 0.4922, + "step": 26956 + }, + { + "epoch": 3.6, + "grad_norm": 0.50390625, + "learning_rate": 6.116440415167313e-06, + "loss": 0.2764, + "step": 26957 + }, + { + "epoch": 3.6, + "grad_norm": 0.50390625, + "learning_rate": 6.112430907540257e-06, + "loss": 0.3343, + "step": 26958 + }, + { + "epoch": 3.6, + "grad_norm": 0.4921875, + "learning_rate": 6.108422673088876e-06, + "loss": 0.1989, + "step": 26959 + }, + { + "epoch": 3.6, + "grad_norm": 0.671875, + "learning_rate": 6.104415711867539e-06, + "loss": 0.2375, + "step": 26960 + }, + { + "epoch": 3.6, + "grad_norm": 0.6015625, + "learning_rate": 6.100410023930592e-06, + "loss": 0.5527, + "step": 26961 + }, + { + "epoch": 3.6, + "grad_norm": 0.68359375, + "learning_rate": 6.096405609332345e-06, + "loss": 0.4775, + "step": 26962 + }, + { + "epoch": 3.6, + "grad_norm": 0.71484375, + "learning_rate": 6.092402468127112e-06, + "loss": 0.4298, + "step": 26963 + }, + { + "epoch": 3.6, + "grad_norm": 0.59765625, + "learning_rate": 6.0884006003691705e-06, + "loss": 0.3008, + "step": 26964 + }, + { + "epoch": 3.6, + "grad_norm": 0.515625, + "learning_rate": 6.084400006112778e-06, + "loss": 0.2623, + "step": 26965 + }, + { + "epoch": 3.6, + "grad_norm": 0.53515625, + "learning_rate": 6.080400685412202e-06, + "loss": 0.2449, + "step": 26966 + }, + { + "epoch": 3.6, + "grad_norm": 0.7109375, + "learning_rate": 6.0764026383216764e-06, + "loss": 0.2874, + "step": 26967 + }, + { + "epoch": 3.6, + "grad_norm": 0.66015625, + "learning_rate": 6.072405864895403e-06, + "loss": 0.3452, + "step": 26968 + }, + { + "epoch": 3.6, + "grad_norm": 0.71875, + "learning_rate": 6.068410365187583e-06, + "loss": 0.2701, + "step": 26969 + }, + { + "epoch": 3.6, + "grad_norm": 0.55078125, + "learning_rate": 6.064416139252405e-06, + "loss": 0.3745, + "step": 26970 + }, + { + "epoch": 3.6, + "grad_norm": 0.62890625, + "learning_rate": 6.060423187144049e-06, + "loss": 0.3097, + "step": 26971 + }, + { + "epoch": 3.6, + "grad_norm": 0.76171875, + "learning_rate": 6.056431508916649e-06, + "loss": 0.5514, + "step": 26972 + }, + { + "epoch": 3.6, + "grad_norm": 0.5625, + "learning_rate": 6.052441104624307e-06, + "loss": 0.1391, + "step": 26973 + }, + { + "epoch": 3.6, + "grad_norm": 0.5703125, + "learning_rate": 6.048451974321167e-06, + "loss": 0.1528, + "step": 26974 + }, + { + "epoch": 3.6, + "grad_norm": 0.65234375, + "learning_rate": 6.044464118061311e-06, + "loss": 0.3576, + "step": 26975 + }, + { + "epoch": 3.6, + "grad_norm": 0.6875, + "learning_rate": 6.040477535898836e-06, + "loss": 0.243, + "step": 26976 + }, + { + "epoch": 3.6, + "grad_norm": 0.69921875, + "learning_rate": 6.036492227887769e-06, + "loss": 0.4276, + "step": 26977 + }, + { + "epoch": 3.6, + "grad_norm": 0.66796875, + "learning_rate": 6.032508194082176e-06, + "loss": 0.5468, + "step": 26978 + }, + { + "epoch": 3.6, + "grad_norm": 0.6796875, + "learning_rate": 6.028525434536081e-06, + "loss": 0.1587, + "step": 26979 + }, + { + "epoch": 3.6, + "grad_norm": 0.52734375, + "learning_rate": 6.024543949303507e-06, + "loss": 0.1473, + "step": 26980 + }, + { + "epoch": 3.6, + "grad_norm": 0.466796875, + "learning_rate": 6.0205637384384225e-06, + "loss": 0.2608, + "step": 26981 + }, + { + "epoch": 3.6, + "grad_norm": 0.71875, + "learning_rate": 6.016584801994796e-06, + "loss": 0.4606, + "step": 26982 + }, + { + "epoch": 3.6, + "grad_norm": 0.6328125, + "learning_rate": 6.012607140026605e-06, + "loss": 0.3646, + "step": 26983 + }, + { + "epoch": 3.6, + "grad_norm": 0.53515625, + "learning_rate": 6.008630752587796e-06, + "loss": 0.3479, + "step": 26984 + }, + { + "epoch": 3.6, + "grad_norm": 0.5859375, + "learning_rate": 6.004655639732271e-06, + "loss": 0.6091, + "step": 26985 + }, + { + "epoch": 3.6, + "grad_norm": 0.6953125, + "learning_rate": 6.000681801513941e-06, + "loss": 0.2473, + "step": 26986 + }, + { + "epoch": 3.6, + "grad_norm": 0.625, + "learning_rate": 5.996709237986686e-06, + "loss": 0.2944, + "step": 26987 + }, + { + "epoch": 3.6, + "grad_norm": 0.65625, + "learning_rate": 5.992737949204385e-06, + "loss": 0.4316, + "step": 26988 + }, + { + "epoch": 3.6, + "grad_norm": 0.609375, + "learning_rate": 5.988767935220885e-06, + "loss": 0.2372, + "step": 26989 + }, + { + "epoch": 3.6, + "grad_norm": 0.77734375, + "learning_rate": 5.984799196090052e-06, + "loss": 0.3352, + "step": 26990 + }, + { + "epoch": 3.6, + "grad_norm": 0.546875, + "learning_rate": 5.9808317318656655e-06, + "loss": 0.2591, + "step": 26991 + }, + { + "epoch": 3.6, + "grad_norm": 0.66015625, + "learning_rate": 5.97686554260154e-06, + "loss": 0.1707, + "step": 26992 + }, + { + "epoch": 3.6, + "grad_norm": 0.5390625, + "learning_rate": 5.972900628351452e-06, + "loss": 0.2522, + "step": 26993 + }, + { + "epoch": 3.6, + "grad_norm": 0.703125, + "learning_rate": 5.968936989169205e-06, + "loss": 0.3355, + "step": 26994 + }, + { + "epoch": 3.6, + "grad_norm": 0.609375, + "learning_rate": 5.964974625108499e-06, + "loss": 0.2049, + "step": 26995 + }, + { + "epoch": 3.6, + "grad_norm": 0.91015625, + "learning_rate": 5.9610135362231036e-06, + "loss": 0.4444, + "step": 26996 + }, + { + "epoch": 3.6, + "grad_norm": 0.70703125, + "learning_rate": 5.957053722566708e-06, + "loss": 0.4297, + "step": 26997 + }, + { + "epoch": 3.6, + "grad_norm": 0.67578125, + "learning_rate": 5.9530951841930135e-06, + "loss": 0.4177, + "step": 26998 + }, + { + "epoch": 3.6, + "grad_norm": 0.6015625, + "learning_rate": 5.949137921155723e-06, + "loss": 0.388, + "step": 26999 + }, + { + "epoch": 3.6, + "grad_norm": 0.69140625, + "learning_rate": 5.94518193350847e-06, + "loss": 0.2647, + "step": 27000 + }, + { + "epoch": 3.6, + "grad_norm": 0.7265625, + "learning_rate": 5.941227221304924e-06, + "loss": 0.2572, + "step": 27001 + }, + { + "epoch": 3.6, + "grad_norm": 0.6484375, + "learning_rate": 5.937273784598696e-06, + "loss": 0.3919, + "step": 27002 + }, + { + "epoch": 3.6, + "grad_norm": 0.6640625, + "learning_rate": 5.9333216234434105e-06, + "loss": 0.3565, + "step": 27003 + }, + { + "epoch": 3.6, + "grad_norm": 0.578125, + "learning_rate": 5.929370737892648e-06, + "loss": 0.3001, + "step": 27004 + }, + { + "epoch": 3.6, + "grad_norm": 0.78125, + "learning_rate": 5.925421127999997e-06, + "loss": 0.5157, + "step": 27005 + }, + { + "epoch": 3.6, + "grad_norm": 0.58984375, + "learning_rate": 5.921472793819005e-06, + "loss": 0.1919, + "step": 27006 + }, + { + "epoch": 3.6, + "grad_norm": 0.640625, + "learning_rate": 5.917525735403229e-06, + "loss": 0.2446, + "step": 27007 + }, + { + "epoch": 3.6, + "grad_norm": 0.8125, + "learning_rate": 5.913579952806192e-06, + "loss": 0.5035, + "step": 27008 + }, + { + "epoch": 3.6, + "grad_norm": 0.84375, + "learning_rate": 5.909635446081408e-06, + "loss": 0.2443, + "step": 27009 + }, + { + "epoch": 3.6, + "grad_norm": 0.482421875, + "learning_rate": 5.9056922152823215e-06, + "loss": 0.2914, + "step": 27010 + }, + { + "epoch": 3.6, + "grad_norm": 0.703125, + "learning_rate": 5.9017502604624465e-06, + "loss": 0.408, + "step": 27011 + }, + { + "epoch": 3.6, + "grad_norm": 0.58984375, + "learning_rate": 5.897809581675229e-06, + "loss": 0.5025, + "step": 27012 + }, + { + "epoch": 3.6, + "grad_norm": 0.8046875, + "learning_rate": 5.893870178974126e-06, + "loss": 0.5278, + "step": 27013 + }, + { + "epoch": 3.6, + "grad_norm": 0.76171875, + "learning_rate": 5.889932052412528e-06, + "loss": 0.4824, + "step": 27014 + }, + { + "epoch": 3.6, + "grad_norm": 0.76953125, + "learning_rate": 5.885995202043848e-06, + "loss": 0.402, + "step": 27015 + }, + { + "epoch": 3.61, + "grad_norm": 0.68359375, + "learning_rate": 5.882059627921465e-06, + "loss": 0.2969, + "step": 27016 + }, + { + "epoch": 3.61, + "grad_norm": 0.64453125, + "learning_rate": 5.8781253300987825e-06, + "loss": 0.5334, + "step": 27017 + }, + { + "epoch": 3.61, + "grad_norm": 0.78515625, + "learning_rate": 5.874192308629112e-06, + "loss": 0.3867, + "step": 27018 + }, + { + "epoch": 3.61, + "grad_norm": 0.60546875, + "learning_rate": 5.870260563565799e-06, + "loss": 0.4972, + "step": 27019 + }, + { + "epoch": 3.61, + "grad_norm": 0.640625, + "learning_rate": 5.8663300949621915e-06, + "loss": 0.3921, + "step": 27020 + }, + { + "epoch": 3.61, + "grad_norm": 0.46484375, + "learning_rate": 5.862400902871534e-06, + "loss": 0.195, + "step": 27021 + }, + { + "epoch": 3.61, + "grad_norm": 0.6328125, + "learning_rate": 5.8584729873471525e-06, + "loss": 0.1861, + "step": 27022 + }, + { + "epoch": 3.61, + "grad_norm": 0.69921875, + "learning_rate": 5.854546348442291e-06, + "loss": 0.358, + "step": 27023 + }, + { + "epoch": 3.61, + "grad_norm": 0.7421875, + "learning_rate": 5.850620986210198e-06, + "loss": 0.3654, + "step": 27024 + }, + { + "epoch": 3.61, + "grad_norm": 0.51953125, + "learning_rate": 5.846696900704108e-06, + "loss": 0.2804, + "step": 27025 + }, + { + "epoch": 3.61, + "grad_norm": 0.640625, + "learning_rate": 5.8427740919772544e-06, + "loss": 0.4264, + "step": 27026 + }, + { + "epoch": 3.61, + "grad_norm": 0.486328125, + "learning_rate": 5.838852560082797e-06, + "loss": 0.2093, + "step": 27027 + }, + { + "epoch": 3.61, + "grad_norm": 0.55859375, + "learning_rate": 5.834932305073937e-06, + "loss": 0.3933, + "step": 27028 + }, + { + "epoch": 3.61, + "grad_norm": 0.6328125, + "learning_rate": 5.8310133270038204e-06, + "loss": 0.1544, + "step": 27029 + }, + { + "epoch": 3.61, + "grad_norm": 0.671875, + "learning_rate": 5.827095625925605e-06, + "loss": 0.4145, + "step": 27030 + }, + { + "epoch": 3.61, + "grad_norm": 0.462890625, + "learning_rate": 5.823179201892426e-06, + "loss": 0.2538, + "step": 27031 + }, + { + "epoch": 3.61, + "grad_norm": 0.60546875, + "learning_rate": 5.819264054957374e-06, + "loss": 0.3188, + "step": 27032 + }, + { + "epoch": 3.61, + "grad_norm": 0.58203125, + "learning_rate": 5.81535018517354e-06, + "loss": 0.4998, + "step": 27033 + }, + { + "epoch": 3.61, + "grad_norm": 0.59765625, + "learning_rate": 5.811437592594005e-06, + "loss": 0.5226, + "step": 27034 + }, + { + "epoch": 3.61, + "grad_norm": 0.671875, + "learning_rate": 5.807526277271813e-06, + "loss": 0.3363, + "step": 27035 + }, + { + "epoch": 3.61, + "grad_norm": 0.62890625, + "learning_rate": 5.803616239260046e-06, + "loss": 0.5091, + "step": 27036 + }, + { + "epoch": 3.61, + "grad_norm": 0.6640625, + "learning_rate": 5.799707478611682e-06, + "loss": 0.3576, + "step": 27037 + }, + { + "epoch": 3.61, + "grad_norm": 0.60546875, + "learning_rate": 5.795799995379736e-06, + "loss": 0.3812, + "step": 27038 + }, + { + "epoch": 3.61, + "grad_norm": 0.8828125, + "learning_rate": 5.791893789617198e-06, + "loss": 0.3398, + "step": 27039 + }, + { + "epoch": 3.61, + "grad_norm": 0.83203125, + "learning_rate": 5.787988861377069e-06, + "loss": 0.4446, + "step": 27040 + }, + { + "epoch": 3.61, + "grad_norm": 0.6328125, + "learning_rate": 5.784085210712254e-06, + "loss": 0.3279, + "step": 27041 + }, + { + "epoch": 3.61, + "grad_norm": 0.69140625, + "learning_rate": 5.7801828376757075e-06, + "loss": 0.5292, + "step": 27042 + }, + { + "epoch": 3.61, + "grad_norm": 0.484375, + "learning_rate": 5.776281742320367e-06, + "loss": 0.1702, + "step": 27043 + }, + { + "epoch": 3.61, + "grad_norm": 0.57421875, + "learning_rate": 5.772381924699122e-06, + "loss": 0.446, + "step": 27044 + }, + { + "epoch": 3.61, + "grad_norm": 0.65625, + "learning_rate": 5.7684833848648535e-06, + "loss": 0.2701, + "step": 27045 + }, + { + "epoch": 3.61, + "grad_norm": 0.416015625, + "learning_rate": 5.764586122870408e-06, + "loss": 0.1886, + "step": 27046 + }, + { + "epoch": 3.61, + "grad_norm": 0.57421875, + "learning_rate": 5.760690138768654e-06, + "loss": 0.5533, + "step": 27047 + }, + { + "epoch": 3.61, + "grad_norm": 0.5859375, + "learning_rate": 5.756795432612439e-06, + "loss": 0.1551, + "step": 27048 + }, + { + "epoch": 3.61, + "grad_norm": 0.578125, + "learning_rate": 5.752902004454563e-06, + "loss": 0.2557, + "step": 27049 + }, + { + "epoch": 3.61, + "grad_norm": 0.44140625, + "learning_rate": 5.7490098543478196e-06, + "loss": 0.1432, + "step": 27050 + }, + { + "epoch": 3.61, + "grad_norm": 0.65625, + "learning_rate": 5.745118982344977e-06, + "loss": 0.3983, + "step": 27051 + }, + { + "epoch": 3.61, + "grad_norm": 0.62109375, + "learning_rate": 5.741229388498826e-06, + "loss": 0.5055, + "step": 27052 + }, + { + "epoch": 3.61, + "grad_norm": 0.69921875, + "learning_rate": 5.737341072862113e-06, + "loss": 0.4828, + "step": 27053 + }, + { + "epoch": 3.61, + "grad_norm": 0.6953125, + "learning_rate": 5.73345403548754e-06, + "loss": 0.5066, + "step": 27054 + }, + { + "epoch": 3.61, + "grad_norm": 0.61328125, + "learning_rate": 5.729568276427821e-06, + "loss": 0.4185, + "step": 27055 + }, + { + "epoch": 3.61, + "grad_norm": 0.6875, + "learning_rate": 5.725683795735659e-06, + "loss": 0.3445, + "step": 27056 + }, + { + "epoch": 3.61, + "grad_norm": 0.48828125, + "learning_rate": 5.7218005934637556e-06, + "loss": 0.3794, + "step": 27057 + }, + { + "epoch": 3.61, + "grad_norm": 0.6796875, + "learning_rate": 5.717918669664723e-06, + "loss": 0.2412, + "step": 27058 + }, + { + "epoch": 3.61, + "grad_norm": 0.65625, + "learning_rate": 5.714038024391233e-06, + "loss": 0.3411, + "step": 27059 + }, + { + "epoch": 3.61, + "grad_norm": 0.7578125, + "learning_rate": 5.710158657695885e-06, + "loss": 0.3647, + "step": 27060 + }, + { + "epoch": 3.61, + "grad_norm": 0.58984375, + "learning_rate": 5.706280569631306e-06, + "loss": 0.175, + "step": 27061 + }, + { + "epoch": 3.61, + "grad_norm": 0.56640625, + "learning_rate": 5.7024037602500855e-06, + "loss": 0.2767, + "step": 27062 + }, + { + "epoch": 3.61, + "grad_norm": 0.65625, + "learning_rate": 5.698528229604794e-06, + "loss": 0.5007, + "step": 27063 + }, + { + "epoch": 3.61, + "grad_norm": 0.51953125, + "learning_rate": 5.694653977747966e-06, + "loss": 0.1886, + "step": 27064 + }, + { + "epoch": 3.61, + "grad_norm": 0.61328125, + "learning_rate": 5.69078100473216e-06, + "loss": 0.5284, + "step": 27065 + }, + { + "epoch": 3.61, + "grad_norm": 0.5859375, + "learning_rate": 5.686909310609889e-06, + "loss": 0.4814, + "step": 27066 + }, + { + "epoch": 3.61, + "grad_norm": 0.6875, + "learning_rate": 5.683038895433679e-06, + "loss": 0.4552, + "step": 27067 + }, + { + "epoch": 3.61, + "grad_norm": 0.7890625, + "learning_rate": 5.679169759255965e-06, + "loss": 0.4661, + "step": 27068 + }, + { + "epoch": 3.61, + "grad_norm": 0.765625, + "learning_rate": 5.675301902129271e-06, + "loss": 0.464, + "step": 27069 + }, + { + "epoch": 3.61, + "grad_norm": 0.671875, + "learning_rate": 5.6714353241060005e-06, + "loss": 0.4448, + "step": 27070 + }, + { + "epoch": 3.61, + "grad_norm": 0.546875, + "learning_rate": 5.667570025238611e-06, + "loss": 0.2, + "step": 27071 + }, + { + "epoch": 3.61, + "grad_norm": 0.435546875, + "learning_rate": 5.663706005579516e-06, + "loss": 0.0933, + "step": 27072 + }, + { + "epoch": 3.61, + "grad_norm": 0.69921875, + "learning_rate": 5.6598432651811085e-06, + "loss": 0.6832, + "step": 27073 + }, + { + "epoch": 3.61, + "grad_norm": 0.53125, + "learning_rate": 5.655981804095778e-06, + "loss": 0.2727, + "step": 27074 + }, + { + "epoch": 3.61, + "grad_norm": 0.61328125, + "learning_rate": 5.652121622375872e-06, + "loss": 0.2738, + "step": 27075 + }, + { + "epoch": 3.61, + "grad_norm": 0.7265625, + "learning_rate": 5.6482627200737716e-06, + "loss": 0.3847, + "step": 27076 + }, + { + "epoch": 3.61, + "grad_norm": 0.78125, + "learning_rate": 5.644405097241768e-06, + "loss": 0.4064, + "step": 27077 + }, + { + "epoch": 3.61, + "grad_norm": 0.50390625, + "learning_rate": 5.640548753932184e-06, + "loss": 0.3559, + "step": 27078 + }, + { + "epoch": 3.61, + "grad_norm": 0.703125, + "learning_rate": 5.636693690197325e-06, + "loss": 0.1853, + "step": 27079 + }, + { + "epoch": 3.61, + "grad_norm": 0.8359375, + "learning_rate": 5.632839906089449e-06, + "loss": 0.662, + "step": 27080 + }, + { + "epoch": 3.61, + "grad_norm": 0.64453125, + "learning_rate": 5.628987401660857e-06, + "loss": 0.4044, + "step": 27081 + }, + { + "epoch": 3.61, + "grad_norm": 0.6796875, + "learning_rate": 5.625136176963752e-06, + "loss": 0.2072, + "step": 27082 + }, + { + "epoch": 3.61, + "grad_norm": 0.62109375, + "learning_rate": 5.62128623205036e-06, + "loss": 0.3772, + "step": 27083 + }, + { + "epoch": 3.61, + "grad_norm": 0.6484375, + "learning_rate": 5.617437566972905e-06, + "loss": 0.5952, + "step": 27084 + }, + { + "epoch": 3.61, + "grad_norm": 0.58984375, + "learning_rate": 5.613590181783557e-06, + "loss": 0.1799, + "step": 27085 + }, + { + "epoch": 3.61, + "grad_norm": 0.51953125, + "learning_rate": 5.609744076534529e-06, + "loss": 0.2683, + "step": 27086 + }, + { + "epoch": 3.61, + "grad_norm": 0.6640625, + "learning_rate": 5.605899251277935e-06, + "loss": 0.2264, + "step": 27087 + }, + { + "epoch": 3.61, + "grad_norm": 0.455078125, + "learning_rate": 5.602055706065923e-06, + "loss": 0.1223, + "step": 27088 + }, + { + "epoch": 3.61, + "grad_norm": 0.5546875, + "learning_rate": 5.598213440950617e-06, + "loss": 0.2118, + "step": 27089 + }, + { + "epoch": 3.61, + "grad_norm": 0.51171875, + "learning_rate": 5.594372455984154e-06, + "loss": 0.1582, + "step": 27090 + }, + { + "epoch": 3.62, + "grad_norm": 0.63671875, + "learning_rate": 5.590532751218558e-06, + "loss": 0.4501, + "step": 27091 + }, + { + "epoch": 3.62, + "grad_norm": 0.56640625, + "learning_rate": 5.586694326705933e-06, + "loss": 0.3836, + "step": 27092 + }, + { + "epoch": 3.62, + "grad_norm": 0.6796875, + "learning_rate": 5.582857182498347e-06, + "loss": 0.4888, + "step": 27093 + }, + { + "epoch": 3.62, + "grad_norm": 0.6328125, + "learning_rate": 5.579021318647804e-06, + "loss": 0.2572, + "step": 27094 + }, + { + "epoch": 3.62, + "grad_norm": 0.765625, + "learning_rate": 5.575186735206339e-06, + "loss": 0.3336, + "step": 27095 + }, + { + "epoch": 3.62, + "grad_norm": 0.6484375, + "learning_rate": 5.571353432225923e-06, + "loss": 0.2897, + "step": 27096 + }, + { + "epoch": 3.62, + "grad_norm": 0.69140625, + "learning_rate": 5.567521409758569e-06, + "loss": 0.4365, + "step": 27097 + }, + { + "epoch": 3.62, + "grad_norm": 0.53515625, + "learning_rate": 5.563690667856225e-06, + "loss": 0.2558, + "step": 27098 + }, + { + "epoch": 3.62, + "grad_norm": 0.494140625, + "learning_rate": 5.5598612065708596e-06, + "loss": 0.2645, + "step": 27099 + }, + { + "epoch": 3.62, + "grad_norm": 0.51171875, + "learning_rate": 5.556033025954366e-06, + "loss": 0.1647, + "step": 27100 + }, + { + "epoch": 3.62, + "grad_norm": 0.70703125, + "learning_rate": 5.55220612605869e-06, + "loss": 0.4323, + "step": 27101 + }, + { + "epoch": 3.62, + "grad_norm": 0.50390625, + "learning_rate": 5.548380506935713e-06, + "loss": 0.1301, + "step": 27102 + }, + { + "epoch": 3.62, + "grad_norm": 0.6640625, + "learning_rate": 5.544556168637305e-06, + "loss": 0.5564, + "step": 27103 + }, + { + "epoch": 3.62, + "grad_norm": 0.73046875, + "learning_rate": 5.540733111215357e-06, + "loss": 0.5248, + "step": 27104 + }, + { + "epoch": 3.62, + "grad_norm": 0.6640625, + "learning_rate": 5.536911334721695e-06, + "loss": 0.2255, + "step": 27105 + }, + { + "epoch": 3.62, + "grad_norm": 0.5234375, + "learning_rate": 5.533090839208133e-06, + "loss": 0.4034, + "step": 27106 + }, + { + "epoch": 3.62, + "grad_norm": 0.48046875, + "learning_rate": 5.529271624726473e-06, + "loss": 0.2352, + "step": 27107 + }, + { + "epoch": 3.62, + "grad_norm": 0.7578125, + "learning_rate": 5.525453691328531e-06, + "loss": 0.279, + "step": 27108 + }, + { + "epoch": 3.62, + "grad_norm": 0.45703125, + "learning_rate": 5.5216370390660745e-06, + "loss": 0.1784, + "step": 27109 + }, + { + "epoch": 3.62, + "grad_norm": 0.59765625, + "learning_rate": 5.517821667990852e-06, + "loss": 0.4103, + "step": 27110 + }, + { + "epoch": 3.62, + "grad_norm": 0.66015625, + "learning_rate": 5.514007578154601e-06, + "loss": 0.4185, + "step": 27111 + }, + { + "epoch": 3.62, + "grad_norm": 0.5234375, + "learning_rate": 5.510194769609045e-06, + "loss": 0.2552, + "step": 27112 + }, + { + "epoch": 3.62, + "grad_norm": 0.51953125, + "learning_rate": 5.50638324240591e-06, + "loss": 0.2917, + "step": 27113 + }, + { + "epoch": 3.62, + "grad_norm": 0.81640625, + "learning_rate": 5.502572996596855e-06, + "loss": 0.5061, + "step": 27114 + }, + { + "epoch": 3.62, + "grad_norm": 0.6640625, + "learning_rate": 5.49876403223355e-06, + "loss": 0.4369, + "step": 27115 + }, + { + "epoch": 3.62, + "grad_norm": 0.5390625, + "learning_rate": 5.494956349367653e-06, + "loss": 0.3002, + "step": 27116 + }, + { + "epoch": 3.62, + "grad_norm": 0.53515625, + "learning_rate": 5.491149948050822e-06, + "loss": 0.2182, + "step": 27117 + }, + { + "epoch": 3.62, + "grad_norm": 0.61328125, + "learning_rate": 5.48734482833465e-06, + "loss": 0.2839, + "step": 27118 + }, + { + "epoch": 3.62, + "grad_norm": 0.734375, + "learning_rate": 5.483540990270719e-06, + "loss": 0.528, + "step": 27119 + }, + { + "epoch": 3.62, + "grad_norm": 0.447265625, + "learning_rate": 5.479738433910642e-06, + "loss": 0.1603, + "step": 27120 + }, + { + "epoch": 3.62, + "grad_norm": 0.486328125, + "learning_rate": 5.475937159305977e-06, + "loss": 0.17, + "step": 27121 + }, + { + "epoch": 3.62, + "grad_norm": 0.462890625, + "learning_rate": 5.472137166508273e-06, + "loss": 0.1656, + "step": 27122 + }, + { + "epoch": 3.62, + "grad_norm": 0.71875, + "learning_rate": 5.4683384555690444e-06, + "loss": 0.3612, + "step": 27123 + }, + { + "epoch": 3.62, + "grad_norm": 0.67578125, + "learning_rate": 5.464541026539826e-06, + "loss": 0.2137, + "step": 27124 + }, + { + "epoch": 3.62, + "grad_norm": 0.59375, + "learning_rate": 5.46074487947209e-06, + "loss": 0.3389, + "step": 27125 + }, + { + "epoch": 3.62, + "grad_norm": 1.0390625, + "learning_rate": 5.45695001441735e-06, + "loss": 0.4772, + "step": 27126 + }, + { + "epoch": 3.62, + "grad_norm": 0.63671875, + "learning_rate": 5.453156431427031e-06, + "loss": 0.4735, + "step": 27127 + }, + { + "epoch": 3.62, + "grad_norm": 0.59765625, + "learning_rate": 5.449364130552592e-06, + "loss": 0.3202, + "step": 27128 + }, + { + "epoch": 3.62, + "grad_norm": 0.58984375, + "learning_rate": 5.44557311184547e-06, + "loss": 0.3636, + "step": 27129 + }, + { + "epoch": 3.62, + "grad_norm": 0.578125, + "learning_rate": 5.441783375357046e-06, + "loss": 0.2219, + "step": 27130 + }, + { + "epoch": 3.62, + "grad_norm": 0.69140625, + "learning_rate": 5.437994921138734e-06, + "loss": 0.3752, + "step": 27131 + }, + { + "epoch": 3.62, + "grad_norm": 0.48046875, + "learning_rate": 5.434207749241904e-06, + "loss": 0.2828, + "step": 27132 + }, + { + "epoch": 3.62, + "grad_norm": 0.60546875, + "learning_rate": 5.430421859717905e-06, + "loss": 0.465, + "step": 27133 + }, + { + "epoch": 3.62, + "grad_norm": 0.59375, + "learning_rate": 5.426637252618072e-06, + "loss": 0.2637, + "step": 27134 + }, + { + "epoch": 3.62, + "grad_norm": 0.443359375, + "learning_rate": 5.422853927993732e-06, + "loss": 0.2253, + "step": 27135 + }, + { + "epoch": 3.62, + "grad_norm": 0.77734375, + "learning_rate": 5.41907188589621e-06, + "loss": 0.2437, + "step": 27136 + }, + { + "epoch": 3.62, + "grad_norm": 0.5703125, + "learning_rate": 5.415291126376764e-06, + "loss": 0.3803, + "step": 27137 + }, + { + "epoch": 3.62, + "grad_norm": 0.76171875, + "learning_rate": 5.411511649486667e-06, + "loss": 0.5238, + "step": 27138 + }, + { + "epoch": 3.62, + "grad_norm": 0.65234375, + "learning_rate": 5.407733455277186e-06, + "loss": 0.2752, + "step": 27139 + }, + { + "epoch": 3.62, + "grad_norm": 0.546875, + "learning_rate": 5.40395654379956e-06, + "loss": 0.2725, + "step": 27140 + }, + { + "epoch": 3.62, + "grad_norm": 0.6640625, + "learning_rate": 5.4001809151049795e-06, + "loss": 0.4022, + "step": 27141 + }, + { + "epoch": 3.62, + "grad_norm": 0.484375, + "learning_rate": 5.3964065692446715e-06, + "loss": 0.414, + "step": 27142 + }, + { + "epoch": 3.62, + "grad_norm": 0.57421875, + "learning_rate": 5.392633506269795e-06, + "loss": 0.1827, + "step": 27143 + }, + { + "epoch": 3.62, + "grad_norm": 0.90625, + "learning_rate": 5.388861726231531e-06, + "loss": 0.4902, + "step": 27144 + }, + { + "epoch": 3.62, + "grad_norm": 0.84375, + "learning_rate": 5.385091229181028e-06, + "loss": 0.2229, + "step": 27145 + }, + { + "epoch": 3.62, + "grad_norm": 0.65625, + "learning_rate": 5.3813220151693985e-06, + "loss": 0.4174, + "step": 27146 + }, + { + "epoch": 3.62, + "grad_norm": 0.421875, + "learning_rate": 5.377554084247771e-06, + "loss": 0.1802, + "step": 27147 + }, + { + "epoch": 3.62, + "grad_norm": 0.56640625, + "learning_rate": 5.373787436467248e-06, + "loss": 0.3802, + "step": 27148 + }, + { + "epoch": 3.62, + "grad_norm": 0.498046875, + "learning_rate": 5.3700220718788996e-06, + "loss": 0.243, + "step": 27149 + }, + { + "epoch": 3.62, + "grad_norm": 0.66015625, + "learning_rate": 5.366257990533774e-06, + "loss": 0.2691, + "step": 27150 + }, + { + "epoch": 3.62, + "grad_norm": 0.67578125, + "learning_rate": 5.36249519248293e-06, + "loss": 0.316, + "step": 27151 + }, + { + "epoch": 3.62, + "grad_norm": 0.55078125, + "learning_rate": 5.358733677777383e-06, + "loss": 0.3817, + "step": 27152 + }, + { + "epoch": 3.62, + "grad_norm": 0.6953125, + "learning_rate": 5.3549734464681475e-06, + "loss": 0.2905, + "step": 27153 + }, + { + "epoch": 3.62, + "grad_norm": 0.578125, + "learning_rate": 5.351214498606239e-06, + "loss": 0.4303, + "step": 27154 + }, + { + "epoch": 3.62, + "grad_norm": 0.79296875, + "learning_rate": 5.347456834242593e-06, + "loss": 0.6052, + "step": 27155 + }, + { + "epoch": 3.62, + "grad_norm": 0.62890625, + "learning_rate": 5.343700453428168e-06, + "loss": 0.2444, + "step": 27156 + }, + { + "epoch": 3.62, + "grad_norm": 0.68359375, + "learning_rate": 5.339945356213927e-06, + "loss": 0.6334, + "step": 27157 + }, + { + "epoch": 3.62, + "grad_norm": 0.6015625, + "learning_rate": 5.33619154265077e-06, + "loss": 0.3244, + "step": 27158 + }, + { + "epoch": 3.62, + "grad_norm": 0.55078125, + "learning_rate": 5.332439012789625e-06, + "loss": 0.3358, + "step": 27159 + }, + { + "epoch": 3.62, + "grad_norm": 0.7421875, + "learning_rate": 5.328687766681351e-06, + "loss": 0.2693, + "step": 27160 + }, + { + "epoch": 3.62, + "grad_norm": 0.703125, + "learning_rate": 5.324937804376828e-06, + "loss": 0.391, + "step": 27161 + }, + { + "epoch": 3.62, + "grad_norm": 0.57421875, + "learning_rate": 5.321189125926918e-06, + "loss": 0.4346, + "step": 27162 + }, + { + "epoch": 3.62, + "grad_norm": 0.5625, + "learning_rate": 5.317441731382456e-06, + "loss": 0.1382, + "step": 27163 + }, + { + "epoch": 3.62, + "grad_norm": 0.4296875, + "learning_rate": 5.313695620794234e-06, + "loss": 0.202, + "step": 27164 + }, + { + "epoch": 3.62, + "grad_norm": 0.7109375, + "learning_rate": 5.309950794213081e-06, + "loss": 0.5625, + "step": 27165 + }, + { + "epoch": 3.63, + "grad_norm": 0.796875, + "learning_rate": 5.3062072516897655e-06, + "loss": 0.1951, + "step": 27166 + }, + { + "epoch": 3.63, + "grad_norm": 0.62890625, + "learning_rate": 5.302464993275047e-06, + "loss": 0.3482, + "step": 27167 + }, + { + "epoch": 3.63, + "grad_norm": 0.578125, + "learning_rate": 5.298724019019697e-06, + "loss": 0.5103, + "step": 27168 + }, + { + "epoch": 3.63, + "grad_norm": 0.578125, + "learning_rate": 5.294984328974406e-06, + "loss": 0.1863, + "step": 27169 + }, + { + "epoch": 3.63, + "grad_norm": 0.79296875, + "learning_rate": 5.291245923189913e-06, + "loss": 0.3172, + "step": 27170 + }, + { + "epoch": 3.63, + "grad_norm": 0.5, + "learning_rate": 5.2875088017169116e-06, + "loss": 0.1253, + "step": 27171 + }, + { + "epoch": 3.63, + "grad_norm": 0.640625, + "learning_rate": 5.283772964606093e-06, + "loss": 0.5156, + "step": 27172 + }, + { + "epoch": 3.63, + "grad_norm": 0.5546875, + "learning_rate": 5.280038411908084e-06, + "loss": 0.3264, + "step": 27173 + }, + { + "epoch": 3.63, + "grad_norm": 0.64453125, + "learning_rate": 5.276305143673543e-06, + "loss": 0.4552, + "step": 27174 + }, + { + "epoch": 3.63, + "grad_norm": 0.640625, + "learning_rate": 5.272573159953098e-06, + "loss": 0.4398, + "step": 27175 + }, + { + "epoch": 3.63, + "grad_norm": 0.5078125, + "learning_rate": 5.268842460797363e-06, + "loss": 0.3575, + "step": 27176 + }, + { + "epoch": 3.63, + "grad_norm": 0.7109375, + "learning_rate": 5.265113046256931e-06, + "loss": 0.4753, + "step": 27177 + }, + { + "epoch": 3.63, + "grad_norm": 0.5625, + "learning_rate": 5.261384916382362e-06, + "loss": 0.2388, + "step": 27178 + }, + { + "epoch": 3.63, + "grad_norm": 0.5625, + "learning_rate": 5.2576580712242025e-06, + "loss": 0.2036, + "step": 27179 + }, + { + "epoch": 3.63, + "grad_norm": 0.6171875, + "learning_rate": 5.253932510833015e-06, + "loss": 0.396, + "step": 27180 + }, + { + "epoch": 3.63, + "grad_norm": 0.69140625, + "learning_rate": 5.250208235259302e-06, + "loss": 0.4044, + "step": 27181 + }, + { + "epoch": 3.63, + "grad_norm": 0.609375, + "learning_rate": 5.246485244553589e-06, + "loss": 0.4403, + "step": 27182 + }, + { + "epoch": 3.63, + "grad_norm": 0.55078125, + "learning_rate": 5.2427635387663375e-06, + "loss": 0.2909, + "step": 27183 + }, + { + "epoch": 3.63, + "grad_norm": 0.62109375, + "learning_rate": 5.239043117948028e-06, + "loss": 0.4249, + "step": 27184 + }, + { + "epoch": 3.63, + "grad_norm": 0.53125, + "learning_rate": 5.23532398214911e-06, + "loss": 0.3446, + "step": 27185 + }, + { + "epoch": 3.63, + "grad_norm": 0.5390625, + "learning_rate": 5.2316061314200304e-06, + "loss": 0.1462, + "step": 27186 + }, + { + "epoch": 3.63, + "grad_norm": 0.9375, + "learning_rate": 5.227889565811184e-06, + "loss": 0.5835, + "step": 27187 + }, + { + "epoch": 3.63, + "grad_norm": 0.65234375, + "learning_rate": 5.224174285372974e-06, + "loss": 0.4324, + "step": 27188 + }, + { + "epoch": 3.63, + "grad_norm": 0.494140625, + "learning_rate": 5.220460290155793e-06, + "loss": 0.4548, + "step": 27189 + }, + { + "epoch": 3.63, + "grad_norm": 0.453125, + "learning_rate": 5.216747580210013e-06, + "loss": 0.2742, + "step": 27190 + }, + { + "epoch": 3.63, + "grad_norm": 0.515625, + "learning_rate": 5.2130361555859595e-06, + "loss": 0.3387, + "step": 27191 + }, + { + "epoch": 3.63, + "grad_norm": 0.6953125, + "learning_rate": 5.2093260163339594e-06, + "loss": 0.4505, + "step": 27192 + }, + { + "epoch": 3.63, + "grad_norm": 0.59765625, + "learning_rate": 5.2056171625043395e-06, + "loss": 0.3516, + "step": 27193 + }, + { + "epoch": 3.63, + "grad_norm": 0.734375, + "learning_rate": 5.20190959414738e-06, + "loss": 0.4469, + "step": 27194 + }, + { + "epoch": 3.63, + "grad_norm": 0.68359375, + "learning_rate": 5.198203311313399e-06, + "loss": 0.1831, + "step": 27195 + }, + { + "epoch": 3.63, + "grad_norm": 0.62109375, + "learning_rate": 5.194498314052599e-06, + "loss": 0.2894, + "step": 27196 + }, + { + "epoch": 3.63, + "grad_norm": 0.466796875, + "learning_rate": 5.1907946024152385e-06, + "loss": 0.2194, + "step": 27197 + }, + { + "epoch": 3.63, + "grad_norm": 0.55078125, + "learning_rate": 5.187092176451569e-06, + "loss": 0.4993, + "step": 27198 + }, + { + "epoch": 3.63, + "grad_norm": 0.52734375, + "learning_rate": 5.183391036211782e-06, + "loss": 0.1321, + "step": 27199 + }, + { + "epoch": 3.63, + "grad_norm": 0.412109375, + "learning_rate": 5.1796911817460605e-06, + "loss": 0.251, + "step": 27200 + }, + { + "epoch": 3.63, + "grad_norm": 0.482421875, + "learning_rate": 5.175992613104574e-06, + "loss": 0.2801, + "step": 27201 + }, + { + "epoch": 3.63, + "grad_norm": 0.69140625, + "learning_rate": 5.172295330337495e-06, + "loss": 0.2776, + "step": 27202 + }, + { + "epoch": 3.63, + "grad_norm": 0.55078125, + "learning_rate": 5.16859933349495e-06, + "loss": 0.3179, + "step": 27203 + }, + { + "epoch": 3.63, + "grad_norm": 0.76171875, + "learning_rate": 5.164904622627054e-06, + "loss": 0.411, + "step": 27204 + }, + { + "epoch": 3.63, + "grad_norm": 0.53125, + "learning_rate": 5.161211197783933e-06, + "loss": 0.2779, + "step": 27205 + }, + { + "epoch": 3.63, + "grad_norm": 0.82421875, + "learning_rate": 5.157519059015636e-06, + "loss": 0.4745, + "step": 27206 + }, + { + "epoch": 3.63, + "grad_norm": 0.5234375, + "learning_rate": 5.153828206372258e-06, + "loss": 0.2294, + "step": 27207 + }, + { + "epoch": 3.63, + "grad_norm": 0.73046875, + "learning_rate": 5.150138639903834e-06, + "loss": 0.4318, + "step": 27208 + }, + { + "epoch": 3.63, + "grad_norm": 0.62109375, + "learning_rate": 5.146450359660415e-06, + "loss": 0.4366, + "step": 27209 + }, + { + "epoch": 3.63, + "grad_norm": 0.66015625, + "learning_rate": 5.142763365692005e-06, + "loss": 0.2736, + "step": 27210 + }, + { + "epoch": 3.63, + "grad_norm": 0.62890625, + "learning_rate": 5.1390776580485855e-06, + "loss": 0.3505, + "step": 27211 + }, + { + "epoch": 3.63, + "grad_norm": 0.5, + "learning_rate": 5.135393236780173e-06, + "loss": 0.1724, + "step": 27212 + }, + { + "epoch": 3.63, + "grad_norm": 0.66796875, + "learning_rate": 5.1317101019367156e-06, + "loss": 0.3501, + "step": 27213 + }, + { + "epoch": 3.63, + "grad_norm": 0.640625, + "learning_rate": 5.128028253568151e-06, + "loss": 0.3805, + "step": 27214 + }, + { + "epoch": 3.63, + "grad_norm": 0.88671875, + "learning_rate": 5.124347691724407e-06, + "loss": 0.31, + "step": 27215 + }, + { + "epoch": 3.63, + "grad_norm": 0.5078125, + "learning_rate": 5.1206684164553985e-06, + "loss": 0.3446, + "step": 27216 + }, + { + "epoch": 3.63, + "grad_norm": 0.6640625, + "learning_rate": 5.116990427811008e-06, + "loss": 0.1807, + "step": 27217 + }, + { + "epoch": 3.63, + "grad_norm": 0.62890625, + "learning_rate": 5.1133137258411515e-06, + "loss": 0.2366, + "step": 27218 + }, + { + "epoch": 3.63, + "grad_norm": 0.66796875, + "learning_rate": 5.109638310595632e-06, + "loss": 0.5947, + "step": 27219 + }, + { + "epoch": 3.63, + "grad_norm": 0.59375, + "learning_rate": 5.105964182124323e-06, + "loss": 0.404, + "step": 27220 + }, + { + "epoch": 3.63, + "grad_norm": 0.65625, + "learning_rate": 5.10229134047705e-06, + "loss": 0.1701, + "step": 27221 + }, + { + "epoch": 3.63, + "grad_norm": 0.75390625, + "learning_rate": 5.098619785703617e-06, + "loss": 0.3484, + "step": 27222 + }, + { + "epoch": 3.63, + "grad_norm": 0.625, + "learning_rate": 5.094949517853786e-06, + "loss": 0.4135, + "step": 27223 + }, + { + "epoch": 3.63, + "grad_norm": 0.484375, + "learning_rate": 5.091280536977361e-06, + "loss": 0.1987, + "step": 27224 + }, + { + "epoch": 3.63, + "grad_norm": 0.439453125, + "learning_rate": 5.087612843124079e-06, + "loss": 0.1322, + "step": 27225 + }, + { + "epoch": 3.63, + "grad_norm": 0.7109375, + "learning_rate": 5.083946436343701e-06, + "loss": 0.249, + "step": 27226 + }, + { + "epoch": 3.63, + "grad_norm": 0.7265625, + "learning_rate": 5.0802813166859085e-06, + "loss": 0.4463, + "step": 27227 + }, + { + "epoch": 3.63, + "grad_norm": 0.59765625, + "learning_rate": 5.07661748420043e-06, + "loss": 0.2124, + "step": 27228 + }, + { + "epoch": 3.63, + "grad_norm": 0.609375, + "learning_rate": 5.0729549389369245e-06, + "loss": 0.4823, + "step": 27229 + }, + { + "epoch": 3.63, + "grad_norm": 0.703125, + "learning_rate": 5.069293680945086e-06, + "loss": 0.4832, + "step": 27230 + }, + { + "epoch": 3.63, + "grad_norm": 0.498046875, + "learning_rate": 5.065633710274542e-06, + "loss": 0.2343, + "step": 27231 + }, + { + "epoch": 3.63, + "grad_norm": 0.7265625, + "learning_rate": 5.0619750269749525e-06, + "loss": 0.2776, + "step": 27232 + }, + { + "epoch": 3.63, + "grad_norm": 0.61328125, + "learning_rate": 5.058317631095888e-06, + "loss": 0.2872, + "step": 27233 + }, + { + "epoch": 3.63, + "grad_norm": 0.69140625, + "learning_rate": 5.054661522686977e-06, + "loss": 0.425, + "step": 27234 + }, + { + "epoch": 3.63, + "grad_norm": 0.7265625, + "learning_rate": 5.051006701797789e-06, + "loss": 0.359, + "step": 27235 + }, + { + "epoch": 3.63, + "grad_norm": 0.58203125, + "learning_rate": 5.047353168477908e-06, + "loss": 0.1788, + "step": 27236 + }, + { + "epoch": 3.63, + "grad_norm": 0.6796875, + "learning_rate": 5.04370092277684e-06, + "loss": 0.4339, + "step": 27237 + }, + { + "epoch": 3.63, + "grad_norm": 0.78515625, + "learning_rate": 5.040049964744132e-06, + "loss": 0.4068, + "step": 27238 + }, + { + "epoch": 3.63, + "grad_norm": 0.765625, + "learning_rate": 5.0364002944293e-06, + "loss": 0.4477, + "step": 27239 + }, + { + "epoch": 3.63, + "grad_norm": 0.58203125, + "learning_rate": 5.032751911881816e-06, + "loss": 0.3597, + "step": 27240 + }, + { + "epoch": 3.64, + "grad_norm": 0.765625, + "learning_rate": 5.029104817151187e-06, + "loss": 0.2119, + "step": 27241 + }, + { + "epoch": 3.64, + "grad_norm": 0.484375, + "learning_rate": 5.025459010286826e-06, + "loss": 0.28, + "step": 27242 + }, + { + "epoch": 3.64, + "grad_norm": 0.6953125, + "learning_rate": 5.021814491338195e-06, + "loss": 0.3472, + "step": 27243 + }, + { + "epoch": 3.64, + "grad_norm": 0.6875, + "learning_rate": 5.0181712603547205e-06, + "loss": 0.3414, + "step": 27244 + }, + { + "epoch": 3.64, + "grad_norm": 0.609375, + "learning_rate": 5.014529317385808e-06, + "loss": 0.4395, + "step": 27245 + }, + { + "epoch": 3.64, + "grad_norm": 0.8125, + "learning_rate": 5.010888662480839e-06, + "loss": 0.7386, + "step": 27246 + }, + { + "epoch": 3.64, + "grad_norm": 0.66796875, + "learning_rate": 5.0072492956891755e-06, + "loss": 0.3583, + "step": 27247 + }, + { + "epoch": 3.64, + "grad_norm": 0.82421875, + "learning_rate": 5.003611217060178e-06, + "loss": 0.477, + "step": 27248 + }, + { + "epoch": 3.64, + "grad_norm": 0.73046875, + "learning_rate": 4.999974426643172e-06, + "loss": 0.2228, + "step": 27249 + }, + { + "epoch": 3.64, + "grad_norm": 0.546875, + "learning_rate": 4.996338924487509e-06, + "loss": 0.1769, + "step": 27250 + }, + { + "epoch": 3.64, + "grad_norm": 0.703125, + "learning_rate": 4.992704710642459e-06, + "loss": 0.2856, + "step": 27251 + }, + { + "epoch": 3.64, + "grad_norm": 0.54296875, + "learning_rate": 4.9890717851572955e-06, + "loss": 0.2383, + "step": 27252 + }, + { + "epoch": 3.64, + "grad_norm": 0.52734375, + "learning_rate": 4.985440148081299e-06, + "loss": 0.3771, + "step": 27253 + }, + { + "epoch": 3.64, + "grad_norm": 0.5625, + "learning_rate": 4.981809799463711e-06, + "loss": 0.1974, + "step": 27254 + }, + { + "epoch": 3.64, + "grad_norm": 0.494140625, + "learning_rate": 4.9781807393537774e-06, + "loss": 0.2435, + "step": 27255 + }, + { + "epoch": 3.64, + "grad_norm": 0.66015625, + "learning_rate": 4.974552967800683e-06, + "loss": 0.2814, + "step": 27256 + }, + { + "epoch": 3.64, + "grad_norm": 0.65625, + "learning_rate": 4.970926484853644e-06, + "loss": 0.3532, + "step": 27257 + }, + { + "epoch": 3.64, + "grad_norm": 0.5625, + "learning_rate": 4.967301290561821e-06, + "loss": 0.3157, + "step": 27258 + }, + { + "epoch": 3.64, + "grad_norm": 0.734375, + "learning_rate": 4.963677384974408e-06, + "loss": 0.4122, + "step": 27259 + }, + { + "epoch": 3.64, + "grad_norm": 0.61328125, + "learning_rate": 4.960054768140499e-06, + "loss": 0.435, + "step": 27260 + }, + { + "epoch": 3.64, + "grad_norm": 0.5, + "learning_rate": 4.9564334401092535e-06, + "loss": 0.3156, + "step": 27261 + }, + { + "epoch": 3.64, + "grad_norm": 0.625, + "learning_rate": 4.9528134009297675e-06, + "loss": 0.3743, + "step": 27262 + }, + { + "epoch": 3.64, + "grad_norm": 0.640625, + "learning_rate": 4.949194650651145e-06, + "loss": 0.1915, + "step": 27263 + }, + { + "epoch": 3.64, + "grad_norm": 0.84375, + "learning_rate": 4.9455771893224365e-06, + "loss": 0.5878, + "step": 27264 + }, + { + "epoch": 3.64, + "grad_norm": 0.71484375, + "learning_rate": 4.941961016992703e-06, + "loss": 0.4581, + "step": 27265 + }, + { + "epoch": 3.64, + "grad_norm": 0.498046875, + "learning_rate": 4.938346133710991e-06, + "loss": 0.3924, + "step": 27266 + }, + { + "epoch": 3.64, + "grad_norm": 0.78515625, + "learning_rate": 4.93473253952631e-06, + "loss": 0.3465, + "step": 27267 + }, + { + "epoch": 3.64, + "grad_norm": 0.546875, + "learning_rate": 4.931120234487674e-06, + "loss": 0.1533, + "step": 27268 + }, + { + "epoch": 3.64, + "grad_norm": 0.65234375, + "learning_rate": 4.927509218644055e-06, + "loss": 0.5641, + "step": 27269 + }, + { + "epoch": 3.64, + "grad_norm": 0.69140625, + "learning_rate": 4.923899492044437e-06, + "loss": 0.1801, + "step": 27270 + }, + { + "epoch": 3.64, + "grad_norm": 0.6640625, + "learning_rate": 4.920291054737747e-06, + "loss": 0.332, + "step": 27271 + }, + { + "epoch": 3.64, + "grad_norm": 0.703125, + "learning_rate": 4.916683906772956e-06, + "loss": 0.4925, + "step": 27272 + }, + { + "epoch": 3.64, + "grad_norm": 0.69921875, + "learning_rate": 4.913078048198938e-06, + "loss": 0.2191, + "step": 27273 + }, + { + "epoch": 3.64, + "grad_norm": 0.57421875, + "learning_rate": 4.909473479064608e-06, + "loss": 0.3079, + "step": 27274 + }, + { + "epoch": 3.64, + "grad_norm": 0.66015625, + "learning_rate": 4.90587019941886e-06, + "loss": 0.1776, + "step": 27275 + }, + { + "epoch": 3.64, + "grad_norm": 0.64453125, + "learning_rate": 4.902268209310523e-06, + "loss": 0.3482, + "step": 27276 + }, + { + "epoch": 3.64, + "grad_norm": 0.68359375, + "learning_rate": 4.898667508788468e-06, + "loss": 0.4974, + "step": 27277 + }, + { + "epoch": 3.64, + "grad_norm": 0.59375, + "learning_rate": 4.895068097901534e-06, + "loss": 0.2801, + "step": 27278 + }, + { + "epoch": 3.64, + "grad_norm": 0.66796875, + "learning_rate": 4.891469976698493e-06, + "loss": 0.2835, + "step": 27279 + }, + { + "epoch": 3.64, + "grad_norm": 0.68359375, + "learning_rate": 4.887873145228173e-06, + "loss": 0.4517, + "step": 27280 + }, + { + "epoch": 3.64, + "grad_norm": 0.80859375, + "learning_rate": 4.884277603539322e-06, + "loss": 0.4588, + "step": 27281 + }, + { + "epoch": 3.64, + "grad_norm": 0.51953125, + "learning_rate": 4.8806833516807375e-06, + "loss": 0.2419, + "step": 27282 + }, + { + "epoch": 3.64, + "grad_norm": 0.57421875, + "learning_rate": 4.877090389701111e-06, + "loss": 0.3057, + "step": 27283 + }, + { + "epoch": 3.64, + "grad_norm": 0.52734375, + "learning_rate": 4.873498717649183e-06, + "loss": 0.2055, + "step": 27284 + }, + { + "epoch": 3.64, + "grad_norm": 0.625, + "learning_rate": 4.869908335573681e-06, + "loss": 0.4354, + "step": 27285 + }, + { + "epoch": 3.64, + "grad_norm": 0.7734375, + "learning_rate": 4.8663192435232765e-06, + "loss": 0.645, + "step": 27286 + }, + { + "epoch": 3.64, + "grad_norm": 0.5546875, + "learning_rate": 4.862731441546642e-06, + "loss": 0.3365, + "step": 27287 + }, + { + "epoch": 3.64, + "grad_norm": 0.53515625, + "learning_rate": 4.859144929692416e-06, + "loss": 0.3677, + "step": 27288 + }, + { + "epoch": 3.64, + "grad_norm": 0.578125, + "learning_rate": 4.855559708009249e-06, + "loss": 0.3207, + "step": 27289 + }, + { + "epoch": 3.64, + "grad_norm": 0.53125, + "learning_rate": 4.851975776545747e-06, + "loss": 0.2678, + "step": 27290 + }, + { + "epoch": 3.64, + "grad_norm": 0.8359375, + "learning_rate": 4.848393135350537e-06, + "loss": 0.4148, + "step": 27291 + }, + { + "epoch": 3.64, + "grad_norm": 0.81640625, + "learning_rate": 4.844811784472169e-06, + "loss": 0.5255, + "step": 27292 + }, + { + "epoch": 3.64, + "grad_norm": 0.6328125, + "learning_rate": 4.841231723959228e-06, + "loss": 0.3078, + "step": 27293 + }, + { + "epoch": 3.64, + "grad_norm": 0.6015625, + "learning_rate": 4.83765295386025e-06, + "loss": 0.1761, + "step": 27294 + }, + { + "epoch": 3.64, + "grad_norm": 0.5234375, + "learning_rate": 4.834075474223798e-06, + "loss": 0.3409, + "step": 27295 + }, + { + "epoch": 3.64, + "grad_norm": 0.68359375, + "learning_rate": 4.830499285098333e-06, + "loss": 0.3796, + "step": 27296 + }, + { + "epoch": 3.64, + "grad_norm": 0.80078125, + "learning_rate": 4.8269243865323836e-06, + "loss": 0.3109, + "step": 27297 + }, + { + "epoch": 3.64, + "grad_norm": 0.58984375, + "learning_rate": 4.823350778574421e-06, + "loss": 0.2878, + "step": 27298 + }, + { + "epoch": 3.64, + "grad_norm": 0.4609375, + "learning_rate": 4.819778461272917e-06, + "loss": 0.1924, + "step": 27299 + }, + { + "epoch": 3.64, + "grad_norm": 0.71875, + "learning_rate": 4.816207434676301e-06, + "loss": 0.5214, + "step": 27300 + }, + { + "epoch": 3.64, + "grad_norm": 0.58203125, + "learning_rate": 4.812637698833e-06, + "loss": 0.4785, + "step": 27301 + }, + { + "epoch": 3.64, + "grad_norm": 0.46875, + "learning_rate": 4.8090692537914096e-06, + "loss": 0.3064, + "step": 27302 + }, + { + "epoch": 3.64, + "grad_norm": 0.65625, + "learning_rate": 4.805502099599946e-06, + "loss": 0.4642, + "step": 27303 + }, + { + "epoch": 3.64, + "grad_norm": 0.71875, + "learning_rate": 4.801936236306958e-06, + "loss": 0.3531, + "step": 27304 + }, + { + "epoch": 3.64, + "grad_norm": 0.50390625, + "learning_rate": 4.7983716639608326e-06, + "loss": 0.3761, + "step": 27305 + }, + { + "epoch": 3.64, + "grad_norm": 0.6640625, + "learning_rate": 4.7948083826098726e-06, + "loss": 0.2844, + "step": 27306 + }, + { + "epoch": 3.64, + "grad_norm": 0.78515625, + "learning_rate": 4.7912463923024066e-06, + "loss": 0.1851, + "step": 27307 + }, + { + "epoch": 3.64, + "grad_norm": 0.79296875, + "learning_rate": 4.787685693086752e-06, + "loss": 0.6017, + "step": 27308 + }, + { + "epoch": 3.64, + "grad_norm": 0.6796875, + "learning_rate": 4.784126285011193e-06, + "loss": 0.2722, + "step": 27309 + }, + { + "epoch": 3.64, + "grad_norm": 0.66015625, + "learning_rate": 4.780568168123978e-06, + "loss": 0.3819, + "step": 27310 + }, + { + "epoch": 3.64, + "grad_norm": 0.73046875, + "learning_rate": 4.777011342473392e-06, + "loss": 0.6622, + "step": 27311 + }, + { + "epoch": 3.64, + "grad_norm": 0.42578125, + "learning_rate": 4.773455808107619e-06, + "loss": 0.1786, + "step": 27312 + }, + { + "epoch": 3.64, + "grad_norm": 0.5859375, + "learning_rate": 4.769901565074908e-06, + "loss": 0.2773, + "step": 27313 + }, + { + "epoch": 3.64, + "grad_norm": 0.7421875, + "learning_rate": 4.766348613423466e-06, + "loss": 0.6143, + "step": 27314 + }, + { + "epoch": 3.64, + "grad_norm": 0.671875, + "learning_rate": 4.762796953201443e-06, + "loss": 0.3167, + "step": 27315 + }, + { + "epoch": 3.65, + "grad_norm": 0.46875, + "learning_rate": 4.759246584457011e-06, + "loss": 0.3177, + "step": 27316 + }, + { + "epoch": 3.65, + "grad_norm": 0.458984375, + "learning_rate": 4.755697507238333e-06, + "loss": 0.2796, + "step": 27317 + }, + { + "epoch": 3.65, + "grad_norm": 0.54296875, + "learning_rate": 4.752149721593524e-06, + "loss": 0.2329, + "step": 27318 + }, + { + "epoch": 3.65, + "grad_norm": 0.69921875, + "learning_rate": 4.7486032275706915e-06, + "loss": 0.579, + "step": 27319 + }, + { + "epoch": 3.65, + "grad_norm": 0.703125, + "learning_rate": 4.745058025217919e-06, + "loss": 0.4887, + "step": 27320 + }, + { + "epoch": 3.65, + "grad_norm": 0.69140625, + "learning_rate": 4.741514114583301e-06, + "loss": 0.3164, + "step": 27321 + }, + { + "epoch": 3.65, + "grad_norm": 0.71875, + "learning_rate": 4.737971495714888e-06, + "loss": 0.3921, + "step": 27322 + }, + { + "epoch": 3.65, + "grad_norm": 0.74609375, + "learning_rate": 4.734430168660731e-06, + "loss": 0.3461, + "step": 27323 + }, + { + "epoch": 3.65, + "grad_norm": 0.79296875, + "learning_rate": 4.730890133468846e-06, + "loss": 0.4017, + "step": 27324 + }, + { + "epoch": 3.65, + "grad_norm": 0.62109375, + "learning_rate": 4.727351390187218e-06, + "loss": 0.3783, + "step": 27325 + }, + { + "epoch": 3.65, + "grad_norm": 0.58203125, + "learning_rate": 4.723813938863852e-06, + "loss": 0.3813, + "step": 27326 + }, + { + "epoch": 3.65, + "grad_norm": 0.76953125, + "learning_rate": 4.720277779546722e-06, + "loss": 0.3622, + "step": 27327 + }, + { + "epoch": 3.65, + "grad_norm": 0.455078125, + "learning_rate": 4.716742912283778e-06, + "loss": 0.2699, + "step": 27328 + }, + { + "epoch": 3.65, + "grad_norm": 0.6015625, + "learning_rate": 4.713209337122948e-06, + "loss": 0.3502, + "step": 27329 + }, + { + "epoch": 3.65, + "grad_norm": 0.69921875, + "learning_rate": 4.70967705411215e-06, + "loss": 0.3328, + "step": 27330 + }, + { + "epoch": 3.65, + "grad_norm": 0.55859375, + "learning_rate": 4.7061460632992886e-06, + "loss": 0.2969, + "step": 27331 + }, + { + "epoch": 3.65, + "grad_norm": 0.57421875, + "learning_rate": 4.702616364732259e-06, + "loss": 0.2706, + "step": 27332 + }, + { + "epoch": 3.65, + "grad_norm": 0.72265625, + "learning_rate": 4.699087958458903e-06, + "loss": 0.4367, + "step": 27333 + }, + { + "epoch": 3.65, + "grad_norm": 0.43359375, + "learning_rate": 4.695560844527069e-06, + "loss": 0.2198, + "step": 27334 + }, + { + "epoch": 3.65, + "grad_norm": 0.6484375, + "learning_rate": 4.692035022984609e-06, + "loss": 0.2269, + "step": 27335 + }, + { + "epoch": 3.65, + "grad_norm": 0.5546875, + "learning_rate": 4.688510493879328e-06, + "loss": 0.3054, + "step": 27336 + }, + { + "epoch": 3.65, + "grad_norm": 0.5859375, + "learning_rate": 4.684987257259011e-06, + "loss": 0.2291, + "step": 27337 + }, + { + "epoch": 3.65, + "grad_norm": 0.5234375, + "learning_rate": 4.681465313171429e-06, + "loss": 0.167, + "step": 27338 + }, + { + "epoch": 3.65, + "grad_norm": 0.5703125, + "learning_rate": 4.677944661664357e-06, + "loss": 0.3359, + "step": 27339 + }, + { + "epoch": 3.65, + "grad_norm": 0.8203125, + "learning_rate": 4.674425302785534e-06, + "loss": 0.2348, + "step": 27340 + }, + { + "epoch": 3.65, + "grad_norm": 0.7265625, + "learning_rate": 4.670907236582689e-06, + "loss": 0.5228, + "step": 27341 + }, + { + "epoch": 3.65, + "grad_norm": 0.76953125, + "learning_rate": 4.667390463103516e-06, + "loss": 0.4032, + "step": 27342 + }, + { + "epoch": 3.65, + "grad_norm": 0.578125, + "learning_rate": 4.6638749823957105e-06, + "loss": 0.462, + "step": 27343 + }, + { + "epoch": 3.65, + "grad_norm": 0.486328125, + "learning_rate": 4.660360794506946e-06, + "loss": 0.3206, + "step": 27344 + }, + { + "epoch": 3.65, + "grad_norm": 0.59765625, + "learning_rate": 4.656847899484884e-06, + "loss": 0.413, + "step": 27345 + }, + { + "epoch": 3.65, + "grad_norm": 0.59375, + "learning_rate": 4.653336297377153e-06, + "loss": 0.3325, + "step": 27346 + }, + { + "epoch": 3.65, + "grad_norm": 0.56640625, + "learning_rate": 4.649825988231371e-06, + "loss": 0.2014, + "step": 27347 + }, + { + "epoch": 3.65, + "grad_norm": 0.53125, + "learning_rate": 4.646316972095155e-06, + "loss": 0.3056, + "step": 27348 + }, + { + "epoch": 3.65, + "grad_norm": 0.66796875, + "learning_rate": 4.642809249016067e-06, + "loss": 0.4046, + "step": 27349 + }, + { + "epoch": 3.65, + "grad_norm": 0.6171875, + "learning_rate": 4.639302819041691e-06, + "loss": 0.534, + "step": 27350 + }, + { + "epoch": 3.65, + "grad_norm": 0.65234375, + "learning_rate": 4.6357976822195785e-06, + "loss": 0.3976, + "step": 27351 + }, + { + "epoch": 3.65, + "grad_norm": 0.5546875, + "learning_rate": 4.632293838597246e-06, + "loss": 0.4517, + "step": 27352 + }, + { + "epoch": 3.65, + "grad_norm": 0.6171875, + "learning_rate": 4.628791288222201e-06, + "loss": 0.4233, + "step": 27353 + }, + { + "epoch": 3.65, + "grad_norm": 0.5234375, + "learning_rate": 4.625290031141971e-06, + "loss": 0.4202, + "step": 27354 + }, + { + "epoch": 3.65, + "grad_norm": 0.55859375, + "learning_rate": 4.6217900674040305e-06, + "loss": 0.204, + "step": 27355 + }, + { + "epoch": 3.65, + "grad_norm": 0.62109375, + "learning_rate": 4.618291397055807e-06, + "loss": 0.4173, + "step": 27356 + }, + { + "epoch": 3.65, + "grad_norm": 0.484375, + "learning_rate": 4.614794020144775e-06, + "loss": 0.191, + "step": 27357 + }, + { + "epoch": 3.65, + "grad_norm": 0.80078125, + "learning_rate": 4.61129793671835e-06, + "loss": 0.7271, + "step": 27358 + }, + { + "epoch": 3.65, + "grad_norm": 0.56640625, + "learning_rate": 4.6078031468239635e-06, + "loss": 0.3294, + "step": 27359 + }, + { + "epoch": 3.65, + "grad_norm": 0.64453125, + "learning_rate": 4.6043096505089865e-06, + "loss": 0.2783, + "step": 27360 + }, + { + "epoch": 3.65, + "grad_norm": 0.78125, + "learning_rate": 4.600817447820782e-06, + "loss": 0.4731, + "step": 27361 + }, + { + "epoch": 3.65, + "grad_norm": 0.546875, + "learning_rate": 4.597326538806712e-06, + "loss": 0.3402, + "step": 27362 + }, + { + "epoch": 3.65, + "grad_norm": 0.57421875, + "learning_rate": 4.593836923514128e-06, + "loss": 0.337, + "step": 27363 + }, + { + "epoch": 3.65, + "grad_norm": 0.64453125, + "learning_rate": 4.590348601990368e-06, + "loss": 0.2778, + "step": 27364 + }, + { + "epoch": 3.65, + "grad_norm": 0.66796875, + "learning_rate": 4.586861574282697e-06, + "loss": 0.2749, + "step": 27365 + }, + { + "epoch": 3.65, + "grad_norm": 0.486328125, + "learning_rate": 4.583375840438409e-06, + "loss": 0.2185, + "step": 27366 + }, + { + "epoch": 3.65, + "grad_norm": 0.69140625, + "learning_rate": 4.579891400504788e-06, + "loss": 0.5513, + "step": 27367 + }, + { + "epoch": 3.65, + "grad_norm": 0.546875, + "learning_rate": 4.576408254529096e-06, + "loss": 0.3632, + "step": 27368 + }, + { + "epoch": 3.65, + "grad_norm": 0.57421875, + "learning_rate": 4.572926402558542e-06, + "loss": 0.2673, + "step": 27369 + }, + { + "epoch": 3.65, + "grad_norm": 0.490234375, + "learning_rate": 4.569445844640341e-06, + "loss": 0.2017, + "step": 27370 + }, + { + "epoch": 3.65, + "grad_norm": 0.64453125, + "learning_rate": 4.565966580821712e-06, + "loss": 0.2422, + "step": 27371 + }, + { + "epoch": 3.65, + "grad_norm": 0.56640625, + "learning_rate": 4.5624886111498285e-06, + "loss": 0.277, + "step": 27372 + }, + { + "epoch": 3.65, + "grad_norm": 0.6015625, + "learning_rate": 4.559011935671842e-06, + "loss": 0.4429, + "step": 27373 + }, + { + "epoch": 3.65, + "grad_norm": 0.703125, + "learning_rate": 4.555536554434913e-06, + "loss": 0.28, + "step": 27374 + }, + { + "epoch": 3.65, + "grad_norm": 0.369140625, + "learning_rate": 4.5520624674861625e-06, + "loss": 0.1368, + "step": 27375 + }, + { + "epoch": 3.65, + "grad_norm": 0.5546875, + "learning_rate": 4.548589674872694e-06, + "loss": 0.1317, + "step": 27376 + }, + { + "epoch": 3.65, + "grad_norm": 0.625, + "learning_rate": 4.545118176641616e-06, + "loss": 0.3507, + "step": 27377 + }, + { + "epoch": 3.65, + "grad_norm": 0.5703125, + "learning_rate": 4.541647972840013e-06, + "loss": 0.2466, + "step": 27378 + }, + { + "epoch": 3.65, + "grad_norm": 0.52734375, + "learning_rate": 4.538179063514914e-06, + "loss": 0.3398, + "step": 27379 + }, + { + "epoch": 3.65, + "grad_norm": 0.66796875, + "learning_rate": 4.5347114487133694e-06, + "loss": 0.182, + "step": 27380 + }, + { + "epoch": 3.65, + "grad_norm": 0.62109375, + "learning_rate": 4.531245128482408e-06, + "loss": 0.3119, + "step": 27381 + }, + { + "epoch": 3.65, + "grad_norm": 0.478515625, + "learning_rate": 4.52778010286905e-06, + "loss": 0.3945, + "step": 27382 + }, + { + "epoch": 3.65, + "grad_norm": 0.6640625, + "learning_rate": 4.524316371920246e-06, + "loss": 0.4801, + "step": 27383 + }, + { + "epoch": 3.65, + "grad_norm": 0.52734375, + "learning_rate": 4.5208539356830114e-06, + "loss": 0.1271, + "step": 27384 + }, + { + "epoch": 3.65, + "grad_norm": 0.48828125, + "learning_rate": 4.517392794204256e-06, + "loss": 0.328, + "step": 27385 + }, + { + "epoch": 3.65, + "grad_norm": 0.6484375, + "learning_rate": 4.513932947530941e-06, + "loss": 0.4339, + "step": 27386 + }, + { + "epoch": 3.65, + "grad_norm": 0.671875, + "learning_rate": 4.5104743957099845e-06, + "loss": 0.3185, + "step": 27387 + }, + { + "epoch": 3.65, + "grad_norm": 0.6640625, + "learning_rate": 4.507017138788272e-06, + "loss": 0.5038, + "step": 27388 + }, + { + "epoch": 3.65, + "grad_norm": 0.625, + "learning_rate": 4.503561176812688e-06, + "loss": 0.3705, + "step": 27389 + }, + { + "epoch": 3.65, + "grad_norm": 0.439453125, + "learning_rate": 4.500106509830104e-06, + "loss": 0.2615, + "step": 27390 + }, + { + "epoch": 3.66, + "grad_norm": 0.67578125, + "learning_rate": 4.496653137887386e-06, + "loss": 0.1482, + "step": 27391 + }, + { + "epoch": 3.66, + "grad_norm": 0.408203125, + "learning_rate": 4.493201061031327e-06, + "loss": 0.1495, + "step": 27392 + }, + { + "epoch": 3.66, + "grad_norm": 0.515625, + "learning_rate": 4.489750279308757e-06, + "loss": 0.3393, + "step": 27393 + }, + { + "epoch": 3.66, + "grad_norm": 0.5234375, + "learning_rate": 4.486300792766473e-06, + "loss": 0.1576, + "step": 27394 + }, + { + "epoch": 3.66, + "grad_norm": 0.765625, + "learning_rate": 4.4828526014512485e-06, + "loss": 0.1722, + "step": 27395 + }, + { + "epoch": 3.66, + "grad_norm": 0.5703125, + "learning_rate": 4.479405705409878e-06, + "loss": 0.2305, + "step": 27396 + }, + { + "epoch": 3.66, + "grad_norm": 0.54296875, + "learning_rate": 4.475960104689037e-06, + "loss": 0.4847, + "step": 27397 + }, + { + "epoch": 3.66, + "grad_norm": 0.4765625, + "learning_rate": 4.472515799335486e-06, + "loss": 0.1611, + "step": 27398 + }, + { + "epoch": 3.66, + "grad_norm": 0.6640625, + "learning_rate": 4.469072789395934e-06, + "loss": 0.2266, + "step": 27399 + }, + { + "epoch": 3.66, + "grad_norm": 0.69140625, + "learning_rate": 4.465631074917065e-06, + "loss": 0.297, + "step": 27400 + }, + { + "epoch": 3.66, + "grad_norm": 0.609375, + "learning_rate": 4.462190655945564e-06, + "loss": 0.5261, + "step": 27401 + }, + { + "epoch": 3.66, + "grad_norm": 0.57421875, + "learning_rate": 4.45875153252806e-06, + "loss": 0.4333, + "step": 27402 + }, + { + "epoch": 3.66, + "grad_norm": 0.484375, + "learning_rate": 4.455313704711206e-06, + "loss": 0.1347, + "step": 27403 + }, + { + "epoch": 3.66, + "grad_norm": 0.76171875, + "learning_rate": 4.451877172541619e-06, + "loss": 0.3883, + "step": 27404 + }, + { + "epoch": 3.66, + "grad_norm": 0.55078125, + "learning_rate": 4.448441936065906e-06, + "loss": 0.4128, + "step": 27405 + }, + { + "epoch": 3.66, + "grad_norm": 0.486328125, + "learning_rate": 4.445007995330641e-06, + "loss": 0.2301, + "step": 27406 + }, + { + "epoch": 3.66, + "grad_norm": 0.39453125, + "learning_rate": 4.441575350382388e-06, + "loss": 0.1594, + "step": 27407 + }, + { + "epoch": 3.66, + "grad_norm": 0.51953125, + "learning_rate": 4.438144001267697e-06, + "loss": 0.3042, + "step": 27408 + }, + { + "epoch": 3.66, + "grad_norm": 0.83203125, + "learning_rate": 4.434713948033131e-06, + "loss": 0.5718, + "step": 27409 + }, + { + "epoch": 3.66, + "grad_norm": 0.7109375, + "learning_rate": 4.431285190725165e-06, + "loss": 0.3784, + "step": 27410 + }, + { + "epoch": 3.66, + "grad_norm": 0.66015625, + "learning_rate": 4.427857729390306e-06, + "loss": 0.2287, + "step": 27411 + }, + { + "epoch": 3.66, + "grad_norm": 0.79296875, + "learning_rate": 4.424431564075027e-06, + "loss": 0.7319, + "step": 27412 + }, + { + "epoch": 3.66, + "grad_norm": 0.640625, + "learning_rate": 4.4210066948257915e-06, + "loss": 0.2674, + "step": 27413 + }, + { + "epoch": 3.66, + "grad_norm": 0.412109375, + "learning_rate": 4.417583121689062e-06, + "loss": 0.1821, + "step": 27414 + }, + { + "epoch": 3.66, + "grad_norm": 0.671875, + "learning_rate": 4.414160844711246e-06, + "loss": 0.6455, + "step": 27415 + }, + { + "epoch": 3.66, + "grad_norm": 0.7890625, + "learning_rate": 4.410739863938751e-06, + "loss": 0.7245, + "step": 27416 + }, + { + "epoch": 3.66, + "grad_norm": 0.64453125, + "learning_rate": 4.407320179417962e-06, + "loss": 0.392, + "step": 27417 + }, + { + "epoch": 3.66, + "grad_norm": 0.62890625, + "learning_rate": 4.4039017911952865e-06, + "loss": 0.4354, + "step": 27418 + }, + { + "epoch": 3.66, + "grad_norm": 0.56640625, + "learning_rate": 4.400484699317042e-06, + "loss": 0.2252, + "step": 27419 + }, + { + "epoch": 3.66, + "grad_norm": 0.384765625, + "learning_rate": 4.397068903829582e-06, + "loss": 0.1691, + "step": 27420 + }, + { + "epoch": 3.66, + "grad_norm": 0.69921875, + "learning_rate": 4.393654404779235e-06, + "loss": 0.5642, + "step": 27421 + }, + { + "epoch": 3.66, + "grad_norm": 0.81640625, + "learning_rate": 4.390241202212275e-06, + "loss": 0.2532, + "step": 27422 + }, + { + "epoch": 3.66, + "grad_norm": 0.7421875, + "learning_rate": 4.386829296175021e-06, + "loss": 0.3263, + "step": 27423 + }, + { + "epoch": 3.66, + "grad_norm": 0.57421875, + "learning_rate": 4.383418686713725e-06, + "loss": 0.1577, + "step": 27424 + }, + { + "epoch": 3.66, + "grad_norm": 0.5234375, + "learning_rate": 4.380009373874627e-06, + "loss": 0.3164, + "step": 27425 + }, + { + "epoch": 3.66, + "grad_norm": 0.73046875, + "learning_rate": 4.3766013577039796e-06, + "loss": 0.3072, + "step": 27426 + }, + { + "epoch": 3.66, + "grad_norm": 0.67578125, + "learning_rate": 4.373194638247979e-06, + "loss": 0.3913, + "step": 27427 + }, + { + "epoch": 3.66, + "grad_norm": 0.61328125, + "learning_rate": 4.3697892155528444e-06, + "loss": 0.3342, + "step": 27428 + }, + { + "epoch": 3.66, + "grad_norm": 0.57421875, + "learning_rate": 4.366385089664738e-06, + "loss": 0.3398, + "step": 27429 + }, + { + "epoch": 3.66, + "grad_norm": 0.58984375, + "learning_rate": 4.362982260629822e-06, + "loss": 0.3238, + "step": 27430 + }, + { + "epoch": 3.66, + "grad_norm": 0.46484375, + "learning_rate": 4.359580728494251e-06, + "loss": 0.1816, + "step": 27431 + }, + { + "epoch": 3.66, + "grad_norm": 0.546875, + "learning_rate": 4.356180493304152e-06, + "loss": 0.3886, + "step": 27432 + }, + { + "epoch": 3.66, + "grad_norm": 0.7265625, + "learning_rate": 4.352781555105634e-06, + "loss": 0.6485, + "step": 27433 + }, + { + "epoch": 3.66, + "grad_norm": 0.953125, + "learning_rate": 4.349383913944771e-06, + "loss": 0.2209, + "step": 27434 + }, + { + "epoch": 3.66, + "grad_norm": 0.77734375, + "learning_rate": 4.3459875698676486e-06, + "loss": 0.2891, + "step": 27435 + }, + { + "epoch": 3.66, + "grad_norm": 0.5703125, + "learning_rate": 4.3425925229203304e-06, + "loss": 0.3867, + "step": 27436 + }, + { + "epoch": 3.66, + "grad_norm": 0.7734375, + "learning_rate": 4.339198773148856e-06, + "loss": 0.4681, + "step": 27437 + }, + { + "epoch": 3.66, + "grad_norm": 0.83984375, + "learning_rate": 4.3358063205992336e-06, + "loss": 0.4735, + "step": 27438 + }, + { + "epoch": 3.66, + "grad_norm": 0.53515625, + "learning_rate": 4.332415165317471e-06, + "loss": 0.264, + "step": 27439 + }, + { + "epoch": 3.66, + "grad_norm": 0.498046875, + "learning_rate": 4.329025307349554e-06, + "loss": 0.1961, + "step": 27440 + }, + { + "epoch": 3.66, + "grad_norm": 0.66015625, + "learning_rate": 4.32563674674148e-06, + "loss": 0.4159, + "step": 27441 + }, + { + "epoch": 3.66, + "grad_norm": 0.51953125, + "learning_rate": 4.322249483539153e-06, + "loss": 0.2066, + "step": 27442 + }, + { + "epoch": 3.66, + "grad_norm": 0.640625, + "learning_rate": 4.318863517788529e-06, + "loss": 0.4024, + "step": 27443 + }, + { + "epoch": 3.66, + "grad_norm": 0.51953125, + "learning_rate": 4.315478849535526e-06, + "loss": 0.2663, + "step": 27444 + }, + { + "epoch": 3.66, + "grad_norm": 0.71484375, + "learning_rate": 4.312095478826051e-06, + "loss": 0.317, + "step": 27445 + }, + { + "epoch": 3.66, + "grad_norm": 0.640625, + "learning_rate": 4.3087134057059555e-06, + "loss": 0.1787, + "step": 27446 + }, + { + "epoch": 3.66, + "grad_norm": 0.9140625, + "learning_rate": 4.3053326302211375e-06, + "loss": 0.2888, + "step": 27447 + }, + { + "epoch": 3.66, + "grad_norm": 0.671875, + "learning_rate": 4.301953152417415e-06, + "loss": 0.3762, + "step": 27448 + }, + { + "epoch": 3.66, + "grad_norm": 0.6484375, + "learning_rate": 4.29857497234063e-06, + "loss": 0.3473, + "step": 27449 + }, + { + "epoch": 3.66, + "grad_norm": 0.83203125, + "learning_rate": 4.295198090036579e-06, + "loss": 0.3246, + "step": 27450 + }, + { + "epoch": 3.66, + "grad_norm": 0.78125, + "learning_rate": 4.2918225055510796e-06, + "loss": 0.2288, + "step": 27451 + }, + { + "epoch": 3.66, + "grad_norm": 0.546875, + "learning_rate": 4.288448218929886e-06, + "loss": 0.283, + "step": 27452 + }, + { + "epoch": 3.66, + "grad_norm": 0.625, + "learning_rate": 4.28507523021876e-06, + "loss": 0.3236, + "step": 27453 + }, + { + "epoch": 3.66, + "grad_norm": 0.59765625, + "learning_rate": 4.281703539463444e-06, + "loss": 0.3427, + "step": 27454 + }, + { + "epoch": 3.66, + "grad_norm": 0.76953125, + "learning_rate": 4.278333146709668e-06, + "loss": 0.5983, + "step": 27455 + }, + { + "epoch": 3.66, + "grad_norm": 0.431640625, + "learning_rate": 4.2749640520031164e-06, + "loss": 0.177, + "step": 27456 + }, + { + "epoch": 3.66, + "grad_norm": 0.6953125, + "learning_rate": 4.271596255389498e-06, + "loss": 0.6086, + "step": 27457 + }, + { + "epoch": 3.66, + "grad_norm": 0.55078125, + "learning_rate": 4.268229756914466e-06, + "loss": 0.2354, + "step": 27458 + }, + { + "epoch": 3.66, + "grad_norm": 0.435546875, + "learning_rate": 4.264864556623671e-06, + "loss": 0.2644, + "step": 27459 + }, + { + "epoch": 3.66, + "grad_norm": 0.5390625, + "learning_rate": 4.261500654562778e-06, + "loss": 0.4193, + "step": 27460 + }, + { + "epoch": 3.66, + "grad_norm": 0.6171875, + "learning_rate": 4.258138050777361e-06, + "loss": 0.2916, + "step": 27461 + }, + { + "epoch": 3.66, + "grad_norm": 0.5625, + "learning_rate": 4.254776745313038e-06, + "loss": 0.4362, + "step": 27462 + }, + { + "epoch": 3.66, + "grad_norm": 0.578125, + "learning_rate": 4.251416738215397e-06, + "loss": 0.4846, + "step": 27463 + }, + { + "epoch": 3.66, + "grad_norm": 0.51953125, + "learning_rate": 4.248058029530011e-06, + "loss": 0.2872, + "step": 27464 + }, + { + "epoch": 3.66, + "grad_norm": 0.53515625, + "learning_rate": 4.244700619302388e-06, + "loss": 0.3636, + "step": 27465 + }, + { + "epoch": 3.67, + "grad_norm": 0.53515625, + "learning_rate": 4.241344507578082e-06, + "loss": 0.3012, + "step": 27466 + }, + { + "epoch": 3.67, + "grad_norm": 0.80859375, + "learning_rate": 4.237989694402611e-06, + "loss": 0.5156, + "step": 27467 + }, + { + "epoch": 3.67, + "grad_norm": 0.58203125, + "learning_rate": 4.234636179821449e-06, + "loss": 0.175, + "step": 27468 + }, + { + "epoch": 3.67, + "grad_norm": 0.68359375, + "learning_rate": 4.231283963880117e-06, + "loss": 0.458, + "step": 27469 + }, + { + "epoch": 3.67, + "grad_norm": 0.59765625, + "learning_rate": 4.2279330466240105e-06, + "loss": 0.4746, + "step": 27470 + }, + { + "epoch": 3.67, + "grad_norm": 0.74609375, + "learning_rate": 4.224583428098594e-06, + "loss": 0.3338, + "step": 27471 + }, + { + "epoch": 3.67, + "grad_norm": 0.48046875, + "learning_rate": 4.221235108349297e-06, + "loss": 0.3971, + "step": 27472 + }, + { + "epoch": 3.67, + "grad_norm": 0.6640625, + "learning_rate": 4.217888087421518e-06, + "loss": 0.3669, + "step": 27473 + }, + { + "epoch": 3.67, + "grad_norm": 0.796875, + "learning_rate": 4.214542365360663e-06, + "loss": 0.1949, + "step": 27474 + }, + { + "epoch": 3.67, + "grad_norm": 0.4609375, + "learning_rate": 4.211197942212086e-06, + "loss": 0.238, + "step": 27475 + }, + { + "epoch": 3.67, + "grad_norm": 0.53515625, + "learning_rate": 4.207854818021129e-06, + "loss": 0.1547, + "step": 27476 + }, + { + "epoch": 3.67, + "grad_norm": 0.61328125, + "learning_rate": 4.204512992833143e-06, + "loss": 0.2605, + "step": 27477 + }, + { + "epoch": 3.67, + "grad_norm": 0.71875, + "learning_rate": 4.2011724666934595e-06, + "loss": 0.28, + "step": 27478 + }, + { + "epoch": 3.67, + "grad_norm": 0.7578125, + "learning_rate": 4.197833239647342e-06, + "loss": 0.4785, + "step": 27479 + }, + { + "epoch": 3.67, + "grad_norm": 0.56640625, + "learning_rate": 4.194495311740098e-06, + "loss": 0.3061, + "step": 27480 + }, + { + "epoch": 3.67, + "grad_norm": 0.578125, + "learning_rate": 4.191158683016994e-06, + "loss": 0.4336, + "step": 27481 + }, + { + "epoch": 3.67, + "grad_norm": 0.72265625, + "learning_rate": 4.1878233535232456e-06, + "loss": 0.3437, + "step": 27482 + }, + { + "epoch": 3.67, + "grad_norm": 0.578125, + "learning_rate": 4.18448932330413e-06, + "loss": 0.3375, + "step": 27483 + }, + { + "epoch": 3.67, + "grad_norm": 0.478515625, + "learning_rate": 4.181156592404811e-06, + "loss": 0.1322, + "step": 27484 + }, + { + "epoch": 3.67, + "grad_norm": 0.625, + "learning_rate": 4.1778251608705185e-06, + "loss": 0.1556, + "step": 27485 + }, + { + "epoch": 3.67, + "grad_norm": 0.60546875, + "learning_rate": 4.174495028746406e-06, + "loss": 0.2074, + "step": 27486 + }, + { + "epoch": 3.67, + "grad_norm": 0.77734375, + "learning_rate": 4.171166196077647e-06, + "loss": 0.4858, + "step": 27487 + }, + { + "epoch": 3.67, + "grad_norm": 0.78125, + "learning_rate": 4.167838662909374e-06, + "loss": 0.3382, + "step": 27488 + }, + { + "epoch": 3.67, + "grad_norm": 0.51953125, + "learning_rate": 4.164512429286715e-06, + "loss": 0.2409, + "step": 27489 + }, + { + "epoch": 3.67, + "grad_norm": 0.8515625, + "learning_rate": 4.16118749525477e-06, + "loss": 0.566, + "step": 27490 + }, + { + "epoch": 3.67, + "grad_norm": 0.53515625, + "learning_rate": 4.157863860858646e-06, + "loss": 0.2022, + "step": 27491 + }, + { + "epoch": 3.67, + "grad_norm": 0.5390625, + "learning_rate": 4.154541526143385e-06, + "loss": 0.1835, + "step": 27492 + }, + { + "epoch": 3.67, + "grad_norm": 0.65234375, + "learning_rate": 4.151220491154051e-06, + "loss": 0.3861, + "step": 27493 + }, + { + "epoch": 3.67, + "grad_norm": 0.58984375, + "learning_rate": 4.147900755935696e-06, + "loss": 0.26, + "step": 27494 + }, + { + "epoch": 3.67, + "grad_norm": 0.7109375, + "learning_rate": 4.144582320533319e-06, + "loss": 0.6597, + "step": 27495 + }, + { + "epoch": 3.67, + "grad_norm": 0.62890625, + "learning_rate": 4.141265184991916e-06, + "loss": 0.2804, + "step": 27496 + }, + { + "epoch": 3.67, + "grad_norm": 0.546875, + "learning_rate": 4.137949349356496e-06, + "loss": 0.1506, + "step": 27497 + }, + { + "epoch": 3.67, + "grad_norm": 0.41015625, + "learning_rate": 4.1346348136720005e-06, + "loss": 0.2018, + "step": 27498 + }, + { + "epoch": 3.67, + "grad_norm": 0.578125, + "learning_rate": 4.131321577983372e-06, + "loss": 0.242, + "step": 27499 + }, + { + "epoch": 3.67, + "grad_norm": 0.30078125, + "learning_rate": 4.1280096423355616e-06, + "loss": 0.1101, + "step": 27500 + }, + { + "epoch": 3.67, + "grad_norm": 0.80078125, + "learning_rate": 4.1246990067734695e-06, + "loss": 0.4235, + "step": 27501 + }, + { + "epoch": 3.67, + "grad_norm": 0.466796875, + "learning_rate": 4.121389671341991e-06, + "loss": 0.323, + "step": 27502 + }, + { + "epoch": 3.67, + "grad_norm": 0.6953125, + "learning_rate": 4.118081636085991e-06, + "loss": 0.3745, + "step": 27503 + }, + { + "epoch": 3.67, + "grad_norm": 0.7734375, + "learning_rate": 4.1147749010503556e-06, + "loss": 0.6295, + "step": 27504 + }, + { + "epoch": 3.67, + "grad_norm": 0.5078125, + "learning_rate": 4.111469466279916e-06, + "loss": 0.4262, + "step": 27505 + }, + { + "epoch": 3.67, + "grad_norm": 0.609375, + "learning_rate": 4.108165331819491e-06, + "loss": 0.3913, + "step": 27506 + }, + { + "epoch": 3.67, + "grad_norm": 0.462890625, + "learning_rate": 4.1048624977138684e-06, + "loss": 0.1793, + "step": 27507 + }, + { + "epoch": 3.67, + "grad_norm": 0.5625, + "learning_rate": 4.1015609640078665e-06, + "loss": 0.1891, + "step": 27508 + }, + { + "epoch": 3.67, + "grad_norm": 0.474609375, + "learning_rate": 4.098260730746239e-06, + "loss": 0.2252, + "step": 27509 + }, + { + "epoch": 3.67, + "grad_norm": 0.50390625, + "learning_rate": 4.094961797973762e-06, + "loss": 0.2193, + "step": 27510 + }, + { + "epoch": 3.67, + "grad_norm": 0.55859375, + "learning_rate": 4.091664165735132e-06, + "loss": 0.4147, + "step": 27511 + }, + { + "epoch": 3.67, + "grad_norm": 0.72265625, + "learning_rate": 4.088367834075102e-06, + "loss": 0.2156, + "step": 27512 + }, + { + "epoch": 3.67, + "grad_norm": 0.66015625, + "learning_rate": 4.0850728030383475e-06, + "loss": 0.3423, + "step": 27513 + }, + { + "epoch": 3.67, + "grad_norm": 0.4140625, + "learning_rate": 4.08177907266959e-06, + "loss": 0.1628, + "step": 27514 + }, + { + "epoch": 3.67, + "grad_norm": 0.87109375, + "learning_rate": 4.078486643013446e-06, + "loss": 0.4388, + "step": 27515 + }, + { + "epoch": 3.67, + "grad_norm": 0.482421875, + "learning_rate": 4.075195514114593e-06, + "loss": 0.3653, + "step": 27516 + }, + { + "epoch": 3.67, + "grad_norm": 0.4140625, + "learning_rate": 4.071905686017641e-06, + "loss": 0.1334, + "step": 27517 + }, + { + "epoch": 3.67, + "grad_norm": 0.640625, + "learning_rate": 4.068617158767241e-06, + "loss": 0.2283, + "step": 27518 + }, + { + "epoch": 3.67, + "grad_norm": 0.58984375, + "learning_rate": 4.065329932407935e-06, + "loss": 0.3699, + "step": 27519 + }, + { + "epoch": 3.67, + "grad_norm": 0.5859375, + "learning_rate": 4.062044006984345e-06, + "loss": 0.3607, + "step": 27520 + }, + { + "epoch": 3.67, + "grad_norm": 0.458984375, + "learning_rate": 4.058759382540999e-06, + "loss": 0.3092, + "step": 27521 + }, + { + "epoch": 3.67, + "grad_norm": 0.474609375, + "learning_rate": 4.055476059122443e-06, + "loss": 0.2484, + "step": 27522 + }, + { + "epoch": 3.67, + "grad_norm": 0.72265625, + "learning_rate": 4.052194036773216e-06, + "loss": 0.3004, + "step": 27523 + }, + { + "epoch": 3.67, + "grad_norm": 0.80078125, + "learning_rate": 4.048913315537827e-06, + "loss": 0.3609, + "step": 27524 + }, + { + "epoch": 3.67, + "grad_norm": 0.56640625, + "learning_rate": 4.045633895460743e-06, + "loss": 0.1857, + "step": 27525 + }, + { + "epoch": 3.67, + "grad_norm": 0.53515625, + "learning_rate": 4.0423557765864485e-06, + "loss": 0.3559, + "step": 27526 + }, + { + "epoch": 3.67, + "grad_norm": 0.5859375, + "learning_rate": 4.039078958959386e-06, + "loss": 0.2568, + "step": 27527 + }, + { + "epoch": 3.67, + "grad_norm": 0.412109375, + "learning_rate": 4.03580344262402e-06, + "loss": 0.2819, + "step": 27528 + }, + { + "epoch": 3.67, + "grad_norm": 0.609375, + "learning_rate": 4.032529227624737e-06, + "loss": 0.434, + "step": 27529 + }, + { + "epoch": 3.67, + "grad_norm": 0.9453125, + "learning_rate": 4.029256314005958e-06, + "loss": 0.3637, + "step": 27530 + }, + { + "epoch": 3.67, + "grad_norm": 0.79296875, + "learning_rate": 4.025984701812047e-06, + "loss": 0.4866, + "step": 27531 + }, + { + "epoch": 3.67, + "grad_norm": 0.7421875, + "learning_rate": 4.022714391087379e-06, + "loss": 0.6616, + "step": 27532 + }, + { + "epoch": 3.67, + "grad_norm": 0.36328125, + "learning_rate": 4.0194453818763074e-06, + "loss": 0.1424, + "step": 27533 + }, + { + "epoch": 3.67, + "grad_norm": 0.67578125, + "learning_rate": 4.016177674223154e-06, + "loss": 0.2479, + "step": 27534 + }, + { + "epoch": 3.67, + "grad_norm": 0.53515625, + "learning_rate": 4.0129112681722255e-06, + "loss": 0.2419, + "step": 27535 + }, + { + "epoch": 3.67, + "grad_norm": 0.5546875, + "learning_rate": 4.009646163767833e-06, + "loss": 0.3794, + "step": 27536 + }, + { + "epoch": 3.67, + "grad_norm": 0.65625, + "learning_rate": 4.006382361054251e-06, + "loss": 0.2813, + "step": 27537 + }, + { + "epoch": 3.67, + "grad_norm": 0.70703125, + "learning_rate": 4.003119860075722e-06, + "loss": 0.4534, + "step": 27538 + }, + { + "epoch": 3.67, + "grad_norm": 0.51171875, + "learning_rate": 3.999858660876499e-06, + "loss": 0.3729, + "step": 27539 + }, + { + "epoch": 3.67, + "grad_norm": 0.59765625, + "learning_rate": 3.996598763500803e-06, + "loss": 0.3255, + "step": 27540 + }, + { + "epoch": 3.68, + "grad_norm": 0.79296875, + "learning_rate": 3.993340167992843e-06, + "loss": 0.2109, + "step": 27541 + }, + { + "epoch": 3.68, + "grad_norm": 0.66015625, + "learning_rate": 3.990082874396828e-06, + "loss": 0.5061, + "step": 27542 + }, + { + "epoch": 3.68, + "grad_norm": 0.5859375, + "learning_rate": 3.986826882756889e-06, + "loss": 0.412, + "step": 27543 + }, + { + "epoch": 3.68, + "grad_norm": 0.63671875, + "learning_rate": 3.983572193117191e-06, + "loss": 0.3934, + "step": 27544 + }, + { + "epoch": 3.68, + "grad_norm": 0.62109375, + "learning_rate": 3.980318805521888e-06, + "loss": 0.3182, + "step": 27545 + }, + { + "epoch": 3.68, + "grad_norm": 0.64453125, + "learning_rate": 3.977066720015077e-06, + "loss": 0.322, + "step": 27546 + }, + { + "epoch": 3.68, + "grad_norm": 0.5390625, + "learning_rate": 3.973815936640879e-06, + "loss": 0.2265, + "step": 27547 + }, + { + "epoch": 3.68, + "grad_norm": 0.765625, + "learning_rate": 3.970566455443359e-06, + "loss": 0.2466, + "step": 27548 + }, + { + "epoch": 3.68, + "grad_norm": 0.6953125, + "learning_rate": 3.967318276466591e-06, + "loss": 0.329, + "step": 27549 + }, + { + "epoch": 3.68, + "grad_norm": 0.62109375, + "learning_rate": 3.964071399754621e-06, + "loss": 0.1661, + "step": 27550 + }, + { + "epoch": 3.68, + "grad_norm": 0.62109375, + "learning_rate": 3.960825825351488e-06, + "loss": 0.1834, + "step": 27551 + }, + { + "epoch": 3.68, + "grad_norm": 0.66015625, + "learning_rate": 3.9575815533011815e-06, + "loss": 0.3147, + "step": 27552 + }, + { + "epoch": 3.68, + "grad_norm": 0.55859375, + "learning_rate": 3.9543385836477094e-06, + "loss": 0.3594, + "step": 27553 + }, + { + "epoch": 3.68, + "grad_norm": 1.5078125, + "learning_rate": 3.951096916435059e-06, + "loss": 0.4287, + "step": 27554 + }, + { + "epoch": 3.68, + "grad_norm": 0.73828125, + "learning_rate": 3.947856551707174e-06, + "loss": 0.229, + "step": 27555 + }, + { + "epoch": 3.68, + "grad_norm": 0.490234375, + "learning_rate": 3.944617489508007e-06, + "loss": 0.3089, + "step": 27556 + }, + { + "epoch": 3.68, + "grad_norm": 0.57421875, + "learning_rate": 3.941379729881456e-06, + "loss": 0.2978, + "step": 27557 + }, + { + "epoch": 3.68, + "grad_norm": 0.765625, + "learning_rate": 3.938143272871453e-06, + "loss": 0.2472, + "step": 27558 + }, + { + "epoch": 3.68, + "grad_norm": 0.703125, + "learning_rate": 3.934908118521874e-06, + "loss": 0.2023, + "step": 27559 + }, + { + "epoch": 3.68, + "grad_norm": 0.470703125, + "learning_rate": 3.931674266876617e-06, + "loss": 0.3092, + "step": 27560 + }, + { + "epoch": 3.68, + "grad_norm": 0.734375, + "learning_rate": 3.928441717979492e-06, + "loss": 0.4846, + "step": 27561 + }, + { + "epoch": 3.68, + "grad_norm": 0.53515625, + "learning_rate": 3.925210471874352e-06, + "loss": 0.1378, + "step": 27562 + }, + { + "epoch": 3.68, + "grad_norm": 0.67578125, + "learning_rate": 3.921980528605029e-06, + "loss": 0.2731, + "step": 27563 + }, + { + "epoch": 3.68, + "grad_norm": 0.71875, + "learning_rate": 3.918751888215322e-06, + "loss": 0.4146, + "step": 27564 + }, + { + "epoch": 3.68, + "grad_norm": 0.474609375, + "learning_rate": 3.915524550748983e-06, + "loss": 0.1624, + "step": 27565 + }, + { + "epoch": 3.68, + "grad_norm": 0.734375, + "learning_rate": 3.912298516249824e-06, + "loss": 0.5178, + "step": 27566 + }, + { + "epoch": 3.68, + "grad_norm": 0.625, + "learning_rate": 3.909073784761541e-06, + "loss": 0.4044, + "step": 27567 + }, + { + "epoch": 3.68, + "grad_norm": 0.453125, + "learning_rate": 3.9058503563279e-06, + "loss": 0.1004, + "step": 27568 + }, + { + "epoch": 3.68, + "grad_norm": 0.71875, + "learning_rate": 3.902628230992589e-06, + "loss": 0.384, + "step": 27569 + }, + { + "epoch": 3.68, + "grad_norm": 0.61328125, + "learning_rate": 3.899407408799338e-06, + "loss": 0.3007, + "step": 27570 + }, + { + "epoch": 3.68, + "grad_norm": 0.5703125, + "learning_rate": 3.896187889791781e-06, + "loss": 0.3007, + "step": 27571 + }, + { + "epoch": 3.68, + "grad_norm": 0.72265625, + "learning_rate": 3.892969674013591e-06, + "loss": 0.4256, + "step": 27572 + }, + { + "epoch": 3.68, + "grad_norm": 0.59375, + "learning_rate": 3.889752761508414e-06, + "loss": 0.1951, + "step": 27573 + }, + { + "epoch": 3.68, + "grad_norm": 0.5234375, + "learning_rate": 3.886537152319891e-06, + "loss": 0.1554, + "step": 27574 + }, + { + "epoch": 3.68, + "grad_norm": 0.6171875, + "learning_rate": 3.883322846491599e-06, + "loss": 0.339, + "step": 27575 + }, + { + "epoch": 3.68, + "grad_norm": 0.66796875, + "learning_rate": 3.880109844067126e-06, + "loss": 0.3666, + "step": 27576 + }, + { + "epoch": 3.68, + "grad_norm": 0.6640625, + "learning_rate": 3.876898145090058e-06, + "loss": 0.5762, + "step": 27577 + }, + { + "epoch": 3.68, + "grad_norm": 0.76953125, + "learning_rate": 3.87368774960396e-06, + "loss": 0.3227, + "step": 27578 + }, + { + "epoch": 3.68, + "grad_norm": 0.6484375, + "learning_rate": 3.870478657652344e-06, + "loss": 0.3204, + "step": 27579 + }, + { + "epoch": 3.68, + "grad_norm": 0.703125, + "learning_rate": 3.8672708692787275e-06, + "loss": 0.478, + "step": 27580 + }, + { + "epoch": 3.68, + "grad_norm": 0.78125, + "learning_rate": 3.864064384526611e-06, + "loss": 0.2612, + "step": 27581 + }, + { + "epoch": 3.68, + "grad_norm": 0.78515625, + "learning_rate": 3.8608592034394704e-06, + "loss": 0.3129, + "step": 27582 + }, + { + "epoch": 3.68, + "grad_norm": 0.59765625, + "learning_rate": 3.857655326060805e-06, + "loss": 0.4013, + "step": 27583 + }, + { + "epoch": 3.68, + "grad_norm": 0.419921875, + "learning_rate": 3.854452752434024e-06, + "loss": 0.1217, + "step": 27584 + }, + { + "epoch": 3.68, + "grad_norm": 0.4765625, + "learning_rate": 3.851251482602569e-06, + "loss": 0.165, + "step": 27585 + }, + { + "epoch": 3.68, + "grad_norm": 0.55078125, + "learning_rate": 3.848051516609852e-06, + "loss": 0.2299, + "step": 27586 + }, + { + "epoch": 3.68, + "grad_norm": 0.7578125, + "learning_rate": 3.8448528544992816e-06, + "loss": 0.4118, + "step": 27587 + }, + { + "epoch": 3.68, + "grad_norm": 0.625, + "learning_rate": 3.8416554963142004e-06, + "loss": 0.1655, + "step": 27588 + }, + { + "epoch": 3.68, + "grad_norm": 0.609375, + "learning_rate": 3.838459442097985e-06, + "loss": 0.1642, + "step": 27589 + }, + { + "epoch": 3.68, + "grad_norm": 0.7734375, + "learning_rate": 3.835264691893981e-06, + "loss": 0.3963, + "step": 27590 + }, + { + "epoch": 3.68, + "grad_norm": 0.60546875, + "learning_rate": 3.832071245745506e-06, + "loss": 0.2125, + "step": 27591 + }, + { + "epoch": 3.68, + "grad_norm": 0.55078125, + "learning_rate": 3.82887910369586e-06, + "loss": 0.22, + "step": 27592 + }, + { + "epoch": 3.68, + "grad_norm": 0.58203125, + "learning_rate": 3.825688265788352e-06, + "loss": 0.2038, + "step": 27593 + }, + { + "epoch": 3.68, + "grad_norm": 0.64453125, + "learning_rate": 3.822498732066215e-06, + "loss": 0.5344, + "step": 27594 + }, + { + "epoch": 3.68, + "grad_norm": 0.5703125, + "learning_rate": 3.819310502572726e-06, + "loss": 0.2451, + "step": 27595 + }, + { + "epoch": 3.68, + "grad_norm": 0.484375, + "learning_rate": 3.816123577351116e-06, + "loss": 0.2322, + "step": 27596 + }, + { + "epoch": 3.68, + "grad_norm": 0.6484375, + "learning_rate": 3.8129379564446178e-06, + "loss": 0.2839, + "step": 27597 + }, + { + "epoch": 3.68, + "grad_norm": 0.53515625, + "learning_rate": 3.8097536398963963e-06, + "loss": 0.2257, + "step": 27598 + }, + { + "epoch": 3.68, + "grad_norm": 0.490234375, + "learning_rate": 3.8065706277496504e-06, + "loss": 0.183, + "step": 27599 + }, + { + "epoch": 3.68, + "grad_norm": 0.50390625, + "learning_rate": 3.803388920047535e-06, + "loss": 0.33, + "step": 27600 + }, + { + "epoch": 3.68, + "grad_norm": 0.6875, + "learning_rate": 3.800208516833226e-06, + "loss": 0.2663, + "step": 27601 + }, + { + "epoch": 3.68, + "grad_norm": 0.51171875, + "learning_rate": 3.7970294181498224e-06, + "loss": 0.278, + "step": 27602 + }, + { + "epoch": 3.68, + "grad_norm": 0.48828125, + "learning_rate": 3.7938516240404455e-06, + "loss": 0.1923, + "step": 27603 + }, + { + "epoch": 3.68, + "grad_norm": 0.73046875, + "learning_rate": 3.7906751345481715e-06, + "loss": 0.2782, + "step": 27604 + }, + { + "epoch": 3.68, + "grad_norm": 0.65625, + "learning_rate": 3.787499949716089e-06, + "loss": 0.2118, + "step": 27605 + }, + { + "epoch": 3.68, + "grad_norm": 0.63671875, + "learning_rate": 3.784326069587274e-06, + "loss": 0.2817, + "step": 27606 + }, + { + "epoch": 3.68, + "grad_norm": 0.6640625, + "learning_rate": 3.7811534942047367e-06, + "loss": 0.4088, + "step": 27607 + }, + { + "epoch": 3.68, + "grad_norm": 0.61328125, + "learning_rate": 3.777982223611509e-06, + "loss": 0.2074, + "step": 27608 + }, + { + "epoch": 3.68, + "grad_norm": 0.8828125, + "learning_rate": 3.7748122578506016e-06, + "loss": 0.4994, + "step": 27609 + }, + { + "epoch": 3.68, + "grad_norm": 0.58984375, + "learning_rate": 3.771643596965002e-06, + "loss": 0.3328, + "step": 27610 + }, + { + "epoch": 3.68, + "grad_norm": 0.609375, + "learning_rate": 3.768476240997665e-06, + "loss": 0.4241, + "step": 27611 + }, + { + "epoch": 3.68, + "grad_norm": 0.52734375, + "learning_rate": 3.7653101899915446e-06, + "loss": 0.2302, + "step": 27612 + }, + { + "epoch": 3.68, + "grad_norm": 0.76171875, + "learning_rate": 3.7621454439895955e-06, + "loss": 0.3351, + "step": 27613 + }, + { + "epoch": 3.68, + "grad_norm": 0.47265625, + "learning_rate": 3.7589820030347054e-06, + "loss": 0.1956, + "step": 27614 + }, + { + "epoch": 3.68, + "grad_norm": 0.490234375, + "learning_rate": 3.7558198671698185e-06, + "loss": 0.1333, + "step": 27615 + }, + { + "epoch": 3.69, + "grad_norm": 0.67578125, + "learning_rate": 3.7526590364377557e-06, + "loss": 0.3831, + "step": 27616 + }, + { + "epoch": 3.69, + "grad_norm": 0.5859375, + "learning_rate": 3.7494995108814047e-06, + "loss": 0.175, + "step": 27617 + }, + { + "epoch": 3.69, + "grad_norm": 0.59765625, + "learning_rate": 3.7463412905436203e-06, + "loss": 0.3323, + "step": 27618 + }, + { + "epoch": 3.69, + "grad_norm": 0.609375, + "learning_rate": 3.7431843754672236e-06, + "loss": 0.5921, + "step": 27619 + }, + { + "epoch": 3.69, + "grad_norm": 0.734375, + "learning_rate": 3.740028765695025e-06, + "loss": 0.2191, + "step": 27620 + }, + { + "epoch": 3.69, + "grad_norm": 0.63671875, + "learning_rate": 3.7368744612698124e-06, + "loss": 0.3455, + "step": 27621 + }, + { + "epoch": 3.69, + "grad_norm": 0.625, + "learning_rate": 3.733721462234363e-06, + "loss": 0.5012, + "step": 27622 + }, + { + "epoch": 3.69, + "grad_norm": 0.6015625, + "learning_rate": 3.7305697686314312e-06, + "loss": 0.309, + "step": 27623 + }, + { + "epoch": 3.69, + "grad_norm": 0.79296875, + "learning_rate": 3.7274193805037717e-06, + "loss": 0.6697, + "step": 27624 + }, + { + "epoch": 3.69, + "grad_norm": 0.69921875, + "learning_rate": 3.7242702978940837e-06, + "loss": 0.301, + "step": 27625 + }, + { + "epoch": 3.69, + "grad_norm": 0.75, + "learning_rate": 3.7211225208450774e-06, + "loss": 0.4131, + "step": 27626 + }, + { + "epoch": 3.69, + "grad_norm": 0.8203125, + "learning_rate": 3.717976049399452e-06, + "loss": 0.3538, + "step": 27627 + }, + { + "epoch": 3.69, + "grad_norm": 0.58984375, + "learning_rate": 3.7148308835998623e-06, + "loss": 0.2985, + "step": 27628 + }, + { + "epoch": 3.69, + "grad_norm": 0.60546875, + "learning_rate": 3.7116870234889633e-06, + "loss": 0.2918, + "step": 27629 + }, + { + "epoch": 3.69, + "grad_norm": 0.76171875, + "learning_rate": 3.708544469109376e-06, + "loss": 0.4368, + "step": 27630 + }, + { + "epoch": 3.69, + "grad_norm": 0.66015625, + "learning_rate": 3.7054032205037224e-06, + "loss": 0.3673, + "step": 27631 + }, + { + "epoch": 3.69, + "grad_norm": 0.5546875, + "learning_rate": 3.7022632777146017e-06, + "loss": 0.3354, + "step": 27632 + }, + { + "epoch": 3.69, + "grad_norm": 0.75390625, + "learning_rate": 3.6991246407846124e-06, + "loss": 0.3098, + "step": 27633 + }, + { + "epoch": 3.69, + "grad_norm": 0.458984375, + "learning_rate": 3.695987309756277e-06, + "loss": 0.1918, + "step": 27634 + }, + { + "epoch": 3.69, + "grad_norm": 0.60546875, + "learning_rate": 3.692851284672172e-06, + "loss": 0.6006, + "step": 27635 + }, + { + "epoch": 3.69, + "grad_norm": 0.6171875, + "learning_rate": 3.689716565574808e-06, + "loss": 0.4277, + "step": 27636 + }, + { + "epoch": 3.69, + "grad_norm": 0.48046875, + "learning_rate": 3.6865831525067064e-06, + "loss": 0.2749, + "step": 27637 + }, + { + "epoch": 3.69, + "grad_norm": 0.51171875, + "learning_rate": 3.6834510455103443e-06, + "loss": 0.3737, + "step": 27638 + }, + { + "epoch": 3.69, + "grad_norm": 0.5390625, + "learning_rate": 3.6803202446282214e-06, + "loss": 0.5195, + "step": 27639 + }, + { + "epoch": 3.69, + "grad_norm": 0.578125, + "learning_rate": 3.677190749902748e-06, + "loss": 0.1957, + "step": 27640 + }, + { + "epoch": 3.69, + "grad_norm": 0.333984375, + "learning_rate": 3.6740625613763902e-06, + "loss": 0.1616, + "step": 27641 + }, + { + "epoch": 3.69, + "grad_norm": 0.59375, + "learning_rate": 3.6709356790915696e-06, + "loss": 0.3749, + "step": 27642 + }, + { + "epoch": 3.69, + "grad_norm": 0.625, + "learning_rate": 3.6678101030906962e-06, + "loss": 0.253, + "step": 27643 + }, + { + "epoch": 3.69, + "grad_norm": 0.71875, + "learning_rate": 3.664685833416137e-06, + "loss": 0.4728, + "step": 27644 + }, + { + "epoch": 3.69, + "grad_norm": 0.494140625, + "learning_rate": 3.661562870110258e-06, + "loss": 0.1975, + "step": 27645 + }, + { + "epoch": 3.69, + "grad_norm": 0.7265625, + "learning_rate": 3.6584412132154135e-06, + "loss": 0.6687, + "step": 27646 + }, + { + "epoch": 3.69, + "grad_norm": 0.71875, + "learning_rate": 3.6553208627739586e-06, + "loss": 0.4804, + "step": 27647 + }, + { + "epoch": 3.69, + "grad_norm": 0.80859375, + "learning_rate": 3.65220181882816e-06, + "loss": 0.2507, + "step": 27648 + }, + { + "epoch": 3.69, + "grad_norm": 0.5078125, + "learning_rate": 3.6490840814203508e-06, + "loss": 0.4331, + "step": 27649 + }, + { + "epoch": 3.69, + "grad_norm": 0.56640625, + "learning_rate": 3.6459676505927965e-06, + "loss": 0.6014, + "step": 27650 + }, + { + "epoch": 3.69, + "grad_norm": 0.66796875, + "learning_rate": 3.6428525263877745e-06, + "loss": 0.2276, + "step": 27651 + }, + { + "epoch": 3.69, + "grad_norm": 0.62890625, + "learning_rate": 3.6397387088475177e-06, + "loss": 0.2201, + "step": 27652 + }, + { + "epoch": 3.69, + "grad_norm": 0.75390625, + "learning_rate": 3.636626198014226e-06, + "loss": 0.3528, + "step": 27653 + }, + { + "epoch": 3.69, + "grad_norm": 0.73828125, + "learning_rate": 3.6335149939301426e-06, + "loss": 0.349, + "step": 27654 + }, + { + "epoch": 3.69, + "grad_norm": 0.92578125, + "learning_rate": 3.630405096637435e-06, + "loss": 0.1953, + "step": 27655 + }, + { + "epoch": 3.69, + "grad_norm": 0.63671875, + "learning_rate": 3.6272965061782905e-06, + "loss": 0.2293, + "step": 27656 + }, + { + "epoch": 3.69, + "grad_norm": 0.58203125, + "learning_rate": 3.624189222594854e-06, + "loss": 0.444, + "step": 27657 + }, + { + "epoch": 3.69, + "grad_norm": 0.48828125, + "learning_rate": 3.6210832459292574e-06, + "loss": 0.2234, + "step": 27658 + }, + { + "epoch": 3.69, + "grad_norm": 0.57421875, + "learning_rate": 3.617978576223635e-06, + "loss": 0.3889, + "step": 27659 + }, + { + "epoch": 3.69, + "grad_norm": 0.46484375, + "learning_rate": 3.614875213520086e-06, + "loss": 0.1184, + "step": 27660 + }, + { + "epoch": 3.69, + "grad_norm": 0.6640625, + "learning_rate": 3.6117731578606763e-06, + "loss": 0.2416, + "step": 27661 + }, + { + "epoch": 3.69, + "grad_norm": 0.65234375, + "learning_rate": 3.6086724092874834e-06, + "loss": 0.4876, + "step": 27662 + }, + { + "epoch": 3.69, + "grad_norm": 0.6796875, + "learning_rate": 3.6055729678425633e-06, + "loss": 0.2282, + "step": 27663 + }, + { + "epoch": 3.69, + "grad_norm": 0.53125, + "learning_rate": 3.602474833567948e-06, + "loss": 0.3027, + "step": 27664 + }, + { + "epoch": 3.69, + "grad_norm": 0.6015625, + "learning_rate": 3.5993780065056273e-06, + "loss": 0.2634, + "step": 27665 + }, + { + "epoch": 3.69, + "grad_norm": 0.640625, + "learning_rate": 3.596282486697633e-06, + "loss": 0.459, + "step": 27666 + }, + { + "epoch": 3.69, + "grad_norm": 0.59375, + "learning_rate": 3.593188274185899e-06, + "loss": 0.198, + "step": 27667 + }, + { + "epoch": 3.69, + "grad_norm": 0.439453125, + "learning_rate": 3.590095369012403e-06, + "loss": 0.2379, + "step": 27668 + }, + { + "epoch": 3.69, + "grad_norm": 0.80078125, + "learning_rate": 3.5870037712190884e-06, + "loss": 0.2356, + "step": 27669 + }, + { + "epoch": 3.69, + "grad_norm": 0.6640625, + "learning_rate": 3.5839134808479e-06, + "loss": 0.4366, + "step": 27670 + }, + { + "epoch": 3.69, + "grad_norm": 0.70703125, + "learning_rate": 3.5808244979407045e-06, + "loss": 0.3549, + "step": 27671 + }, + { + "epoch": 3.69, + "grad_norm": 0.71484375, + "learning_rate": 3.5777368225394126e-06, + "loss": 0.3293, + "step": 27672 + }, + { + "epoch": 3.69, + "grad_norm": 0.71875, + "learning_rate": 3.5746504546859018e-06, + "loss": 0.2877, + "step": 27673 + }, + { + "epoch": 3.69, + "grad_norm": 0.609375, + "learning_rate": 3.5715653944220162e-06, + "loss": 0.3755, + "step": 27674 + }, + { + "epoch": 3.69, + "grad_norm": 0.79296875, + "learning_rate": 3.5684816417895783e-06, + "loss": 0.3778, + "step": 27675 + }, + { + "epoch": 3.69, + "grad_norm": 0.58203125, + "learning_rate": 3.5653991968304436e-06, + "loss": 0.4633, + "step": 27676 + }, + { + "epoch": 3.69, + "grad_norm": 0.66015625, + "learning_rate": 3.562318059586367e-06, + "loss": 0.3413, + "step": 27677 + }, + { + "epoch": 3.69, + "grad_norm": 0.62890625, + "learning_rate": 3.5592382300991487e-06, + "loss": 0.4152, + "step": 27678 + }, + { + "epoch": 3.69, + "grad_norm": 0.66015625, + "learning_rate": 3.556159708410567e-06, + "loss": 0.3125, + "step": 27679 + }, + { + "epoch": 3.69, + "grad_norm": 0.5546875, + "learning_rate": 3.5530824945623542e-06, + "loss": 0.202, + "step": 27680 + }, + { + "epoch": 3.69, + "grad_norm": 0.8046875, + "learning_rate": 3.550006588596233e-06, + "loss": 0.3209, + "step": 27681 + }, + { + "epoch": 3.69, + "grad_norm": 0.47265625, + "learning_rate": 3.5469319905539257e-06, + "loss": 0.2718, + "step": 27682 + }, + { + "epoch": 3.69, + "grad_norm": 0.625, + "learning_rate": 3.5438587004771317e-06, + "loss": 0.3537, + "step": 27683 + }, + { + "epoch": 3.69, + "grad_norm": 0.6875, + "learning_rate": 3.540786718407507e-06, + "loss": 0.3635, + "step": 27684 + }, + { + "epoch": 3.69, + "grad_norm": 0.875, + "learning_rate": 3.5377160443867296e-06, + "loss": 0.3565, + "step": 27685 + }, + { + "epoch": 3.69, + "grad_norm": 0.5703125, + "learning_rate": 3.5346466784564323e-06, + "loss": 0.1612, + "step": 27686 + }, + { + "epoch": 3.69, + "grad_norm": 0.4765625, + "learning_rate": 3.531578620658227e-06, + "loss": 0.2753, + "step": 27687 + }, + { + "epoch": 3.69, + "grad_norm": 0.50390625, + "learning_rate": 3.5285118710337574e-06, + "loss": 0.2126, + "step": 27688 + }, + { + "epoch": 3.69, + "grad_norm": 0.63671875, + "learning_rate": 3.525446429624557e-06, + "loss": 0.3207, + "step": 27689 + }, + { + "epoch": 3.69, + "grad_norm": 0.51171875, + "learning_rate": 3.5223822964722265e-06, + "loss": 0.1787, + "step": 27690 + }, + { + "epoch": 3.7, + "grad_norm": 0.5859375, + "learning_rate": 3.51931947161831e-06, + "loss": 0.4594, + "step": 27691 + }, + { + "epoch": 3.7, + "grad_norm": 0.61328125, + "learning_rate": 3.5162579551043296e-06, + "loss": 0.2273, + "step": 27692 + }, + { + "epoch": 3.7, + "grad_norm": 0.50390625, + "learning_rate": 3.5131977469718413e-06, + "loss": 0.2715, + "step": 27693 + }, + { + "epoch": 3.7, + "grad_norm": 0.51171875, + "learning_rate": 3.5101388472622897e-06, + "loss": 0.3134, + "step": 27694 + }, + { + "epoch": 3.7, + "grad_norm": 0.7109375, + "learning_rate": 3.5070812560171973e-06, + "loss": 0.5318, + "step": 27695 + }, + { + "epoch": 3.7, + "grad_norm": 0.65625, + "learning_rate": 3.5040249732780086e-06, + "loss": 0.3462, + "step": 27696 + }, + { + "epoch": 3.7, + "grad_norm": 0.62109375, + "learning_rate": 3.500969999086179e-06, + "loss": 0.3832, + "step": 27697 + }, + { + "epoch": 3.7, + "grad_norm": 0.470703125, + "learning_rate": 3.497916333483109e-06, + "loss": 0.1351, + "step": 27698 + }, + { + "epoch": 3.7, + "grad_norm": 0.50390625, + "learning_rate": 3.4948639765102433e-06, + "loss": 0.1792, + "step": 27699 + }, + { + "epoch": 3.7, + "grad_norm": 0.64453125, + "learning_rate": 3.4918129282089597e-06, + "loss": 0.3276, + "step": 27700 + }, + { + "epoch": 3.7, + "grad_norm": 0.69140625, + "learning_rate": 3.488763188620625e-06, + "loss": 0.7594, + "step": 27701 + }, + { + "epoch": 3.7, + "grad_norm": 0.69140625, + "learning_rate": 3.485714757786618e-06, + "loss": 0.3307, + "step": 27702 + }, + { + "epoch": 3.7, + "grad_norm": 0.54296875, + "learning_rate": 3.482667635748238e-06, + "loss": 0.2721, + "step": 27703 + }, + { + "epoch": 3.7, + "grad_norm": 0.67578125, + "learning_rate": 3.4796218225468414e-06, + "loss": 0.1969, + "step": 27704 + }, + { + "epoch": 3.7, + "grad_norm": 0.6328125, + "learning_rate": 3.476577318223717e-06, + "loss": 0.3216, + "step": 27705 + }, + { + "epoch": 3.7, + "grad_norm": 0.62109375, + "learning_rate": 3.4735341228201547e-06, + "loss": 0.2007, + "step": 27706 + }, + { + "epoch": 3.7, + "grad_norm": 0.69140625, + "learning_rate": 3.470492236377421e-06, + "loss": 0.1745, + "step": 27707 + }, + { + "epoch": 3.7, + "grad_norm": 0.67578125, + "learning_rate": 3.467451658936749e-06, + "loss": 0.283, + "step": 27708 + }, + { + "epoch": 3.7, + "grad_norm": 0.671875, + "learning_rate": 3.4644123905393953e-06, + "loss": 0.3892, + "step": 27709 + }, + { + "epoch": 3.7, + "grad_norm": 0.58203125, + "learning_rate": 3.4613744312265827e-06, + "loss": 0.3217, + "step": 27710 + }, + { + "epoch": 3.7, + "grad_norm": 0.515625, + "learning_rate": 3.458337781039478e-06, + "loss": 0.3508, + "step": 27711 + }, + { + "epoch": 3.7, + "grad_norm": 0.57421875, + "learning_rate": 3.455302440019281e-06, + "loss": 0.4651, + "step": 27712 + }, + { + "epoch": 3.7, + "grad_norm": 0.59375, + "learning_rate": 3.452268408207127e-06, + "loss": 0.2639, + "step": 27713 + }, + { + "epoch": 3.7, + "grad_norm": 0.71484375, + "learning_rate": 3.449235685644192e-06, + "loss": 0.281, + "step": 27714 + }, + { + "epoch": 3.7, + "grad_norm": 0.703125, + "learning_rate": 3.446204272371578e-06, + "loss": 0.4054, + "step": 27715 + }, + { + "epoch": 3.7, + "grad_norm": 0.58984375, + "learning_rate": 3.4431741684304073e-06, + "loss": 0.3716, + "step": 27716 + }, + { + "epoch": 3.7, + "grad_norm": 0.7265625, + "learning_rate": 3.4401453738617582e-06, + "loss": 0.4021, + "step": 27717 + }, + { + "epoch": 3.7, + "grad_norm": 0.6796875, + "learning_rate": 3.43711788870672e-06, + "loss": 0.4378, + "step": 27718 + }, + { + "epoch": 3.7, + "grad_norm": 0.79296875, + "learning_rate": 3.4340917130063267e-06, + "loss": 0.3038, + "step": 27719 + }, + { + "epoch": 3.7, + "grad_norm": 0.515625, + "learning_rate": 3.431066846801634e-06, + "loss": 0.1963, + "step": 27720 + }, + { + "epoch": 3.7, + "grad_norm": 0.490234375, + "learning_rate": 3.4280432901336425e-06, + "loss": 0.1395, + "step": 27721 + }, + { + "epoch": 3.7, + "grad_norm": 0.70703125, + "learning_rate": 3.425021043043375e-06, + "loss": 0.2957, + "step": 27722 + }, + { + "epoch": 3.7, + "grad_norm": 0.58984375, + "learning_rate": 3.4220001055717878e-06, + "loss": 0.2918, + "step": 27723 + }, + { + "epoch": 3.7, + "grad_norm": 0.84765625, + "learning_rate": 3.418980477759881e-06, + "loss": 0.368, + "step": 27724 + }, + { + "epoch": 3.7, + "grad_norm": 0.4765625, + "learning_rate": 3.4159621596485893e-06, + "loss": 0.1849, + "step": 27725 + }, + { + "epoch": 3.7, + "grad_norm": 0.53515625, + "learning_rate": 3.4129451512788123e-06, + "loss": 0.3119, + "step": 27726 + }, + { + "epoch": 3.7, + "grad_norm": 0.53515625, + "learning_rate": 3.409929452691507e-06, + "loss": 0.2717, + "step": 27727 + }, + { + "epoch": 3.7, + "grad_norm": 0.515625, + "learning_rate": 3.4069150639275295e-06, + "loss": 0.2227, + "step": 27728 + }, + { + "epoch": 3.7, + "grad_norm": 0.59375, + "learning_rate": 3.403901985027802e-06, + "loss": 0.1971, + "step": 27729 + }, + { + "epoch": 3.7, + "grad_norm": 0.46875, + "learning_rate": 3.4008902160331478e-06, + "loss": 0.1799, + "step": 27730 + }, + { + "epoch": 3.7, + "grad_norm": 0.66796875, + "learning_rate": 3.3978797569844123e-06, + "loss": 0.2711, + "step": 27731 + }, + { + "epoch": 3.7, + "grad_norm": 0.5, + "learning_rate": 3.394870607922429e-06, + "loss": 0.223, + "step": 27732 + }, + { + "epoch": 3.7, + "grad_norm": 0.65234375, + "learning_rate": 3.39186276888801e-06, + "loss": 0.2706, + "step": 27733 + }, + { + "epoch": 3.7, + "grad_norm": 1.109375, + "learning_rate": 3.3888562399219226e-06, + "loss": 0.5984, + "step": 27734 + }, + { + "epoch": 3.7, + "grad_norm": 0.7421875, + "learning_rate": 3.385851021064956e-06, + "loss": 0.3287, + "step": 27735 + }, + { + "epoch": 3.7, + "grad_norm": 0.69921875, + "learning_rate": 3.3828471123578564e-06, + "loss": 0.5228, + "step": 27736 + }, + { + "epoch": 3.7, + "grad_norm": 0.921875, + "learning_rate": 3.3798445138413572e-06, + "loss": 0.2822, + "step": 27737 + }, + { + "epoch": 3.7, + "grad_norm": 0.490234375, + "learning_rate": 3.37684322555617e-06, + "loss": 0.1811, + "step": 27738 + }, + { + "epoch": 3.7, + "grad_norm": 0.890625, + "learning_rate": 3.373843247543018e-06, + "loss": 0.5355, + "step": 27739 + }, + { + "epoch": 3.7, + "grad_norm": 0.58984375, + "learning_rate": 3.370844579842558e-06, + "loss": 0.3719, + "step": 27740 + }, + { + "epoch": 3.7, + "grad_norm": 0.61328125, + "learning_rate": 3.3678472224954573e-06, + "loss": 0.4093, + "step": 27741 + }, + { + "epoch": 3.7, + "grad_norm": 0.5078125, + "learning_rate": 3.3648511755423605e-06, + "loss": 0.2362, + "step": 27742 + }, + { + "epoch": 3.7, + "grad_norm": 0.58203125, + "learning_rate": 3.361856439023914e-06, + "loss": 0.3554, + "step": 27743 + }, + { + "epoch": 3.7, + "grad_norm": 0.81640625, + "learning_rate": 3.3588630129807064e-06, + "loss": 0.4321, + "step": 27744 + }, + { + "epoch": 3.7, + "grad_norm": 0.5078125, + "learning_rate": 3.3558708974533504e-06, + "loss": 0.2298, + "step": 27745 + }, + { + "epoch": 3.7, + "grad_norm": 0.56640625, + "learning_rate": 3.352880092482402e-06, + "loss": 0.2332, + "step": 27746 + }, + { + "epoch": 3.7, + "grad_norm": 0.62890625, + "learning_rate": 3.3498905981084404e-06, + "loss": 0.3427, + "step": 27747 + }, + { + "epoch": 3.7, + "grad_norm": 0.4609375, + "learning_rate": 3.3469024143719772e-06, + "loss": 0.3416, + "step": 27748 + }, + { + "epoch": 3.7, + "grad_norm": 0.56640625, + "learning_rate": 3.3439155413135583e-06, + "loss": 0.1966, + "step": 27749 + }, + { + "epoch": 3.7, + "grad_norm": 0.5546875, + "learning_rate": 3.3409299789736726e-06, + "loss": 0.2851, + "step": 27750 + }, + { + "epoch": 3.7, + "grad_norm": 0.6015625, + "learning_rate": 3.3379457273928105e-06, + "loss": 0.2839, + "step": 27751 + }, + { + "epoch": 3.7, + "grad_norm": 0.6484375, + "learning_rate": 3.334962786611451e-06, + "loss": 0.4019, + "step": 27752 + }, + { + "epoch": 3.7, + "grad_norm": 0.75, + "learning_rate": 3.3319811566700275e-06, + "loss": 0.2699, + "step": 27753 + }, + { + "epoch": 3.7, + "grad_norm": 0.5703125, + "learning_rate": 3.3290008376089754e-06, + "loss": 0.4191, + "step": 27754 + }, + { + "epoch": 3.7, + "grad_norm": 0.58984375, + "learning_rate": 3.3260218294687174e-06, + "loss": 0.2728, + "step": 27755 + }, + { + "epoch": 3.7, + "grad_norm": 0.6796875, + "learning_rate": 3.3230441322896656e-06, + "loss": 0.4958, + "step": 27756 + }, + { + "epoch": 3.7, + "grad_norm": 0.6328125, + "learning_rate": 3.3200677461121652e-06, + "loss": 0.1866, + "step": 27757 + }, + { + "epoch": 3.7, + "grad_norm": 0.54296875, + "learning_rate": 3.317092670976596e-06, + "loss": 0.1624, + "step": 27758 + }, + { + "epoch": 3.7, + "grad_norm": 0.53125, + "learning_rate": 3.3141189069232913e-06, + "loss": 0.2029, + "step": 27759 + }, + { + "epoch": 3.7, + "grad_norm": 0.6484375, + "learning_rate": 3.3111464539925975e-06, + "loss": 0.2716, + "step": 27760 + }, + { + "epoch": 3.7, + "grad_norm": 0.625, + "learning_rate": 3.308175312224837e-06, + "loss": 0.4139, + "step": 27761 + }, + { + "epoch": 3.7, + "grad_norm": 0.64453125, + "learning_rate": 3.3052054816602452e-06, + "loss": 0.1672, + "step": 27762 + }, + { + "epoch": 3.7, + "grad_norm": 0.90234375, + "learning_rate": 3.3022369623391337e-06, + "loss": 0.4505, + "step": 27763 + }, + { + "epoch": 3.7, + "grad_norm": 0.53515625, + "learning_rate": 3.299269754301748e-06, + "loss": 0.1291, + "step": 27764 + }, + { + "epoch": 3.7, + "grad_norm": 0.78515625, + "learning_rate": 3.2963038575883233e-06, + "loss": 0.3401, + "step": 27765 + }, + { + "epoch": 3.71, + "grad_norm": 0.80859375, + "learning_rate": 3.2933392722390933e-06, + "loss": 0.3339, + "step": 27766 + }, + { + "epoch": 3.71, + "grad_norm": 0.625, + "learning_rate": 3.290375998294237e-06, + "loss": 0.23, + "step": 27767 + }, + { + "epoch": 3.71, + "grad_norm": 0.6328125, + "learning_rate": 3.2874140357939454e-06, + "loss": 0.2406, + "step": 27768 + }, + { + "epoch": 3.71, + "grad_norm": 0.57421875, + "learning_rate": 3.2844533847783965e-06, + "loss": 0.2424, + "step": 27769 + }, + { + "epoch": 3.71, + "grad_norm": 0.55078125, + "learning_rate": 3.2814940452877364e-06, + "loss": 0.3331, + "step": 27770 + }, + { + "epoch": 3.71, + "grad_norm": 0.68359375, + "learning_rate": 3.278536017362088e-06, + "loss": 0.2965, + "step": 27771 + }, + { + "epoch": 3.71, + "grad_norm": 0.58984375, + "learning_rate": 3.2755793010415646e-06, + "loss": 0.4357, + "step": 27772 + }, + { + "epoch": 3.71, + "grad_norm": 0.5390625, + "learning_rate": 3.272623896366267e-06, + "loss": 0.1822, + "step": 27773 + }, + { + "epoch": 3.71, + "grad_norm": 0.69140625, + "learning_rate": 3.269669803376263e-06, + "loss": 0.3184, + "step": 27774 + }, + { + "epoch": 3.71, + "grad_norm": 0.53515625, + "learning_rate": 3.266717022111632e-06, + "loss": 0.3901, + "step": 27775 + }, + { + "epoch": 3.71, + "grad_norm": 0.50390625, + "learning_rate": 3.2637655526123857e-06, + "loss": 0.3719, + "step": 27776 + }, + { + "epoch": 3.71, + "grad_norm": 0.6171875, + "learning_rate": 3.2608153949185593e-06, + "loss": 0.2657, + "step": 27777 + }, + { + "epoch": 3.71, + "grad_norm": 0.51953125, + "learning_rate": 3.2578665490701655e-06, + "loss": 0.1647, + "step": 27778 + }, + { + "epoch": 3.71, + "grad_norm": 0.6015625, + "learning_rate": 3.254919015107205e-06, + "loss": 0.1721, + "step": 27779 + }, + { + "epoch": 3.71, + "grad_norm": 0.6875, + "learning_rate": 3.2519727930696242e-06, + "loss": 0.5368, + "step": 27780 + }, + { + "epoch": 3.71, + "grad_norm": 0.57421875, + "learning_rate": 3.24902788299738e-06, + "loss": 0.4983, + "step": 27781 + }, + { + "epoch": 3.71, + "grad_norm": 0.6796875, + "learning_rate": 3.2460842849304063e-06, + "loss": 0.4042, + "step": 27782 + }, + { + "epoch": 3.71, + "grad_norm": 0.64453125, + "learning_rate": 3.24314199890865e-06, + "loss": 0.2059, + "step": 27783 + }, + { + "epoch": 3.71, + "grad_norm": 0.546875, + "learning_rate": 3.240201024971967e-06, + "loss": 0.2033, + "step": 27784 + }, + { + "epoch": 3.71, + "grad_norm": 0.59375, + "learning_rate": 3.237261363160271e-06, + "loss": 0.365, + "step": 27785 + }, + { + "epoch": 3.71, + "grad_norm": 0.75390625, + "learning_rate": 3.234323013513396e-06, + "loss": 0.3382, + "step": 27786 + }, + { + "epoch": 3.71, + "grad_norm": 0.48046875, + "learning_rate": 3.2313859760712105e-06, + "loss": 0.2435, + "step": 27787 + }, + { + "epoch": 3.71, + "grad_norm": 0.65234375, + "learning_rate": 3.228450250873538e-06, + "loss": 0.359, + "step": 27788 + }, + { + "epoch": 3.71, + "grad_norm": 0.59765625, + "learning_rate": 3.2255158379602025e-06, + "loss": 0.4389, + "step": 27789 + }, + { + "epoch": 3.71, + "grad_norm": 0.62109375, + "learning_rate": 3.2225827373709604e-06, + "loss": 0.3271, + "step": 27790 + }, + { + "epoch": 3.71, + "grad_norm": 0.75, + "learning_rate": 3.219650949145614e-06, + "loss": 0.2699, + "step": 27791 + }, + { + "epoch": 3.71, + "grad_norm": 0.70703125, + "learning_rate": 3.21672047332392e-06, + "loss": 0.3054, + "step": 27792 + }, + { + "epoch": 3.71, + "grad_norm": 0.58984375, + "learning_rate": 3.213791309945613e-06, + "loss": 0.1833, + "step": 27793 + }, + { + "epoch": 3.71, + "grad_norm": 0.546875, + "learning_rate": 3.2108634590504062e-06, + "loss": 0.2684, + "step": 27794 + }, + { + "epoch": 3.71, + "grad_norm": 0.71484375, + "learning_rate": 3.2079369206780007e-06, + "loss": 0.3952, + "step": 27795 + }, + { + "epoch": 3.71, + "grad_norm": 0.486328125, + "learning_rate": 3.2050116948680987e-06, + "loss": 0.181, + "step": 27796 + }, + { + "epoch": 3.71, + "grad_norm": 0.82421875, + "learning_rate": 3.202087781660368e-06, + "loss": 0.3484, + "step": 27797 + }, + { + "epoch": 3.71, + "grad_norm": 0.578125, + "learning_rate": 3.1991651810944544e-06, + "loss": 0.293, + "step": 27798 + }, + { + "epoch": 3.71, + "grad_norm": 0.56640625, + "learning_rate": 3.1962438932099827e-06, + "loss": 0.2353, + "step": 27799 + }, + { + "epoch": 3.71, + "grad_norm": 0.70703125, + "learning_rate": 3.1933239180465645e-06, + "loss": 0.2592, + "step": 27800 + }, + { + "epoch": 3.71, + "grad_norm": 0.6328125, + "learning_rate": 3.1904052556438023e-06, + "loss": 0.4181, + "step": 27801 + }, + { + "epoch": 3.71, + "grad_norm": 0.703125, + "learning_rate": 3.1874879060412975e-06, + "loss": 0.257, + "step": 27802 + }, + { + "epoch": 3.71, + "grad_norm": 0.73046875, + "learning_rate": 3.184571869278574e-06, + "loss": 0.2164, + "step": 27803 + }, + { + "epoch": 3.71, + "grad_norm": 0.5703125, + "learning_rate": 3.1816571453951894e-06, + "loss": 0.2418, + "step": 27804 + }, + { + "epoch": 3.71, + "grad_norm": 0.49609375, + "learning_rate": 3.178743734430667e-06, + "loss": 0.2705, + "step": 27805 + }, + { + "epoch": 3.71, + "grad_norm": 0.56640625, + "learning_rate": 3.1758316364245424e-06, + "loss": 0.1185, + "step": 27806 + }, + { + "epoch": 3.71, + "grad_norm": 0.55078125, + "learning_rate": 3.1729208514162613e-06, + "loss": 0.3361, + "step": 27807 + }, + { + "epoch": 3.71, + "grad_norm": 0.60546875, + "learning_rate": 3.1700113794453145e-06, + "loss": 0.1613, + "step": 27808 + }, + { + "epoch": 3.71, + "grad_norm": 0.515625, + "learning_rate": 3.167103220551182e-06, + "loss": 0.3519, + "step": 27809 + }, + { + "epoch": 3.71, + "grad_norm": 0.54296875, + "learning_rate": 3.1641963747732537e-06, + "loss": 0.2432, + "step": 27810 + }, + { + "epoch": 3.71, + "grad_norm": 0.53125, + "learning_rate": 3.161290842150977e-06, + "loss": 0.2855, + "step": 27811 + }, + { + "epoch": 3.71, + "grad_norm": 0.6484375, + "learning_rate": 3.1583866227237635e-06, + "loss": 0.3823, + "step": 27812 + }, + { + "epoch": 3.71, + "grad_norm": 0.65234375, + "learning_rate": 3.1554837165309604e-06, + "loss": 0.1942, + "step": 27813 + }, + { + "epoch": 3.71, + "grad_norm": 0.78515625, + "learning_rate": 3.1525821236119577e-06, + "loss": 0.2991, + "step": 27814 + }, + { + "epoch": 3.71, + "grad_norm": 0.51953125, + "learning_rate": 3.1496818440060917e-06, + "loss": 0.2705, + "step": 27815 + }, + { + "epoch": 3.71, + "grad_norm": 0.77734375, + "learning_rate": 3.1467828777527075e-06, + "loss": 0.235, + "step": 27816 + }, + { + "epoch": 3.71, + "grad_norm": 0.60546875, + "learning_rate": 3.1438852248911077e-06, + "loss": 0.4959, + "step": 27817 + }, + { + "epoch": 3.71, + "grad_norm": 0.5546875, + "learning_rate": 3.140988885460572e-06, + "loss": 0.3411, + "step": 27818 + }, + { + "epoch": 3.71, + "grad_norm": 0.65625, + "learning_rate": 3.1380938595004016e-06, + "loss": 0.452, + "step": 27819 + }, + { + "epoch": 3.71, + "grad_norm": 0.4609375, + "learning_rate": 3.135200147049855e-06, + "loss": 0.2655, + "step": 27820 + }, + { + "epoch": 3.71, + "grad_norm": 0.482421875, + "learning_rate": 3.1323077481481556e-06, + "loss": 0.2951, + "step": 27821 + }, + { + "epoch": 3.71, + "grad_norm": 0.67578125, + "learning_rate": 3.129416662834528e-06, + "loss": 0.3875, + "step": 27822 + }, + { + "epoch": 3.71, + "grad_norm": 0.5390625, + "learning_rate": 3.126526891148174e-06, + "loss": 0.1336, + "step": 27823 + }, + { + "epoch": 3.71, + "grad_norm": 0.466796875, + "learning_rate": 3.1236384331282954e-06, + "loss": 0.2085, + "step": 27824 + }, + { + "epoch": 3.71, + "grad_norm": 0.5703125, + "learning_rate": 3.120751288814061e-06, + "loss": 0.4395, + "step": 27825 + }, + { + "epoch": 3.71, + "grad_norm": 0.439453125, + "learning_rate": 3.1178654582446067e-06, + "loss": 0.3501, + "step": 27826 + }, + { + "epoch": 3.71, + "grad_norm": 0.6015625, + "learning_rate": 3.114980941459078e-06, + "loss": 0.1863, + "step": 27827 + }, + { + "epoch": 3.71, + "grad_norm": 0.74609375, + "learning_rate": 3.1120977384965776e-06, + "loss": 0.1901, + "step": 27828 + }, + { + "epoch": 3.71, + "grad_norm": 0.498046875, + "learning_rate": 3.1092158493962408e-06, + "loss": 0.1436, + "step": 27829 + }, + { + "epoch": 3.71, + "grad_norm": 0.53125, + "learning_rate": 3.106335274197092e-06, + "loss": 0.2128, + "step": 27830 + }, + { + "epoch": 3.71, + "grad_norm": 0.59765625, + "learning_rate": 3.1034560129382327e-06, + "loss": 0.1985, + "step": 27831 + }, + { + "epoch": 3.71, + "grad_norm": 0.5703125, + "learning_rate": 3.100578065658699e-06, + "loss": 0.2285, + "step": 27832 + }, + { + "epoch": 3.71, + "grad_norm": 0.546875, + "learning_rate": 3.0977014323975152e-06, + "loss": 0.1874, + "step": 27833 + }, + { + "epoch": 3.71, + "grad_norm": 0.625, + "learning_rate": 3.094826113193716e-06, + "loss": 0.4026, + "step": 27834 + }, + { + "epoch": 3.71, + "grad_norm": 0.609375, + "learning_rate": 3.091952108086238e-06, + "loss": 0.3805, + "step": 27835 + }, + { + "epoch": 3.71, + "grad_norm": 0.462890625, + "learning_rate": 3.0890794171140936e-06, + "loss": 0.1796, + "step": 27836 + }, + { + "epoch": 3.71, + "grad_norm": 0.5859375, + "learning_rate": 3.0862080403162297e-06, + "loss": 0.3635, + "step": 27837 + }, + { + "epoch": 3.71, + "grad_norm": 0.640625, + "learning_rate": 3.0833379777315707e-06, + "loss": 0.4188, + "step": 27838 + }, + { + "epoch": 3.71, + "grad_norm": 0.70703125, + "learning_rate": 3.0804692293990746e-06, + "loss": 0.4064, + "step": 27839 + }, + { + "epoch": 3.71, + "grad_norm": 0.72265625, + "learning_rate": 3.0776017953575985e-06, + "loss": 0.386, + "step": 27840 + }, + { + "epoch": 3.72, + "grad_norm": 0.63671875, + "learning_rate": 3.0747356756460565e-06, + "loss": 0.228, + "step": 27841 + }, + { + "epoch": 3.72, + "grad_norm": 0.70703125, + "learning_rate": 3.071870870303295e-06, + "loss": 0.2803, + "step": 27842 + }, + { + "epoch": 3.72, + "grad_norm": 0.6640625, + "learning_rate": 3.0690073793681827e-06, + "loss": 0.1919, + "step": 27843 + }, + { + "epoch": 3.72, + "grad_norm": 0.388671875, + "learning_rate": 3.0661452028795336e-06, + "loss": 0.123, + "step": 27844 + }, + { + "epoch": 3.72, + "grad_norm": 0.72265625, + "learning_rate": 3.063284340876171e-06, + "loss": 0.4323, + "step": 27845 + }, + { + "epoch": 3.72, + "grad_norm": 0.640625, + "learning_rate": 3.060424793396888e-06, + "loss": 0.1917, + "step": 27846 + }, + { + "epoch": 3.72, + "grad_norm": 0.63671875, + "learning_rate": 3.0575665604804625e-06, + "loss": 0.2099, + "step": 27847 + }, + { + "epoch": 3.72, + "grad_norm": 0.5078125, + "learning_rate": 3.054709642165654e-06, + "loss": 0.2158, + "step": 27848 + }, + { + "epoch": 3.72, + "grad_norm": 0.640625, + "learning_rate": 3.0518540384911867e-06, + "loss": 0.3935, + "step": 27849 + }, + { + "epoch": 3.72, + "grad_norm": 0.65625, + "learning_rate": 3.0489997494958067e-06, + "loss": 0.2117, + "step": 27850 + }, + { + "epoch": 3.72, + "grad_norm": 0.62109375, + "learning_rate": 3.046146775218206e-06, + "loss": 0.3747, + "step": 27851 + }, + { + "epoch": 3.72, + "grad_norm": 0.671875, + "learning_rate": 3.043295115697098e-06, + "loss": 0.4213, + "step": 27852 + }, + { + "epoch": 3.72, + "grad_norm": 0.73046875, + "learning_rate": 3.0404447709711183e-06, + "loss": 0.2995, + "step": 27853 + }, + { + "epoch": 3.72, + "grad_norm": 0.58203125, + "learning_rate": 3.037595741078936e-06, + "loss": 0.3329, + "step": 27854 + }, + { + "epoch": 3.72, + "grad_norm": 0.58203125, + "learning_rate": 3.0347480260591867e-06, + "loss": 0.4536, + "step": 27855 + }, + { + "epoch": 3.72, + "grad_norm": 0.5, + "learning_rate": 3.031901625950506e-06, + "loss": 0.352, + "step": 27856 + }, + { + "epoch": 3.72, + "grad_norm": 0.7109375, + "learning_rate": 3.029056540791453e-06, + "loss": 0.3737, + "step": 27857 + }, + { + "epoch": 3.72, + "grad_norm": 0.56640625, + "learning_rate": 3.02621277062064e-06, + "loss": 0.2081, + "step": 27858 + }, + { + "epoch": 3.72, + "grad_norm": 0.63671875, + "learning_rate": 3.0233703154766036e-06, + "loss": 0.2271, + "step": 27859 + }, + { + "epoch": 3.72, + "grad_norm": 0.44921875, + "learning_rate": 3.0205291753979126e-06, + "loss": 0.2448, + "step": 27860 + }, + { + "epoch": 3.72, + "grad_norm": 0.439453125, + "learning_rate": 3.0176893504230807e-06, + "loss": 0.1305, + "step": 27861 + }, + { + "epoch": 3.72, + "grad_norm": 0.71875, + "learning_rate": 3.014850840590644e-06, + "loss": 0.3185, + "step": 27862 + }, + { + "epoch": 3.72, + "grad_norm": 0.8984375, + "learning_rate": 3.01201364593906e-06, + "loss": 0.2974, + "step": 27863 + }, + { + "epoch": 3.72, + "grad_norm": 0.5234375, + "learning_rate": 3.00917776650681e-06, + "loss": 0.1722, + "step": 27864 + }, + { + "epoch": 3.72, + "grad_norm": 0.77734375, + "learning_rate": 3.0063432023323623e-06, + "loss": 0.5036, + "step": 27865 + }, + { + "epoch": 3.72, + "grad_norm": 0.6484375, + "learning_rate": 3.0035099534541644e-06, + "loss": 0.2722, + "step": 27866 + }, + { + "epoch": 3.72, + "grad_norm": 0.8515625, + "learning_rate": 3.0006780199106188e-06, + "loss": 0.4504, + "step": 27867 + }, + { + "epoch": 3.72, + "grad_norm": 0.5859375, + "learning_rate": 2.997847401740117e-06, + "loss": 0.2468, + "step": 27868 + }, + { + "epoch": 3.72, + "grad_norm": 0.640625, + "learning_rate": 2.995018098981073e-06, + "loss": 0.3493, + "step": 27869 + }, + { + "epoch": 3.72, + "grad_norm": 0.703125, + "learning_rate": 2.9921901116718442e-06, + "loss": 0.1298, + "step": 27870 + }, + { + "epoch": 3.72, + "grad_norm": 0.54296875, + "learning_rate": 2.9893634398507787e-06, + "loss": 0.2708, + "step": 27871 + }, + { + "epoch": 3.72, + "grad_norm": 0.65625, + "learning_rate": 2.9865380835562007e-06, + "loss": 0.6275, + "step": 27872 + }, + { + "epoch": 3.72, + "grad_norm": 0.6875, + "learning_rate": 2.9837140428264244e-06, + "loss": 0.4101, + "step": 27873 + }, + { + "epoch": 3.72, + "grad_norm": 0.7421875, + "learning_rate": 2.980891317699752e-06, + "loss": 0.5493, + "step": 27874 + }, + { + "epoch": 3.72, + "grad_norm": 0.5703125, + "learning_rate": 2.978069908214465e-06, + "loss": 0.3947, + "step": 27875 + }, + { + "epoch": 3.72, + "grad_norm": 0.50390625, + "learning_rate": 2.9752498144088094e-06, + "loss": 0.3299, + "step": 27876 + }, + { + "epoch": 3.72, + "grad_norm": 0.5234375, + "learning_rate": 2.9724310363210437e-06, + "loss": 0.2574, + "step": 27877 + }, + { + "epoch": 3.72, + "grad_norm": 0.65625, + "learning_rate": 2.9696135739893827e-06, + "loss": 0.3923, + "step": 27878 + }, + { + "epoch": 3.72, + "grad_norm": 0.62109375, + "learning_rate": 2.9667974274520393e-06, + "loss": 0.4688, + "step": 27879 + }, + { + "epoch": 3.72, + "grad_norm": 0.6796875, + "learning_rate": 2.9639825967471948e-06, + "loss": 0.2733, + "step": 27880 + }, + { + "epoch": 3.72, + "grad_norm": 0.81640625, + "learning_rate": 2.9611690819130178e-06, + "loss": 0.257, + "step": 27881 + }, + { + "epoch": 3.72, + "grad_norm": 0.59375, + "learning_rate": 2.9583568829876785e-06, + "loss": 0.2764, + "step": 27882 + }, + { + "epoch": 3.72, + "grad_norm": 0.75, + "learning_rate": 2.9555460000092906e-06, + "loss": 0.4774, + "step": 27883 + }, + { + "epoch": 3.72, + "grad_norm": 0.64453125, + "learning_rate": 2.9527364330159903e-06, + "loss": 0.312, + "step": 27884 + }, + { + "epoch": 3.72, + "grad_norm": 0.57421875, + "learning_rate": 2.9499281820458692e-06, + "loss": 0.5264, + "step": 27885 + }, + { + "epoch": 3.72, + "grad_norm": 0.6640625, + "learning_rate": 2.947121247136997e-06, + "loss": 0.3692, + "step": 27886 + }, + { + "epoch": 3.72, + "grad_norm": 0.70703125, + "learning_rate": 2.9443156283274543e-06, + "loss": 0.5564, + "step": 27887 + }, + { + "epoch": 3.72, + "grad_norm": 0.6015625, + "learning_rate": 2.9415113256552774e-06, + "loss": 0.3169, + "step": 27888 + }, + { + "epoch": 3.72, + "grad_norm": 0.6171875, + "learning_rate": 2.9387083391585135e-06, + "loss": 0.3566, + "step": 27889 + }, + { + "epoch": 3.72, + "grad_norm": 0.5, + "learning_rate": 2.9359066688751436e-06, + "loss": 0.2922, + "step": 27890 + }, + { + "epoch": 3.72, + "grad_norm": 0.62890625, + "learning_rate": 2.9331063148431812e-06, + "loss": 0.4445, + "step": 27891 + }, + { + "epoch": 3.72, + "grad_norm": 0.58203125, + "learning_rate": 2.9303072771005858e-06, + "loss": 0.4695, + "step": 27892 + }, + { + "epoch": 3.72, + "grad_norm": 0.75390625, + "learning_rate": 2.9275095556853373e-06, + "loss": 0.1451, + "step": 27893 + }, + { + "epoch": 3.72, + "grad_norm": 0.578125, + "learning_rate": 2.924713150635361e-06, + "loss": 0.4248, + "step": 27894 + }, + { + "epoch": 3.72, + "grad_norm": 0.6015625, + "learning_rate": 2.92191806198856e-06, + "loss": 0.2575, + "step": 27895 + }, + { + "epoch": 3.72, + "grad_norm": 0.498046875, + "learning_rate": 2.9191242897828597e-06, + "loss": 0.2731, + "step": 27896 + }, + { + "epoch": 3.72, + "grad_norm": 0.65625, + "learning_rate": 2.916331834056141e-06, + "loss": 0.2584, + "step": 27897 + }, + { + "epoch": 3.72, + "grad_norm": 0.57421875, + "learning_rate": 2.913540694846273e-06, + "loss": 0.4151, + "step": 27898 + }, + { + "epoch": 3.72, + "grad_norm": 0.75, + "learning_rate": 2.9107508721911036e-06, + "loss": 0.4824, + "step": 27899 + }, + { + "epoch": 3.72, + "grad_norm": 0.49609375, + "learning_rate": 2.9079623661284585e-06, + "loss": 0.1552, + "step": 27900 + }, + { + "epoch": 3.72, + "grad_norm": 0.859375, + "learning_rate": 2.9051751766961623e-06, + "loss": 0.5067, + "step": 27901 + }, + { + "epoch": 3.72, + "grad_norm": 0.55859375, + "learning_rate": 2.902389303932007e-06, + "loss": 0.211, + "step": 27902 + }, + { + "epoch": 3.72, + "grad_norm": 0.578125, + "learning_rate": 2.8996047478737633e-06, + "loss": 0.3661, + "step": 27903 + }, + { + "epoch": 3.72, + "grad_norm": 0.6328125, + "learning_rate": 2.8968215085591998e-06, + "loss": 0.4058, + "step": 27904 + }, + { + "epoch": 3.72, + "grad_norm": 0.51171875, + "learning_rate": 2.894039586026054e-06, + "loss": 0.3211, + "step": 27905 + }, + { + "epoch": 3.72, + "grad_norm": 0.55859375, + "learning_rate": 2.8912589803120614e-06, + "loss": 0.1887, + "step": 27906 + }, + { + "epoch": 3.72, + "grad_norm": 0.498046875, + "learning_rate": 2.8884796914549262e-06, + "loss": 0.1184, + "step": 27907 + }, + { + "epoch": 3.72, + "grad_norm": 0.50390625, + "learning_rate": 2.8857017194923173e-06, + "loss": 0.2793, + "step": 27908 + }, + { + "epoch": 3.72, + "grad_norm": 0.69140625, + "learning_rate": 2.8829250644619275e-06, + "loss": 0.5012, + "step": 27909 + }, + { + "epoch": 3.72, + "grad_norm": 0.4921875, + "learning_rate": 2.880149726401404e-06, + "loss": 0.2125, + "step": 27910 + }, + { + "epoch": 3.72, + "grad_norm": 0.64453125, + "learning_rate": 2.877375705348373e-06, + "loss": 0.2171, + "step": 27911 + }, + { + "epoch": 3.72, + "grad_norm": 0.46484375, + "learning_rate": 2.87460300134047e-06, + "loss": 0.1727, + "step": 27912 + }, + { + "epoch": 3.72, + "grad_norm": 0.59765625, + "learning_rate": 2.871831614415277e-06, + "loss": 0.2935, + "step": 27913 + }, + { + "epoch": 3.72, + "grad_norm": 0.8046875, + "learning_rate": 2.869061544610385e-06, + "loss": 0.2265, + "step": 27914 + }, + { + "epoch": 3.72, + "grad_norm": 0.7109375, + "learning_rate": 2.8662927919633655e-06, + "loss": 0.2367, + "step": 27915 + }, + { + "epoch": 3.73, + "grad_norm": 0.51171875, + "learning_rate": 2.863525356511754e-06, + "loss": 0.2596, + "step": 27916 + }, + { + "epoch": 3.73, + "grad_norm": 0.83984375, + "learning_rate": 2.860759238293076e-06, + "loss": 0.5249, + "step": 27917 + }, + { + "epoch": 3.73, + "grad_norm": 0.62890625, + "learning_rate": 2.857994437344846e-06, + "loss": 0.2185, + "step": 27918 + }, + { + "epoch": 3.73, + "grad_norm": 0.388671875, + "learning_rate": 2.8552309537045575e-06, + "loss": 0.1968, + "step": 27919 + }, + { + "epoch": 3.73, + "grad_norm": 0.412109375, + "learning_rate": 2.85246878740969e-06, + "loss": 0.15, + "step": 27920 + }, + { + "epoch": 3.73, + "grad_norm": 0.6171875, + "learning_rate": 2.849707938497692e-06, + "loss": 0.3553, + "step": 27921 + }, + { + "epoch": 3.73, + "grad_norm": 0.5703125, + "learning_rate": 2.8469484070060005e-06, + "loss": 0.3905, + "step": 27922 + }, + { + "epoch": 3.73, + "grad_norm": 0.83203125, + "learning_rate": 2.8441901929720404e-06, + "loss": 0.3258, + "step": 27923 + }, + { + "epoch": 3.73, + "grad_norm": 0.66015625, + "learning_rate": 2.841433296433216e-06, + "loss": 0.4499, + "step": 27924 + }, + { + "epoch": 3.73, + "grad_norm": 0.62890625, + "learning_rate": 2.8386777174269186e-06, + "loss": 0.1868, + "step": 27925 + }, + { + "epoch": 3.73, + "grad_norm": 0.478515625, + "learning_rate": 2.835923455990508e-06, + "loss": 0.1941, + "step": 27926 + }, + { + "epoch": 3.73, + "grad_norm": 0.546875, + "learning_rate": 2.8331705121613204e-06, + "loss": 0.2937, + "step": 27927 + }, + { + "epoch": 3.73, + "grad_norm": 0.69921875, + "learning_rate": 2.8304188859767156e-06, + "loss": 0.6079, + "step": 27928 + }, + { + "epoch": 3.73, + "grad_norm": 0.6484375, + "learning_rate": 2.8276685774739966e-06, + "loss": 0.2023, + "step": 27929 + }, + { + "epoch": 3.73, + "grad_norm": 0.65234375, + "learning_rate": 2.8249195866904445e-06, + "loss": 0.415, + "step": 27930 + }, + { + "epoch": 3.73, + "grad_norm": 0.625, + "learning_rate": 2.8221719136633627e-06, + "loss": 0.3059, + "step": 27931 + }, + { + "epoch": 3.73, + "grad_norm": 0.7421875, + "learning_rate": 2.819425558429989e-06, + "loss": 0.3192, + "step": 27932 + }, + { + "epoch": 3.73, + "grad_norm": 0.6796875, + "learning_rate": 2.8166805210275705e-06, + "loss": 0.2743, + "step": 27933 + }, + { + "epoch": 3.73, + "grad_norm": 0.65625, + "learning_rate": 2.8139368014933444e-06, + "loss": 0.2066, + "step": 27934 + }, + { + "epoch": 3.73, + "grad_norm": 0.427734375, + "learning_rate": 2.811194399864514e-06, + "loss": 0.1918, + "step": 27935 + }, + { + "epoch": 3.73, + "grad_norm": 0.54296875, + "learning_rate": 2.80845331617825e-06, + "loss": 0.3327, + "step": 27936 + }, + { + "epoch": 3.73, + "grad_norm": 0.57421875, + "learning_rate": 2.8057135504717447e-06, + "loss": 0.2876, + "step": 27937 + }, + { + "epoch": 3.73, + "grad_norm": 0.4921875, + "learning_rate": 2.8029751027821348e-06, + "loss": 0.1566, + "step": 27938 + }, + { + "epoch": 3.73, + "grad_norm": 0.83984375, + "learning_rate": 2.8002379731465688e-06, + "loss": 0.4034, + "step": 27939 + }, + { + "epoch": 3.73, + "grad_norm": 0.609375, + "learning_rate": 2.7975021616021612e-06, + "loss": 0.3357, + "step": 27940 + }, + { + "epoch": 3.73, + "grad_norm": 0.75, + "learning_rate": 2.7947676681860046e-06, + "loss": 0.4729, + "step": 27941 + }, + { + "epoch": 3.73, + "grad_norm": 0.51171875, + "learning_rate": 2.7920344929351806e-06, + "loss": 0.1801, + "step": 27942 + }, + { + "epoch": 3.73, + "grad_norm": 0.58984375, + "learning_rate": 2.7893026358867593e-06, + "loss": 0.1361, + "step": 27943 + }, + { + "epoch": 3.73, + "grad_norm": 0.71484375, + "learning_rate": 2.7865720970777996e-06, + "loss": 0.2973, + "step": 27944 + }, + { + "epoch": 3.73, + "grad_norm": 0.62109375, + "learning_rate": 2.7838428765452953e-06, + "loss": 0.4247, + "step": 27945 + }, + { + "epoch": 3.73, + "grad_norm": 0.66015625, + "learning_rate": 2.781114974326271e-06, + "loss": 0.3308, + "step": 27946 + }, + { + "epoch": 3.73, + "grad_norm": 0.79296875, + "learning_rate": 2.778388390457731e-06, + "loss": 0.4116, + "step": 27947 + }, + { + "epoch": 3.73, + "grad_norm": 0.8515625, + "learning_rate": 2.7756631249766464e-06, + "loss": 0.2735, + "step": 27948 + }, + { + "epoch": 3.73, + "grad_norm": 0.53125, + "learning_rate": 2.772939177919953e-06, + "loss": 0.2025, + "step": 27949 + }, + { + "epoch": 3.73, + "grad_norm": 0.625, + "learning_rate": 2.7702165493246114e-06, + "loss": 0.3841, + "step": 27950 + }, + { + "epoch": 3.73, + "grad_norm": 0.70703125, + "learning_rate": 2.7674952392275244e-06, + "loss": 0.3912, + "step": 27951 + }, + { + "epoch": 3.73, + "grad_norm": 0.83984375, + "learning_rate": 2.7647752476656297e-06, + "loss": 0.1948, + "step": 27952 + }, + { + "epoch": 3.73, + "grad_norm": 0.53125, + "learning_rate": 2.762056574675775e-06, + "loss": 0.2241, + "step": 27953 + }, + { + "epoch": 3.73, + "grad_norm": 0.578125, + "learning_rate": 2.7593392202948322e-06, + "loss": 0.3305, + "step": 27954 + }, + { + "epoch": 3.73, + "grad_norm": 0.54296875, + "learning_rate": 2.75662318455967e-06, + "loss": 0.5362, + "step": 27955 + }, + { + "epoch": 3.73, + "grad_norm": 0.7421875, + "learning_rate": 2.7539084675071046e-06, + "loss": 0.2934, + "step": 27956 + }, + { + "epoch": 3.73, + "grad_norm": 0.78125, + "learning_rate": 2.7511950691739507e-06, + "loss": 0.3437, + "step": 27957 + }, + { + "epoch": 3.73, + "grad_norm": 0.66796875, + "learning_rate": 2.748482989597012e-06, + "loss": 0.2998, + "step": 27958 + }, + { + "epoch": 3.73, + "grad_norm": 0.88671875, + "learning_rate": 2.7457722288130484e-06, + "loss": 0.5027, + "step": 27959 + }, + { + "epoch": 3.73, + "grad_norm": 0.42578125, + "learning_rate": 2.74306278685883e-06, + "loss": 0.3157, + "step": 27960 + }, + { + "epoch": 3.73, + "grad_norm": 0.4375, + "learning_rate": 2.7403546637711054e-06, + "loss": 0.2589, + "step": 27961 + }, + { + "epoch": 3.73, + "grad_norm": 0.458984375, + "learning_rate": 2.7376478595866007e-06, + "loss": 0.2424, + "step": 27962 + }, + { + "epoch": 3.73, + "grad_norm": 0.79296875, + "learning_rate": 2.7349423743419976e-06, + "loss": 0.6279, + "step": 27963 + }, + { + "epoch": 3.73, + "grad_norm": 0.6640625, + "learning_rate": 2.7322382080740006e-06, + "loss": 0.2965, + "step": 27964 + }, + { + "epoch": 3.73, + "grad_norm": 0.66796875, + "learning_rate": 2.7295353608192907e-06, + "loss": 0.2621, + "step": 27965 + }, + { + "epoch": 3.73, + "grad_norm": 0.6015625, + "learning_rate": 2.726833832614506e-06, + "loss": 0.1174, + "step": 27966 + }, + { + "epoch": 3.73, + "grad_norm": 0.609375, + "learning_rate": 2.7241336234962944e-06, + "loss": 0.1805, + "step": 27967 + }, + { + "epoch": 3.73, + "grad_norm": 0.65625, + "learning_rate": 2.721434733501249e-06, + "loss": 0.3405, + "step": 27968 + }, + { + "epoch": 3.73, + "grad_norm": 0.466796875, + "learning_rate": 2.718737162665974e-06, + "loss": 0.1278, + "step": 27969 + }, + { + "epoch": 3.73, + "grad_norm": 0.80078125, + "learning_rate": 2.7160409110270735e-06, + "loss": 0.2134, + "step": 27970 + }, + { + "epoch": 3.73, + "grad_norm": 0.76171875, + "learning_rate": 2.713345978621096e-06, + "loss": 0.5622, + "step": 27971 + }, + { + "epoch": 3.73, + "grad_norm": 0.515625, + "learning_rate": 2.7106523654845673e-06, + "loss": 0.1838, + "step": 27972 + }, + { + "epoch": 3.73, + "grad_norm": 0.56640625, + "learning_rate": 2.7079600716540363e-06, + "loss": 0.2176, + "step": 27973 + }, + { + "epoch": 3.73, + "grad_norm": 0.56640625, + "learning_rate": 2.7052690971660187e-06, + "loss": 0.2606, + "step": 27974 + }, + { + "epoch": 3.73, + "grad_norm": 0.7734375, + "learning_rate": 2.702579442056996e-06, + "loss": 0.268, + "step": 27975 + }, + { + "epoch": 3.73, + "grad_norm": 0.69140625, + "learning_rate": 2.699891106363428e-06, + "loss": 0.3458, + "step": 27976 + }, + { + "epoch": 3.73, + "grad_norm": 0.59765625, + "learning_rate": 2.6972040901217966e-06, + "loss": 0.3644, + "step": 27977 + }, + { + "epoch": 3.73, + "grad_norm": 0.546875, + "learning_rate": 2.694518393368517e-06, + "loss": 0.2237, + "step": 27978 + }, + { + "epoch": 3.73, + "grad_norm": 0.5625, + "learning_rate": 2.6918340161400268e-06, + "loss": 0.2937, + "step": 27979 + }, + { + "epoch": 3.73, + "grad_norm": 0.8046875, + "learning_rate": 2.689150958472719e-06, + "loss": 0.6613, + "step": 27980 + }, + { + "epoch": 3.73, + "grad_norm": 0.70703125, + "learning_rate": 2.686469220402976e-06, + "loss": 0.2653, + "step": 27981 + }, + { + "epoch": 3.73, + "grad_norm": 0.56640625, + "learning_rate": 2.6837888019671574e-06, + "loss": 0.4314, + "step": 27982 + }, + { + "epoch": 3.73, + "grad_norm": 0.69921875, + "learning_rate": 2.681109703201623e-06, + "loss": 0.4343, + "step": 27983 + }, + { + "epoch": 3.73, + "grad_norm": 0.482421875, + "learning_rate": 2.678431924142699e-06, + "loss": 0.2408, + "step": 27984 + }, + { + "epoch": 3.73, + "grad_norm": 0.62109375, + "learning_rate": 2.6757554648267123e-06, + "loss": 0.3384, + "step": 27985 + }, + { + "epoch": 3.73, + "grad_norm": 0.71875, + "learning_rate": 2.673080325289934e-06, + "loss": 0.3858, + "step": 27986 + }, + { + "epoch": 3.73, + "grad_norm": 0.62890625, + "learning_rate": 2.6704065055686455e-06, + "loss": 0.2278, + "step": 27987 + }, + { + "epoch": 3.73, + "grad_norm": 0.796875, + "learning_rate": 2.6677340056991184e-06, + "loss": 0.3455, + "step": 27988 + }, + { + "epoch": 3.73, + "grad_norm": 0.94921875, + "learning_rate": 2.66506282571759e-06, + "loss": 0.2696, + "step": 27989 + }, + { + "epoch": 3.73, + "grad_norm": 0.60546875, + "learning_rate": 2.6623929656602655e-06, + "loss": 0.3043, + "step": 27990 + }, + { + "epoch": 3.74, + "grad_norm": 0.6171875, + "learning_rate": 2.65972442556337e-06, + "loss": 0.51, + "step": 27991 + }, + { + "epoch": 3.74, + "grad_norm": 0.76171875, + "learning_rate": 2.657057205463087e-06, + "loss": 0.2803, + "step": 27992 + }, + { + "epoch": 3.74, + "grad_norm": 0.72265625, + "learning_rate": 2.654391305395576e-06, + "loss": 0.2998, + "step": 27993 + }, + { + "epoch": 3.74, + "grad_norm": 0.72265625, + "learning_rate": 2.651726725397008e-06, + "loss": 0.5857, + "step": 27994 + }, + { + "epoch": 3.74, + "grad_norm": 0.73828125, + "learning_rate": 2.6490634655034875e-06, + "loss": 0.4185, + "step": 27995 + }, + { + "epoch": 3.74, + "grad_norm": 0.63671875, + "learning_rate": 2.646401525751141e-06, + "loss": 0.2273, + "step": 27996 + }, + { + "epoch": 3.74, + "grad_norm": 0.703125, + "learning_rate": 2.6437409061760843e-06, + "loss": 0.2282, + "step": 27997 + }, + { + "epoch": 3.74, + "grad_norm": 0.625, + "learning_rate": 2.6410816068143883e-06, + "loss": 0.34, + "step": 27998 + }, + { + "epoch": 3.74, + "grad_norm": 0.5390625, + "learning_rate": 2.6384236277021025e-06, + "loss": 0.1689, + "step": 27999 + }, + { + "epoch": 3.74, + "grad_norm": 0.431640625, + "learning_rate": 2.635766968875275e-06, + "loss": 0.1916, + "step": 28000 + }, + { + "epoch": 3.74, + "grad_norm": 0.79296875, + "learning_rate": 2.633111630369933e-06, + "loss": 0.4663, + "step": 28001 + }, + { + "epoch": 3.74, + "grad_norm": 0.44140625, + "learning_rate": 2.6304576122221035e-06, + "loss": 0.2224, + "step": 28002 + }, + { + "epoch": 3.74, + "grad_norm": 0.5703125, + "learning_rate": 2.6278049144677463e-06, + "loss": 0.2573, + "step": 28003 + }, + { + "epoch": 3.74, + "grad_norm": 0.7265625, + "learning_rate": 2.625153537142855e-06, + "loss": 0.618, + "step": 28004 + }, + { + "epoch": 3.74, + "grad_norm": 0.72265625, + "learning_rate": 2.622503480283367e-06, + "loss": 0.3228, + "step": 28005 + }, + { + "epoch": 3.74, + "grad_norm": 0.453125, + "learning_rate": 2.619854743925232e-06, + "loss": 0.1176, + "step": 28006 + }, + { + "epoch": 3.74, + "grad_norm": 0.62890625, + "learning_rate": 2.617207328104365e-06, + "loss": 0.476, + "step": 28007 + }, + { + "epoch": 3.74, + "grad_norm": 0.65625, + "learning_rate": 2.6145612328566717e-06, + "loss": 0.5115, + "step": 28008 + }, + { + "epoch": 3.74, + "grad_norm": 0.69921875, + "learning_rate": 2.611916458218022e-06, + "loss": 0.3203, + "step": 28009 + }, + { + "epoch": 3.74, + "grad_norm": 0.734375, + "learning_rate": 2.6092730042242885e-06, + "loss": 0.3441, + "step": 28010 + }, + { + "epoch": 3.74, + "grad_norm": 0.62890625, + "learning_rate": 2.6066308709113195e-06, + "loss": 0.3033, + "step": 28011 + }, + { + "epoch": 3.74, + "grad_norm": 0.53125, + "learning_rate": 2.6039900583149536e-06, + "loss": 0.3452, + "step": 28012 + }, + { + "epoch": 3.74, + "grad_norm": 0.48828125, + "learning_rate": 2.6013505664709836e-06, + "loss": 0.1812, + "step": 28013 + }, + { + "epoch": 3.74, + "grad_norm": 0.6640625, + "learning_rate": 2.5987123954152037e-06, + "loss": 0.3702, + "step": 28014 + }, + { + "epoch": 3.74, + "grad_norm": 0.66015625, + "learning_rate": 2.5960755451833963e-06, + "loss": 0.5487, + "step": 28015 + }, + { + "epoch": 3.74, + "grad_norm": 0.486328125, + "learning_rate": 2.5934400158113326e-06, + "loss": 0.1103, + "step": 28016 + }, + { + "epoch": 3.74, + "grad_norm": 0.58984375, + "learning_rate": 2.5908058073347286e-06, + "loss": 0.3178, + "step": 28017 + }, + { + "epoch": 3.74, + "grad_norm": 0.6953125, + "learning_rate": 2.5881729197893112e-06, + "loss": 0.3102, + "step": 28018 + }, + { + "epoch": 3.74, + "grad_norm": 0.51171875, + "learning_rate": 2.5855413532107855e-06, + "loss": 0.3016, + "step": 28019 + }, + { + "epoch": 3.74, + "grad_norm": 0.6796875, + "learning_rate": 2.5829111076348444e-06, + "loss": 0.3707, + "step": 28020 + }, + { + "epoch": 3.74, + "grad_norm": 0.71875, + "learning_rate": 2.580282183097149e-06, + "loss": 0.2247, + "step": 28021 + }, + { + "epoch": 3.74, + "grad_norm": 0.59765625, + "learning_rate": 2.5776545796333485e-06, + "loss": 0.5367, + "step": 28022 + }, + { + "epoch": 3.74, + "grad_norm": 0.61328125, + "learning_rate": 2.575028297279081e-06, + "loss": 0.2337, + "step": 28023 + }, + { + "epoch": 3.74, + "grad_norm": 0.5625, + "learning_rate": 2.5724033360699505e-06, + "loss": 0.2453, + "step": 28024 + }, + { + "epoch": 3.74, + "grad_norm": 0.75, + "learning_rate": 2.569779696041563e-06, + "loss": 0.287, + "step": 28025 + }, + { + "epoch": 3.74, + "grad_norm": 0.56640625, + "learning_rate": 2.567157377229501e-06, + "loss": 0.2747, + "step": 28026 + }, + { + "epoch": 3.74, + "grad_norm": 0.80078125, + "learning_rate": 2.5645363796693023e-06, + "loss": 0.4103, + "step": 28027 + }, + { + "epoch": 3.74, + "grad_norm": 0.7421875, + "learning_rate": 2.5619167033965386e-06, + "loss": 0.6993, + "step": 28028 + }, + { + "epoch": 3.74, + "grad_norm": 0.5, + "learning_rate": 2.5592983484467037e-06, + "loss": 0.1765, + "step": 28029 + }, + { + "epoch": 3.74, + "grad_norm": 0.73046875, + "learning_rate": 2.5566813148553358e-06, + "loss": 0.3493, + "step": 28030 + }, + { + "epoch": 3.74, + "grad_norm": 0.703125, + "learning_rate": 2.5540656026579068e-06, + "loss": 0.6207, + "step": 28031 + }, + { + "epoch": 3.74, + "grad_norm": 0.55078125, + "learning_rate": 2.551451211889877e-06, + "loss": 0.1671, + "step": 28032 + }, + { + "epoch": 3.74, + "grad_norm": 0.6640625, + "learning_rate": 2.548838142586718e-06, + "loss": 0.2886, + "step": 28033 + }, + { + "epoch": 3.74, + "grad_norm": 0.5859375, + "learning_rate": 2.546226394783846e-06, + "loss": 0.1817, + "step": 28034 + }, + { + "epoch": 3.74, + "grad_norm": 0.80078125, + "learning_rate": 2.5436159685167104e-06, + "loss": 0.3585, + "step": 28035 + }, + { + "epoch": 3.74, + "grad_norm": 0.5546875, + "learning_rate": 2.541006863820672e-06, + "loss": 0.271, + "step": 28036 + }, + { + "epoch": 3.74, + "grad_norm": 0.57421875, + "learning_rate": 2.5383990807311353e-06, + "loss": 0.3682, + "step": 28037 + }, + { + "epoch": 3.74, + "grad_norm": 0.55859375, + "learning_rate": 2.5357926192834612e-06, + "loss": 0.3539, + "step": 28038 + }, + { + "epoch": 3.74, + "grad_norm": 0.5234375, + "learning_rate": 2.5331874795129883e-06, + "loss": 0.3081, + "step": 28039 + }, + { + "epoch": 3.74, + "grad_norm": 0.5390625, + "learning_rate": 2.5305836614550548e-06, + "loss": 0.1768, + "step": 28040 + }, + { + "epoch": 3.74, + "grad_norm": 0.54296875, + "learning_rate": 2.527981165144955e-06, + "loss": 0.2757, + "step": 28041 + }, + { + "epoch": 3.74, + "grad_norm": 0.921875, + "learning_rate": 2.525379990617982e-06, + "loss": 0.4204, + "step": 28042 + }, + { + "epoch": 3.74, + "grad_norm": 0.734375, + "learning_rate": 2.5227801379094195e-06, + "loss": 0.3472, + "step": 28043 + }, + { + "epoch": 3.74, + "grad_norm": 0.73828125, + "learning_rate": 2.520181607054528e-06, + "loss": 0.3975, + "step": 28044 + }, + { + "epoch": 3.74, + "grad_norm": 0.40234375, + "learning_rate": 2.517584398088524e-06, + "loss": 0.1484, + "step": 28045 + }, + { + "epoch": 3.74, + "grad_norm": 0.5078125, + "learning_rate": 2.5149885110466342e-06, + "loss": 0.2936, + "step": 28046 + }, + { + "epoch": 3.74, + "grad_norm": 0.46875, + "learning_rate": 2.5123939459640754e-06, + "loss": 0.1448, + "step": 28047 + }, + { + "epoch": 3.74, + "grad_norm": 0.703125, + "learning_rate": 2.5098007028760306e-06, + "loss": 0.6302, + "step": 28048 + }, + { + "epoch": 3.74, + "grad_norm": 0.392578125, + "learning_rate": 2.5072087818176382e-06, + "loss": 0.1594, + "step": 28049 + }, + { + "epoch": 3.74, + "grad_norm": 0.60546875, + "learning_rate": 2.50461818282407e-06, + "loss": 0.3696, + "step": 28050 + }, + { + "epoch": 3.74, + "grad_norm": 0.6171875, + "learning_rate": 2.5020289059304535e-06, + "loss": 0.2499, + "step": 28051 + }, + { + "epoch": 3.74, + "grad_norm": 0.5390625, + "learning_rate": 2.4994409511719165e-06, + "loss": 0.126, + "step": 28052 + }, + { + "epoch": 3.74, + "grad_norm": 0.8046875, + "learning_rate": 2.496854318583519e-06, + "loss": 0.2024, + "step": 28053 + }, + { + "epoch": 3.74, + "grad_norm": 0.53125, + "learning_rate": 2.494269008200356e-06, + "loss": 0.1923, + "step": 28054 + }, + { + "epoch": 3.74, + "grad_norm": 0.66015625, + "learning_rate": 2.491685020057477e-06, + "loss": 0.3564, + "step": 28055 + }, + { + "epoch": 3.74, + "grad_norm": 0.71484375, + "learning_rate": 2.4891023541899318e-06, + "loss": 0.3472, + "step": 28056 + }, + { + "epoch": 3.74, + "grad_norm": 0.474609375, + "learning_rate": 2.4865210106327476e-06, + "loss": 0.2491, + "step": 28057 + }, + { + "epoch": 3.74, + "grad_norm": 0.98828125, + "learning_rate": 2.4839409894209296e-06, + "loss": 0.2676, + "step": 28058 + }, + { + "epoch": 3.74, + "grad_norm": 0.546875, + "learning_rate": 2.4813622905894395e-06, + "loss": 0.3086, + "step": 28059 + }, + { + "epoch": 3.74, + "grad_norm": 0.71875, + "learning_rate": 2.478784914173271e-06, + "loss": 0.5265, + "step": 28060 + }, + { + "epoch": 3.74, + "grad_norm": 0.62109375, + "learning_rate": 2.476208860207363e-06, + "loss": 0.2341, + "step": 28061 + }, + { + "epoch": 3.74, + "grad_norm": 0.609375, + "learning_rate": 2.473634128726665e-06, + "loss": 0.2323, + "step": 28062 + }, + { + "epoch": 3.74, + "grad_norm": 0.765625, + "learning_rate": 2.4710607197660717e-06, + "loss": 0.4048, + "step": 28063 + }, + { + "epoch": 3.74, + "grad_norm": 0.68359375, + "learning_rate": 2.4684886333604997e-06, + "loss": 0.3762, + "step": 28064 + }, + { + "epoch": 3.74, + "grad_norm": 0.57421875, + "learning_rate": 2.4659178695447984e-06, + "loss": 0.4034, + "step": 28065 + }, + { + "epoch": 3.75, + "grad_norm": 0.55859375, + "learning_rate": 2.463348428353851e-06, + "loss": 0.2278, + "step": 28066 + }, + { + "epoch": 3.75, + "grad_norm": 0.7109375, + "learning_rate": 2.4607803098225192e-06, + "loss": 0.3559, + "step": 28067 + }, + { + "epoch": 3.75, + "grad_norm": 0.6171875, + "learning_rate": 2.4582135139855853e-06, + "loss": 0.4002, + "step": 28068 + }, + { + "epoch": 3.75, + "grad_norm": 0.67578125, + "learning_rate": 2.455648040877878e-06, + "loss": 0.308, + "step": 28069 + }, + { + "epoch": 3.75, + "grad_norm": 0.68359375, + "learning_rate": 2.453083890534191e-06, + "loss": 0.2802, + "step": 28070 + }, + { + "epoch": 3.75, + "grad_norm": 0.58984375, + "learning_rate": 2.4505210629892973e-06, + "loss": 0.4124, + "step": 28071 + }, + { + "epoch": 3.75, + "grad_norm": 0.62109375, + "learning_rate": 2.447959558277946e-06, + "loss": 0.3996, + "step": 28072 + }, + { + "epoch": 3.75, + "grad_norm": 0.5625, + "learning_rate": 2.4453993764348538e-06, + "loss": 0.2643, + "step": 28073 + }, + { + "epoch": 3.75, + "grad_norm": 0.7578125, + "learning_rate": 2.442840517494771e-06, + "loss": 0.2471, + "step": 28074 + }, + { + "epoch": 3.75, + "grad_norm": 0.546875, + "learning_rate": 2.4402829814923923e-06, + "loss": 0.3568, + "step": 28075 + }, + { + "epoch": 3.75, + "grad_norm": 0.859375, + "learning_rate": 2.4377267684623784e-06, + "loss": 0.4689, + "step": 28076 + }, + { + "epoch": 3.75, + "grad_norm": 0.56640625, + "learning_rate": 2.4351718784394016e-06, + "loss": 0.2506, + "step": 28077 + }, + { + "epoch": 3.75, + "grad_norm": 0.7578125, + "learning_rate": 2.4326183114581125e-06, + "loss": 0.5147, + "step": 28078 + }, + { + "epoch": 3.75, + "grad_norm": 0.55078125, + "learning_rate": 2.4300660675531384e-06, + "loss": 0.3005, + "step": 28079 + }, + { + "epoch": 3.75, + "grad_norm": 0.734375, + "learning_rate": 2.4275151467590852e-06, + "loss": 0.4652, + "step": 28080 + }, + { + "epoch": 3.75, + "grad_norm": 0.69140625, + "learning_rate": 2.424965549110547e-06, + "loss": 0.2714, + "step": 28081 + }, + { + "epoch": 3.75, + "grad_norm": 0.57421875, + "learning_rate": 2.4224172746421082e-06, + "loss": 0.4027, + "step": 28082 + }, + { + "epoch": 3.75, + "grad_norm": 0.447265625, + "learning_rate": 2.419870323388307e-06, + "loss": 0.1749, + "step": 28083 + }, + { + "epoch": 3.75, + "grad_norm": 0.59765625, + "learning_rate": 2.4173246953836937e-06, + "loss": 0.3464, + "step": 28084 + }, + { + "epoch": 3.75, + "grad_norm": 0.62890625, + "learning_rate": 2.4147803906627855e-06, + "loss": 0.2631, + "step": 28085 + }, + { + "epoch": 3.75, + "grad_norm": 0.42578125, + "learning_rate": 2.4122374092600874e-06, + "loss": 0.1726, + "step": 28086 + }, + { + "epoch": 3.75, + "grad_norm": 0.60546875, + "learning_rate": 2.4096957512100836e-06, + "loss": 0.4476, + "step": 28087 + }, + { + "epoch": 3.75, + "grad_norm": 0.54296875, + "learning_rate": 2.4071554165472244e-06, + "loss": 0.129, + "step": 28088 + }, + { + "epoch": 3.75, + "grad_norm": 0.4375, + "learning_rate": 2.4046164053059927e-06, + "loss": 0.2258, + "step": 28089 + }, + { + "epoch": 3.75, + "grad_norm": 0.61328125, + "learning_rate": 2.402078717520795e-06, + "loss": 0.2917, + "step": 28090 + }, + { + "epoch": 3.75, + "grad_norm": 0.69140625, + "learning_rate": 2.3995423532260365e-06, + "loss": 0.2556, + "step": 28091 + }, + { + "epoch": 3.75, + "grad_norm": 0.6328125, + "learning_rate": 2.3970073124561344e-06, + "loss": 0.2917, + "step": 28092 + }, + { + "epoch": 3.75, + "grad_norm": 0.416015625, + "learning_rate": 2.394473595245439e-06, + "loss": 0.218, + "step": 28093 + }, + { + "epoch": 3.75, + "grad_norm": 0.75390625, + "learning_rate": 2.391941201628345e-06, + "loss": 0.4682, + "step": 28094 + }, + { + "epoch": 3.75, + "grad_norm": 0.609375, + "learning_rate": 2.3894101316391583e-06, + "loss": 0.2713, + "step": 28095 + }, + { + "epoch": 3.75, + "grad_norm": 0.5, + "learning_rate": 2.386880385312218e-06, + "loss": 0.2119, + "step": 28096 + }, + { + "epoch": 3.75, + "grad_norm": 0.67578125, + "learning_rate": 2.3843519626818188e-06, + "loss": 0.4871, + "step": 28097 + }, + { + "epoch": 3.75, + "grad_norm": 0.62109375, + "learning_rate": 2.381824863782278e-06, + "loss": 0.3991, + "step": 28098 + }, + { + "epoch": 3.75, + "grad_norm": 0.52734375, + "learning_rate": 2.379299088647824e-06, + "loss": 0.2863, + "step": 28099 + }, + { + "epoch": 3.75, + "grad_norm": 0.62890625, + "learning_rate": 2.37677463731274e-06, + "loss": 0.2196, + "step": 28100 + }, + { + "epoch": 3.75, + "grad_norm": 0.484375, + "learning_rate": 2.374251509811243e-06, + "loss": 0.1771, + "step": 28101 + }, + { + "epoch": 3.75, + "grad_norm": 0.59765625, + "learning_rate": 2.37172970617755e-06, + "loss": 0.2402, + "step": 28102 + }, + { + "epoch": 3.75, + "grad_norm": 0.53125, + "learning_rate": 2.3692092264458565e-06, + "loss": 0.303, + "step": 28103 + }, + { + "epoch": 3.75, + "grad_norm": 0.73046875, + "learning_rate": 2.366690070650346e-06, + "loss": 0.4225, + "step": 28104 + }, + { + "epoch": 3.75, + "grad_norm": 0.66796875, + "learning_rate": 2.3641722388251795e-06, + "loss": 0.2432, + "step": 28105 + }, + { + "epoch": 3.75, + "grad_norm": 0.63671875, + "learning_rate": 2.361655731004497e-06, + "loss": 0.6017, + "step": 28106 + }, + { + "epoch": 3.75, + "grad_norm": 0.65234375, + "learning_rate": 2.3591405472224273e-06, + "loss": 0.4396, + "step": 28107 + }, + { + "epoch": 3.75, + "grad_norm": 0.431640625, + "learning_rate": 2.356626687513086e-06, + "loss": 0.1259, + "step": 28108 + }, + { + "epoch": 3.75, + "grad_norm": 0.53515625, + "learning_rate": 2.354114151910547e-06, + "loss": 0.273, + "step": 28109 + }, + { + "epoch": 3.75, + "grad_norm": 0.6640625, + "learning_rate": 2.3516029404488827e-06, + "loss": 0.4826, + "step": 28110 + }, + { + "epoch": 3.75, + "grad_norm": 0.703125, + "learning_rate": 2.3490930531621545e-06, + "loss": 0.4847, + "step": 28111 + }, + { + "epoch": 3.75, + "grad_norm": 0.76171875, + "learning_rate": 2.3465844900844136e-06, + "loss": 0.3364, + "step": 28112 + }, + { + "epoch": 3.75, + "grad_norm": 0.51171875, + "learning_rate": 2.3440772512496544e-06, + "loss": 0.25, + "step": 28113 + }, + { + "epoch": 3.75, + "grad_norm": 0.76171875, + "learning_rate": 2.3415713366918726e-06, + "loss": 0.6932, + "step": 28114 + }, + { + "epoch": 3.75, + "grad_norm": 0.5859375, + "learning_rate": 2.339066746445062e-06, + "loss": 0.5493, + "step": 28115 + }, + { + "epoch": 3.75, + "grad_norm": 0.515625, + "learning_rate": 2.3365634805431856e-06, + "loss": 0.1444, + "step": 28116 + }, + { + "epoch": 3.75, + "grad_norm": 0.58984375, + "learning_rate": 2.334061539020194e-06, + "loss": 0.1758, + "step": 28117 + }, + { + "epoch": 3.75, + "grad_norm": 0.6953125, + "learning_rate": 2.3315609219100033e-06, + "loss": 0.4224, + "step": 28118 + }, + { + "epoch": 3.75, + "grad_norm": 0.7734375, + "learning_rate": 2.329061629246532e-06, + "loss": 0.2091, + "step": 28119 + }, + { + "epoch": 3.75, + "grad_norm": 0.58984375, + "learning_rate": 2.326563661063663e-06, + "loss": 0.3072, + "step": 28120 + }, + { + "epoch": 3.75, + "grad_norm": 0.443359375, + "learning_rate": 2.324067017395293e-06, + "loss": 0.184, + "step": 28121 + }, + { + "epoch": 3.75, + "grad_norm": 0.5703125, + "learning_rate": 2.321571698275249e-06, + "loss": 0.1904, + "step": 28122 + }, + { + "epoch": 3.75, + "grad_norm": 0.50390625, + "learning_rate": 2.3190777037373825e-06, + "loss": 0.1539, + "step": 28123 + }, + { + "epoch": 3.75, + "grad_norm": 0.73046875, + "learning_rate": 2.3165850338155104e-06, + "loss": 0.4292, + "step": 28124 + }, + { + "epoch": 3.75, + "grad_norm": 0.51953125, + "learning_rate": 2.3140936885434504e-06, + "loss": 0.3545, + "step": 28125 + }, + { + "epoch": 3.75, + "grad_norm": 0.55078125, + "learning_rate": 2.3116036679549757e-06, + "loss": 0.3123, + "step": 28126 + }, + { + "epoch": 3.75, + "grad_norm": 0.640625, + "learning_rate": 2.309114972083848e-06, + "loss": 0.4852, + "step": 28127 + }, + { + "epoch": 3.75, + "grad_norm": 0.58203125, + "learning_rate": 2.3066276009638064e-06, + "loss": 0.3593, + "step": 28128 + }, + { + "epoch": 3.75, + "grad_norm": 0.6015625, + "learning_rate": 2.304141554628603e-06, + "loss": 0.3203, + "step": 28129 + }, + { + "epoch": 3.75, + "grad_norm": 0.59375, + "learning_rate": 2.3016568331119425e-06, + "loss": 0.3325, + "step": 28130 + }, + { + "epoch": 3.75, + "grad_norm": 0.62890625, + "learning_rate": 2.2991734364475215e-06, + "loss": 0.2238, + "step": 28131 + }, + { + "epoch": 3.75, + "grad_norm": 0.60546875, + "learning_rate": 2.2966913646690014e-06, + "loss": 0.2866, + "step": 28132 + }, + { + "epoch": 3.75, + "grad_norm": 0.40625, + "learning_rate": 2.294210617810055e-06, + "loss": 0.2403, + "step": 28133 + }, + { + "epoch": 3.75, + "grad_norm": 0.60546875, + "learning_rate": 2.291731195904323e-06, + "loss": 0.2383, + "step": 28134 + }, + { + "epoch": 3.75, + "grad_norm": 0.7578125, + "learning_rate": 2.289253098985422e-06, + "loss": 0.2956, + "step": 28135 + }, + { + "epoch": 3.75, + "grad_norm": 0.77734375, + "learning_rate": 2.286776327086959e-06, + "loss": 0.2472, + "step": 28136 + }, + { + "epoch": 3.75, + "grad_norm": 0.69921875, + "learning_rate": 2.2843008802425294e-06, + "loss": 0.3163, + "step": 28137 + }, + { + "epoch": 3.75, + "grad_norm": 0.5703125, + "learning_rate": 2.2818267584856834e-06, + "loss": 0.3262, + "step": 28138 + }, + { + "epoch": 3.75, + "grad_norm": 0.57421875, + "learning_rate": 2.279353961849984e-06, + "loss": 0.3615, + "step": 28139 + }, + { + "epoch": 3.76, + "grad_norm": 0.71875, + "learning_rate": 2.27688249036897e-06, + "loss": 0.218, + "step": 28140 + }, + { + "epoch": 3.76, + "grad_norm": 0.494140625, + "learning_rate": 2.2744123440761378e-06, + "loss": 0.3262, + "step": 28141 + }, + { + "epoch": 3.76, + "grad_norm": 0.515625, + "learning_rate": 2.2719435230049935e-06, + "loss": 0.1414, + "step": 28142 + }, + { + "epoch": 3.76, + "grad_norm": 0.625, + "learning_rate": 2.2694760271890213e-06, + "loss": 0.2474, + "step": 28143 + }, + { + "epoch": 3.76, + "grad_norm": 0.58984375, + "learning_rate": 2.2670098566616837e-06, + "loss": 0.2523, + "step": 28144 + }, + { + "epoch": 3.76, + "grad_norm": 0.625, + "learning_rate": 2.2645450114564093e-06, + "loss": 0.3881, + "step": 28145 + }, + { + "epoch": 3.76, + "grad_norm": 0.5234375, + "learning_rate": 2.262081491606627e-06, + "loss": 0.243, + "step": 28146 + }, + { + "epoch": 3.76, + "grad_norm": 0.451171875, + "learning_rate": 2.259619297145743e-06, + "loss": 0.4476, + "step": 28147 + }, + { + "epoch": 3.76, + "grad_norm": 0.74609375, + "learning_rate": 2.2571584281071646e-06, + "loss": 0.4465, + "step": 28148 + }, + { + "epoch": 3.76, + "grad_norm": 0.703125, + "learning_rate": 2.2546988845242532e-06, + "loss": 0.3764, + "step": 28149 + }, + { + "epoch": 3.76, + "grad_norm": 0.6796875, + "learning_rate": 2.2522406664303386e-06, + "loss": 0.3742, + "step": 28150 + }, + { + "epoch": 3.76, + "grad_norm": 0.703125, + "learning_rate": 2.2497837738587825e-06, + "loss": 0.5289, + "step": 28151 + }, + { + "epoch": 3.76, + "grad_norm": 0.490234375, + "learning_rate": 2.2473282068428915e-06, + "loss": 0.5219, + "step": 28152 + }, + { + "epoch": 3.76, + "grad_norm": 1.0234375, + "learning_rate": 2.244873965415972e-06, + "loss": 0.2203, + "step": 28153 + }, + { + "epoch": 3.76, + "grad_norm": 0.77734375, + "learning_rate": 2.2424210496113095e-06, + "loss": 0.5184, + "step": 28154 + }, + { + "epoch": 3.76, + "grad_norm": 0.671875, + "learning_rate": 2.2399694594621433e-06, + "loss": 0.3473, + "step": 28155 + }, + { + "epoch": 3.76, + "grad_norm": 0.62109375, + "learning_rate": 2.237519195001736e-06, + "loss": 0.2715, + "step": 28156 + }, + { + "epoch": 3.76, + "grad_norm": 0.69140625, + "learning_rate": 2.2350702562633054e-06, + "loss": 0.3079, + "step": 28157 + }, + { + "epoch": 3.76, + "grad_norm": 0.4765625, + "learning_rate": 2.23262264328008e-06, + "loss": 0.1953, + "step": 28158 + }, + { + "epoch": 3.76, + "grad_norm": 0.71484375, + "learning_rate": 2.230176356085234e-06, + "loss": 0.2242, + "step": 28159 + }, + { + "epoch": 3.76, + "grad_norm": 0.6796875, + "learning_rate": 2.2277313947119405e-06, + "loss": 0.3222, + "step": 28160 + }, + { + "epoch": 3.76, + "grad_norm": 0.466796875, + "learning_rate": 2.225287759193373e-06, + "loss": 0.2045, + "step": 28161 + }, + { + "epoch": 3.76, + "grad_norm": 0.5234375, + "learning_rate": 2.2228454495626384e-06, + "loss": 0.445, + "step": 28162 + }, + { + "epoch": 3.76, + "grad_norm": 0.66796875, + "learning_rate": 2.2204044658528876e-06, + "loss": 0.1742, + "step": 28163 + }, + { + "epoch": 3.76, + "grad_norm": 0.373046875, + "learning_rate": 2.217964808097195e-06, + "loss": 0.1293, + "step": 28164 + }, + { + "epoch": 3.76, + "grad_norm": 0.6328125, + "learning_rate": 2.2155264763286553e-06, + "loss": 0.4978, + "step": 28165 + }, + { + "epoch": 3.76, + "grad_norm": 0.58203125, + "learning_rate": 2.2130894705803317e-06, + "loss": 0.3533, + "step": 28166 + }, + { + "epoch": 3.76, + "grad_norm": 0.671875, + "learning_rate": 2.2106537908852864e-06, + "loss": 0.3428, + "step": 28167 + }, + { + "epoch": 3.76, + "grad_norm": 0.73828125, + "learning_rate": 2.208219437276526e-06, + "loss": 0.4434, + "step": 28168 + }, + { + "epoch": 3.76, + "grad_norm": 0.65625, + "learning_rate": 2.2057864097870696e-06, + "loss": 0.3291, + "step": 28169 + }, + { + "epoch": 3.76, + "grad_norm": 0.48046875, + "learning_rate": 2.2033547084499227e-06, + "loss": 0.1241, + "step": 28170 + }, + { + "epoch": 3.76, + "grad_norm": 0.45703125, + "learning_rate": 2.20092433329806e-06, + "loss": 0.3387, + "step": 28171 + }, + { + "epoch": 3.76, + "grad_norm": 0.80859375, + "learning_rate": 2.19849528436441e-06, + "loss": 0.2409, + "step": 28172 + }, + { + "epoch": 3.76, + "grad_norm": 0.73046875, + "learning_rate": 2.196067561681936e-06, + "loss": 0.5769, + "step": 28173 + }, + { + "epoch": 3.76, + "grad_norm": 0.474609375, + "learning_rate": 2.1936411652835775e-06, + "loss": 0.3236, + "step": 28174 + }, + { + "epoch": 3.76, + "grad_norm": 0.640625, + "learning_rate": 2.1912160952021977e-06, + "loss": 0.1242, + "step": 28175 + }, + { + "epoch": 3.76, + "grad_norm": 0.4609375, + "learning_rate": 2.188792351470692e-06, + "loss": 0.1018, + "step": 28176 + }, + { + "epoch": 3.76, + "grad_norm": 0.7734375, + "learning_rate": 2.1863699341219566e-06, + "loss": 0.4024, + "step": 28177 + }, + { + "epoch": 3.76, + "grad_norm": 0.4765625, + "learning_rate": 2.18394884318881e-06, + "loss": 0.2043, + "step": 28178 + }, + { + "epoch": 3.76, + "grad_norm": 0.640625, + "learning_rate": 2.1815290787041034e-06, + "loss": 0.3994, + "step": 28179 + }, + { + "epoch": 3.76, + "grad_norm": 0.5859375, + "learning_rate": 2.1791106407006323e-06, + "loss": 0.3746, + "step": 28180 + }, + { + "epoch": 3.76, + "grad_norm": 0.5546875, + "learning_rate": 2.1766935292112157e-06, + "loss": 0.1749, + "step": 28181 + }, + { + "epoch": 3.76, + "grad_norm": 0.66796875, + "learning_rate": 2.1742777442686046e-06, + "loss": 0.418, + "step": 28182 + }, + { + "epoch": 3.76, + "grad_norm": 0.59765625, + "learning_rate": 2.171863285905573e-06, + "loss": 0.2714, + "step": 28183 + }, + { + "epoch": 3.76, + "grad_norm": 0.6484375, + "learning_rate": 2.1694501541548507e-06, + "loss": 0.3973, + "step": 28184 + }, + { + "epoch": 3.76, + "grad_norm": 0.60546875, + "learning_rate": 2.1670383490491998e-06, + "loss": 0.3449, + "step": 28185 + }, + { + "epoch": 3.76, + "grad_norm": 0.53125, + "learning_rate": 2.164627870621283e-06, + "loss": 0.1406, + "step": 28186 + }, + { + "epoch": 3.76, + "grad_norm": 0.5234375, + "learning_rate": 2.1622187189037966e-06, + "loss": 0.5481, + "step": 28187 + }, + { + "epoch": 3.76, + "grad_norm": 0.68359375, + "learning_rate": 2.159810893929426e-06, + "loss": 0.4303, + "step": 28188 + }, + { + "epoch": 3.76, + "grad_norm": 0.5546875, + "learning_rate": 2.1574043957308e-06, + "loss": 0.264, + "step": 28189 + }, + { + "epoch": 3.76, + "grad_norm": 0.44140625, + "learning_rate": 2.1549992243405816e-06, + "loss": 0.1674, + "step": 28190 + }, + { + "epoch": 3.76, + "grad_norm": 0.484375, + "learning_rate": 2.1525953797913556e-06, + "loss": 0.3342, + "step": 28191 + }, + { + "epoch": 3.76, + "grad_norm": 0.828125, + "learning_rate": 2.1501928621157406e-06, + "loss": 0.2311, + "step": 28192 + }, + { + "epoch": 3.76, + "grad_norm": 0.6484375, + "learning_rate": 2.147791671346311e-06, + "loss": 0.4401, + "step": 28193 + }, + { + "epoch": 3.76, + "grad_norm": 0.58984375, + "learning_rate": 2.1453918075156286e-06, + "loss": 0.1596, + "step": 28194 + }, + { + "epoch": 3.76, + "grad_norm": 0.80859375, + "learning_rate": 2.142993270656235e-06, + "loss": 0.4475, + "step": 28195 + }, + { + "epoch": 3.76, + "grad_norm": 0.69921875, + "learning_rate": 2.140596060800648e-06, + "loss": 0.4917, + "step": 28196 + }, + { + "epoch": 3.76, + "grad_norm": 0.52734375, + "learning_rate": 2.1382001779813866e-06, + "loss": 0.256, + "step": 28197 + }, + { + "epoch": 3.76, + "grad_norm": 0.640625, + "learning_rate": 2.1358056222309573e-06, + "loss": 0.1305, + "step": 28198 + }, + { + "epoch": 3.76, + "grad_norm": 0.5546875, + "learning_rate": 2.1334123935818017e-06, + "loss": 0.2132, + "step": 28199 + }, + { + "epoch": 3.76, + "grad_norm": 0.6015625, + "learning_rate": 2.131020492066382e-06, + "loss": 0.2443, + "step": 28200 + }, + { + "epoch": 3.76, + "grad_norm": 0.76171875, + "learning_rate": 2.128629917717129e-06, + "loss": 0.3602, + "step": 28201 + }, + { + "epoch": 3.76, + "grad_norm": 0.625, + "learning_rate": 2.1262406705664707e-06, + "loss": 0.4895, + "step": 28202 + }, + { + "epoch": 3.76, + "grad_norm": 0.77734375, + "learning_rate": 2.123852750646804e-06, + "loss": 0.4146, + "step": 28203 + }, + { + "epoch": 3.76, + "grad_norm": 0.546875, + "learning_rate": 2.121466157990526e-06, + "loss": 0.4339, + "step": 28204 + }, + { + "epoch": 3.76, + "grad_norm": 0.63671875, + "learning_rate": 2.1190808926299654e-06, + "loss": 0.3403, + "step": 28205 + }, + { + "epoch": 3.76, + "grad_norm": 0.8984375, + "learning_rate": 2.1166969545974968e-06, + "loss": 0.3991, + "step": 28206 + }, + { + "epoch": 3.76, + "grad_norm": 0.66796875, + "learning_rate": 2.1143143439254274e-06, + "loss": 0.5097, + "step": 28207 + }, + { + "epoch": 3.76, + "grad_norm": 0.7265625, + "learning_rate": 2.1119330606460986e-06, + "loss": 0.4107, + "step": 28208 + }, + { + "epoch": 3.76, + "grad_norm": 0.52734375, + "learning_rate": 2.1095531047917617e-06, + "loss": 0.2844, + "step": 28209 + }, + { + "epoch": 3.76, + "grad_norm": 0.65625, + "learning_rate": 2.1071744763947352e-06, + "loss": 0.1908, + "step": 28210 + }, + { + "epoch": 3.76, + "grad_norm": 0.71875, + "learning_rate": 2.1047971754872275e-06, + "loss": 0.4177, + "step": 28211 + }, + { + "epoch": 3.76, + "grad_norm": 0.9140625, + "learning_rate": 2.1024212021015012e-06, + "loss": 0.2858, + "step": 28212 + }, + { + "epoch": 3.76, + "grad_norm": 0.734375, + "learning_rate": 2.1000465562697856e-06, + "loss": 0.3403, + "step": 28213 + }, + { + "epoch": 3.76, + "grad_norm": 0.671875, + "learning_rate": 2.097673238024256e-06, + "loss": 0.398, + "step": 28214 + }, + { + "epoch": 3.77, + "grad_norm": 0.7109375, + "learning_rate": 2.095301247397119e-06, + "loss": 0.387, + "step": 28215 + }, + { + "epoch": 3.77, + "grad_norm": 0.63671875, + "learning_rate": 2.092930584420527e-06, + "loss": 0.3294, + "step": 28216 + }, + { + "epoch": 3.77, + "grad_norm": 0.443359375, + "learning_rate": 2.0905612491266436e-06, + "loss": 0.1351, + "step": 28217 + }, + { + "epoch": 3.77, + "grad_norm": 0.61328125, + "learning_rate": 2.0881932415475756e-06, + "loss": 0.4606, + "step": 28218 + }, + { + "epoch": 3.77, + "grad_norm": 0.66796875, + "learning_rate": 2.0858265617154427e-06, + "loss": 0.351, + "step": 28219 + }, + { + "epoch": 3.77, + "grad_norm": 0.66796875, + "learning_rate": 2.0834612096623516e-06, + "loss": 0.1556, + "step": 28220 + }, + { + "epoch": 3.77, + "grad_norm": 0.478515625, + "learning_rate": 2.0810971854203775e-06, + "loss": 0.2581, + "step": 28221 + }, + { + "epoch": 3.77, + "grad_norm": 0.69140625, + "learning_rate": 2.078734489021561e-06, + "loss": 0.4699, + "step": 28222 + }, + { + "epoch": 3.77, + "grad_norm": 0.7578125, + "learning_rate": 2.076373120497943e-06, + "loss": 0.2824, + "step": 28223 + }, + { + "epoch": 3.77, + "grad_norm": 0.423828125, + "learning_rate": 2.074013079881565e-06, + "loss": 0.1484, + "step": 28224 + }, + { + "epoch": 3.77, + "grad_norm": 0.5546875, + "learning_rate": 2.071654367204401e-06, + "loss": 0.3458, + "step": 28225 + }, + { + "epoch": 3.77, + "grad_norm": 0.6328125, + "learning_rate": 2.0692969824984697e-06, + "loss": 0.4497, + "step": 28226 + }, + { + "epoch": 3.77, + "grad_norm": 0.8671875, + "learning_rate": 2.066940925795724e-06, + "loss": 0.5597, + "step": 28227 + }, + { + "epoch": 3.77, + "grad_norm": 0.62109375, + "learning_rate": 2.0645861971280933e-06, + "loss": 0.3306, + "step": 28228 + }, + { + "epoch": 3.77, + "grad_norm": 0.4921875, + "learning_rate": 2.0622327965275413e-06, + "loss": 0.3411, + "step": 28229 + }, + { + "epoch": 3.77, + "grad_norm": 0.62109375, + "learning_rate": 2.0598807240259644e-06, + "loss": 0.4914, + "step": 28230 + }, + { + "epoch": 3.77, + "grad_norm": 0.515625, + "learning_rate": 2.0575299796552704e-06, + "loss": 0.3663, + "step": 28231 + }, + { + "epoch": 3.77, + "grad_norm": 0.80859375, + "learning_rate": 2.0551805634473233e-06, + "loss": 0.2897, + "step": 28232 + }, + { + "epoch": 3.77, + "grad_norm": 0.65625, + "learning_rate": 2.0528324754339856e-06, + "loss": 0.441, + "step": 28233 + }, + { + "epoch": 3.77, + "grad_norm": 0.51953125, + "learning_rate": 2.0504857156471214e-06, + "loss": 0.1911, + "step": 28234 + }, + { + "epoch": 3.77, + "grad_norm": 0.74609375, + "learning_rate": 2.048140284118516e-06, + "loss": 0.2893, + "step": 28235 + }, + { + "epoch": 3.77, + "grad_norm": 0.51171875, + "learning_rate": 2.0457961808799996e-06, + "loss": 0.3757, + "step": 28236 + }, + { + "epoch": 3.77, + "grad_norm": 0.578125, + "learning_rate": 2.043453405963347e-06, + "loss": 0.3501, + "step": 28237 + }, + { + "epoch": 3.77, + "grad_norm": 0.56640625, + "learning_rate": 2.0411119594003324e-06, + "loss": 0.4497, + "step": 28238 + }, + { + "epoch": 3.77, + "grad_norm": 0.640625, + "learning_rate": 2.0387718412227088e-06, + "loss": 0.3121, + "step": 28239 + }, + { + "epoch": 3.77, + "grad_norm": 0.53125, + "learning_rate": 2.036433051462228e-06, + "loss": 0.1463, + "step": 28240 + }, + { + "epoch": 3.77, + "grad_norm": 0.64453125, + "learning_rate": 2.0340955901505754e-06, + "loss": 0.3509, + "step": 28241 + }, + { + "epoch": 3.77, + "grad_norm": 0.451171875, + "learning_rate": 2.0317594573194487e-06, + "loss": 0.1575, + "step": 28242 + }, + { + "epoch": 3.77, + "grad_norm": 0.60546875, + "learning_rate": 2.0294246530005556e-06, + "loss": 0.4074, + "step": 28243 + }, + { + "epoch": 3.77, + "grad_norm": 0.494140625, + "learning_rate": 2.0270911772255375e-06, + "loss": 0.2958, + "step": 28244 + }, + { + "epoch": 3.77, + "grad_norm": 0.490234375, + "learning_rate": 2.0247590300260356e-06, + "loss": 0.3497, + "step": 28245 + }, + { + "epoch": 3.77, + "grad_norm": 0.5859375, + "learning_rate": 2.0224282114337023e-06, + "loss": 0.1452, + "step": 28246 + }, + { + "epoch": 3.77, + "grad_norm": 0.4375, + "learning_rate": 2.0200987214801015e-06, + "loss": 0.2021, + "step": 28247 + }, + { + "epoch": 3.77, + "grad_norm": 0.671875, + "learning_rate": 2.017770560196852e-06, + "loss": 0.3032, + "step": 28248 + }, + { + "epoch": 3.77, + "grad_norm": 0.6640625, + "learning_rate": 2.0154437276155182e-06, + "loss": 0.2676, + "step": 28249 + }, + { + "epoch": 3.77, + "grad_norm": 0.45703125, + "learning_rate": 2.0131182237676515e-06, + "loss": 0.1731, + "step": 28250 + }, + { + "epoch": 3.77, + "grad_norm": 0.84375, + "learning_rate": 2.0107940486847943e-06, + "loss": 0.5003, + "step": 28251 + }, + { + "epoch": 3.77, + "grad_norm": 0.828125, + "learning_rate": 2.0084712023984543e-06, + "loss": 0.3781, + "step": 28252 + }, + { + "epoch": 3.77, + "grad_norm": 0.390625, + "learning_rate": 2.0061496849401286e-06, + "loss": 0.1714, + "step": 28253 + }, + { + "epoch": 3.77, + "grad_norm": 0.68359375, + "learning_rate": 2.003829496341325e-06, + "loss": 0.3105, + "step": 28254 + }, + { + "epoch": 3.77, + "grad_norm": 0.60546875, + "learning_rate": 2.0015106366334746e-06, + "loss": 0.3617, + "step": 28255 + }, + { + "epoch": 3.77, + "grad_norm": 0.67578125, + "learning_rate": 1.9991931058480296e-06, + "loss": 0.213, + "step": 28256 + }, + { + "epoch": 3.77, + "grad_norm": 0.65625, + "learning_rate": 1.9968769040164205e-06, + "loss": 0.3666, + "step": 28257 + }, + { + "epoch": 3.77, + "grad_norm": 0.69921875, + "learning_rate": 1.9945620311700774e-06, + "loss": 0.4057, + "step": 28258 + }, + { + "epoch": 3.77, + "grad_norm": 0.515625, + "learning_rate": 1.9922484873403645e-06, + "loss": 0.2224, + "step": 28259 + }, + { + "epoch": 3.77, + "grad_norm": 0.58203125, + "learning_rate": 1.989936272558668e-06, + "loss": 0.3584, + "step": 28260 + }, + { + "epoch": 3.77, + "grad_norm": 0.72265625, + "learning_rate": 1.987625386856318e-06, + "loss": 0.5314, + "step": 28261 + }, + { + "epoch": 3.77, + "grad_norm": 0.55078125, + "learning_rate": 1.9853158302646892e-06, + "loss": 0.2177, + "step": 28262 + }, + { + "epoch": 3.77, + "grad_norm": 0.53125, + "learning_rate": 1.9830076028150792e-06, + "loss": 0.2749, + "step": 28263 + }, + { + "epoch": 3.77, + "grad_norm": 0.51171875, + "learning_rate": 1.980700704538796e-06, + "loss": 0.2845, + "step": 28264 + }, + { + "epoch": 3.77, + "grad_norm": 0.74609375, + "learning_rate": 1.9783951354671148e-06, + "loss": 0.4285, + "step": 28265 + }, + { + "epoch": 3.77, + "grad_norm": 0.443359375, + "learning_rate": 1.9760908956312997e-06, + "loss": 0.3027, + "step": 28266 + }, + { + "epoch": 3.77, + "grad_norm": 0.67578125, + "learning_rate": 1.973787985062614e-06, + "loss": 0.2457, + "step": 28267 + }, + { + "epoch": 3.77, + "grad_norm": 0.53515625, + "learning_rate": 1.9714864037922663e-06, + "loss": 0.2936, + "step": 28268 + }, + { + "epoch": 3.77, + "grad_norm": 0.54296875, + "learning_rate": 1.9691861518514875e-06, + "loss": 0.2613, + "step": 28269 + }, + { + "epoch": 3.77, + "grad_norm": 0.451171875, + "learning_rate": 1.9668872292714524e-06, + "loss": 0.1536, + "step": 28270 + }, + { + "epoch": 3.77, + "grad_norm": 0.5703125, + "learning_rate": 1.964589636083347e-06, + "loss": 0.2454, + "step": 28271 + }, + { + "epoch": 3.77, + "grad_norm": 0.63671875, + "learning_rate": 1.9622933723183357e-06, + "loss": 0.2983, + "step": 28272 + }, + { + "epoch": 3.77, + "grad_norm": 0.71484375, + "learning_rate": 1.9599984380075377e-06, + "loss": 0.5039, + "step": 28273 + }, + { + "epoch": 3.77, + "grad_norm": 0.4375, + "learning_rate": 1.9577048331820722e-06, + "loss": 0.1235, + "step": 28274 + }, + { + "epoch": 3.77, + "grad_norm": 0.734375, + "learning_rate": 1.95541255787306e-06, + "loss": 0.2442, + "step": 28275 + }, + { + "epoch": 3.77, + "grad_norm": 0.65625, + "learning_rate": 1.9531216121115746e-06, + "loss": 0.5605, + "step": 28276 + }, + { + "epoch": 3.77, + "grad_norm": 0.57421875, + "learning_rate": 1.9508319959286926e-06, + "loss": 0.4885, + "step": 28277 + }, + { + "epoch": 3.77, + "grad_norm": 0.69921875, + "learning_rate": 1.948543709355455e-06, + "loss": 0.6188, + "step": 28278 + }, + { + "epoch": 3.77, + "grad_norm": 0.671875, + "learning_rate": 1.9462567524228924e-06, + "loss": 0.6329, + "step": 28279 + }, + { + "epoch": 3.77, + "grad_norm": 0.54296875, + "learning_rate": 1.9439711251620142e-06, + "loss": 0.2058, + "step": 28280 + }, + { + "epoch": 3.77, + "grad_norm": 0.68359375, + "learning_rate": 1.941686827603828e-06, + "loss": 0.2311, + "step": 28281 + }, + { + "epoch": 3.77, + "grad_norm": 0.5625, + "learning_rate": 1.9394038597792984e-06, + "loss": 0.175, + "step": 28282 + }, + { + "epoch": 3.77, + "grad_norm": 0.58203125, + "learning_rate": 1.93712222171939e-06, + "loss": 0.228, + "step": 28283 + }, + { + "epoch": 3.77, + "grad_norm": 0.33984375, + "learning_rate": 1.934841913455032e-06, + "loss": 0.2622, + "step": 28284 + }, + { + "epoch": 3.77, + "grad_norm": 0.76171875, + "learning_rate": 1.932562935017168e-06, + "loss": 0.3401, + "step": 28285 + }, + { + "epoch": 3.77, + "grad_norm": 0.6640625, + "learning_rate": 1.9302852864366838e-06, + "loss": 0.4669, + "step": 28286 + }, + { + "epoch": 3.77, + "grad_norm": 0.7421875, + "learning_rate": 1.9280089677444658e-06, + "loss": 0.3652, + "step": 28287 + }, + { + "epoch": 3.77, + "grad_norm": 0.7265625, + "learning_rate": 1.925733978971389e-06, + "loss": 0.3593, + "step": 28288 + }, + { + "epoch": 3.77, + "grad_norm": 0.66015625, + "learning_rate": 1.923460320148307e-06, + "loss": 0.2128, + "step": 28289 + }, + { + "epoch": 3.78, + "grad_norm": 0.6328125, + "learning_rate": 1.9211879913060615e-06, + "loss": 0.2915, + "step": 28290 + }, + { + "epoch": 3.78, + "grad_norm": 0.51171875, + "learning_rate": 1.918916992475439e-06, + "loss": 0.4397, + "step": 28291 + }, + { + "epoch": 3.78, + "grad_norm": 0.5546875, + "learning_rate": 1.9166473236872485e-06, + "loss": 0.2999, + "step": 28292 + }, + { + "epoch": 3.78, + "grad_norm": 0.94921875, + "learning_rate": 1.914378984972265e-06, + "loss": 0.5991, + "step": 28293 + }, + { + "epoch": 3.78, + "grad_norm": 0.66796875, + "learning_rate": 1.912111976361264e-06, + "loss": 0.3482, + "step": 28294 + }, + { + "epoch": 3.78, + "grad_norm": 0.58203125, + "learning_rate": 1.9098462978849873e-06, + "loss": 0.309, + "step": 28295 + }, + { + "epoch": 3.78, + "grad_norm": 0.73046875, + "learning_rate": 1.907581949574133e-06, + "loss": 0.29, + "step": 28296 + }, + { + "epoch": 3.78, + "grad_norm": 0.4453125, + "learning_rate": 1.905318931459421e-06, + "loss": 0.1415, + "step": 28297 + }, + { + "epoch": 3.78, + "grad_norm": 0.67578125, + "learning_rate": 1.9030572435715377e-06, + "loss": 0.3995, + "step": 28298 + }, + { + "epoch": 3.78, + "grad_norm": 0.69140625, + "learning_rate": 1.9007968859411695e-06, + "loss": 0.4375, + "step": 28299 + }, + { + "epoch": 3.78, + "grad_norm": 0.6875, + "learning_rate": 1.8985378585989478e-06, + "loss": 0.2512, + "step": 28300 + }, + { + "epoch": 3.78, + "grad_norm": 0.4921875, + "learning_rate": 1.8962801615755143e-06, + "loss": 0.2979, + "step": 28301 + }, + { + "epoch": 3.78, + "grad_norm": 0.515625, + "learning_rate": 1.8940237949014895e-06, + "loss": 0.2626, + "step": 28302 + }, + { + "epoch": 3.78, + "grad_norm": 0.44921875, + "learning_rate": 1.8917687586074594e-06, + "loss": 0.0852, + "step": 28303 + }, + { + "epoch": 3.78, + "grad_norm": 0.4765625, + "learning_rate": 1.8895150527240224e-06, + "loss": 0.3371, + "step": 28304 + }, + { + "epoch": 3.78, + "grad_norm": 0.61328125, + "learning_rate": 1.8872626772817093e-06, + "loss": 0.5932, + "step": 28305 + }, + { + "epoch": 3.78, + "grad_norm": 0.703125, + "learning_rate": 1.8850116323110955e-06, + "loss": 0.2447, + "step": 28306 + }, + { + "epoch": 3.78, + "grad_norm": 0.6171875, + "learning_rate": 1.8827619178426902e-06, + "loss": 0.4163, + "step": 28307 + }, + { + "epoch": 3.78, + "grad_norm": 0.71875, + "learning_rate": 1.8805135339070024e-06, + "loss": 0.3729, + "step": 28308 + }, + { + "epoch": 3.78, + "grad_norm": 0.57421875, + "learning_rate": 1.8782664805345407e-06, + "loss": 0.2455, + "step": 28309 + }, + { + "epoch": 3.78, + "grad_norm": 0.462890625, + "learning_rate": 1.8760207577557365e-06, + "loss": 0.1169, + "step": 28310 + }, + { + "epoch": 3.78, + "grad_norm": 0.447265625, + "learning_rate": 1.8737763656010765e-06, + "loss": 0.2292, + "step": 28311 + }, + { + "epoch": 3.78, + "grad_norm": 0.51953125, + "learning_rate": 1.8715333041009808e-06, + "loss": 0.2159, + "step": 28312 + }, + { + "epoch": 3.78, + "grad_norm": 0.70703125, + "learning_rate": 1.8692915732858807e-06, + "loss": 0.445, + "step": 28313 + }, + { + "epoch": 3.78, + "grad_norm": 0.5625, + "learning_rate": 1.867051173186163e-06, + "loss": 0.2393, + "step": 28314 + }, + { + "epoch": 3.78, + "grad_norm": 0.5859375, + "learning_rate": 1.864812103832203e-06, + "loss": 0.21, + "step": 28315 + }, + { + "epoch": 3.78, + "grad_norm": 0.6640625, + "learning_rate": 1.862574365254377e-06, + "loss": 0.5901, + "step": 28316 + }, + { + "epoch": 3.78, + "grad_norm": 0.76171875, + "learning_rate": 1.8603379574830383e-06, + "loss": 0.7033, + "step": 28317 + }, + { + "epoch": 3.78, + "grad_norm": 0.50390625, + "learning_rate": 1.8581028805484957e-06, + "loss": 0.3397, + "step": 28318 + }, + { + "epoch": 3.78, + "grad_norm": 0.64453125, + "learning_rate": 1.8558691344810698e-06, + "loss": 0.2202, + "step": 28319 + }, + { + "epoch": 3.78, + "grad_norm": 0.63671875, + "learning_rate": 1.853636719311047e-06, + "loss": 0.2062, + "step": 28320 + }, + { + "epoch": 3.78, + "grad_norm": 0.546875, + "learning_rate": 1.8514056350686927e-06, + "loss": 0.4622, + "step": 28321 + }, + { + "epoch": 3.78, + "grad_norm": 0.59765625, + "learning_rate": 1.8491758817842709e-06, + "loss": 0.5294, + "step": 28322 + }, + { + "epoch": 3.78, + "grad_norm": 0.76953125, + "learning_rate": 1.8469474594880243e-06, + "loss": 0.3154, + "step": 28323 + }, + { + "epoch": 3.78, + "grad_norm": 0.6640625, + "learning_rate": 1.844720368210151e-06, + "loss": 0.5578, + "step": 28324 + }, + { + "epoch": 3.78, + "grad_norm": 0.470703125, + "learning_rate": 1.8424946079808713e-06, + "loss": 0.2712, + "step": 28325 + }, + { + "epoch": 3.78, + "grad_norm": 0.6484375, + "learning_rate": 1.840270178830361e-06, + "loss": 0.1991, + "step": 28326 + }, + { + "epoch": 3.78, + "grad_norm": 0.50390625, + "learning_rate": 1.8380470807887962e-06, + "loss": 0.2192, + "step": 28327 + }, + { + "epoch": 3.78, + "grad_norm": 0.87890625, + "learning_rate": 1.8358253138862968e-06, + "loss": 0.2951, + "step": 28328 + }, + { + "epoch": 3.78, + "grad_norm": 0.59375, + "learning_rate": 1.833604878153017e-06, + "loss": 0.4392, + "step": 28329 + }, + { + "epoch": 3.78, + "grad_norm": 0.55078125, + "learning_rate": 1.8313857736190544e-06, + "loss": 0.349, + "step": 28330 + }, + { + "epoch": 3.78, + "grad_norm": 0.5078125, + "learning_rate": 1.8291680003145073e-06, + "loss": 0.3891, + "step": 28331 + }, + { + "epoch": 3.78, + "grad_norm": 0.53515625, + "learning_rate": 1.8269515582694518e-06, + "loss": 0.4467, + "step": 28332 + }, + { + "epoch": 3.78, + "grad_norm": 0.6015625, + "learning_rate": 1.8247364475139305e-06, + "loss": 0.1737, + "step": 28333 + }, + { + "epoch": 3.78, + "grad_norm": 0.62890625, + "learning_rate": 1.822522668077997e-06, + "loss": 0.3157, + "step": 28334 + }, + { + "epoch": 3.78, + "grad_norm": 0.5859375, + "learning_rate": 1.8203102199916611e-06, + "loss": 0.3333, + "step": 28335 + }, + { + "epoch": 3.78, + "grad_norm": 0.61328125, + "learning_rate": 1.8180991032849426e-06, + "loss": 0.3601, + "step": 28336 + }, + { + "epoch": 3.78, + "grad_norm": 0.70703125, + "learning_rate": 1.8158893179878068e-06, + "loss": 0.6698, + "step": 28337 + }, + { + "epoch": 3.78, + "grad_norm": 0.59765625, + "learning_rate": 1.8136808641302183e-06, + "loss": 0.3847, + "step": 28338 + }, + { + "epoch": 3.78, + "grad_norm": 0.76953125, + "learning_rate": 1.8114737417421312e-06, + "loss": 0.4874, + "step": 28339 + }, + { + "epoch": 3.78, + "grad_norm": 0.4921875, + "learning_rate": 1.8092679508534882e-06, + "loss": 0.2775, + "step": 28340 + }, + { + "epoch": 3.78, + "grad_norm": 0.67578125, + "learning_rate": 1.8070634914941874e-06, + "loss": 0.4344, + "step": 28341 + }, + { + "epoch": 3.78, + "grad_norm": 0.6328125, + "learning_rate": 1.8048603636941164e-06, + "loss": 0.3575, + "step": 28342 + }, + { + "epoch": 3.78, + "grad_norm": 0.55078125, + "learning_rate": 1.802658567483173e-06, + "loss": 0.4688, + "step": 28343 + }, + { + "epoch": 3.78, + "grad_norm": 0.8671875, + "learning_rate": 1.8004581028912004e-06, + "loss": 0.429, + "step": 28344 + }, + { + "epoch": 3.78, + "grad_norm": 0.87109375, + "learning_rate": 1.7982589699480411e-06, + "loss": 0.4449, + "step": 28345 + }, + { + "epoch": 3.78, + "grad_norm": 0.6953125, + "learning_rate": 1.796061168683505e-06, + "loss": 0.1949, + "step": 28346 + }, + { + "epoch": 3.78, + "grad_norm": 0.4921875, + "learning_rate": 1.7938646991274122e-06, + "loss": 0.1911, + "step": 28347 + }, + { + "epoch": 3.78, + "grad_norm": 0.453125, + "learning_rate": 1.791669561309539e-06, + "loss": 0.2807, + "step": 28348 + }, + { + "epoch": 3.78, + "grad_norm": 0.63671875, + "learning_rate": 1.789475755259662e-06, + "loss": 0.2825, + "step": 28349 + }, + { + "epoch": 3.78, + "grad_norm": 0.5390625, + "learning_rate": 1.7872832810075236e-06, + "loss": 0.1573, + "step": 28350 + }, + { + "epoch": 3.78, + "grad_norm": 0.54296875, + "learning_rate": 1.7850921385828556e-06, + "loss": 0.1831, + "step": 28351 + }, + { + "epoch": 3.78, + "grad_norm": 0.63671875, + "learning_rate": 1.7829023280153678e-06, + "loss": 0.3005, + "step": 28352 + }, + { + "epoch": 3.78, + "grad_norm": 0.5390625, + "learning_rate": 1.7807138493347585e-06, + "loss": 0.2862, + "step": 28353 + }, + { + "epoch": 3.78, + "grad_norm": 0.5234375, + "learning_rate": 1.7785267025707152e-06, + "loss": 0.1883, + "step": 28354 + }, + { + "epoch": 3.78, + "grad_norm": 0.66796875, + "learning_rate": 1.7763408877528808e-06, + "loss": 0.507, + "step": 28355 + }, + { + "epoch": 3.78, + "grad_norm": 0.58984375, + "learning_rate": 1.7741564049109205e-06, + "loss": 0.2645, + "step": 28356 + }, + { + "epoch": 3.78, + "grad_norm": 0.75390625, + "learning_rate": 1.7719732540744217e-06, + "loss": 0.1672, + "step": 28357 + }, + { + "epoch": 3.78, + "grad_norm": 0.625, + "learning_rate": 1.769791435273005e-06, + "loss": 0.3332, + "step": 28358 + }, + { + "epoch": 3.78, + "grad_norm": 0.69140625, + "learning_rate": 1.7676109485362802e-06, + "loss": 0.3492, + "step": 28359 + }, + { + "epoch": 3.78, + "grad_norm": 0.53515625, + "learning_rate": 1.7654317938937793e-06, + "loss": 0.2399, + "step": 28360 + }, + { + "epoch": 3.78, + "grad_norm": 0.48046875, + "learning_rate": 1.7632539713750674e-06, + "loss": 0.1456, + "step": 28361 + }, + { + "epoch": 3.78, + "grad_norm": 0.56640625, + "learning_rate": 1.7610774810096874e-06, + "loss": 0.3415, + "step": 28362 + }, + { + "epoch": 3.78, + "grad_norm": 0.66015625, + "learning_rate": 1.7589023228271494e-06, + "loss": 0.3662, + "step": 28363 + }, + { + "epoch": 3.78, + "grad_norm": 0.8125, + "learning_rate": 1.7567284968569408e-06, + "loss": 0.6652, + "step": 28364 + }, + { + "epoch": 3.79, + "grad_norm": 0.57421875, + "learning_rate": 1.754556003128549e-06, + "loss": 0.286, + "step": 28365 + }, + { + "epoch": 3.79, + "grad_norm": 0.59765625, + "learning_rate": 1.7523848416714173e-06, + "loss": 0.3133, + "step": 28366 + }, + { + "epoch": 3.79, + "grad_norm": 0.51953125, + "learning_rate": 1.7502150125150219e-06, + "loss": 0.2503, + "step": 28367 + }, + { + "epoch": 3.79, + "grad_norm": 0.54296875, + "learning_rate": 1.7480465156887726e-06, + "loss": 0.2939, + "step": 28368 + }, + { + "epoch": 3.79, + "grad_norm": 0.734375, + "learning_rate": 1.7458793512220461e-06, + "loss": 0.1598, + "step": 28369 + }, + { + "epoch": 3.79, + "grad_norm": 0.64453125, + "learning_rate": 1.7437135191442632e-06, + "loss": 0.64, + "step": 28370 + }, + { + "epoch": 3.79, + "grad_norm": 0.609375, + "learning_rate": 1.7415490194847894e-06, + "loss": 0.366, + "step": 28371 + }, + { + "epoch": 3.79, + "grad_norm": 0.470703125, + "learning_rate": 1.7393858522729678e-06, + "loss": 0.3318, + "step": 28372 + }, + { + "epoch": 3.79, + "grad_norm": 0.68359375, + "learning_rate": 1.7372240175381527e-06, + "loss": 0.4833, + "step": 28373 + }, + { + "epoch": 3.79, + "grad_norm": 0.455078125, + "learning_rate": 1.7350635153096316e-06, + "loss": 0.2793, + "step": 28374 + }, + { + "epoch": 3.79, + "grad_norm": 0.494140625, + "learning_rate": 1.7329043456167039e-06, + "loss": 0.3307, + "step": 28375 + }, + { + "epoch": 3.79, + "grad_norm": 0.69921875, + "learning_rate": 1.7307465084886787e-06, + "loss": 0.3685, + "step": 28376 + }, + { + "epoch": 3.79, + "grad_norm": 0.470703125, + "learning_rate": 1.7285900039547998e-06, + "loss": 0.2399, + "step": 28377 + }, + { + "epoch": 3.79, + "grad_norm": 0.44921875, + "learning_rate": 1.7264348320442992e-06, + "loss": 0.1576, + "step": 28378 + }, + { + "epoch": 3.79, + "grad_norm": 0.4765625, + "learning_rate": 1.7242809927864201e-06, + "loss": 0.2218, + "step": 28379 + }, + { + "epoch": 3.79, + "grad_norm": 0.47265625, + "learning_rate": 1.7221284862103615e-06, + "loss": 0.1618, + "step": 28380 + }, + { + "epoch": 3.79, + "grad_norm": 0.57421875, + "learning_rate": 1.7199773123453223e-06, + "loss": 0.2442, + "step": 28381 + }, + { + "epoch": 3.79, + "grad_norm": 0.7421875, + "learning_rate": 1.7178274712204567e-06, + "loss": 0.4131, + "step": 28382 + }, + { + "epoch": 3.79, + "grad_norm": 0.625, + "learning_rate": 1.7156789628649306e-06, + "loss": 0.1614, + "step": 28383 + }, + { + "epoch": 3.79, + "grad_norm": 0.52734375, + "learning_rate": 1.713531787307876e-06, + "loss": 0.2335, + "step": 28384 + }, + { + "epoch": 3.79, + "grad_norm": 0.6328125, + "learning_rate": 1.7113859445784141e-06, + "loss": 0.3965, + "step": 28385 + }, + { + "epoch": 3.79, + "grad_norm": 0.625, + "learning_rate": 1.709241434705644e-06, + "loss": 0.281, + "step": 28386 + }, + { + "epoch": 3.79, + "grad_norm": 0.421875, + "learning_rate": 1.7070982577186313e-06, + "loss": 0.1826, + "step": 28387 + }, + { + "epoch": 3.79, + "grad_norm": 0.68359375, + "learning_rate": 1.7049564136464523e-06, + "loss": 0.4511, + "step": 28388 + }, + { + "epoch": 3.79, + "grad_norm": 0.51953125, + "learning_rate": 1.7028159025181511e-06, + "loss": 0.3708, + "step": 28389 + }, + { + "epoch": 3.79, + "grad_norm": 0.68359375, + "learning_rate": 1.7006767243627598e-06, + "loss": 0.4457, + "step": 28390 + }, + { + "epoch": 3.79, + "grad_norm": 0.6015625, + "learning_rate": 1.698538879209266e-06, + "loss": 0.497, + "step": 28391 + }, + { + "epoch": 3.79, + "grad_norm": 0.8125, + "learning_rate": 1.6964023670866913e-06, + "loss": 0.4454, + "step": 28392 + }, + { + "epoch": 3.79, + "grad_norm": 0.640625, + "learning_rate": 1.694267188023979e-06, + "loss": 0.2672, + "step": 28393 + }, + { + "epoch": 3.79, + "grad_norm": 0.5078125, + "learning_rate": 1.6921333420500951e-06, + "loss": 0.315, + "step": 28394 + }, + { + "epoch": 3.79, + "grad_norm": 0.6875, + "learning_rate": 1.6900008291939829e-06, + "loss": 0.3113, + "step": 28395 + }, + { + "epoch": 3.79, + "grad_norm": 0.474609375, + "learning_rate": 1.6878696494845637e-06, + "loss": 0.2919, + "step": 28396 + }, + { + "epoch": 3.79, + "grad_norm": 0.5546875, + "learning_rate": 1.6857398029507144e-06, + "loss": 0.2338, + "step": 28397 + }, + { + "epoch": 3.79, + "grad_norm": 0.80859375, + "learning_rate": 1.683611289621323e-06, + "loss": 0.4682, + "step": 28398 + }, + { + "epoch": 3.79, + "grad_norm": 0.55078125, + "learning_rate": 1.6814841095252775e-06, + "loss": 0.2038, + "step": 28399 + }, + { + "epoch": 3.79, + "grad_norm": 0.73046875, + "learning_rate": 1.6793582626914105e-06, + "loss": 0.4, + "step": 28400 + }, + { + "epoch": 3.79, + "grad_norm": 0.515625, + "learning_rate": 1.6772337491485325e-06, + "loss": 0.1504, + "step": 28401 + }, + { + "epoch": 3.79, + "grad_norm": 0.46484375, + "learning_rate": 1.6751105689254753e-06, + "loss": 0.1707, + "step": 28402 + }, + { + "epoch": 3.79, + "grad_norm": 0.474609375, + "learning_rate": 1.6729887220510164e-06, + "loss": 0.1856, + "step": 28403 + }, + { + "epoch": 3.79, + "grad_norm": 0.73046875, + "learning_rate": 1.6708682085539551e-06, + "loss": 0.323, + "step": 28404 + }, + { + "epoch": 3.79, + "grad_norm": 0.494140625, + "learning_rate": 1.6687490284630125e-06, + "loss": 0.2509, + "step": 28405 + }, + { + "epoch": 3.79, + "grad_norm": 0.54296875, + "learning_rate": 1.6666311818069435e-06, + "loss": 0.3704, + "step": 28406 + }, + { + "epoch": 3.79, + "grad_norm": 0.65234375, + "learning_rate": 1.6645146686144696e-06, + "loss": 0.3518, + "step": 28407 + }, + { + "epoch": 3.79, + "grad_norm": 0.46875, + "learning_rate": 1.6623994889142791e-06, + "loss": 0.2585, + "step": 28408 + }, + { + "epoch": 3.79, + "grad_norm": 0.80078125, + "learning_rate": 1.660285642735082e-06, + "loss": 0.4374, + "step": 28409 + }, + { + "epoch": 3.79, + "grad_norm": 0.84765625, + "learning_rate": 1.6581731301055115e-06, + "loss": 0.3254, + "step": 28410 + }, + { + "epoch": 3.79, + "grad_norm": 0.65234375, + "learning_rate": 1.6560619510542218e-06, + "loss": 0.2802, + "step": 28411 + }, + { + "epoch": 3.79, + "grad_norm": 0.375, + "learning_rate": 1.6539521056098573e-06, + "loss": 0.1376, + "step": 28412 + }, + { + "epoch": 3.79, + "grad_norm": 0.73828125, + "learning_rate": 1.651843593801028e-06, + "loss": 0.4213, + "step": 28413 + }, + { + "epoch": 3.79, + "grad_norm": 0.70703125, + "learning_rate": 1.6497364156563111e-06, + "loss": 0.631, + "step": 28414 + }, + { + "epoch": 3.79, + "grad_norm": 0.55078125, + "learning_rate": 1.647630571204295e-06, + "loss": 0.4715, + "step": 28415 + }, + { + "epoch": 3.79, + "grad_norm": 0.58984375, + "learning_rate": 1.6455260604735234e-06, + "loss": 0.243, + "step": 28416 + }, + { + "epoch": 3.79, + "grad_norm": 0.796875, + "learning_rate": 1.6434228834925513e-06, + "loss": 0.2937, + "step": 28417 + }, + { + "epoch": 3.79, + "grad_norm": 0.62109375, + "learning_rate": 1.6413210402898893e-06, + "loss": 0.4113, + "step": 28418 + }, + { + "epoch": 3.79, + "grad_norm": 0.59765625, + "learning_rate": 1.6392205308940255e-06, + "loss": 0.4566, + "step": 28419 + }, + { + "epoch": 3.79, + "grad_norm": 0.55859375, + "learning_rate": 1.6371213553334708e-06, + "loss": 0.3993, + "step": 28420 + }, + { + "epoch": 3.79, + "grad_norm": 0.5625, + "learning_rate": 1.6350235136366688e-06, + "loss": 0.2089, + "step": 28421 + }, + { + "epoch": 3.79, + "grad_norm": 0.57421875, + "learning_rate": 1.6329270058320857e-06, + "loss": 0.4731, + "step": 28422 + }, + { + "epoch": 3.79, + "grad_norm": 0.70703125, + "learning_rate": 1.6308318319481431e-06, + "loss": 0.464, + "step": 28423 + }, + { + "epoch": 3.79, + "grad_norm": 0.5703125, + "learning_rate": 1.6287379920132518e-06, + "loss": 0.1749, + "step": 28424 + }, + { + "epoch": 3.79, + "grad_norm": 0.5078125, + "learning_rate": 1.626645486055811e-06, + "loss": 0.3225, + "step": 28425 + }, + { + "epoch": 3.79, + "grad_norm": 0.5625, + "learning_rate": 1.6245543141041875e-06, + "loss": 0.3581, + "step": 28426 + }, + { + "epoch": 3.79, + "grad_norm": 0.734375, + "learning_rate": 1.622464476186758e-06, + "loss": 0.5796, + "step": 28427 + }, + { + "epoch": 3.79, + "grad_norm": 0.7109375, + "learning_rate": 1.6203759723318445e-06, + "loss": 0.631, + "step": 28428 + }, + { + "epoch": 3.79, + "grad_norm": 0.4609375, + "learning_rate": 1.6182888025677578e-06, + "loss": 0.1857, + "step": 28429 + }, + { + "epoch": 3.79, + "grad_norm": 0.625, + "learning_rate": 1.6162029669228306e-06, + "loss": 0.1833, + "step": 28430 + }, + { + "epoch": 3.79, + "grad_norm": 0.6484375, + "learning_rate": 1.6141184654253182e-06, + "loss": 0.2216, + "step": 28431 + }, + { + "epoch": 3.79, + "grad_norm": 0.671875, + "learning_rate": 1.6120352981035203e-06, + "loss": 0.1999, + "step": 28432 + }, + { + "epoch": 3.79, + "grad_norm": 0.54296875, + "learning_rate": 1.6099534649856584e-06, + "loss": 0.1949, + "step": 28433 + }, + { + "epoch": 3.79, + "grad_norm": 0.451171875, + "learning_rate": 1.6078729660999769e-06, + "loss": 0.2531, + "step": 28434 + }, + { + "epoch": 3.79, + "grad_norm": 0.46484375, + "learning_rate": 1.6057938014746866e-06, + "loss": 0.1951, + "step": 28435 + }, + { + "epoch": 3.79, + "grad_norm": 0.78125, + "learning_rate": 1.6037159711379868e-06, + "loss": 0.3772, + "step": 28436 + }, + { + "epoch": 3.79, + "grad_norm": 0.734375, + "learning_rate": 1.601639475118033e-06, + "loss": 0.4311, + "step": 28437 + }, + { + "epoch": 3.79, + "grad_norm": 0.671875, + "learning_rate": 1.5995643134430138e-06, + "loss": 0.3162, + "step": 28438 + }, + { + "epoch": 3.79, + "grad_norm": 0.91015625, + "learning_rate": 1.59749048614104e-06, + "loss": 0.2957, + "step": 28439 + }, + { + "epoch": 3.8, + "grad_norm": 0.59765625, + "learning_rate": 1.5954179932402668e-06, + "loss": 0.7392, + "step": 28440 + }, + { + "epoch": 3.8, + "grad_norm": 0.74609375, + "learning_rate": 1.5933468347687829e-06, + "loss": 0.439, + "step": 28441 + }, + { + "epoch": 3.8, + "grad_norm": 0.5625, + "learning_rate": 1.5912770107546659e-06, + "loss": 0.2414, + "step": 28442 + }, + { + "epoch": 3.8, + "grad_norm": 0.58984375, + "learning_rate": 1.589208521225982e-06, + "loss": 0.2056, + "step": 28443 + }, + { + "epoch": 3.8, + "grad_norm": 0.54296875, + "learning_rate": 1.587141366210798e-06, + "loss": 0.2598, + "step": 28444 + }, + { + "epoch": 3.8, + "grad_norm": 0.6875, + "learning_rate": 1.5850755457371357e-06, + "loss": 0.3409, + "step": 28445 + }, + { + "epoch": 3.8, + "grad_norm": 0.59375, + "learning_rate": 1.5830110598330172e-06, + "loss": 0.2806, + "step": 28446 + }, + { + "epoch": 3.8, + "grad_norm": 0.7890625, + "learning_rate": 1.580947908526431e-06, + "loss": 0.7619, + "step": 28447 + }, + { + "epoch": 3.8, + "grad_norm": 0.5234375, + "learning_rate": 1.578886091845344e-06, + "loss": 0.2179, + "step": 28448 + }, + { + "epoch": 3.8, + "grad_norm": 0.63671875, + "learning_rate": 1.5768256098177336e-06, + "loss": 0.4517, + "step": 28449 + }, + { + "epoch": 3.8, + "grad_norm": 0.7109375, + "learning_rate": 1.5747664624715441e-06, + "loss": 0.4977, + "step": 28450 + }, + { + "epoch": 3.8, + "grad_norm": 0.74609375, + "learning_rate": 1.5727086498346865e-06, + "loss": 0.5334, + "step": 28451 + }, + { + "epoch": 3.8, + "grad_norm": 0.486328125, + "learning_rate": 1.570652171935072e-06, + "loss": 0.332, + "step": 28452 + }, + { + "epoch": 3.8, + "grad_norm": 0.609375, + "learning_rate": 1.5685970288005892e-06, + "loss": 0.1674, + "step": 28453 + }, + { + "epoch": 3.8, + "grad_norm": 0.41015625, + "learning_rate": 1.5665432204590936e-06, + "loss": 0.1973, + "step": 28454 + }, + { + "epoch": 3.8, + "grad_norm": 0.55078125, + "learning_rate": 1.5644907469384518e-06, + "loss": 0.1286, + "step": 28455 + }, + { + "epoch": 3.8, + "grad_norm": 0.5546875, + "learning_rate": 1.562439608266486e-06, + "loss": 0.2979, + "step": 28456 + }, + { + "epoch": 3.8, + "grad_norm": 0.6484375, + "learning_rate": 1.5603898044710185e-06, + "loss": 0.1418, + "step": 28457 + }, + { + "epoch": 3.8, + "grad_norm": 0.68359375, + "learning_rate": 1.5583413355798382e-06, + "loss": 0.3689, + "step": 28458 + }, + { + "epoch": 3.8, + "grad_norm": 0.72265625, + "learning_rate": 1.5562942016207338e-06, + "loss": 0.4199, + "step": 28459 + }, + { + "epoch": 3.8, + "grad_norm": 0.53515625, + "learning_rate": 1.554248402621461e-06, + "loss": 0.2878, + "step": 28460 + }, + { + "epoch": 3.8, + "grad_norm": 0.59375, + "learning_rate": 1.5522039386097642e-06, + "loss": 0.5862, + "step": 28461 + }, + { + "epoch": 3.8, + "grad_norm": 0.59765625, + "learning_rate": 1.5501608096133546e-06, + "loss": 0.136, + "step": 28462 + }, + { + "epoch": 3.8, + "grad_norm": 0.65625, + "learning_rate": 1.5481190156599656e-06, + "loss": 0.3134, + "step": 28463 + }, + { + "epoch": 3.8, + "grad_norm": 0.7578125, + "learning_rate": 1.546078556777253e-06, + "loss": 0.4186, + "step": 28464 + }, + { + "epoch": 3.8, + "grad_norm": 0.474609375, + "learning_rate": 1.5440394329929164e-06, + "loss": 0.2719, + "step": 28465 + }, + { + "epoch": 3.8, + "grad_norm": 0.62890625, + "learning_rate": 1.5420016443345897e-06, + "loss": 0.1878, + "step": 28466 + }, + { + "epoch": 3.8, + "grad_norm": 0.91015625, + "learning_rate": 1.5399651908299062e-06, + "loss": 0.5838, + "step": 28467 + }, + { + "epoch": 3.8, + "grad_norm": 0.50390625, + "learning_rate": 1.5379300725064772e-06, + "loss": 0.3424, + "step": 28468 + }, + { + "epoch": 3.8, + "grad_norm": 0.62890625, + "learning_rate": 1.5358962893919249e-06, + "loss": 0.3785, + "step": 28469 + }, + { + "epoch": 3.8, + "grad_norm": 0.58203125, + "learning_rate": 1.5338638415138052e-06, + "loss": 0.3271, + "step": 28470 + }, + { + "epoch": 3.8, + "grad_norm": 0.63671875, + "learning_rate": 1.531832728899696e-06, + "loss": 0.4136, + "step": 28471 + }, + { + "epoch": 3.8, + "grad_norm": 0.65234375, + "learning_rate": 1.5298029515771195e-06, + "loss": 0.4189, + "step": 28472 + }, + { + "epoch": 3.8, + "grad_norm": 0.65625, + "learning_rate": 1.5277745095736206e-06, + "loss": 0.4758, + "step": 28473 + }, + { + "epoch": 3.8, + "grad_norm": 0.6484375, + "learning_rate": 1.5257474029166996e-06, + "loss": 0.4698, + "step": 28474 + }, + { + "epoch": 3.8, + "grad_norm": 0.7265625, + "learning_rate": 1.5237216316338343e-06, + "loss": 0.5399, + "step": 28475 + }, + { + "epoch": 3.8, + "grad_norm": 0.640625, + "learning_rate": 1.5216971957525029e-06, + "loss": 0.3591, + "step": 28476 + }, + { + "epoch": 3.8, + "grad_norm": 0.4609375, + "learning_rate": 1.5196740953001832e-06, + "loss": 0.2847, + "step": 28477 + }, + { + "epoch": 3.8, + "grad_norm": 0.4609375, + "learning_rate": 1.5176523303042756e-06, + "loss": 0.3188, + "step": 28478 + }, + { + "epoch": 3.8, + "grad_norm": 0.54296875, + "learning_rate": 1.5156319007922026e-06, + "loss": 0.3674, + "step": 28479 + }, + { + "epoch": 3.8, + "grad_norm": 0.5, + "learning_rate": 1.5136128067913645e-06, + "loss": 0.3543, + "step": 28480 + }, + { + "epoch": 3.8, + "grad_norm": 0.5390625, + "learning_rate": 1.5115950483291507e-06, + "loss": 0.2959, + "step": 28481 + }, + { + "epoch": 3.8, + "grad_norm": 0.578125, + "learning_rate": 1.509578625432917e-06, + "loss": 0.215, + "step": 28482 + }, + { + "epoch": 3.8, + "grad_norm": 0.57421875, + "learning_rate": 1.507563538130008e-06, + "loss": 0.1266, + "step": 28483 + }, + { + "epoch": 3.8, + "grad_norm": 0.55859375, + "learning_rate": 1.5055497864477463e-06, + "loss": 0.4563, + "step": 28484 + }, + { + "epoch": 3.8, + "grad_norm": 0.55078125, + "learning_rate": 1.5035373704134437e-06, + "loss": 0.2342, + "step": 28485 + }, + { + "epoch": 3.8, + "grad_norm": 0.5078125, + "learning_rate": 1.5015262900544003e-06, + "loss": 0.3275, + "step": 28486 + }, + { + "epoch": 3.8, + "grad_norm": 0.625, + "learning_rate": 1.499516545397861e-06, + "loss": 0.4365, + "step": 28487 + }, + { + "epoch": 3.8, + "grad_norm": 0.84375, + "learning_rate": 1.4975081364711042e-06, + "loss": 0.375, + "step": 28488 + }, + { + "epoch": 3.8, + "grad_norm": 0.58203125, + "learning_rate": 1.4955010633013521e-06, + "loss": 0.5129, + "step": 28489 + }, + { + "epoch": 3.8, + "grad_norm": 0.703125, + "learning_rate": 1.493495325915828e-06, + "loss": 0.2458, + "step": 28490 + }, + { + "epoch": 3.8, + "grad_norm": 0.62890625, + "learning_rate": 1.4914909243417318e-06, + "loss": 0.5263, + "step": 28491 + }, + { + "epoch": 3.8, + "grad_norm": 0.796875, + "learning_rate": 1.4894878586062311e-06, + "loss": 0.4048, + "step": 28492 + }, + { + "epoch": 3.8, + "grad_norm": 0.44140625, + "learning_rate": 1.4874861287365039e-06, + "loss": 0.0938, + "step": 28493 + }, + { + "epoch": 3.8, + "grad_norm": 0.55859375, + "learning_rate": 1.485485734759684e-06, + "loss": 0.342, + "step": 28494 + }, + { + "epoch": 3.8, + "grad_norm": 0.5390625, + "learning_rate": 1.4834866767029054e-06, + "loss": 0.2782, + "step": 28495 + }, + { + "epoch": 3.8, + "grad_norm": 0.5859375, + "learning_rate": 1.4814889545932797e-06, + "loss": 0.3101, + "step": 28496 + }, + { + "epoch": 3.8, + "grad_norm": 0.7734375, + "learning_rate": 1.4794925684578854e-06, + "loss": 0.3479, + "step": 28497 + }, + { + "epoch": 3.8, + "grad_norm": 0.62890625, + "learning_rate": 1.4774975183238115e-06, + "loss": 0.302, + "step": 28498 + }, + { + "epoch": 3.8, + "grad_norm": 0.5390625, + "learning_rate": 1.4755038042180925e-06, + "loss": 0.3909, + "step": 28499 + }, + { + "epoch": 3.8, + "grad_norm": 0.5390625, + "learning_rate": 1.473511426167784e-06, + "loss": 0.1982, + "step": 28500 + }, + { + "epoch": 3.8, + "grad_norm": 0.578125, + "learning_rate": 1.4715203841998981e-06, + "loss": 0.273, + "step": 28501 + }, + { + "epoch": 3.8, + "grad_norm": 0.6640625, + "learning_rate": 1.4695306783414241e-06, + "loss": 0.2822, + "step": 28502 + }, + { + "epoch": 3.8, + "grad_norm": 0.58203125, + "learning_rate": 1.4675423086193406e-06, + "loss": 0.2445, + "step": 28503 + }, + { + "epoch": 3.8, + "grad_norm": 0.515625, + "learning_rate": 1.465555275060626e-06, + "loss": 0.1756, + "step": 28504 + }, + { + "epoch": 3.8, + "grad_norm": 0.7734375, + "learning_rate": 1.4635695776922365e-06, + "loss": 0.3117, + "step": 28505 + }, + { + "epoch": 3.8, + "grad_norm": 0.8203125, + "learning_rate": 1.4615852165410726e-06, + "loss": 0.2155, + "step": 28506 + }, + { + "epoch": 3.8, + "grad_norm": 0.6875, + "learning_rate": 1.4596021916340463e-06, + "loss": 0.2719, + "step": 28507 + }, + { + "epoch": 3.8, + "grad_norm": 0.7109375, + "learning_rate": 1.4576205029980694e-06, + "loss": 0.3571, + "step": 28508 + }, + { + "epoch": 3.8, + "grad_norm": 0.75390625, + "learning_rate": 1.4556401506599982e-06, + "loss": 0.2957, + "step": 28509 + }, + { + "epoch": 3.8, + "grad_norm": 0.62890625, + "learning_rate": 1.453661134646689e-06, + "loss": 0.5182, + "step": 28510 + }, + { + "epoch": 3.8, + "grad_norm": 0.546875, + "learning_rate": 1.4516834549849868e-06, + "loss": 0.3683, + "step": 28511 + }, + { + "epoch": 3.8, + "grad_norm": 0.69921875, + "learning_rate": 1.4497071117017036e-06, + "loss": 0.4294, + "step": 28512 + }, + { + "epoch": 3.8, + "grad_norm": 0.6328125, + "learning_rate": 1.4477321048236403e-06, + "loss": 0.2963, + "step": 28513 + }, + { + "epoch": 3.8, + "grad_norm": 0.486328125, + "learning_rate": 1.4457584343775865e-06, + "loss": 0.2374, + "step": 28514 + }, + { + "epoch": 3.81, + "grad_norm": 0.6328125, + "learning_rate": 1.4437861003902874e-06, + "loss": 0.3805, + "step": 28515 + }, + { + "epoch": 3.81, + "grad_norm": 0.486328125, + "learning_rate": 1.4418151028885107e-06, + "loss": 0.2289, + "step": 28516 + }, + { + "epoch": 3.81, + "grad_norm": 0.625, + "learning_rate": 1.4398454418989683e-06, + "loss": 0.417, + "step": 28517 + }, + { + "epoch": 3.81, + "grad_norm": 0.64453125, + "learning_rate": 1.437877117448383e-06, + "loss": 0.3846, + "step": 28518 + }, + { + "epoch": 3.81, + "grad_norm": 0.50390625, + "learning_rate": 1.4359101295634447e-06, + "loss": 0.2021, + "step": 28519 + }, + { + "epoch": 3.81, + "grad_norm": 0.7265625, + "learning_rate": 1.4339444782708101e-06, + "loss": 0.2436, + "step": 28520 + }, + { + "epoch": 3.81, + "grad_norm": 0.52734375, + "learning_rate": 1.4319801635971575e-06, + "loss": 0.2787, + "step": 28521 + }, + { + "epoch": 3.81, + "grad_norm": 0.65234375, + "learning_rate": 1.430017185569099e-06, + "loss": 0.4932, + "step": 28522 + }, + { + "epoch": 3.81, + "grad_norm": 0.53125, + "learning_rate": 1.4280555442132914e-06, + "loss": 0.2701, + "step": 28523 + }, + { + "epoch": 3.81, + "grad_norm": 0.470703125, + "learning_rate": 1.426095239556302e-06, + "loss": 0.168, + "step": 28524 + }, + { + "epoch": 3.81, + "grad_norm": 0.6484375, + "learning_rate": 1.424136271624732e-06, + "loss": 0.2936, + "step": 28525 + }, + { + "epoch": 3.81, + "grad_norm": 0.78515625, + "learning_rate": 1.4221786404451376e-06, + "loss": 0.3422, + "step": 28526 + }, + { + "epoch": 3.81, + "grad_norm": 0.640625, + "learning_rate": 1.4202223460440645e-06, + "loss": 0.3205, + "step": 28527 + }, + { + "epoch": 3.81, + "grad_norm": 0.6953125, + "learning_rate": 1.4182673884480469e-06, + "loss": 0.5287, + "step": 28528 + }, + { + "epoch": 3.81, + "grad_norm": 0.578125, + "learning_rate": 1.4163137676835858e-06, + "loss": 0.2638, + "step": 28529 + }, + { + "epoch": 3.81, + "grad_norm": 0.51953125, + "learning_rate": 1.4143614837771935e-06, + "loss": 0.2003, + "step": 28530 + }, + { + "epoch": 3.81, + "grad_norm": 0.57421875, + "learning_rate": 1.4124105367553153e-06, + "loss": 0.312, + "step": 28531 + }, + { + "epoch": 3.81, + "grad_norm": 0.7578125, + "learning_rate": 1.4104609266444413e-06, + "loss": 0.6329, + "step": 28532 + }, + { + "epoch": 3.81, + "grad_norm": 0.70703125, + "learning_rate": 1.4085126534709725e-06, + "loss": 0.416, + "step": 28533 + }, + { + "epoch": 3.81, + "grad_norm": 0.66015625, + "learning_rate": 1.4065657172613656e-06, + "loss": 0.4752, + "step": 28534 + }, + { + "epoch": 3.81, + "grad_norm": 0.48046875, + "learning_rate": 1.4046201180419883e-06, + "loss": 0.3041, + "step": 28535 + }, + { + "epoch": 3.81, + "grad_norm": 0.578125, + "learning_rate": 1.4026758558392529e-06, + "loss": 0.3099, + "step": 28536 + }, + { + "epoch": 3.81, + "grad_norm": 0.66015625, + "learning_rate": 1.4007329306795047e-06, + "loss": 0.3506, + "step": 28537 + }, + { + "epoch": 3.81, + "grad_norm": 0.65234375, + "learning_rate": 1.3987913425891118e-06, + "loss": 0.244, + "step": 28538 + }, + { + "epoch": 3.81, + "grad_norm": 0.53515625, + "learning_rate": 1.3968510915943756e-06, + "loss": 0.1543, + "step": 28539 + }, + { + "epoch": 3.81, + "grad_norm": 0.73828125, + "learning_rate": 1.394912177721619e-06, + "loss": 0.4118, + "step": 28540 + }, + { + "epoch": 3.81, + "grad_norm": 0.58984375, + "learning_rate": 1.3929746009971433e-06, + "loss": 0.4674, + "step": 28541 + }, + { + "epoch": 3.81, + "grad_norm": 0.5390625, + "learning_rate": 1.3910383614472277e-06, + "loss": 0.26, + "step": 28542 + }, + { + "epoch": 3.81, + "grad_norm": 0.52734375, + "learning_rate": 1.3891034590981067e-06, + "loss": 0.1986, + "step": 28543 + }, + { + "epoch": 3.81, + "grad_norm": 0.66015625, + "learning_rate": 1.3871698939760259e-06, + "loss": 0.3763, + "step": 28544 + }, + { + "epoch": 3.81, + "grad_norm": 0.65625, + "learning_rate": 1.3852376661072197e-06, + "loss": 0.3072, + "step": 28545 + }, + { + "epoch": 3.81, + "grad_norm": 0.61328125, + "learning_rate": 1.3833067755178898e-06, + "loss": 0.4546, + "step": 28546 + }, + { + "epoch": 3.81, + "grad_norm": 0.74609375, + "learning_rate": 1.3813772222341925e-06, + "loss": 0.4569, + "step": 28547 + }, + { + "epoch": 3.81, + "grad_norm": 0.462890625, + "learning_rate": 1.3794490062823296e-06, + "loss": 0.2032, + "step": 28548 + }, + { + "epoch": 3.81, + "grad_norm": 0.6640625, + "learning_rate": 1.3775221276884132e-06, + "loss": 0.2789, + "step": 28549 + }, + { + "epoch": 3.81, + "grad_norm": 0.73046875, + "learning_rate": 1.3755965864786114e-06, + "loss": 0.5857, + "step": 28550 + }, + { + "epoch": 3.81, + "grad_norm": 0.60546875, + "learning_rate": 1.3736723826790143e-06, + "loss": 0.4267, + "step": 28551 + }, + { + "epoch": 3.81, + "grad_norm": 0.55859375, + "learning_rate": 1.3717495163157124e-06, + "loss": 0.375, + "step": 28552 + }, + { + "epoch": 3.81, + "grad_norm": 0.64453125, + "learning_rate": 1.3698279874147846e-06, + "loss": 0.2829, + "step": 28553 + }, + { + "epoch": 3.81, + "grad_norm": 0.62890625, + "learning_rate": 1.3679077960022878e-06, + "loss": 0.4686, + "step": 28554 + }, + { + "epoch": 3.81, + "grad_norm": 0.6328125, + "learning_rate": 1.3659889421042793e-06, + "loss": 0.3021, + "step": 28555 + }, + { + "epoch": 3.81, + "grad_norm": 0.671875, + "learning_rate": 1.3640714257467491e-06, + "loss": 0.3268, + "step": 28556 + }, + { + "epoch": 3.81, + "grad_norm": 0.474609375, + "learning_rate": 1.3621552469557097e-06, + "loss": 0.2035, + "step": 28557 + }, + { + "epoch": 3.81, + "grad_norm": 0.609375, + "learning_rate": 1.3602404057571628e-06, + "loss": 0.2178, + "step": 28558 + }, + { + "epoch": 3.81, + "grad_norm": 0.58203125, + "learning_rate": 1.358326902177054e-06, + "loss": 0.3405, + "step": 28559 + }, + { + "epoch": 3.81, + "grad_norm": 0.478515625, + "learning_rate": 1.3564147362413516e-06, + "loss": 0.2039, + "step": 28560 + }, + { + "epoch": 3.81, + "grad_norm": 0.8203125, + "learning_rate": 1.3545039079759569e-06, + "loss": 0.4396, + "step": 28561 + }, + { + "epoch": 3.81, + "grad_norm": 0.67578125, + "learning_rate": 1.3525944174068161e-06, + "loss": 0.2646, + "step": 28562 + }, + { + "epoch": 3.81, + "grad_norm": 0.6328125, + "learning_rate": 1.3506862645597973e-06, + "loss": 0.2018, + "step": 28563 + }, + { + "epoch": 3.81, + "grad_norm": 0.5859375, + "learning_rate": 1.3487794494607908e-06, + "loss": 0.2652, + "step": 28564 + }, + { + "epoch": 3.81, + "grad_norm": 0.69921875, + "learning_rate": 1.3468739721356428e-06, + "loss": 0.1726, + "step": 28565 + }, + { + "epoch": 3.81, + "grad_norm": 0.6015625, + "learning_rate": 1.344969832610199e-06, + "loss": 0.419, + "step": 28566 + }, + { + "epoch": 3.81, + "grad_norm": 0.59375, + "learning_rate": 1.3430670309102834e-06, + "loss": 0.2124, + "step": 28567 + }, + { + "epoch": 3.81, + "grad_norm": 0.50390625, + "learning_rate": 1.3411655670616973e-06, + "loss": 0.1693, + "step": 28568 + }, + { + "epoch": 3.81, + "grad_norm": 0.58203125, + "learning_rate": 1.3392654410902316e-06, + "loss": 0.2376, + "step": 28569 + }, + { + "epoch": 3.81, + "grad_norm": 0.5390625, + "learning_rate": 1.3373666530216323e-06, + "loss": 0.1525, + "step": 28570 + }, + { + "epoch": 3.81, + "grad_norm": 0.58984375, + "learning_rate": 1.3354692028816673e-06, + "loss": 0.3611, + "step": 28571 + }, + { + "epoch": 3.81, + "grad_norm": 0.734375, + "learning_rate": 1.3335730906960608e-06, + "loss": 0.3371, + "step": 28572 + }, + { + "epoch": 3.81, + "grad_norm": 0.75, + "learning_rate": 1.3316783164905367e-06, + "loss": 0.4984, + "step": 28573 + }, + { + "epoch": 3.81, + "grad_norm": 0.55078125, + "learning_rate": 1.3297848802907743e-06, + "loss": 0.3501, + "step": 28574 + }, + { + "epoch": 3.81, + "grad_norm": 0.51953125, + "learning_rate": 1.327892782122453e-06, + "loss": 0.1729, + "step": 28575 + }, + { + "epoch": 3.81, + "grad_norm": 0.52734375, + "learning_rate": 1.3260020220112302e-06, + "loss": 0.1773, + "step": 28576 + }, + { + "epoch": 3.81, + "grad_norm": 0.6640625, + "learning_rate": 1.324112599982752e-06, + "loss": 0.2615, + "step": 28577 + }, + { + "epoch": 3.81, + "grad_norm": 0.54296875, + "learning_rate": 1.3222245160626423e-06, + "loss": 0.3581, + "step": 28578 + }, + { + "epoch": 3.81, + "grad_norm": 0.7109375, + "learning_rate": 1.3203377702764918e-06, + "loss": 0.4768, + "step": 28579 + }, + { + "epoch": 3.81, + "grad_norm": 0.474609375, + "learning_rate": 1.3184523626498913e-06, + "loss": 0.1453, + "step": 28580 + }, + { + "epoch": 3.81, + "grad_norm": 0.5859375, + "learning_rate": 1.316568293208409e-06, + "loss": 0.3508, + "step": 28581 + }, + { + "epoch": 3.81, + "grad_norm": 0.76171875, + "learning_rate": 1.3146855619776134e-06, + "loss": 0.3438, + "step": 28582 + }, + { + "epoch": 3.81, + "grad_norm": 0.64453125, + "learning_rate": 1.3128041689829951e-06, + "loss": 0.2856, + "step": 28583 + }, + { + "epoch": 3.81, + "grad_norm": 0.5234375, + "learning_rate": 1.3109241142501005e-06, + "loss": 0.2058, + "step": 28584 + }, + { + "epoch": 3.81, + "grad_norm": 0.6015625, + "learning_rate": 1.3090453978044093e-06, + "loss": 0.3742, + "step": 28585 + }, + { + "epoch": 3.81, + "grad_norm": 0.38671875, + "learning_rate": 1.307168019671412e-06, + "loss": 0.3082, + "step": 28586 + }, + { + "epoch": 3.81, + "grad_norm": 0.69140625, + "learning_rate": 1.305291979876544e-06, + "loss": 0.5482, + "step": 28587 + }, + { + "epoch": 3.81, + "grad_norm": 0.61328125, + "learning_rate": 1.3034172784452624e-06, + "loss": 0.4443, + "step": 28588 + }, + { + "epoch": 3.81, + "grad_norm": 0.46875, + "learning_rate": 1.3015439154029917e-06, + "loss": 0.1592, + "step": 28589 + }, + { + "epoch": 3.82, + "grad_norm": 0.68359375, + "learning_rate": 1.2996718907751227e-06, + "loss": 0.5496, + "step": 28590 + }, + { + "epoch": 3.82, + "grad_norm": 0.69140625, + "learning_rate": 1.2978012045870457e-06, + "loss": 0.2825, + "step": 28591 + }, + { + "epoch": 3.82, + "grad_norm": 0.57421875, + "learning_rate": 1.295931856864141e-06, + "loss": 0.3206, + "step": 28592 + }, + { + "epoch": 3.82, + "grad_norm": 0.7265625, + "learning_rate": 1.2940638476317435e-06, + "loss": 0.4242, + "step": 28593 + }, + { + "epoch": 3.82, + "grad_norm": 0.625, + "learning_rate": 1.2921971769151774e-06, + "loss": 0.3651, + "step": 28594 + }, + { + "epoch": 3.82, + "grad_norm": 0.5390625, + "learning_rate": 1.2903318447397783e-06, + "loss": 0.2677, + "step": 28595 + }, + { + "epoch": 3.82, + "grad_norm": 0.6640625, + "learning_rate": 1.2884678511308367e-06, + "loss": 0.2691, + "step": 28596 + }, + { + "epoch": 3.82, + "grad_norm": 0.69921875, + "learning_rate": 1.2866051961136215e-06, + "loss": 0.3645, + "step": 28597 + }, + { + "epoch": 3.82, + "grad_norm": 0.80859375, + "learning_rate": 1.2847438797133904e-06, + "loss": 0.3045, + "step": 28598 + }, + { + "epoch": 3.82, + "grad_norm": 0.44921875, + "learning_rate": 1.2828839019553896e-06, + "loss": 0.232, + "step": 28599 + }, + { + "epoch": 3.82, + "grad_norm": 0.5703125, + "learning_rate": 1.2810252628648323e-06, + "loss": 0.29, + "step": 28600 + }, + { + "epoch": 3.82, + "grad_norm": 0.60546875, + "learning_rate": 1.2791679624669428e-06, + "loss": 0.3289, + "step": 28601 + }, + { + "epoch": 3.82, + "grad_norm": 0.68359375, + "learning_rate": 1.277312000786879e-06, + "loss": 0.1758, + "step": 28602 + }, + { + "epoch": 3.82, + "grad_norm": 0.79296875, + "learning_rate": 1.2754573778498313e-06, + "loss": 0.3861, + "step": 28603 + }, + { + "epoch": 3.82, + "grad_norm": 0.55078125, + "learning_rate": 1.2736040936809356e-06, + "loss": 0.1607, + "step": 28604 + }, + { + "epoch": 3.82, + "grad_norm": 0.69140625, + "learning_rate": 1.2717521483053496e-06, + "loss": 0.1766, + "step": 28605 + }, + { + "epoch": 3.82, + "grad_norm": 0.58984375, + "learning_rate": 1.269901541748153e-06, + "loss": 0.5605, + "step": 28606 + }, + { + "epoch": 3.82, + "grad_norm": 0.78125, + "learning_rate": 1.268052274034448e-06, + "loss": 0.4685, + "step": 28607 + }, + { + "epoch": 3.82, + "grad_norm": 0.67578125, + "learning_rate": 1.2662043451893369e-06, + "loss": 0.4778, + "step": 28608 + }, + { + "epoch": 3.82, + "grad_norm": 0.6484375, + "learning_rate": 1.264357755237855e-06, + "loss": 0.3058, + "step": 28609 + }, + { + "epoch": 3.82, + "grad_norm": 0.5625, + "learning_rate": 1.262512504205049e-06, + "loss": 0.1636, + "step": 28610 + }, + { + "epoch": 3.82, + "grad_norm": 0.66015625, + "learning_rate": 1.2606685921159544e-06, + "loss": 0.3504, + "step": 28611 + }, + { + "epoch": 3.82, + "grad_norm": 0.73046875, + "learning_rate": 1.2588260189955513e-06, + "loss": 0.4713, + "step": 28612 + }, + { + "epoch": 3.82, + "grad_norm": 0.56640625, + "learning_rate": 1.256984784868842e-06, + "loss": 0.3264, + "step": 28613 + }, + { + "epoch": 3.82, + "grad_norm": 0.6640625, + "learning_rate": 1.2551448897607842e-06, + "loss": 0.341, + "step": 28614 + }, + { + "epoch": 3.82, + "grad_norm": 0.65234375, + "learning_rate": 1.2533063336963468e-06, + "loss": 0.2346, + "step": 28615 + }, + { + "epoch": 3.82, + "grad_norm": 0.56640625, + "learning_rate": 1.2514691167004433e-06, + "loss": 0.4124, + "step": 28616 + }, + { + "epoch": 3.82, + "grad_norm": 0.6484375, + "learning_rate": 1.2496332387979982e-06, + "loss": 0.1855, + "step": 28617 + }, + { + "epoch": 3.82, + "grad_norm": 0.5234375, + "learning_rate": 1.2477987000139024e-06, + "loss": 0.2063, + "step": 28618 + }, + { + "epoch": 3.82, + "grad_norm": 0.57421875, + "learning_rate": 1.2459655003730475e-06, + "loss": 0.3467, + "step": 28619 + }, + { + "epoch": 3.82, + "grad_norm": 0.625, + "learning_rate": 1.244133639900269e-06, + "loss": 0.2192, + "step": 28620 + }, + { + "epoch": 3.82, + "grad_norm": 0.57421875, + "learning_rate": 1.2423031186204138e-06, + "loss": 0.4353, + "step": 28621 + }, + { + "epoch": 3.82, + "grad_norm": 0.51171875, + "learning_rate": 1.2404739365583172e-06, + "loss": 0.1734, + "step": 28622 + }, + { + "epoch": 3.82, + "grad_norm": 0.5859375, + "learning_rate": 1.2386460937387822e-06, + "loss": 0.4746, + "step": 28623 + }, + { + "epoch": 3.82, + "grad_norm": 0.7890625, + "learning_rate": 1.2368195901865887e-06, + "loss": 0.5329, + "step": 28624 + }, + { + "epoch": 3.82, + "grad_norm": 0.75390625, + "learning_rate": 1.2349944259265056e-06, + "loss": 0.5159, + "step": 28625 + }, + { + "epoch": 3.82, + "grad_norm": 0.45703125, + "learning_rate": 1.2331706009832799e-06, + "loss": 0.3087, + "step": 28626 + }, + { + "epoch": 3.82, + "grad_norm": 0.482421875, + "learning_rate": 1.2313481153816475e-06, + "loss": 0.1511, + "step": 28627 + }, + { + "epoch": 3.82, + "grad_norm": 0.6640625, + "learning_rate": 1.229526969146333e-06, + "loss": 0.2442, + "step": 28628 + }, + { + "epoch": 3.82, + "grad_norm": 0.73046875, + "learning_rate": 1.2277071623020274e-06, + "loss": 0.4857, + "step": 28629 + }, + { + "epoch": 3.82, + "grad_norm": 0.58203125, + "learning_rate": 1.2258886948733895e-06, + "loss": 0.3202, + "step": 28630 + }, + { + "epoch": 3.82, + "grad_norm": 0.5625, + "learning_rate": 1.2240715668850989e-06, + "loss": 0.3431, + "step": 28631 + }, + { + "epoch": 3.82, + "grad_norm": 0.51171875, + "learning_rate": 1.2222557783618027e-06, + "loss": 0.3326, + "step": 28632 + }, + { + "epoch": 3.82, + "grad_norm": 0.54296875, + "learning_rate": 1.2204413293281036e-06, + "loss": 0.3593, + "step": 28633 + }, + { + "epoch": 3.82, + "grad_norm": 0.7578125, + "learning_rate": 1.2186282198086152e-06, + "loss": 0.5874, + "step": 28634 + }, + { + "epoch": 3.82, + "grad_norm": 0.578125, + "learning_rate": 1.216816449827929e-06, + "loss": 0.1804, + "step": 28635 + }, + { + "epoch": 3.82, + "grad_norm": 0.67578125, + "learning_rate": 1.2150060194106028e-06, + "loss": 0.5141, + "step": 28636 + }, + { + "epoch": 3.82, + "grad_norm": 0.498046875, + "learning_rate": 1.2131969285812062e-06, + "loss": 0.3481, + "step": 28637 + }, + { + "epoch": 3.82, + "grad_norm": 0.54296875, + "learning_rate": 1.2113891773642527e-06, + "loss": 0.2504, + "step": 28638 + }, + { + "epoch": 3.82, + "grad_norm": 0.66015625, + "learning_rate": 1.2095827657842673e-06, + "loss": 0.5206, + "step": 28639 + }, + { + "epoch": 3.82, + "grad_norm": 0.57421875, + "learning_rate": 1.2077776938657416e-06, + "loss": 0.2983, + "step": 28640 + }, + { + "epoch": 3.82, + "grad_norm": 0.77734375, + "learning_rate": 1.2059739616331555e-06, + "loss": 0.5091, + "step": 28641 + }, + { + "epoch": 3.82, + "grad_norm": 0.59375, + "learning_rate": 1.2041715691109678e-06, + "loss": 0.2376, + "step": 28642 + }, + { + "epoch": 3.82, + "grad_norm": 0.494140625, + "learning_rate": 1.2023705163236143e-06, + "loss": 0.2619, + "step": 28643 + }, + { + "epoch": 3.82, + "grad_norm": 0.734375, + "learning_rate": 1.200570803295531e-06, + "loss": 0.3189, + "step": 28644 + }, + { + "epoch": 3.82, + "grad_norm": 0.67578125, + "learning_rate": 1.1987724300511093e-06, + "loss": 0.2875, + "step": 28645 + }, + { + "epoch": 3.82, + "grad_norm": 0.5859375, + "learning_rate": 1.1969753966147634e-06, + "loss": 0.3075, + "step": 28646 + }, + { + "epoch": 3.82, + "grad_norm": 0.6796875, + "learning_rate": 1.195179703010829e-06, + "loss": 0.2677, + "step": 28647 + }, + { + "epoch": 3.82, + "grad_norm": 0.5078125, + "learning_rate": 1.1933853492636648e-06, + "loss": 0.245, + "step": 28648 + }, + { + "epoch": 3.82, + "grad_norm": 0.75, + "learning_rate": 1.1915923353976178e-06, + "loss": 0.4184, + "step": 28649 + }, + { + "epoch": 3.82, + "grad_norm": 0.58203125, + "learning_rate": 1.1898006614369906e-06, + "loss": 0.3574, + "step": 28650 + }, + { + "epoch": 3.82, + "grad_norm": 0.53515625, + "learning_rate": 1.188010327406086e-06, + "loss": 0.2638, + "step": 28651 + }, + { + "epoch": 3.82, + "grad_norm": 0.68359375, + "learning_rate": 1.186221333329174e-06, + "loss": 0.2415, + "step": 28652 + }, + { + "epoch": 3.82, + "grad_norm": 0.62890625, + "learning_rate": 1.184433679230512e-06, + "loss": 0.3336, + "step": 28653 + }, + { + "epoch": 3.82, + "grad_norm": 0.5078125, + "learning_rate": 1.1826473651343594e-06, + "loss": 0.3929, + "step": 28654 + }, + { + "epoch": 3.82, + "grad_norm": 0.6015625, + "learning_rate": 1.1808623910649298e-06, + "loss": 0.2286, + "step": 28655 + }, + { + "epoch": 3.82, + "grad_norm": 0.50390625, + "learning_rate": 1.1790787570464256e-06, + "loss": 0.2594, + "step": 28656 + }, + { + "epoch": 3.82, + "grad_norm": 0.68359375, + "learning_rate": 1.1772964631030281e-06, + "loss": 0.5378, + "step": 28657 + }, + { + "epoch": 3.82, + "grad_norm": 0.58984375, + "learning_rate": 1.1755155092589177e-06, + "loss": 0.3159, + "step": 28658 + }, + { + "epoch": 3.82, + "grad_norm": 0.671875, + "learning_rate": 1.1737358955382528e-06, + "loss": 0.6245, + "step": 28659 + }, + { + "epoch": 3.82, + "grad_norm": 0.55859375, + "learning_rate": 1.1719576219651585e-06, + "loss": 0.1847, + "step": 28660 + }, + { + "epoch": 3.82, + "grad_norm": 0.50390625, + "learning_rate": 1.1701806885637268e-06, + "loss": 0.3643, + "step": 28661 + }, + { + "epoch": 3.82, + "grad_norm": 0.5390625, + "learning_rate": 1.1684050953580828e-06, + "loss": 0.2561, + "step": 28662 + }, + { + "epoch": 3.82, + "grad_norm": 0.6796875, + "learning_rate": 1.166630842372296e-06, + "loss": 0.5294, + "step": 28663 + }, + { + "epoch": 3.82, + "grad_norm": 0.7109375, + "learning_rate": 1.1648579296304253e-06, + "loss": 0.2856, + "step": 28664 + }, + { + "epoch": 3.83, + "grad_norm": 0.46484375, + "learning_rate": 1.1630863571565176e-06, + "loss": 0.2314, + "step": 28665 + }, + { + "epoch": 3.83, + "grad_norm": 0.52734375, + "learning_rate": 1.1613161249745875e-06, + "loss": 0.2259, + "step": 28666 + }, + { + "epoch": 3.83, + "grad_norm": 0.6640625, + "learning_rate": 1.1595472331086377e-06, + "loss": 0.3546, + "step": 28667 + }, + { + "epoch": 3.83, + "grad_norm": 0.4375, + "learning_rate": 1.1577796815826715e-06, + "loss": 0.1612, + "step": 28668 + }, + { + "epoch": 3.83, + "grad_norm": 0.49609375, + "learning_rate": 1.1560134704206471e-06, + "loss": 0.4296, + "step": 28669 + }, + { + "epoch": 3.83, + "grad_norm": 0.75, + "learning_rate": 1.1542485996465236e-06, + "loss": 0.1986, + "step": 28670 + }, + { + "epoch": 3.83, + "grad_norm": 0.5234375, + "learning_rate": 1.1524850692842258e-06, + "loss": 0.1903, + "step": 28671 + }, + { + "epoch": 3.83, + "grad_norm": 0.51953125, + "learning_rate": 1.1507228793576575e-06, + "loss": 0.2596, + "step": 28672 + }, + { + "epoch": 3.83, + "grad_norm": 0.59375, + "learning_rate": 1.1489620298907434e-06, + "loss": 0.289, + "step": 28673 + }, + { + "epoch": 3.83, + "grad_norm": 0.6484375, + "learning_rate": 1.1472025209073422e-06, + "loss": 0.3923, + "step": 28674 + }, + { + "epoch": 3.83, + "grad_norm": 0.498046875, + "learning_rate": 1.145444352431313e-06, + "loss": 0.1951, + "step": 28675 + }, + { + "epoch": 3.83, + "grad_norm": 0.66796875, + "learning_rate": 1.1436875244865031e-06, + "loss": 0.2638, + "step": 28676 + }, + { + "epoch": 3.83, + "grad_norm": 0.68359375, + "learning_rate": 1.1419320370967378e-06, + "loss": 0.5302, + "step": 28677 + }, + { + "epoch": 3.83, + "grad_norm": 0.5703125, + "learning_rate": 1.1401778902858319e-06, + "loss": 0.2462, + "step": 28678 + }, + { + "epoch": 3.83, + "grad_norm": 0.55078125, + "learning_rate": 1.1384250840775434e-06, + "loss": 0.3631, + "step": 28679 + }, + { + "epoch": 3.83, + "grad_norm": 0.87109375, + "learning_rate": 1.136673618495665e-06, + "loss": 0.5008, + "step": 28680 + }, + { + "epoch": 3.83, + "grad_norm": 0.546875, + "learning_rate": 1.134923493563944e-06, + "loss": 0.1489, + "step": 28681 + }, + { + "epoch": 3.83, + "grad_norm": 0.671875, + "learning_rate": 1.1331747093061173e-06, + "loss": 0.1953, + "step": 28682 + }, + { + "epoch": 3.83, + "grad_norm": 0.50390625, + "learning_rate": 1.1314272657458991e-06, + "loss": 0.2818, + "step": 28683 + }, + { + "epoch": 3.83, + "grad_norm": 0.5234375, + "learning_rate": 1.1296811629069703e-06, + "loss": 0.3052, + "step": 28684 + }, + { + "epoch": 3.83, + "grad_norm": 0.63671875, + "learning_rate": 1.1279364008130232e-06, + "loss": 0.484, + "step": 28685 + }, + { + "epoch": 3.83, + "grad_norm": 0.5078125, + "learning_rate": 1.1261929794877056e-06, + "loss": 0.1124, + "step": 28686 + }, + { + "epoch": 3.83, + "grad_norm": 0.62109375, + "learning_rate": 1.1244508989546654e-06, + "loss": 0.3099, + "step": 28687 + }, + { + "epoch": 3.83, + "grad_norm": 0.57421875, + "learning_rate": 1.12271015923755e-06, + "loss": 0.3353, + "step": 28688 + }, + { + "epoch": 3.83, + "grad_norm": 0.53515625, + "learning_rate": 1.1209707603599295e-06, + "loss": 0.2549, + "step": 28689 + }, + { + "epoch": 3.83, + "grad_norm": 0.609375, + "learning_rate": 1.1192327023453964e-06, + "loss": 0.2808, + "step": 28690 + }, + { + "epoch": 3.83, + "grad_norm": 0.60546875, + "learning_rate": 1.1174959852175427e-06, + "loss": 0.3251, + "step": 28691 + }, + { + "epoch": 3.83, + "grad_norm": 0.6328125, + "learning_rate": 1.115760608999905e-06, + "loss": 0.2109, + "step": 28692 + }, + { + "epoch": 3.83, + "grad_norm": 0.7421875, + "learning_rate": 1.1140265737160204e-06, + "loss": 0.4988, + "step": 28693 + }, + { + "epoch": 3.83, + "grad_norm": 0.44921875, + "learning_rate": 1.1122938793893923e-06, + "loss": 0.1932, + "step": 28694 + }, + { + "epoch": 3.83, + "grad_norm": 0.640625, + "learning_rate": 1.1105625260435237e-06, + "loss": 0.1776, + "step": 28695 + }, + { + "epoch": 3.83, + "grad_norm": 0.5546875, + "learning_rate": 1.1088325137018961e-06, + "loss": 0.241, + "step": 28696 + }, + { + "epoch": 3.83, + "grad_norm": 0.55078125, + "learning_rate": 1.1071038423879798e-06, + "loss": 0.2296, + "step": 28697 + }, + { + "epoch": 3.83, + "grad_norm": 0.5703125, + "learning_rate": 1.105376512125189e-06, + "loss": 0.3111, + "step": 28698 + }, + { + "epoch": 3.83, + "grad_norm": 0.6796875, + "learning_rate": 1.103650522936961e-06, + "loss": 0.3447, + "step": 28699 + }, + { + "epoch": 3.83, + "grad_norm": 0.455078125, + "learning_rate": 1.1019258748467098e-06, + "loss": 0.1623, + "step": 28700 + }, + { + "epoch": 3.83, + "grad_norm": 0.6796875, + "learning_rate": 1.1002025678778283e-06, + "loss": 0.4237, + "step": 28701 + }, + { + "epoch": 3.83, + "grad_norm": 0.4296875, + "learning_rate": 1.098480602053653e-06, + "loss": 0.1821, + "step": 28702 + }, + { + "epoch": 3.83, + "grad_norm": 0.5390625, + "learning_rate": 1.0967599773975657e-06, + "loss": 0.2437, + "step": 28703 + }, + { + "epoch": 3.83, + "grad_norm": 0.74609375, + "learning_rate": 1.0950406939328806e-06, + "loss": 0.2664, + "step": 28704 + }, + { + "epoch": 3.83, + "grad_norm": 0.65234375, + "learning_rate": 1.0933227516829347e-06, + "loss": 0.2346, + "step": 28705 + }, + { + "epoch": 3.83, + "grad_norm": 0.48828125, + "learning_rate": 1.0916061506709985e-06, + "loss": 0.1731, + "step": 28706 + }, + { + "epoch": 3.83, + "grad_norm": 0.56640625, + "learning_rate": 1.0898908909203643e-06, + "loss": 0.1994, + "step": 28707 + }, + { + "epoch": 3.83, + "grad_norm": 0.6953125, + "learning_rate": 1.0881769724542912e-06, + "loss": 0.1955, + "step": 28708 + }, + { + "epoch": 3.83, + "grad_norm": 0.921875, + "learning_rate": 1.0864643952960162e-06, + "loss": 0.3066, + "step": 28709 + }, + { + "epoch": 3.83, + "grad_norm": 0.75390625, + "learning_rate": 1.0847531594687766e-06, + "loss": 0.4436, + "step": 28710 + }, + { + "epoch": 3.83, + "grad_norm": 0.84375, + "learning_rate": 1.0830432649957534e-06, + "loss": 0.2203, + "step": 28711 + }, + { + "epoch": 3.83, + "grad_norm": 0.486328125, + "learning_rate": 1.0813347119001505e-06, + "loss": 0.17, + "step": 28712 + }, + { + "epoch": 3.83, + "grad_norm": 0.5859375, + "learning_rate": 1.079627500205138e-06, + "loss": 0.2714, + "step": 28713 + }, + { + "epoch": 3.83, + "grad_norm": 0.609375, + "learning_rate": 1.0779216299338645e-06, + "loss": 0.1727, + "step": 28714 + }, + { + "epoch": 3.83, + "grad_norm": 0.6953125, + "learning_rate": 1.0762171011094668e-06, + "loss": 0.5137, + "step": 28715 + }, + { + "epoch": 3.83, + "grad_norm": 0.51953125, + "learning_rate": 1.0745139137550486e-06, + "loss": 0.154, + "step": 28716 + }, + { + "epoch": 3.83, + "grad_norm": 0.64453125, + "learning_rate": 1.0728120678937025e-06, + "loss": 0.1835, + "step": 28717 + }, + { + "epoch": 3.83, + "grad_norm": 0.56640625, + "learning_rate": 1.0711115635485213e-06, + "loss": 0.3501, + "step": 28718 + }, + { + "epoch": 3.83, + "grad_norm": 0.7265625, + "learning_rate": 1.0694124007425753e-06, + "loss": 0.2557, + "step": 28719 + }, + { + "epoch": 3.83, + "grad_norm": 0.69140625, + "learning_rate": 1.0677145794988798e-06, + "loss": 0.4867, + "step": 28720 + }, + { + "epoch": 3.83, + "grad_norm": 0.671875, + "learning_rate": 1.0660180998404601e-06, + "loss": 0.2782, + "step": 28721 + }, + { + "epoch": 3.83, + "grad_norm": 0.70703125, + "learning_rate": 1.0643229617903428e-06, + "loss": 0.4406, + "step": 28722 + }, + { + "epoch": 3.83, + "grad_norm": 0.51171875, + "learning_rate": 1.062629165371487e-06, + "loss": 0.1689, + "step": 28723 + }, + { + "epoch": 3.83, + "grad_norm": 0.59375, + "learning_rate": 1.0609367106068968e-06, + "loss": 0.4284, + "step": 28724 + }, + { + "epoch": 3.83, + "grad_norm": 0.59375, + "learning_rate": 1.059245597519487e-06, + "loss": 0.2104, + "step": 28725 + }, + { + "epoch": 3.83, + "grad_norm": 0.8203125, + "learning_rate": 1.057555826132217e-06, + "loss": 0.6251, + "step": 28726 + }, + { + "epoch": 3.83, + "grad_norm": 0.56640625, + "learning_rate": 1.05586739646798e-06, + "loss": 0.3001, + "step": 28727 + }, + { + "epoch": 3.83, + "grad_norm": 0.44921875, + "learning_rate": 1.0541803085497014e-06, + "loss": 0.2665, + "step": 28728 + }, + { + "epoch": 3.83, + "grad_norm": 0.52734375, + "learning_rate": 1.052494562400219e-06, + "loss": 0.3042, + "step": 28729 + }, + { + "epoch": 3.83, + "grad_norm": 0.65234375, + "learning_rate": 1.0508101580424257e-06, + "loss": 0.2777, + "step": 28730 + }, + { + "epoch": 3.83, + "grad_norm": 0.7578125, + "learning_rate": 1.0491270954991472e-06, + "loss": 0.3583, + "step": 28731 + }, + { + "epoch": 3.83, + "grad_norm": 0.65234375, + "learning_rate": 1.0474453747932211e-06, + "loss": 0.2158, + "step": 28732 + }, + { + "epoch": 3.83, + "grad_norm": 0.6015625, + "learning_rate": 1.0457649959474402e-06, + "loss": 0.3481, + "step": 28733 + }, + { + "epoch": 3.83, + "grad_norm": 0.67578125, + "learning_rate": 1.0440859589845974e-06, + "loss": 0.259, + "step": 28734 + }, + { + "epoch": 3.83, + "grad_norm": 0.73828125, + "learning_rate": 1.0424082639274523e-06, + "loss": 0.429, + "step": 28735 + }, + { + "epoch": 3.83, + "grad_norm": 0.31640625, + "learning_rate": 1.0407319107987535e-06, + "loss": 0.1068, + "step": 28736 + }, + { + "epoch": 3.83, + "grad_norm": 0.51171875, + "learning_rate": 1.039056899621249e-06, + "loss": 0.1185, + "step": 28737 + }, + { + "epoch": 3.83, + "grad_norm": 0.71484375, + "learning_rate": 1.0373832304176545e-06, + "loss": 0.5232, + "step": 28738 + }, + { + "epoch": 3.83, + "grad_norm": 0.625, + "learning_rate": 1.0357109032106405e-06, + "loss": 0.3047, + "step": 28739 + }, + { + "epoch": 3.84, + "grad_norm": 0.80859375, + "learning_rate": 1.034039918022911e-06, + "loss": 0.6258, + "step": 28740 + }, + { + "epoch": 3.84, + "grad_norm": 0.734375, + "learning_rate": 1.0323702748771147e-06, + "loss": 0.5036, + "step": 28741 + }, + { + "epoch": 3.84, + "grad_norm": 0.458984375, + "learning_rate": 1.0307019737959e-06, + "loss": 0.3888, + "step": 28742 + }, + { + "epoch": 3.84, + "grad_norm": 0.5859375, + "learning_rate": 1.0290350148018823e-06, + "loss": 0.3546, + "step": 28743 + }, + { + "epoch": 3.84, + "grad_norm": 0.578125, + "learning_rate": 1.0273693979176657e-06, + "loss": 0.486, + "step": 28744 + }, + { + "epoch": 3.84, + "grad_norm": 0.5234375, + "learning_rate": 1.0257051231658432e-06, + "loss": 0.3278, + "step": 28745 + }, + { + "epoch": 3.84, + "grad_norm": 0.5390625, + "learning_rate": 1.0240421905689745e-06, + "loss": 0.1505, + "step": 28746 + }, + { + "epoch": 3.84, + "grad_norm": 0.80859375, + "learning_rate": 1.0223806001496194e-06, + "loss": 0.373, + "step": 28747 + }, + { + "epoch": 3.84, + "grad_norm": 0.6796875, + "learning_rate": 1.0207203519303154e-06, + "loss": 0.1712, + "step": 28748 + }, + { + "epoch": 3.84, + "grad_norm": 0.58203125, + "learning_rate": 1.0190614459335556e-06, + "loss": 0.2081, + "step": 28749 + }, + { + "epoch": 3.84, + "grad_norm": 0.44140625, + "learning_rate": 1.0174038821818443e-06, + "loss": 0.1673, + "step": 28750 + }, + { + "epoch": 3.84, + "grad_norm": 0.69140625, + "learning_rate": 1.0157476606976745e-06, + "loss": 0.3563, + "step": 28751 + }, + { + "epoch": 3.84, + "grad_norm": 0.640625, + "learning_rate": 1.0140927815034952e-06, + "loss": 0.439, + "step": 28752 + }, + { + "epoch": 3.84, + "grad_norm": 0.65234375, + "learning_rate": 1.0124392446217435e-06, + "loss": 0.2319, + "step": 28753 + }, + { + "epoch": 3.84, + "grad_norm": 0.51953125, + "learning_rate": 1.010787050074835e-06, + "loss": 0.2549, + "step": 28754 + }, + { + "epoch": 3.84, + "grad_norm": 0.66796875, + "learning_rate": 1.0091361978851966e-06, + "loss": 0.4564, + "step": 28755 + }, + { + "epoch": 3.84, + "grad_norm": 0.46484375, + "learning_rate": 1.0074866880752098e-06, + "loss": 0.2462, + "step": 28756 + }, + { + "epoch": 3.84, + "grad_norm": 0.62109375, + "learning_rate": 1.0058385206672239e-06, + "loss": 0.2678, + "step": 28757 + }, + { + "epoch": 3.84, + "grad_norm": 0.6015625, + "learning_rate": 1.0041916956835985e-06, + "loss": 0.2124, + "step": 28758 + }, + { + "epoch": 3.84, + "grad_norm": 0.66015625, + "learning_rate": 1.0025462131466824e-06, + "loss": 0.1591, + "step": 28759 + }, + { + "epoch": 3.84, + "grad_norm": 0.458984375, + "learning_rate": 1.0009020730787578e-06, + "loss": 0.2874, + "step": 28760 + }, + { + "epoch": 3.84, + "grad_norm": 0.55859375, + "learning_rate": 9.992592755021623e-07, + "loss": 0.3146, + "step": 28761 + }, + { + "epoch": 3.84, + "grad_norm": 0.546875, + "learning_rate": 9.97617820439134e-07, + "loss": 0.3722, + "step": 28762 + }, + { + "epoch": 3.84, + "grad_norm": 0.51953125, + "learning_rate": 9.959777079119436e-07, + "loss": 0.2609, + "step": 28763 + }, + { + "epoch": 3.84, + "grad_norm": 0.40234375, + "learning_rate": 9.943389379428403e-07, + "loss": 0.164, + "step": 28764 + }, + { + "epoch": 3.84, + "grad_norm": 0.58203125, + "learning_rate": 9.927015105540504e-07, + "loss": 0.2296, + "step": 28765 + }, + { + "epoch": 3.84, + "grad_norm": 0.515625, + "learning_rate": 9.910654257677566e-07, + "loss": 0.4045, + "step": 28766 + }, + { + "epoch": 3.84, + "grad_norm": 0.51953125, + "learning_rate": 9.894306836061629e-07, + "loss": 0.1012, + "step": 28767 + }, + { + "epoch": 3.84, + "grad_norm": 0.73828125, + "learning_rate": 9.877972840914406e-07, + "loss": 0.3045, + "step": 28768 + }, + { + "epoch": 3.84, + "grad_norm": 0.62890625, + "learning_rate": 9.861652272457278e-07, + "loss": 0.3372, + "step": 28769 + }, + { + "epoch": 3.84, + "grad_norm": 0.421875, + "learning_rate": 9.845345130911731e-07, + "loss": 0.1702, + "step": 28770 + }, + { + "epoch": 3.84, + "grad_norm": 0.419921875, + "learning_rate": 9.829051416498702e-07, + "loss": 0.1526, + "step": 28771 + }, + { + "epoch": 3.84, + "grad_norm": 0.51171875, + "learning_rate": 9.812771129439124e-07, + "loss": 0.2915, + "step": 28772 + }, + { + "epoch": 3.84, + "grad_norm": 0.625, + "learning_rate": 9.796504269953932e-07, + "loss": 0.1511, + "step": 28773 + }, + { + "epoch": 3.84, + "grad_norm": 0.609375, + "learning_rate": 9.780250838263728e-07, + "loss": 0.23, + "step": 28774 + }, + { + "epoch": 3.84, + "grad_norm": 0.69921875, + "learning_rate": 9.764010834588666e-07, + "loss": 0.2596, + "step": 28775 + }, + { + "epoch": 3.84, + "grad_norm": 0.6171875, + "learning_rate": 9.74778425914924e-07, + "loss": 0.6704, + "step": 28776 + }, + { + "epoch": 3.84, + "grad_norm": 0.68359375, + "learning_rate": 9.731571112165495e-07, + "loss": 0.5689, + "step": 28777 + }, + { + "epoch": 3.84, + "grad_norm": 0.625, + "learning_rate": 9.715371393857143e-07, + "loss": 0.497, + "step": 28778 + }, + { + "epoch": 3.84, + "grad_norm": 0.640625, + "learning_rate": 9.699185104443787e-07, + "loss": 0.28, + "step": 28779 + }, + { + "epoch": 3.84, + "grad_norm": 0.51171875, + "learning_rate": 9.683012244145139e-07, + "loss": 0.2485, + "step": 28780 + }, + { + "epoch": 3.84, + "grad_norm": 0.69921875, + "learning_rate": 9.666852813180472e-07, + "loss": 0.4557, + "step": 28781 + }, + { + "epoch": 3.84, + "grad_norm": 0.59765625, + "learning_rate": 9.650706811768829e-07, + "loss": 0.2983, + "step": 28782 + }, + { + "epoch": 3.84, + "grad_norm": 0.65234375, + "learning_rate": 9.634574240129256e-07, + "loss": 0.3137, + "step": 28783 + }, + { + "epoch": 3.84, + "grad_norm": 0.494140625, + "learning_rate": 9.618455098480471e-07, + "loss": 0.204, + "step": 28784 + }, + { + "epoch": 3.84, + "grad_norm": 0.5546875, + "learning_rate": 9.602349387040966e-07, + "loss": 0.2095, + "step": 28785 + }, + { + "epoch": 3.84, + "grad_norm": 0.48046875, + "learning_rate": 9.586257106029228e-07, + "loss": 0.2388, + "step": 28786 + }, + { + "epoch": 3.84, + "grad_norm": 0.6953125, + "learning_rate": 9.570178255663532e-07, + "loss": 0.283, + "step": 28787 + }, + { + "epoch": 3.84, + "grad_norm": 0.87890625, + "learning_rate": 9.55411283616192e-07, + "loss": 0.6184, + "step": 28788 + }, + { + "epoch": 3.84, + "grad_norm": 0.484375, + "learning_rate": 9.538060847742113e-07, + "loss": 0.2956, + "step": 28789 + }, + { + "epoch": 3.84, + "grad_norm": 0.796875, + "learning_rate": 9.522022290621935e-07, + "loss": 0.296, + "step": 28790 + }, + { + "epoch": 3.84, + "grad_norm": 0.55859375, + "learning_rate": 9.505997165018765e-07, + "loss": 0.1339, + "step": 28791 + }, + { + "epoch": 3.84, + "grad_norm": 0.80078125, + "learning_rate": 9.489985471150098e-07, + "loss": 0.321, + "step": 28792 + }, + { + "epoch": 3.84, + "grad_norm": 0.578125, + "learning_rate": 9.47398720923287e-07, + "loss": 0.4187, + "step": 28793 + }, + { + "epoch": 3.84, + "grad_norm": 0.76953125, + "learning_rate": 9.458002379484021e-07, + "loss": 0.3428, + "step": 28794 + }, + { + "epoch": 3.84, + "grad_norm": 0.625, + "learning_rate": 9.442030982120487e-07, + "loss": 0.292, + "step": 28795 + }, + { + "epoch": 3.84, + "grad_norm": 0.796875, + "learning_rate": 9.42607301735865e-07, + "loss": 0.5396, + "step": 28796 + }, + { + "epoch": 3.84, + "grad_norm": 0.46484375, + "learning_rate": 9.410128485415115e-07, + "loss": 0.2128, + "step": 28797 + }, + { + "epoch": 3.84, + "grad_norm": 0.6640625, + "learning_rate": 9.394197386505932e-07, + "loss": 0.6958, + "step": 28798 + }, + { + "epoch": 3.84, + "grad_norm": 0.53515625, + "learning_rate": 9.378279720847149e-07, + "loss": 0.4049, + "step": 28799 + }, + { + "epoch": 3.84, + "grad_norm": 0.53515625, + "learning_rate": 9.362375488654707e-07, + "loss": 0.2932, + "step": 28800 + }, + { + "epoch": 3.84, + "grad_norm": 0.58203125, + "learning_rate": 9.346484690144319e-07, + "loss": 0.3887, + "step": 28801 + }, + { + "epoch": 3.84, + "grad_norm": 0.53515625, + "learning_rate": 9.330607325531259e-07, + "loss": 0.2273, + "step": 28802 + }, + { + "epoch": 3.84, + "grad_norm": 0.46875, + "learning_rate": 9.314743395031023e-07, + "loss": 0.239, + "step": 28803 + }, + { + "epoch": 3.84, + "grad_norm": 0.59375, + "learning_rate": 9.298892898858658e-07, + "loss": 0.3115, + "step": 28804 + }, + { + "epoch": 3.84, + "grad_norm": 0.9375, + "learning_rate": 9.283055837229215e-07, + "loss": 0.4514, + "step": 28805 + }, + { + "epoch": 3.84, + "grad_norm": 0.73046875, + "learning_rate": 9.267232210357413e-07, + "loss": 0.378, + "step": 28806 + }, + { + "epoch": 3.84, + "grad_norm": 0.486328125, + "learning_rate": 9.251422018457745e-07, + "loss": 0.2059, + "step": 28807 + }, + { + "epoch": 3.84, + "grad_norm": 0.6875, + "learning_rate": 9.235625261744596e-07, + "loss": 0.4239, + "step": 28808 + }, + { + "epoch": 3.84, + "grad_norm": 0.6171875, + "learning_rate": 9.219841940432238e-07, + "loss": 0.4535, + "step": 28809 + }, + { + "epoch": 3.84, + "grad_norm": 0.58984375, + "learning_rate": 9.204072054734725e-07, + "loss": 0.4203, + "step": 28810 + }, + { + "epoch": 3.84, + "grad_norm": 0.82421875, + "learning_rate": 9.188315604865883e-07, + "loss": 0.3358, + "step": 28811 + }, + { + "epoch": 3.84, + "grad_norm": 0.66796875, + "learning_rate": 9.172572591039318e-07, + "loss": 0.2222, + "step": 28812 + }, + { + "epoch": 3.84, + "grad_norm": 0.5703125, + "learning_rate": 9.156843013468641e-07, + "loss": 0.1786, + "step": 28813 + }, + { + "epoch": 3.84, + "grad_norm": 0.61328125, + "learning_rate": 9.141126872367123e-07, + "loss": 0.3753, + "step": 28814 + }, + { + "epoch": 3.85, + "grad_norm": 0.6640625, + "learning_rate": 9.125424167947816e-07, + "loss": 0.3581, + "step": 28815 + }, + { + "epoch": 3.85, + "grad_norm": 0.69921875, + "learning_rate": 9.109734900423661e-07, + "loss": 0.4497, + "step": 28816 + }, + { + "epoch": 3.85, + "grad_norm": 0.5546875, + "learning_rate": 9.094059070007599e-07, + "loss": 0.2513, + "step": 28817 + }, + { + "epoch": 3.85, + "grad_norm": 0.78515625, + "learning_rate": 9.078396676911904e-07, + "loss": 0.3874, + "step": 28818 + }, + { + "epoch": 3.85, + "grad_norm": 0.84765625, + "learning_rate": 9.062747721349186e-07, + "loss": 0.2075, + "step": 28819 + }, + { + "epoch": 3.85, + "grad_norm": 0.68359375, + "learning_rate": 9.047112203531605e-07, + "loss": 0.392, + "step": 28820 + }, + { + "epoch": 3.85, + "grad_norm": 0.65625, + "learning_rate": 9.031490123671104e-07, + "loss": 0.3412, + "step": 28821 + }, + { + "epoch": 3.85, + "grad_norm": 0.61328125, + "learning_rate": 9.015881481979516e-07, + "loss": 0.3181, + "step": 28822 + }, + { + "epoch": 3.85, + "grad_norm": 0.69140625, + "learning_rate": 9.000286278668668e-07, + "loss": 0.2682, + "step": 28823 + }, + { + "epoch": 3.85, + "grad_norm": 0.462890625, + "learning_rate": 8.98470451395006e-07, + "loss": 0.1612, + "step": 28824 + }, + { + "epoch": 3.85, + "grad_norm": 0.63671875, + "learning_rate": 8.969136188034855e-07, + "loss": 0.2096, + "step": 28825 + }, + { + "epoch": 3.85, + "grad_norm": 0.64453125, + "learning_rate": 8.953581301134106e-07, + "loss": 0.1884, + "step": 28826 + }, + { + "epoch": 3.85, + "grad_norm": 0.57421875, + "learning_rate": 8.938039853458868e-07, + "loss": 0.2142, + "step": 28827 + }, + { + "epoch": 3.85, + "grad_norm": 0.58203125, + "learning_rate": 8.922511845219971e-07, + "loss": 0.2431, + "step": 28828 + }, + { + "epoch": 3.85, + "grad_norm": 0.73046875, + "learning_rate": 8.906997276627804e-07, + "loss": 0.1823, + "step": 28829 + }, + { + "epoch": 3.85, + "grad_norm": 0.59375, + "learning_rate": 8.891496147892863e-07, + "loss": 0.501, + "step": 28830 + }, + { + "epoch": 3.85, + "grad_norm": 0.60546875, + "learning_rate": 8.876008459225205e-07, + "loss": 0.2619, + "step": 28831 + }, + { + "epoch": 3.85, + "grad_norm": 0.51953125, + "learning_rate": 8.860534210835103e-07, + "loss": 0.3406, + "step": 28832 + }, + { + "epoch": 3.85, + "grad_norm": 0.80078125, + "learning_rate": 8.845073402932169e-07, + "loss": 0.2987, + "step": 28833 + }, + { + "epoch": 3.85, + "grad_norm": 0.578125, + "learning_rate": 8.829626035726235e-07, + "loss": 0.225, + "step": 28834 + }, + { + "epoch": 3.85, + "grad_norm": 0.75, + "learning_rate": 8.814192109426689e-07, + "loss": 0.4032, + "step": 28835 + }, + { + "epoch": 3.85, + "grad_norm": 0.50390625, + "learning_rate": 8.798771624242808e-07, + "loss": 0.2354, + "step": 28836 + }, + { + "epoch": 3.85, + "grad_norm": 0.6171875, + "learning_rate": 8.783364580383757e-07, + "loss": 0.2798, + "step": 28837 + }, + { + "epoch": 3.85, + "grad_norm": 0.56640625, + "learning_rate": 8.767970978058482e-07, + "loss": 0.4868, + "step": 28838 + }, + { + "epoch": 3.85, + "grad_norm": 0.7109375, + "learning_rate": 8.752590817475592e-07, + "loss": 0.3862, + "step": 28839 + }, + { + "epoch": 3.85, + "grad_norm": 0.64453125, + "learning_rate": 8.73722409884381e-07, + "loss": 0.4135, + "step": 28840 + }, + { + "epoch": 3.85, + "grad_norm": 0.7265625, + "learning_rate": 8.721870822371414e-07, + "loss": 0.6516, + "step": 28841 + }, + { + "epoch": 3.85, + "grad_norm": 0.56640625, + "learning_rate": 8.706530988266682e-07, + "loss": 0.3289, + "step": 28842 + }, + { + "epoch": 3.85, + "grad_norm": 0.6015625, + "learning_rate": 8.691204596737668e-07, + "loss": 0.2614, + "step": 28843 + }, + { + "epoch": 3.85, + "grad_norm": 0.703125, + "learning_rate": 8.675891647992096e-07, + "loss": 0.3357, + "step": 28844 + }, + { + "epoch": 3.85, + "grad_norm": 0.55859375, + "learning_rate": 8.660592142237578e-07, + "loss": 0.3787, + "step": 28845 + }, + { + "epoch": 3.85, + "grad_norm": 0.6875, + "learning_rate": 8.645306079681725e-07, + "loss": 0.5222, + "step": 28846 + }, + { + "epoch": 3.85, + "grad_norm": 0.578125, + "learning_rate": 8.630033460531817e-07, + "loss": 0.1549, + "step": 28847 + }, + { + "epoch": 3.85, + "grad_norm": 0.56640625, + "learning_rate": 8.614774284994798e-07, + "loss": 0.1453, + "step": 28848 + }, + { + "epoch": 3.85, + "grad_norm": 0.63671875, + "learning_rate": 8.599528553277725e-07, + "loss": 0.2684, + "step": 28849 + }, + { + "epoch": 3.85, + "grad_norm": 0.58203125, + "learning_rate": 8.584296265587322e-07, + "loss": 0.3422, + "step": 28850 + }, + { + "epoch": 3.85, + "grad_norm": 0.65234375, + "learning_rate": 8.569077422130311e-07, + "loss": 0.3916, + "step": 28851 + }, + { + "epoch": 3.85, + "grad_norm": 0.72265625, + "learning_rate": 8.553872023112752e-07, + "loss": 0.3616, + "step": 28852 + }, + { + "epoch": 3.85, + "grad_norm": 0.66796875, + "learning_rate": 8.538680068741034e-07, + "loss": 0.2229, + "step": 28853 + }, + { + "epoch": 3.85, + "grad_norm": 0.81640625, + "learning_rate": 8.523501559221104e-07, + "loss": 0.3505, + "step": 28854 + }, + { + "epoch": 3.85, + "grad_norm": 0.6015625, + "learning_rate": 8.508336494758795e-07, + "loss": 0.3966, + "step": 28855 + }, + { + "epoch": 3.85, + "grad_norm": 0.578125, + "learning_rate": 8.493184875559945e-07, + "loss": 0.2729, + "step": 28856 + }, + { + "epoch": 3.85, + "grad_norm": 0.62890625, + "learning_rate": 8.478046701829723e-07, + "loss": 0.2975, + "step": 28857 + }, + { + "epoch": 3.85, + "grad_norm": 0.68359375, + "learning_rate": 8.462921973773519e-07, + "loss": 0.4043, + "step": 28858 + }, + { + "epoch": 3.85, + "grad_norm": 0.8046875, + "learning_rate": 8.447810691596391e-07, + "loss": 0.3921, + "step": 28859 + }, + { + "epoch": 3.85, + "grad_norm": 0.65234375, + "learning_rate": 8.432712855503399e-07, + "loss": 0.3132, + "step": 28860 + }, + { + "epoch": 3.85, + "grad_norm": 0.67578125, + "learning_rate": 8.417628465699268e-07, + "loss": 0.2902, + "step": 28861 + }, + { + "epoch": 3.85, + "grad_norm": 0.578125, + "learning_rate": 8.402557522388499e-07, + "loss": 0.1822, + "step": 28862 + }, + { + "epoch": 3.85, + "grad_norm": 0.5703125, + "learning_rate": 8.387500025775375e-07, + "loss": 0.1786, + "step": 28863 + }, + { + "epoch": 3.85, + "grad_norm": 0.7578125, + "learning_rate": 8.372455976064175e-07, + "loss": 0.5716, + "step": 28864 + }, + { + "epoch": 3.85, + "grad_norm": 0.37890625, + "learning_rate": 8.35742537345896e-07, + "loss": 0.1352, + "step": 28865 + }, + { + "epoch": 3.85, + "grad_norm": 0.466796875, + "learning_rate": 8.342408218163455e-07, + "loss": 0.1575, + "step": 28866 + }, + { + "epoch": 3.85, + "grad_norm": 0.5078125, + "learning_rate": 8.327404510381276e-07, + "loss": 0.2133, + "step": 28867 + }, + { + "epoch": 3.85, + "grad_norm": 0.45703125, + "learning_rate": 8.312414250315925e-07, + "loss": 0.4321, + "step": 28868 + }, + { + "epoch": 3.85, + "grad_norm": 0.5, + "learning_rate": 8.297437438170797e-07, + "loss": 0.2565, + "step": 28869 + }, + { + "epoch": 3.85, + "grad_norm": 0.46875, + "learning_rate": 8.282474074148838e-07, + "loss": 0.1177, + "step": 28870 + }, + { + "epoch": 3.85, + "grad_norm": 0.78125, + "learning_rate": 8.267524158453e-07, + "loss": 0.4822, + "step": 28871 + }, + { + "epoch": 3.85, + "grad_norm": 0.6640625, + "learning_rate": 8.252587691286007e-07, + "loss": 0.3684, + "step": 28872 + }, + { + "epoch": 3.85, + "grad_norm": 0.55078125, + "learning_rate": 8.237664672850476e-07, + "loss": 0.1333, + "step": 28873 + }, + { + "epoch": 3.85, + "grad_norm": 0.59375, + "learning_rate": 8.2227551033488e-07, + "loss": 0.4914, + "step": 28874 + }, + { + "epoch": 3.85, + "grad_norm": 0.455078125, + "learning_rate": 8.20785898298293e-07, + "loss": 0.1785, + "step": 28875 + }, + { + "epoch": 3.85, + "grad_norm": 0.494140625, + "learning_rate": 8.192976311955036e-07, + "loss": 0.2124, + "step": 28876 + }, + { + "epoch": 3.85, + "grad_norm": 0.5078125, + "learning_rate": 8.178107090466958e-07, + "loss": 0.2976, + "step": 28877 + }, + { + "epoch": 3.85, + "grad_norm": 0.7734375, + "learning_rate": 8.16325131872031e-07, + "loss": 0.3579, + "step": 28878 + }, + { + "epoch": 3.85, + "grad_norm": 0.69140625, + "learning_rate": 8.148408996916601e-07, + "loss": 0.272, + "step": 28879 + }, + { + "epoch": 3.85, + "grad_norm": 0.56640625, + "learning_rate": 8.13358012525689e-07, + "loss": 0.275, + "step": 28880 + }, + { + "epoch": 3.85, + "grad_norm": 0.59375, + "learning_rate": 8.118764703942461e-07, + "loss": 0.2469, + "step": 28881 + }, + { + "epoch": 3.85, + "grad_norm": 0.478515625, + "learning_rate": 8.103962733174153e-07, + "loss": 0.1599, + "step": 28882 + }, + { + "epoch": 3.85, + "grad_norm": 0.56640625, + "learning_rate": 8.089174213152806e-07, + "loss": 0.3092, + "step": 28883 + }, + { + "epoch": 3.85, + "grad_norm": 0.8203125, + "learning_rate": 8.074399144078815e-07, + "loss": 0.2152, + "step": 28884 + }, + { + "epoch": 3.85, + "grad_norm": 0.69921875, + "learning_rate": 8.059637526152464e-07, + "loss": 0.2066, + "step": 28885 + }, + { + "epoch": 3.85, + "grad_norm": 0.7578125, + "learning_rate": 8.044889359574148e-07, + "loss": 0.2786, + "step": 28886 + }, + { + "epoch": 3.85, + "grad_norm": 0.73828125, + "learning_rate": 8.030154644543708e-07, + "loss": 0.1839, + "step": 28887 + }, + { + "epoch": 3.85, + "grad_norm": 0.5625, + "learning_rate": 8.015433381261095e-07, + "loss": 0.3551, + "step": 28888 + }, + { + "epoch": 3.85, + "grad_norm": 0.53515625, + "learning_rate": 8.000725569925815e-07, + "loss": 0.3155, + "step": 28889 + }, + { + "epoch": 3.86, + "grad_norm": 0.66796875, + "learning_rate": 7.986031210737488e-07, + "loss": 0.4126, + "step": 28890 + }, + { + "epoch": 3.86, + "grad_norm": 0.84375, + "learning_rate": 7.971350303895064e-07, + "loss": 0.2861, + "step": 28891 + }, + { + "epoch": 3.86, + "grad_norm": 0.74609375, + "learning_rate": 7.95668284959783e-07, + "loss": 0.1236, + "step": 28892 + }, + { + "epoch": 3.86, + "grad_norm": 0.578125, + "learning_rate": 7.942028848044736e-07, + "loss": 0.3406, + "step": 28893 + }, + { + "epoch": 3.86, + "grad_norm": 0.5, + "learning_rate": 7.927388299434402e-07, + "loss": 0.3812, + "step": 28894 + }, + { + "epoch": 3.86, + "grad_norm": 0.435546875, + "learning_rate": 7.912761203965335e-07, + "loss": 0.189, + "step": 28895 + }, + { + "epoch": 3.86, + "grad_norm": 0.63671875, + "learning_rate": 7.898147561836044e-07, + "loss": 0.2045, + "step": 28896 + }, + { + "epoch": 3.86, + "grad_norm": 0.671875, + "learning_rate": 7.883547373244593e-07, + "loss": 0.4983, + "step": 28897 + }, + { + "epoch": 3.86, + "grad_norm": 0.478515625, + "learning_rate": 7.868960638388933e-07, + "loss": 0.3446, + "step": 28898 + }, + { + "epoch": 3.86, + "grad_norm": 0.671875, + "learning_rate": 7.854387357466908e-07, + "loss": 0.4152, + "step": 28899 + }, + { + "epoch": 3.86, + "grad_norm": 0.76953125, + "learning_rate": 7.839827530676136e-07, + "loss": 0.401, + "step": 28900 + }, + { + "epoch": 3.86, + "grad_norm": 0.4609375, + "learning_rate": 7.825281158214126e-07, + "loss": 0.2717, + "step": 28901 + }, + { + "epoch": 3.86, + "grad_norm": 0.8125, + "learning_rate": 7.810748240278054e-07, + "loss": 0.4951, + "step": 28902 + }, + { + "epoch": 3.86, + "grad_norm": 0.78125, + "learning_rate": 7.796228777064984e-07, + "loss": 0.2668, + "step": 28903 + }, + { + "epoch": 3.86, + "grad_norm": 0.49609375, + "learning_rate": 7.78172276877176e-07, + "loss": 0.1912, + "step": 28904 + }, + { + "epoch": 3.86, + "grad_norm": 0.66015625, + "learning_rate": 7.767230215595333e-07, + "loss": 0.2352, + "step": 28905 + }, + { + "epoch": 3.86, + "grad_norm": 0.5078125, + "learning_rate": 7.752751117731994e-07, + "loss": 0.1556, + "step": 28906 + }, + { + "epoch": 3.86, + "grad_norm": 0.62890625, + "learning_rate": 7.738285475378249e-07, + "loss": 0.4042, + "step": 28907 + }, + { + "epoch": 3.86, + "grad_norm": 0.67578125, + "learning_rate": 7.723833288730054e-07, + "loss": 0.4807, + "step": 28908 + }, + { + "epoch": 3.86, + "grad_norm": 0.57421875, + "learning_rate": 7.709394557983585e-07, + "loss": 0.3223, + "step": 28909 + }, + { + "epoch": 3.86, + "grad_norm": 0.55078125, + "learning_rate": 7.694969283334575e-07, + "loss": 0.404, + "step": 28910 + }, + { + "epoch": 3.86, + "grad_norm": 0.59375, + "learning_rate": 7.680557464978644e-07, + "loss": 0.3856, + "step": 28911 + }, + { + "epoch": 3.86, + "grad_norm": 0.62890625, + "learning_rate": 7.666159103111192e-07, + "loss": 0.2979, + "step": 28912 + }, + { + "epoch": 3.86, + "grad_norm": 0.6484375, + "learning_rate": 7.651774197927508e-07, + "loss": 0.2621, + "step": 28913 + }, + { + "epoch": 3.86, + "grad_norm": 0.578125, + "learning_rate": 7.637402749622768e-07, + "loss": 0.2978, + "step": 28914 + }, + { + "epoch": 3.86, + "grad_norm": 0.71484375, + "learning_rate": 7.623044758391596e-07, + "loss": 0.6762, + "step": 28915 + }, + { + "epoch": 3.86, + "grad_norm": 0.59375, + "learning_rate": 7.608700224428945e-07, + "loss": 0.2043, + "step": 28916 + }, + { + "epoch": 3.86, + "grad_norm": 0.609375, + "learning_rate": 7.594369147929104e-07, + "loss": 0.277, + "step": 28917 + }, + { + "epoch": 3.86, + "grad_norm": 0.71484375, + "learning_rate": 7.580051529086584e-07, + "loss": 0.4406, + "step": 28918 + }, + { + "epoch": 3.86, + "grad_norm": 0.765625, + "learning_rate": 7.565747368095566e-07, + "loss": 0.3362, + "step": 28919 + }, + { + "epoch": 3.86, + "grad_norm": 0.5078125, + "learning_rate": 7.551456665149892e-07, + "loss": 0.3776, + "step": 28920 + }, + { + "epoch": 3.86, + "grad_norm": 0.5625, + "learning_rate": 7.537179420443407e-07, + "loss": 0.2532, + "step": 28921 + }, + { + "epoch": 3.86, + "grad_norm": 0.55078125, + "learning_rate": 7.522915634169736e-07, + "loss": 0.4102, + "step": 28922 + }, + { + "epoch": 3.86, + "grad_norm": 0.5078125, + "learning_rate": 7.508665306522278e-07, + "loss": 0.2487, + "step": 28923 + }, + { + "epoch": 3.86, + "grad_norm": 0.625, + "learning_rate": 7.494428437694323e-07, + "loss": 0.3059, + "step": 28924 + }, + { + "epoch": 3.86, + "grad_norm": 0.87109375, + "learning_rate": 7.48020502787894e-07, + "loss": 0.4153, + "step": 28925 + }, + { + "epoch": 3.86, + "grad_norm": 0.6484375, + "learning_rate": 7.465995077268972e-07, + "loss": 0.1535, + "step": 28926 + }, + { + "epoch": 3.86, + "grad_norm": 0.56640625, + "learning_rate": 7.451798586057047e-07, + "loss": 0.1685, + "step": 28927 + }, + { + "epoch": 3.86, + "grad_norm": 0.65234375, + "learning_rate": 7.437615554435784e-07, + "loss": 0.3144, + "step": 28928 + }, + { + "epoch": 3.86, + "grad_norm": 0.5859375, + "learning_rate": 7.423445982597477e-07, + "loss": 0.2283, + "step": 28929 + }, + { + "epoch": 3.86, + "grad_norm": 0.64453125, + "learning_rate": 7.409289870734304e-07, + "loss": 0.3673, + "step": 28930 + }, + { + "epoch": 3.86, + "grad_norm": 0.7421875, + "learning_rate": 7.395147219038112e-07, + "loss": 0.5912, + "step": 28931 + }, + { + "epoch": 3.86, + "grad_norm": 0.69921875, + "learning_rate": 7.381018027700859e-07, + "loss": 0.4561, + "step": 28932 + }, + { + "epoch": 3.86, + "grad_norm": 0.75, + "learning_rate": 7.366902296913947e-07, + "loss": 0.5724, + "step": 28933 + }, + { + "epoch": 3.86, + "grad_norm": 0.5703125, + "learning_rate": 7.352800026869111e-07, + "loss": 0.5036, + "step": 28934 + }, + { + "epoch": 3.86, + "grad_norm": 0.65625, + "learning_rate": 7.338711217757199e-07, + "loss": 0.2075, + "step": 28935 + }, + { + "epoch": 3.86, + "grad_norm": 0.578125, + "learning_rate": 7.324635869769502e-07, + "loss": 0.4347, + "step": 28936 + }, + { + "epoch": 3.86, + "grad_norm": 0.47265625, + "learning_rate": 7.310573983096758e-07, + "loss": 0.3293, + "step": 28937 + }, + { + "epoch": 3.86, + "grad_norm": 0.63671875, + "learning_rate": 7.296525557929923e-07, + "loss": 0.1393, + "step": 28938 + }, + { + "epoch": 3.86, + "grad_norm": 0.73828125, + "learning_rate": 7.28249059445929e-07, + "loss": 0.2806, + "step": 28939 + }, + { + "epoch": 3.86, + "grad_norm": 0.61328125, + "learning_rate": 7.268469092875041e-07, + "loss": 0.1826, + "step": 28940 + }, + { + "epoch": 3.86, + "grad_norm": 0.6171875, + "learning_rate": 7.254461053367578e-07, + "loss": 0.2985, + "step": 28941 + }, + { + "epoch": 3.86, + "grad_norm": 0.62109375, + "learning_rate": 7.24046647612675e-07, + "loss": 0.1815, + "step": 28942 + }, + { + "epoch": 3.86, + "grad_norm": 0.68359375, + "learning_rate": 7.226485361342406e-07, + "loss": 0.424, + "step": 28943 + }, + { + "epoch": 3.86, + "grad_norm": 0.466796875, + "learning_rate": 7.212517709203947e-07, + "loss": 0.3275, + "step": 28944 + }, + { + "epoch": 3.86, + "grad_norm": 0.6796875, + "learning_rate": 7.198563519901002e-07, + "loss": 0.5879, + "step": 28945 + }, + { + "epoch": 3.86, + "grad_norm": 0.65234375, + "learning_rate": 7.184622793622642e-07, + "loss": 0.5319, + "step": 28946 + }, + { + "epoch": 3.86, + "grad_norm": 0.671875, + "learning_rate": 7.170695530558158e-07, + "loss": 0.448, + "step": 28947 + }, + { + "epoch": 3.86, + "grad_norm": 0.66796875, + "learning_rate": 7.156781730896067e-07, + "loss": 0.202, + "step": 28948 + }, + { + "epoch": 3.86, + "grad_norm": 0.7890625, + "learning_rate": 7.142881394825218e-07, + "loss": 0.2718, + "step": 28949 + }, + { + "epoch": 3.86, + "grad_norm": 0.55078125, + "learning_rate": 7.128994522534127e-07, + "loss": 0.2544, + "step": 28950 + }, + { + "epoch": 3.86, + "grad_norm": 0.58984375, + "learning_rate": 7.115121114211199e-07, + "loss": 0.1409, + "step": 28951 + }, + { + "epoch": 3.86, + "grad_norm": 0.474609375, + "learning_rate": 7.101261170044394e-07, + "loss": 0.2058, + "step": 28952 + }, + { + "epoch": 3.86, + "grad_norm": 0.671875, + "learning_rate": 7.087414690221784e-07, + "loss": 0.244, + "step": 28953 + }, + { + "epoch": 3.86, + "grad_norm": 0.78515625, + "learning_rate": 7.073581674930996e-07, + "loss": 0.2928, + "step": 28954 + }, + { + "epoch": 3.86, + "grad_norm": 0.5546875, + "learning_rate": 7.05976212435966e-07, + "loss": 0.2498, + "step": 28955 + }, + { + "epoch": 3.86, + "grad_norm": 0.41796875, + "learning_rate": 7.045956038695178e-07, + "loss": 0.13, + "step": 28956 + }, + { + "epoch": 3.86, + "grad_norm": 0.546875, + "learning_rate": 7.03216341812496e-07, + "loss": 0.2712, + "step": 28957 + }, + { + "epoch": 3.86, + "grad_norm": 0.71875, + "learning_rate": 7.018384262835853e-07, + "loss": 0.2476, + "step": 28958 + }, + { + "epoch": 3.86, + "grad_norm": 0.671875, + "learning_rate": 7.004618573014709e-07, + "loss": 0.2429, + "step": 28959 + }, + { + "epoch": 3.86, + "grad_norm": 0.66796875, + "learning_rate": 6.990866348848157e-07, + "loss": 0.329, + "step": 28960 + }, + { + "epoch": 3.86, + "grad_norm": 0.6328125, + "learning_rate": 6.977127590522936e-07, + "loss": 0.2464, + "step": 28961 + }, + { + "epoch": 3.86, + "grad_norm": 0.609375, + "learning_rate": 6.963402298225008e-07, + "loss": 0.2427, + "step": 28962 + }, + { + "epoch": 3.86, + "grad_norm": 0.60546875, + "learning_rate": 6.94969047214089e-07, + "loss": 0.3407, + "step": 28963 + }, + { + "epoch": 3.86, + "grad_norm": 0.6796875, + "learning_rate": 6.935992112456102e-07, + "loss": 0.2458, + "step": 28964 + }, + { + "epoch": 3.87, + "grad_norm": 0.5546875, + "learning_rate": 6.922307219356716e-07, + "loss": 0.5109, + "step": 28965 + }, + { + "epoch": 3.87, + "grad_norm": 0.60546875, + "learning_rate": 6.90863579302814e-07, + "loss": 0.2591, + "step": 28966 + }, + { + "epoch": 3.87, + "grad_norm": 0.54296875, + "learning_rate": 6.89497783365578e-07, + "loss": 0.2382, + "step": 28967 + }, + { + "epoch": 3.87, + "grad_norm": 0.7109375, + "learning_rate": 6.881333341425044e-07, + "loss": 0.6226, + "step": 28968 + }, + { + "epoch": 3.87, + "grad_norm": 0.6484375, + "learning_rate": 6.867702316520674e-07, + "loss": 0.3132, + "step": 28969 + }, + { + "epoch": 3.87, + "grad_norm": 0.5234375, + "learning_rate": 6.854084759127743e-07, + "loss": 0.188, + "step": 28970 + }, + { + "epoch": 3.87, + "grad_norm": 0.5390625, + "learning_rate": 6.84048066943066e-07, + "loss": 0.2879, + "step": 28971 + }, + { + "epoch": 3.87, + "grad_norm": 0.67578125, + "learning_rate": 6.826890047614165e-07, + "loss": 0.3254, + "step": 28972 + }, + { + "epoch": 3.87, + "grad_norm": 0.63671875, + "learning_rate": 6.813312893862444e-07, + "loss": 0.3692, + "step": 28973 + }, + { + "epoch": 3.87, + "grad_norm": 0.46875, + "learning_rate": 6.799749208359685e-07, + "loss": 0.1307, + "step": 28974 + }, + { + "epoch": 3.87, + "grad_norm": 0.55859375, + "learning_rate": 6.786198991289738e-07, + "loss": 0.1984, + "step": 28975 + }, + { + "epoch": 3.87, + "grad_norm": 0.458984375, + "learning_rate": 6.772662242836347e-07, + "loss": 0.1773, + "step": 28976 + }, + { + "epoch": 3.87, + "grad_norm": 0.67578125, + "learning_rate": 6.759138963183142e-07, + "loss": 0.3727, + "step": 28977 + }, + { + "epoch": 3.87, + "grad_norm": 0.59765625, + "learning_rate": 6.745629152513422e-07, + "loss": 0.3954, + "step": 28978 + }, + { + "epoch": 3.87, + "grad_norm": 0.5859375, + "learning_rate": 6.732132811010372e-07, + "loss": 0.2684, + "step": 28979 + }, + { + "epoch": 3.87, + "grad_norm": 0.61328125, + "learning_rate": 6.718649938857291e-07, + "loss": 0.2616, + "step": 28980 + }, + { + "epoch": 3.87, + "grad_norm": 0.55078125, + "learning_rate": 6.705180536236588e-07, + "loss": 0.2595, + "step": 28981 + }, + { + "epoch": 3.87, + "grad_norm": 0.83984375, + "learning_rate": 6.691724603331228e-07, + "loss": 0.3615, + "step": 28982 + }, + { + "epoch": 3.87, + "grad_norm": 0.6328125, + "learning_rate": 6.678282140323511e-07, + "loss": 0.1926, + "step": 28983 + }, + { + "epoch": 3.87, + "grad_norm": 0.51953125, + "learning_rate": 6.664853147395845e-07, + "loss": 0.4634, + "step": 28984 + }, + { + "epoch": 3.87, + "grad_norm": 0.5390625, + "learning_rate": 6.651437624730306e-07, + "loss": 0.2964, + "step": 28985 + }, + { + "epoch": 3.87, + "grad_norm": 0.71484375, + "learning_rate": 6.638035572508749e-07, + "loss": 0.2314, + "step": 28986 + }, + { + "epoch": 3.87, + "grad_norm": 0.6328125, + "learning_rate": 6.62464699091303e-07, + "loss": 0.3617, + "step": 28987 + }, + { + "epoch": 3.87, + "grad_norm": 0.58984375, + "learning_rate": 6.61127188012467e-07, + "loss": 0.3634, + "step": 28988 + }, + { + "epoch": 3.87, + "grad_norm": 0.72265625, + "learning_rate": 6.597910240324967e-07, + "loss": 0.4465, + "step": 28989 + }, + { + "epoch": 3.87, + "grad_norm": 0.490234375, + "learning_rate": 6.584562071695111e-07, + "loss": 0.2309, + "step": 28990 + }, + { + "epoch": 3.87, + "grad_norm": 0.703125, + "learning_rate": 6.571227374416178e-07, + "loss": 0.1716, + "step": 28991 + }, + { + "epoch": 3.87, + "grad_norm": 0.59765625, + "learning_rate": 6.557906148669024e-07, + "loss": 0.327, + "step": 28992 + }, + { + "epoch": 3.87, + "grad_norm": 0.765625, + "learning_rate": 6.544598394634283e-07, + "loss": 0.291, + "step": 28993 + }, + { + "epoch": 3.87, + "grad_norm": 0.85546875, + "learning_rate": 6.531304112492365e-07, + "loss": 0.3632, + "step": 28994 + }, + { + "epoch": 3.87, + "grad_norm": 0.578125, + "learning_rate": 6.518023302423459e-07, + "loss": 0.1693, + "step": 28995 + }, + { + "epoch": 3.87, + "grad_norm": 0.62109375, + "learning_rate": 6.504755964607867e-07, + "loss": 0.3983, + "step": 28996 + }, + { + "epoch": 3.87, + "grad_norm": 0.6796875, + "learning_rate": 6.491502099225444e-07, + "loss": 0.1672, + "step": 28997 + }, + { + "epoch": 3.87, + "grad_norm": 0.4921875, + "learning_rate": 6.478261706455824e-07, + "loss": 0.2735, + "step": 28998 + }, + { + "epoch": 3.87, + "grad_norm": 0.58984375, + "learning_rate": 6.46503478647864e-07, + "loss": 0.3588, + "step": 28999 + }, + { + "epoch": 3.87, + "grad_norm": 0.75, + "learning_rate": 6.451821339473196e-07, + "loss": 0.3446, + "step": 29000 + }, + { + "epoch": 3.87, + "grad_norm": 0.58203125, + "learning_rate": 6.43862136561868e-07, + "loss": 0.1873, + "step": 29001 + }, + { + "epoch": 3.87, + "grad_norm": 0.73828125, + "learning_rate": 6.425434865094171e-07, + "loss": 0.3879, + "step": 29002 + }, + { + "epoch": 3.87, + "grad_norm": 0.61328125, + "learning_rate": 6.412261838078415e-07, + "loss": 0.5373, + "step": 29003 + }, + { + "epoch": 3.87, + "grad_norm": 0.48828125, + "learning_rate": 6.399102284750047e-07, + "loss": 0.3592, + "step": 29004 + }, + { + "epoch": 3.87, + "grad_norm": 0.6015625, + "learning_rate": 6.385956205287591e-07, + "loss": 0.4172, + "step": 29005 + }, + { + "epoch": 3.87, + "grad_norm": 0.6640625, + "learning_rate": 6.372823599869238e-07, + "loss": 0.171, + "step": 29006 + }, + { + "epoch": 3.87, + "grad_norm": 0.734375, + "learning_rate": 6.359704468673067e-07, + "loss": 0.3817, + "step": 29007 + }, + { + "epoch": 3.87, + "grad_norm": 0.55859375, + "learning_rate": 6.346598811877047e-07, + "loss": 0.3398, + "step": 29008 + }, + { + "epoch": 3.87, + "grad_norm": 0.7265625, + "learning_rate": 6.333506629658925e-07, + "loss": 0.4094, + "step": 29009 + }, + { + "epoch": 3.87, + "grad_norm": 0.73046875, + "learning_rate": 6.320427922196115e-07, + "loss": 0.4599, + "step": 29010 + }, + { + "epoch": 3.87, + "grad_norm": 0.5, + "learning_rate": 6.307362689666141e-07, + "loss": 0.1559, + "step": 29011 + }, + { + "epoch": 3.87, + "grad_norm": 0.6796875, + "learning_rate": 6.294310932245972e-07, + "loss": 0.3337, + "step": 29012 + }, + { + "epoch": 3.87, + "grad_norm": 0.5859375, + "learning_rate": 6.281272650112801e-07, + "loss": 0.5384, + "step": 29013 + }, + { + "epoch": 3.87, + "grad_norm": 0.8046875, + "learning_rate": 6.268247843443154e-07, + "loss": 0.2778, + "step": 29014 + }, + { + "epoch": 3.87, + "grad_norm": 0.78125, + "learning_rate": 6.255236512413997e-07, + "loss": 0.5911, + "step": 29015 + }, + { + "epoch": 3.87, + "grad_norm": 0.474609375, + "learning_rate": 6.242238657201638e-07, + "loss": 0.1694, + "step": 29016 + }, + { + "epoch": 3.87, + "grad_norm": 0.703125, + "learning_rate": 6.229254277982155e-07, + "loss": 0.3659, + "step": 29017 + }, + { + "epoch": 3.87, + "grad_norm": 0.63671875, + "learning_rate": 6.216283374931852e-07, + "loss": 0.2581, + "step": 29018 + }, + { + "epoch": 3.87, + "grad_norm": 0.6875, + "learning_rate": 6.20332594822659e-07, + "loss": 0.3304, + "step": 29019 + }, + { + "epoch": 3.87, + "grad_norm": 0.60546875, + "learning_rate": 6.190381998042116e-07, + "loss": 0.4457, + "step": 29020 + }, + { + "epoch": 3.87, + "grad_norm": 0.68359375, + "learning_rate": 6.177451524553734e-07, + "loss": 0.3801, + "step": 29021 + }, + { + "epoch": 3.87, + "grad_norm": 0.55078125, + "learning_rate": 6.164534527936972e-07, + "loss": 0.2723, + "step": 29022 + }, + { + "epoch": 3.87, + "grad_norm": 0.494140625, + "learning_rate": 6.15163100836702e-07, + "loss": 0.2859, + "step": 29023 + }, + { + "epoch": 3.87, + "grad_norm": 0.65625, + "learning_rate": 6.138740966018852e-07, + "loss": 0.3644, + "step": 29024 + }, + { + "epoch": 3.87, + "grad_norm": 0.447265625, + "learning_rate": 6.125864401067105e-07, + "loss": 0.1903, + "step": 29025 + }, + { + "epoch": 3.87, + "grad_norm": 0.609375, + "learning_rate": 6.113001313686528e-07, + "loss": 0.3439, + "step": 29026 + }, + { + "epoch": 3.87, + "grad_norm": 0.515625, + "learning_rate": 6.100151704051538e-07, + "loss": 0.1822, + "step": 29027 + }, + { + "epoch": 3.87, + "grad_norm": 0.484375, + "learning_rate": 6.087315572336328e-07, + "loss": 0.1789, + "step": 29028 + }, + { + "epoch": 3.87, + "grad_norm": 0.62890625, + "learning_rate": 6.07449291871498e-07, + "loss": 0.3699, + "step": 29029 + }, + { + "epoch": 3.87, + "grad_norm": 0.72265625, + "learning_rate": 6.061683743361579e-07, + "loss": 0.268, + "step": 29030 + }, + { + "epoch": 3.87, + "grad_norm": 0.609375, + "learning_rate": 6.04888804644943e-07, + "loss": 0.3981, + "step": 29031 + }, + { + "epoch": 3.87, + "grad_norm": 0.6328125, + "learning_rate": 6.036105828152394e-07, + "loss": 0.1259, + "step": 29032 + }, + { + "epoch": 3.87, + "grad_norm": 0.59765625, + "learning_rate": 6.023337088643665e-07, + "loss": 0.1694, + "step": 29033 + }, + { + "epoch": 3.87, + "grad_norm": 0.6328125, + "learning_rate": 6.010581828096439e-07, + "loss": 0.3103, + "step": 29034 + }, + { + "epoch": 3.87, + "grad_norm": 0.482421875, + "learning_rate": 5.997840046683689e-07, + "loss": 0.1493, + "step": 29035 + }, + { + "epoch": 3.87, + "grad_norm": 0.51171875, + "learning_rate": 5.985111744578165e-07, + "loss": 0.1946, + "step": 29036 + }, + { + "epoch": 3.87, + "grad_norm": 0.404296875, + "learning_rate": 5.972396921952395e-07, + "loss": 0.1349, + "step": 29037 + }, + { + "epoch": 3.87, + "grad_norm": 0.6484375, + "learning_rate": 5.959695578978908e-07, + "loss": 0.4757, + "step": 29038 + }, + { + "epoch": 3.87, + "grad_norm": 0.6796875, + "learning_rate": 5.947007715830011e-07, + "loss": 0.6561, + "step": 29039 + }, + { + "epoch": 3.88, + "grad_norm": 0.69140625, + "learning_rate": 5.934333332677567e-07, + "loss": 0.4115, + "step": 29040 + }, + { + "epoch": 3.88, + "grad_norm": 0.55078125, + "learning_rate": 5.921672429693548e-07, + "loss": 0.1922, + "step": 29041 + }, + { + "epoch": 3.88, + "grad_norm": 0.625, + "learning_rate": 5.909025007049707e-07, + "loss": 0.5123, + "step": 29042 + }, + { + "epoch": 3.88, + "grad_norm": 0.486328125, + "learning_rate": 5.896391064917462e-07, + "loss": 0.2625, + "step": 29043 + }, + { + "epoch": 3.88, + "grad_norm": 0.58984375, + "learning_rate": 5.88377060346812e-07, + "loss": 0.3087, + "step": 29044 + }, + { + "epoch": 3.88, + "grad_norm": 0.64453125, + "learning_rate": 5.871163622872989e-07, + "loss": 0.2624, + "step": 29045 + }, + { + "epoch": 3.88, + "grad_norm": 0.484375, + "learning_rate": 5.85857012330282e-07, + "loss": 0.3104, + "step": 29046 + }, + { + "epoch": 3.88, + "grad_norm": 0.396484375, + "learning_rate": 5.845990104928478e-07, + "loss": 0.11, + "step": 29047 + }, + { + "epoch": 3.88, + "grad_norm": 0.455078125, + "learning_rate": 5.833423567920604e-07, + "loss": 0.1517, + "step": 29048 + }, + { + "epoch": 3.88, + "grad_norm": 0.60546875, + "learning_rate": 5.820870512449505e-07, + "loss": 0.3473, + "step": 29049 + }, + { + "epoch": 3.88, + "grad_norm": 0.6484375, + "learning_rate": 5.808330938685492e-07, + "loss": 0.1759, + "step": 29050 + }, + { + "epoch": 3.88, + "grad_norm": 0.474609375, + "learning_rate": 5.795804846798536e-07, + "loss": 0.2355, + "step": 29051 + }, + { + "epoch": 3.88, + "grad_norm": 0.515625, + "learning_rate": 5.783292236958504e-07, + "loss": 0.1827, + "step": 29052 + }, + { + "epoch": 3.88, + "grad_norm": 0.8203125, + "learning_rate": 5.77079310933526e-07, + "loss": 0.4954, + "step": 29053 + }, + { + "epoch": 3.88, + "grad_norm": 0.435546875, + "learning_rate": 5.758307464098e-07, + "loss": 0.1984, + "step": 29054 + }, + { + "epoch": 3.88, + "grad_norm": 0.69140625, + "learning_rate": 5.745835301416258e-07, + "loss": 0.6181, + "step": 29055 + }, + { + "epoch": 3.88, + "grad_norm": 0.6640625, + "learning_rate": 5.733376621459119e-07, + "loss": 0.3553, + "step": 29056 + }, + { + "epoch": 3.88, + "grad_norm": 0.439453125, + "learning_rate": 5.720931424395559e-07, + "loss": 0.1806, + "step": 29057 + }, + { + "epoch": 3.88, + "grad_norm": 0.462890625, + "learning_rate": 5.708499710394222e-07, + "loss": 0.1578, + "step": 29058 + }, + { + "epoch": 3.88, + "grad_norm": 0.80078125, + "learning_rate": 5.696081479623749e-07, + "loss": 0.5209, + "step": 29059 + }, + { + "epoch": 3.88, + "grad_norm": 0.455078125, + "learning_rate": 5.683676732252563e-07, + "loss": 0.2736, + "step": 29060 + }, + { + "epoch": 3.88, + "grad_norm": 0.796875, + "learning_rate": 5.671285468448862e-07, + "loss": 0.2347, + "step": 29061 + }, + { + "epoch": 3.88, + "grad_norm": 0.8515625, + "learning_rate": 5.658907688380733e-07, + "loss": 0.4139, + "step": 29062 + }, + { + "epoch": 3.88, + "grad_norm": 0.7890625, + "learning_rate": 5.646543392215931e-07, + "loss": 0.4564, + "step": 29063 + }, + { + "epoch": 3.88, + "grad_norm": 0.6953125, + "learning_rate": 5.63419258012221e-07, + "loss": 0.3465, + "step": 29064 + }, + { + "epoch": 3.88, + "grad_norm": 0.470703125, + "learning_rate": 5.621855252266995e-07, + "loss": 0.154, + "step": 29065 + }, + { + "epoch": 3.88, + "grad_norm": 0.859375, + "learning_rate": 5.609531408817592e-07, + "loss": 0.3601, + "step": 29066 + }, + { + "epoch": 3.88, + "grad_norm": 0.6484375, + "learning_rate": 5.597221049941204e-07, + "loss": 0.4774, + "step": 29067 + }, + { + "epoch": 3.88, + "grad_norm": 0.6171875, + "learning_rate": 5.584924175804585e-07, + "loss": 0.4097, + "step": 29068 + }, + { + "epoch": 3.88, + "grad_norm": 0.5859375, + "learning_rate": 5.572640786574712e-07, + "loss": 0.3074, + "step": 29069 + }, + { + "epoch": 3.88, + "grad_norm": 0.76171875, + "learning_rate": 5.560370882418009e-07, + "loss": 0.4742, + "step": 29070 + }, + { + "epoch": 3.88, + "grad_norm": 0.484375, + "learning_rate": 5.548114463500897e-07, + "loss": 0.1866, + "step": 29071 + }, + { + "epoch": 3.88, + "grad_norm": 0.8046875, + "learning_rate": 5.535871529989578e-07, + "loss": 0.3143, + "step": 29072 + }, + { + "epoch": 3.88, + "grad_norm": 0.357421875, + "learning_rate": 5.52364208205014e-07, + "loss": 0.1018, + "step": 29073 + }, + { + "epoch": 3.88, + "grad_norm": 0.609375, + "learning_rate": 5.51142611984834e-07, + "loss": 0.4293, + "step": 29074 + }, + { + "epoch": 3.88, + "grad_norm": 0.76171875, + "learning_rate": 5.499223643549822e-07, + "loss": 0.7046, + "step": 29075 + }, + { + "epoch": 3.88, + "grad_norm": 0.546875, + "learning_rate": 5.487034653320123e-07, + "loss": 0.4791, + "step": 29076 + }, + { + "epoch": 3.88, + "grad_norm": 0.62890625, + "learning_rate": 5.474859149324552e-07, + "loss": 0.3134, + "step": 29077 + }, + { + "epoch": 3.88, + "grad_norm": 0.6875, + "learning_rate": 5.462697131728089e-07, + "loss": 0.2921, + "step": 29078 + }, + { + "epoch": 3.88, + "grad_norm": 0.51953125, + "learning_rate": 5.450548600695715e-07, + "loss": 0.3989, + "step": 29079 + }, + { + "epoch": 3.88, + "grad_norm": 0.671875, + "learning_rate": 5.438413556392297e-07, + "loss": 0.3909, + "step": 29080 + }, + { + "epoch": 3.88, + "grad_norm": 0.423828125, + "learning_rate": 5.426291998982258e-07, + "loss": 0.201, + "step": 29081 + }, + { + "epoch": 3.88, + "grad_norm": 0.5625, + "learning_rate": 5.414183928630023e-07, + "loss": 0.3729, + "step": 29082 + }, + { + "epoch": 3.88, + "grad_norm": 0.546875, + "learning_rate": 5.402089345499795e-07, + "loss": 0.3896, + "step": 29083 + }, + { + "epoch": 3.88, + "grad_norm": 0.52734375, + "learning_rate": 5.390008249755551e-07, + "loss": 0.5091, + "step": 29084 + }, + { + "epoch": 3.88, + "grad_norm": 0.73828125, + "learning_rate": 5.377940641561163e-07, + "loss": 0.4093, + "step": 29085 + }, + { + "epoch": 3.88, + "grad_norm": 0.640625, + "learning_rate": 5.365886521080166e-07, + "loss": 0.3465, + "step": 29086 + }, + { + "epoch": 3.88, + "grad_norm": 0.58203125, + "learning_rate": 5.353845888476206e-07, + "loss": 0.3387, + "step": 29087 + }, + { + "epoch": 3.88, + "grad_norm": 0.6640625, + "learning_rate": 5.341818743912374e-07, + "loss": 0.2435, + "step": 29088 + }, + { + "epoch": 3.88, + "grad_norm": 0.5546875, + "learning_rate": 5.329805087551876e-07, + "loss": 0.1483, + "step": 29089 + }, + { + "epoch": 3.88, + "grad_norm": 0.5625, + "learning_rate": 5.317804919557579e-07, + "loss": 0.4557, + "step": 29090 + }, + { + "epoch": 3.88, + "grad_norm": 0.64453125, + "learning_rate": 5.305818240092242e-07, + "loss": 0.325, + "step": 29091 + }, + { + "epoch": 3.88, + "grad_norm": 0.65625, + "learning_rate": 5.293845049318291e-07, + "loss": 0.2755, + "step": 29092 + }, + { + "epoch": 3.88, + "grad_norm": 0.55859375, + "learning_rate": 5.281885347398374e-07, + "loss": 0.3912, + "step": 29093 + }, + { + "epoch": 3.88, + "grad_norm": 0.83984375, + "learning_rate": 5.269939134494361e-07, + "loss": 0.3905, + "step": 29094 + }, + { + "epoch": 3.88, + "grad_norm": 0.734375, + "learning_rate": 5.258006410768457e-07, + "loss": 0.4061, + "step": 29095 + }, + { + "epoch": 3.88, + "grad_norm": 0.47265625, + "learning_rate": 5.24608717638242e-07, + "loss": 0.1895, + "step": 29096 + }, + { + "epoch": 3.88, + "grad_norm": 0.609375, + "learning_rate": 5.234181431497897e-07, + "loss": 0.2576, + "step": 29097 + }, + { + "epoch": 3.88, + "grad_norm": 0.53515625, + "learning_rate": 5.222289176276207e-07, + "loss": 0.3565, + "step": 29098 + }, + { + "epoch": 3.88, + "grad_norm": 0.56640625, + "learning_rate": 5.210410410878774e-07, + "loss": 0.2975, + "step": 29099 + }, + { + "epoch": 3.88, + "grad_norm": 0.6640625, + "learning_rate": 5.198545135466693e-07, + "loss": 0.51, + "step": 29100 + }, + { + "epoch": 3.88, + "grad_norm": 0.546875, + "learning_rate": 5.186693350200722e-07, + "loss": 0.337, + "step": 29101 + }, + { + "epoch": 3.88, + "grad_norm": 0.65625, + "learning_rate": 5.174855055241623e-07, + "loss": 0.393, + "step": 29102 + }, + { + "epoch": 3.88, + "grad_norm": 0.71484375, + "learning_rate": 5.163030250750044e-07, + "loss": 0.3153, + "step": 29103 + }, + { + "epoch": 3.88, + "grad_norm": 0.6328125, + "learning_rate": 5.151218936886193e-07, + "loss": 0.1757, + "step": 29104 + }, + { + "epoch": 3.88, + "grad_norm": 0.82421875, + "learning_rate": 5.139421113810272e-07, + "loss": 0.2964, + "step": 29105 + }, + { + "epoch": 3.88, + "grad_norm": 0.6953125, + "learning_rate": 5.127636781682376e-07, + "loss": 0.4051, + "step": 29106 + }, + { + "epoch": 3.88, + "grad_norm": 0.640625, + "learning_rate": 5.115865940662268e-07, + "loss": 0.4022, + "step": 29107 + }, + { + "epoch": 3.88, + "grad_norm": 0.6328125, + "learning_rate": 5.104108590909484e-07, + "loss": 0.331, + "step": 29108 + }, + { + "epoch": 3.88, + "grad_norm": 0.46484375, + "learning_rate": 5.092364732583455e-07, + "loss": 0.2143, + "step": 29109 + }, + { + "epoch": 3.88, + "grad_norm": 0.64453125, + "learning_rate": 5.080634365843606e-07, + "loss": 0.2019, + "step": 29110 + }, + { + "epoch": 3.88, + "grad_norm": 0.609375, + "learning_rate": 5.068917490848702e-07, + "loss": 0.5882, + "step": 29111 + }, + { + "epoch": 3.88, + "grad_norm": 0.66796875, + "learning_rate": 5.057214107758057e-07, + "loss": 0.2679, + "step": 29112 + }, + { + "epoch": 3.88, + "grad_norm": 0.69140625, + "learning_rate": 5.045524216729991e-07, + "loss": 0.4205, + "step": 29113 + }, + { + "epoch": 3.88, + "grad_norm": 0.703125, + "learning_rate": 5.033847817923154e-07, + "loss": 0.3639, + "step": 29114 + }, + { + "epoch": 3.89, + "grad_norm": 0.8203125, + "learning_rate": 5.022184911495864e-07, + "loss": 0.4795, + "step": 29115 + }, + { + "epoch": 3.89, + "grad_norm": 0.67578125, + "learning_rate": 5.010535497606439e-07, + "loss": 0.4045, + "step": 29116 + }, + { + "epoch": 3.89, + "grad_norm": 0.5078125, + "learning_rate": 4.998899576412753e-07, + "loss": 0.2359, + "step": 29117 + }, + { + "epoch": 3.89, + "grad_norm": 0.6171875, + "learning_rate": 4.987277148072455e-07, + "loss": 0.161, + "step": 29118 + }, + { + "epoch": 3.89, + "grad_norm": 0.73828125, + "learning_rate": 4.975668212743311e-07, + "loss": 0.3486, + "step": 29119 + }, + { + "epoch": 3.89, + "grad_norm": 0.62109375, + "learning_rate": 4.964072770582862e-07, + "loss": 0.408, + "step": 29120 + }, + { + "epoch": 3.89, + "grad_norm": 0.5625, + "learning_rate": 4.952490821748091e-07, + "loss": 0.3927, + "step": 29121 + }, + { + "epoch": 3.89, + "grad_norm": 0.5390625, + "learning_rate": 4.940922366396095e-07, + "loss": 0.2171, + "step": 29122 + }, + { + "epoch": 3.89, + "grad_norm": 0.498046875, + "learning_rate": 4.92936740468386e-07, + "loss": 0.336, + "step": 29123 + }, + { + "epoch": 3.89, + "grad_norm": 0.59765625, + "learning_rate": 4.91782593676804e-07, + "loss": 0.4204, + "step": 29124 + }, + { + "epoch": 3.89, + "grad_norm": 0.55859375, + "learning_rate": 4.906297962805062e-07, + "loss": 0.3871, + "step": 29125 + }, + { + "epoch": 3.89, + "grad_norm": 0.5859375, + "learning_rate": 4.894783482951471e-07, + "loss": 0.4237, + "step": 29126 + }, + { + "epoch": 3.89, + "grad_norm": 0.55078125, + "learning_rate": 4.88328249736314e-07, + "loss": 0.3018, + "step": 29127 + }, + { + "epoch": 3.89, + "grad_norm": 0.765625, + "learning_rate": 4.871795006196278e-07, + "loss": 0.479, + "step": 29128 + }, + { + "epoch": 3.89, + "grad_norm": 1.09375, + "learning_rate": 4.860321009606428e-07, + "loss": 0.6283, + "step": 29129 + }, + { + "epoch": 3.89, + "grad_norm": 0.80078125, + "learning_rate": 4.848860507749353e-07, + "loss": 0.7783, + "step": 29130 + }, + { + "epoch": 3.89, + "grad_norm": 0.59765625, + "learning_rate": 4.837413500780486e-07, + "loss": 0.2185, + "step": 29131 + }, + { + "epoch": 3.89, + "grad_norm": 0.58203125, + "learning_rate": 4.825979988854923e-07, + "loss": 0.5388, + "step": 29132 + }, + { + "epoch": 3.89, + "grad_norm": 0.74609375, + "learning_rate": 4.814559972127764e-07, + "loss": 0.2996, + "step": 29133 + }, + { + "epoch": 3.89, + "grad_norm": 0.609375, + "learning_rate": 4.803153450753883e-07, + "loss": 0.2736, + "step": 29134 + }, + { + "epoch": 3.89, + "grad_norm": 0.58203125, + "learning_rate": 4.791760424887936e-07, + "loss": 0.4465, + "step": 29135 + }, + { + "epoch": 3.89, + "grad_norm": 0.5234375, + "learning_rate": 4.780380894684466e-07, + "loss": 0.28, + "step": 29136 + }, + { + "epoch": 3.89, + "grad_norm": 0.62890625, + "learning_rate": 4.769014860297683e-07, + "loss": 0.2751, + "step": 29137 + }, + { + "epoch": 3.89, + "grad_norm": 0.69921875, + "learning_rate": 4.757662321881906e-07, + "loss": 0.4278, + "step": 29138 + }, + { + "epoch": 3.89, + "grad_norm": 0.6484375, + "learning_rate": 4.746323279590903e-07, + "loss": 0.3462, + "step": 29139 + }, + { + "epoch": 3.89, + "grad_norm": 0.49609375, + "learning_rate": 4.73499773357855e-07, + "loss": 0.3913, + "step": 29140 + }, + { + "epoch": 3.89, + "grad_norm": 0.5703125, + "learning_rate": 4.7236856839983913e-07, + "loss": 0.4422, + "step": 29141 + }, + { + "epoch": 3.89, + "grad_norm": 0.703125, + "learning_rate": 4.712387131003748e-07, + "loss": 0.4267, + "step": 29142 + }, + { + "epoch": 3.89, + "grad_norm": 0.6328125, + "learning_rate": 4.701102074747943e-07, + "loss": 0.5208, + "step": 29143 + }, + { + "epoch": 3.89, + "grad_norm": 0.65625, + "learning_rate": 4.689830515383964e-07, + "loss": 0.3651, + "step": 29144 + }, + { + "epoch": 3.89, + "grad_norm": 0.74609375, + "learning_rate": 4.6785724530646893e-07, + "loss": 0.2598, + "step": 29145 + }, + { + "epoch": 3.89, + "grad_norm": 0.5546875, + "learning_rate": 4.667327887942774e-07, + "loss": 0.2201, + "step": 29146 + }, + { + "epoch": 3.89, + "grad_norm": 0.5703125, + "learning_rate": 4.6560968201706524e-07, + "loss": 0.3154, + "step": 29147 + }, + { + "epoch": 3.89, + "grad_norm": 0.5390625, + "learning_rate": 4.644879249900758e-07, + "loss": 0.2617, + "step": 29148 + }, + { + "epoch": 3.89, + "grad_norm": 0.70703125, + "learning_rate": 4.6336751772849687e-07, + "loss": 0.5465, + "step": 29149 + }, + { + "epoch": 3.89, + "grad_norm": 0.58984375, + "learning_rate": 4.6224846024754964e-07, + "loss": 0.2726, + "step": 29150 + }, + { + "epoch": 3.89, + "grad_norm": 0.69140625, + "learning_rate": 4.6113075256238867e-07, + "loss": 0.4508, + "step": 29151 + }, + { + "epoch": 3.89, + "grad_norm": 0.71484375, + "learning_rate": 4.6001439468817965e-07, + "loss": 0.2407, + "step": 29152 + }, + { + "epoch": 3.89, + "grad_norm": 0.69921875, + "learning_rate": 4.5889938664005484e-07, + "loss": 0.2081, + "step": 29153 + }, + { + "epoch": 3.89, + "grad_norm": 0.58984375, + "learning_rate": 4.577857284331466e-07, + "loss": 0.3362, + "step": 29154 + }, + { + "epoch": 3.89, + "grad_norm": 0.625, + "learning_rate": 4.566734200825429e-07, + "loss": 0.3084, + "step": 29155 + }, + { + "epoch": 3.89, + "grad_norm": 0.671875, + "learning_rate": 4.555624616033427e-07, + "loss": 0.5746, + "step": 29156 + }, + { + "epoch": 3.89, + "grad_norm": 0.72265625, + "learning_rate": 4.544528530106007e-07, + "loss": 0.5336, + "step": 29157 + }, + { + "epoch": 3.89, + "grad_norm": 0.83984375, + "learning_rate": 4.533445943193715e-07, + "loss": 0.5142, + "step": 29158 + }, + { + "epoch": 3.89, + "grad_norm": 0.60546875, + "learning_rate": 4.5223768554466526e-07, + "loss": 0.3319, + "step": 29159 + }, + { + "epoch": 3.89, + "grad_norm": 0.63671875, + "learning_rate": 4.511321267015145e-07, + "loss": 0.3635, + "step": 29160 + }, + { + "epoch": 3.89, + "grad_norm": 0.71484375, + "learning_rate": 4.5002791780489605e-07, + "loss": 0.3702, + "step": 29161 + }, + { + "epoch": 3.89, + "grad_norm": 0.70703125, + "learning_rate": 4.4892505886979797e-07, + "loss": 0.3585, + "step": 29162 + }, + { + "epoch": 3.89, + "grad_norm": 0.4296875, + "learning_rate": 4.478235499111527e-07, + "loss": 0.1834, + "step": 29163 + }, + { + "epoch": 3.89, + "grad_norm": 0.51953125, + "learning_rate": 4.4672339094392614e-07, + "loss": 0.391, + "step": 29164 + }, + { + "epoch": 3.89, + "grad_norm": 0.66796875, + "learning_rate": 4.456245819830174e-07, + "loss": 0.3742, + "step": 29165 + }, + { + "epoch": 3.89, + "grad_norm": 0.66015625, + "learning_rate": 4.4452712304333676e-07, + "loss": 0.2183, + "step": 29166 + }, + { + "epoch": 3.89, + "grad_norm": 0.83203125, + "learning_rate": 4.434310141397613e-07, + "loss": 0.2535, + "step": 29167 + }, + { + "epoch": 3.89, + "grad_norm": 0.73046875, + "learning_rate": 4.4233625528714575e-07, + "loss": 0.3691, + "step": 29168 + }, + { + "epoch": 3.89, + "grad_norm": 0.609375, + "learning_rate": 4.412428465003671e-07, + "loss": 0.2583, + "step": 29169 + }, + { + "epoch": 3.89, + "grad_norm": 0.5078125, + "learning_rate": 4.401507877942135e-07, + "loss": 0.3704, + "step": 29170 + }, + { + "epoch": 3.89, + "grad_norm": 0.625, + "learning_rate": 4.390600791835286e-07, + "loss": 0.3517, + "step": 29171 + }, + { + "epoch": 3.89, + "grad_norm": 0.609375, + "learning_rate": 4.3797072068306745e-07, + "loss": 0.3419, + "step": 29172 + }, + { + "epoch": 3.89, + "grad_norm": 0.6015625, + "learning_rate": 4.368827123076291e-07, + "loss": 0.1489, + "step": 29173 + }, + { + "epoch": 3.89, + "grad_norm": 0.490234375, + "learning_rate": 4.3579605407196853e-07, + "loss": 0.2998, + "step": 29174 + }, + { + "epoch": 3.89, + "grad_norm": 0.625, + "learning_rate": 4.347107459908073e-07, + "loss": 0.3433, + "step": 29175 + }, + { + "epoch": 3.89, + "grad_norm": 0.55859375, + "learning_rate": 4.336267880788669e-07, + "loss": 0.4294, + "step": 29176 + }, + { + "epoch": 3.89, + "grad_norm": 0.66796875, + "learning_rate": 4.3254418035084677e-07, + "loss": 0.2756, + "step": 29177 + }, + { + "epoch": 3.89, + "grad_norm": 0.5703125, + "learning_rate": 4.3146292282143507e-07, + "loss": 0.1196, + "step": 29178 + }, + { + "epoch": 3.89, + "grad_norm": 0.59765625, + "learning_rate": 4.303830155052868e-07, + "loss": 0.3928, + "step": 29179 + }, + { + "epoch": 3.89, + "grad_norm": 0.5078125, + "learning_rate": 4.2930445841705693e-07, + "loss": 0.2866, + "step": 29180 + }, + { + "epoch": 3.89, + "grad_norm": 0.67578125, + "learning_rate": 4.282272515713559e-07, + "loss": 0.1935, + "step": 29181 + }, + { + "epoch": 3.89, + "grad_norm": 0.85546875, + "learning_rate": 4.271513949827943e-07, + "loss": 0.4184, + "step": 29182 + }, + { + "epoch": 3.89, + "grad_norm": 0.5859375, + "learning_rate": 4.260768886659716e-07, + "loss": 0.221, + "step": 29183 + }, + { + "epoch": 3.89, + "grad_norm": 0.75390625, + "learning_rate": 4.250037326354428e-07, + "loss": 0.5415, + "step": 29184 + }, + { + "epoch": 3.89, + "grad_norm": 0.478515625, + "learning_rate": 4.23931926905774e-07, + "loss": 0.2529, + "step": 29185 + }, + { + "epoch": 3.89, + "grad_norm": 0.625, + "learning_rate": 4.22861471491498e-07, + "loss": 0.3999, + "step": 29186 + }, + { + "epoch": 3.89, + "grad_norm": 0.66796875, + "learning_rate": 4.217923664071255e-07, + "loss": 0.5463, + "step": 29187 + }, + { + "epoch": 3.89, + "grad_norm": 0.75, + "learning_rate": 4.2072461166716706e-07, + "loss": 0.1736, + "step": 29188 + }, + { + "epoch": 3.89, + "grad_norm": 0.5390625, + "learning_rate": 4.1965820728608887e-07, + "loss": 0.1434, + "step": 29189 + }, + { + "epoch": 3.9, + "grad_norm": 0.71875, + "learning_rate": 4.1859315327834603e-07, + "loss": 0.6777, + "step": 29190 + }, + { + "epoch": 3.9, + "grad_norm": 0.39453125, + "learning_rate": 4.175294496583937e-07, + "loss": 0.2256, + "step": 29191 + }, + { + "epoch": 3.9, + "grad_norm": 0.66015625, + "learning_rate": 4.164670964406536e-07, + "loss": 0.3782, + "step": 29192 + }, + { + "epoch": 3.9, + "grad_norm": 0.47265625, + "learning_rate": 4.154060936395476e-07, + "loss": 0.1211, + "step": 29193 + }, + { + "epoch": 3.9, + "grad_norm": 0.52734375, + "learning_rate": 4.143464412694309e-07, + "loss": 0.3233, + "step": 29194 + }, + { + "epoch": 3.9, + "grad_norm": 0.73828125, + "learning_rate": 4.132881393446919e-07, + "loss": 0.4426, + "step": 29195 + }, + { + "epoch": 3.9, + "grad_norm": 0.5546875, + "learning_rate": 4.1223118787968585e-07, + "loss": 0.3865, + "step": 29196 + }, + { + "epoch": 3.9, + "grad_norm": 0.5234375, + "learning_rate": 4.111755868887346e-07, + "loss": 0.2189, + "step": 29197 + }, + { + "epoch": 3.9, + "grad_norm": 0.5859375, + "learning_rate": 4.1012133638616e-07, + "loss": 0.1336, + "step": 29198 + }, + { + "epoch": 3.9, + "grad_norm": 0.5234375, + "learning_rate": 4.090684363862618e-07, + "loss": 0.4452, + "step": 29199 + }, + { + "epoch": 3.9, + "grad_norm": 0.66796875, + "learning_rate": 4.080168869033063e-07, + "loss": 0.5442, + "step": 29200 + }, + { + "epoch": 3.9, + "grad_norm": 0.671875, + "learning_rate": 4.069666879515599e-07, + "loss": 0.2798, + "step": 29201 + }, + { + "epoch": 3.9, + "grad_norm": 0.7265625, + "learning_rate": 4.0591783954526673e-07, + "loss": 0.5431, + "step": 29202 + }, + { + "epoch": 3.9, + "grad_norm": 0.765625, + "learning_rate": 4.048703416986488e-07, + "loss": 0.4096, + "step": 29203 + }, + { + "epoch": 3.9, + "grad_norm": 0.6328125, + "learning_rate": 4.038241944258947e-07, + "loss": 0.2097, + "step": 29204 + }, + { + "epoch": 3.9, + "grad_norm": 0.58984375, + "learning_rate": 4.0277939774121534e-07, + "loss": 0.3759, + "step": 29205 + }, + { + "epoch": 3.9, + "grad_norm": 0.5546875, + "learning_rate": 4.017359516587771e-07, + "loss": 0.1596, + "step": 29206 + }, + { + "epoch": 3.9, + "grad_norm": 0.73828125, + "learning_rate": 4.0069385619270204e-07, + "loss": 0.4296, + "step": 29207 + }, + { + "epoch": 3.9, + "grad_norm": 0.59765625, + "learning_rate": 3.9965311135715665e-07, + "loss": 0.2489, + "step": 29208 + }, + { + "epoch": 3.9, + "grad_norm": 0.455078125, + "learning_rate": 3.9861371716622966e-07, + "loss": 0.1455, + "step": 29209 + }, + { + "epoch": 3.9, + "grad_norm": 0.498046875, + "learning_rate": 3.975756736340319e-07, + "loss": 0.1406, + "step": 29210 + }, + { + "epoch": 3.9, + "grad_norm": 0.671875, + "learning_rate": 3.965389807746189e-07, + "loss": 0.3441, + "step": 29211 + }, + { + "epoch": 3.9, + "grad_norm": 0.73046875, + "learning_rate": 3.955036386020794e-07, + "loss": 0.3821, + "step": 29212 + }, + { + "epoch": 3.9, + "grad_norm": 0.5390625, + "learning_rate": 3.9446964713042433e-07, + "loss": 0.1065, + "step": 29213 + }, + { + "epoch": 3.9, + "grad_norm": 0.55078125, + "learning_rate": 3.9343700637368695e-07, + "loss": 0.2239, + "step": 29214 + }, + { + "epoch": 3.9, + "grad_norm": 0.8203125, + "learning_rate": 3.924057163458783e-07, + "loss": 0.4268, + "step": 29215 + }, + { + "epoch": 3.9, + "grad_norm": 0.482421875, + "learning_rate": 3.9137577706097603e-07, + "loss": 0.2585, + "step": 29216 + }, + { + "epoch": 3.9, + "grad_norm": 0.50390625, + "learning_rate": 3.9034718853293573e-07, + "loss": 0.2673, + "step": 29217 + }, + { + "epoch": 3.9, + "grad_norm": 0.423828125, + "learning_rate": 3.893199507757239e-07, + "loss": 0.1196, + "step": 29218 + }, + { + "epoch": 3.9, + "grad_norm": 0.640625, + "learning_rate": 3.8829406380326285e-07, + "loss": 0.2902, + "step": 29219 + }, + { + "epoch": 3.9, + "grad_norm": 0.7578125, + "learning_rate": 3.8726952762946357e-07, + "loss": 0.2679, + "step": 29220 + }, + { + "epoch": 3.9, + "grad_norm": 0.625, + "learning_rate": 3.862463422682261e-07, + "loss": 0.225, + "step": 29221 + }, + { + "epoch": 3.9, + "grad_norm": 0.58984375, + "learning_rate": 3.8522450773341713e-07, + "loss": 0.5666, + "step": 29222 + }, + { + "epoch": 3.9, + "grad_norm": 0.7265625, + "learning_rate": 3.842040240388922e-07, + "loss": 0.3543, + "step": 29223 + }, + { + "epoch": 3.9, + "grad_norm": 0.58984375, + "learning_rate": 3.831848911984959e-07, + "loss": 0.1937, + "step": 29224 + }, + { + "epoch": 3.9, + "grad_norm": 0.77734375, + "learning_rate": 3.8216710922603925e-07, + "loss": 0.4278, + "step": 29225 + }, + { + "epoch": 3.9, + "grad_norm": 0.51171875, + "learning_rate": 3.8115067813534466e-07, + "loss": 0.3712, + "step": 29226 + }, + { + "epoch": 3.9, + "grad_norm": 0.69921875, + "learning_rate": 3.801355979401788e-07, + "loss": 0.4875, + "step": 29227 + }, + { + "epoch": 3.9, + "grad_norm": 0.60546875, + "learning_rate": 3.7912186865429745e-07, + "loss": 0.1506, + "step": 29228 + }, + { + "epoch": 3.9, + "grad_norm": 0.53515625, + "learning_rate": 3.7810949029146726e-07, + "loss": 0.4973, + "step": 29229 + }, + { + "epoch": 3.9, + "grad_norm": 0.58203125, + "learning_rate": 3.7709846286541064e-07, + "loss": 0.4356, + "step": 29230 + }, + { + "epoch": 3.9, + "grad_norm": 0.6015625, + "learning_rate": 3.760887863898388e-07, + "loss": 0.2146, + "step": 29231 + }, + { + "epoch": 3.9, + "grad_norm": 0.71484375, + "learning_rate": 3.7508046087844084e-07, + "loss": 0.3943, + "step": 29232 + }, + { + "epoch": 3.9, + "grad_norm": 0.498046875, + "learning_rate": 3.740734863448836e-07, + "loss": 0.2089, + "step": 29233 + }, + { + "epoch": 3.9, + "grad_norm": 0.337890625, + "learning_rate": 3.730678628028339e-07, + "loss": 0.1608, + "step": 29234 + }, + { + "epoch": 3.9, + "grad_norm": 0.50390625, + "learning_rate": 3.720635902659253e-07, + "loss": 0.3074, + "step": 29235 + }, + { + "epoch": 3.9, + "grad_norm": 0.55078125, + "learning_rate": 3.7106066874778025e-07, + "loss": 0.3349, + "step": 29236 + }, + { + "epoch": 3.9, + "grad_norm": 0.41015625, + "learning_rate": 3.700590982619878e-07, + "loss": 0.123, + "step": 29237 + }, + { + "epoch": 3.9, + "grad_norm": 0.55859375, + "learning_rate": 3.6905887882213717e-07, + "loss": 0.2692, + "step": 29238 + }, + { + "epoch": 3.9, + "grad_norm": 0.51171875, + "learning_rate": 3.680600104417842e-07, + "loss": 0.3851, + "step": 29239 + }, + { + "epoch": 3.9, + "grad_norm": 0.58984375, + "learning_rate": 3.670624931344846e-07, + "loss": 0.4695, + "step": 29240 + }, + { + "epoch": 3.9, + "grad_norm": 0.5859375, + "learning_rate": 3.6606632691377206e-07, + "loss": 0.3282, + "step": 29241 + }, + { + "epoch": 3.9, + "grad_norm": 0.59375, + "learning_rate": 3.650715117931358e-07, + "loss": 0.2573, + "step": 29242 + }, + { + "epoch": 3.9, + "grad_norm": 0.51953125, + "learning_rate": 3.6407804778607614e-07, + "loss": 0.4499, + "step": 29243 + }, + { + "epoch": 3.9, + "grad_norm": 0.69140625, + "learning_rate": 3.6308593490606e-07, + "loss": 0.3255, + "step": 29244 + }, + { + "epoch": 3.9, + "grad_norm": 1.1015625, + "learning_rate": 3.620951731665545e-07, + "loss": 0.2113, + "step": 29245 + }, + { + "epoch": 3.9, + "grad_norm": 0.50390625, + "learning_rate": 3.6110576258097107e-07, + "loss": 0.2787, + "step": 29246 + }, + { + "epoch": 3.9, + "grad_norm": 0.74609375, + "learning_rate": 3.601177031627545e-07, + "loss": 0.2358, + "step": 29247 + }, + { + "epoch": 3.9, + "grad_norm": 0.7890625, + "learning_rate": 3.5913099492528304e-07, + "loss": 0.5729, + "step": 29248 + }, + { + "epoch": 3.9, + "grad_norm": 0.69140625, + "learning_rate": 3.5814563788194586e-07, + "loss": 0.5596, + "step": 29249 + }, + { + "epoch": 3.9, + "grad_norm": 0.54296875, + "learning_rate": 3.571616320461102e-07, + "loss": 0.4514, + "step": 29250 + }, + { + "epoch": 3.9, + "grad_norm": 0.439453125, + "learning_rate": 3.5617897743109863e-07, + "loss": 0.1333, + "step": 29251 + }, + { + "epoch": 3.9, + "grad_norm": 0.62890625, + "learning_rate": 3.5519767405026715e-07, + "loss": 0.4239, + "step": 29252 + }, + { + "epoch": 3.9, + "grad_norm": 0.67578125, + "learning_rate": 3.5421772191689405e-07, + "loss": 0.3648, + "step": 29253 + }, + { + "epoch": 3.9, + "grad_norm": 0.546875, + "learning_rate": 3.532391210442909e-07, + "loss": 0.217, + "step": 29254 + }, + { + "epoch": 3.9, + "grad_norm": 0.609375, + "learning_rate": 3.522618714457138e-07, + "loss": 0.1226, + "step": 29255 + }, + { + "epoch": 3.9, + "grad_norm": 0.62890625, + "learning_rate": 3.512859731344187e-07, + "loss": 0.1492, + "step": 29256 + }, + { + "epoch": 3.9, + "grad_norm": 0.78515625, + "learning_rate": 3.5031142612363955e-07, + "loss": 0.2754, + "step": 29257 + }, + { + "epoch": 3.9, + "grad_norm": 0.7421875, + "learning_rate": 3.4933823042659905e-07, + "loss": 0.3625, + "step": 29258 + }, + { + "epoch": 3.9, + "grad_norm": 0.65234375, + "learning_rate": 3.4836638605647567e-07, + "loss": 0.3735, + "step": 29259 + }, + { + "epoch": 3.9, + "grad_norm": 0.455078125, + "learning_rate": 3.473958930264698e-07, + "loss": 0.2207, + "step": 29260 + }, + { + "epoch": 3.9, + "grad_norm": 0.65234375, + "learning_rate": 3.464267513497266e-07, + "loss": 0.2908, + "step": 29261 + }, + { + "epoch": 3.9, + "grad_norm": 0.54296875, + "learning_rate": 3.45458961039391e-07, + "loss": 0.3293, + "step": 29262 + }, + { + "epoch": 3.9, + "grad_norm": 0.63671875, + "learning_rate": 3.4449252210859706e-07, + "loss": 0.3237, + "step": 29263 + }, + { + "epoch": 3.9, + "grad_norm": 0.6640625, + "learning_rate": 3.4352743457044537e-07, + "loss": 0.4782, + "step": 29264 + }, + { + "epoch": 3.91, + "grad_norm": 0.6015625, + "learning_rate": 3.425636984380143e-07, + "loss": 0.3738, + "step": 29265 + }, + { + "epoch": 3.91, + "grad_norm": 0.65625, + "learning_rate": 3.416013137243823e-07, + "loss": 0.4175, + "step": 29266 + }, + { + "epoch": 3.91, + "grad_norm": 0.61328125, + "learning_rate": 3.4064028044259454e-07, + "loss": 0.5099, + "step": 29267 + }, + { + "epoch": 3.91, + "grad_norm": 0.60546875, + "learning_rate": 3.39680598605685e-07, + "loss": 0.3397, + "step": 29268 + }, + { + "epoch": 3.91, + "grad_norm": 0.84765625, + "learning_rate": 3.387222682266655e-07, + "loss": 0.2359, + "step": 29269 + }, + { + "epoch": 3.91, + "grad_norm": 0.61328125, + "learning_rate": 3.377652893185368e-07, + "loss": 0.3803, + "step": 29270 + }, + { + "epoch": 3.91, + "grad_norm": 0.69140625, + "learning_rate": 3.368096618942773e-07, + "loss": 0.6322, + "step": 29271 + }, + { + "epoch": 3.91, + "grad_norm": 0.55078125, + "learning_rate": 3.358553859668434e-07, + "loss": 0.2011, + "step": 29272 + }, + { + "epoch": 3.91, + "grad_norm": 0.82421875, + "learning_rate": 3.3490246154916915e-07, + "loss": 0.219, + "step": 29273 + }, + { + "epoch": 3.91, + "grad_norm": 0.515625, + "learning_rate": 3.3395088865417756e-07, + "loss": 0.1857, + "step": 29274 + }, + { + "epoch": 3.91, + "grad_norm": 0.578125, + "learning_rate": 3.330006672947805e-07, + "loss": 0.2716, + "step": 29275 + }, + { + "epoch": 3.91, + "grad_norm": 0.63671875, + "learning_rate": 3.320517974838677e-07, + "loss": 0.2037, + "step": 29276 + }, + { + "epoch": 3.91, + "grad_norm": 0.61328125, + "learning_rate": 3.3110427923429556e-07, + "loss": 0.3211, + "step": 29277 + }, + { + "epoch": 3.91, + "grad_norm": 0.765625, + "learning_rate": 3.301581125589204e-07, + "loss": 0.3282, + "step": 29278 + }, + { + "epoch": 3.91, + "grad_norm": 0.578125, + "learning_rate": 3.292132974705653e-07, + "loss": 0.3315, + "step": 29279 + }, + { + "epoch": 3.91, + "grad_norm": 0.56640625, + "learning_rate": 3.2826983398204226e-07, + "loss": 0.3304, + "step": 29280 + }, + { + "epoch": 3.91, + "grad_norm": 0.59765625, + "learning_rate": 3.2732772210615213e-07, + "loss": 0.3031, + "step": 29281 + }, + { + "epoch": 3.91, + "grad_norm": 0.71484375, + "learning_rate": 3.263869618556625e-07, + "loss": 0.2488, + "step": 29282 + }, + { + "epoch": 3.91, + "grad_norm": 0.59375, + "learning_rate": 3.25447553243341e-07, + "loss": 0.4005, + "step": 29283 + }, + { + "epoch": 3.91, + "grad_norm": 0.6484375, + "learning_rate": 3.2450949628192174e-07, + "loss": 0.4646, + "step": 29284 + }, + { + "epoch": 3.91, + "grad_norm": 0.78125, + "learning_rate": 3.2357279098412797e-07, + "loss": 0.3936, + "step": 29285 + }, + { + "epoch": 3.91, + "grad_norm": 0.66015625, + "learning_rate": 3.226374373626495e-07, + "loss": 0.3279, + "step": 29286 + }, + { + "epoch": 3.91, + "grad_norm": 0.6875, + "learning_rate": 3.2170343543018733e-07, + "loss": 0.3695, + "step": 29287 + }, + { + "epoch": 3.91, + "grad_norm": 0.65234375, + "learning_rate": 3.20770785199398e-07, + "loss": 0.3264, + "step": 29288 + }, + { + "epoch": 3.91, + "grad_norm": 0.625, + "learning_rate": 3.1983948668292685e-07, + "loss": 0.2319, + "step": 29289 + }, + { + "epoch": 3.91, + "grad_norm": 0.7890625, + "learning_rate": 3.1890953989340833e-07, + "loss": 0.5139, + "step": 29290 + }, + { + "epoch": 3.91, + "grad_norm": 0.515625, + "learning_rate": 3.1798094484345453e-07, + "loss": 0.2771, + "step": 29291 + }, + { + "epoch": 3.91, + "grad_norm": 0.7109375, + "learning_rate": 3.1705370154565537e-07, + "loss": 0.2083, + "step": 29292 + }, + { + "epoch": 3.91, + "grad_norm": 0.58984375, + "learning_rate": 3.161278100125786e-07, + "loss": 0.1969, + "step": 29293 + }, + { + "epoch": 3.91, + "grad_norm": 0.54296875, + "learning_rate": 3.15203270256792e-07, + "loss": 0.1258, + "step": 29294 + }, + { + "epoch": 3.91, + "grad_norm": 0.515625, + "learning_rate": 3.142800822908187e-07, + "loss": 0.3167, + "step": 29295 + }, + { + "epoch": 3.91, + "grad_norm": 0.46875, + "learning_rate": 3.133582461271933e-07, + "loss": 0.2653, + "step": 29296 + }, + { + "epoch": 3.91, + "grad_norm": 0.63671875, + "learning_rate": 3.124377617784058e-07, + "loss": 0.3791, + "step": 29297 + }, + { + "epoch": 3.91, + "grad_norm": 0.5390625, + "learning_rate": 3.11518629256935e-07, + "loss": 0.3466, + "step": 29298 + }, + { + "epoch": 3.91, + "grad_norm": 0.703125, + "learning_rate": 3.106008485752598e-07, + "loss": 0.3879, + "step": 29299 + }, + { + "epoch": 3.91, + "grad_norm": 0.56640625, + "learning_rate": 3.0968441974581483e-07, + "loss": 0.201, + "step": 29300 + }, + { + "epoch": 3.91, + "grad_norm": 0.578125, + "learning_rate": 3.0876934278102343e-07, + "loss": 0.658, + "step": 29301 + }, + { + "epoch": 3.91, + "grad_norm": 0.54296875, + "learning_rate": 3.0785561769330895e-07, + "loss": 0.2378, + "step": 29302 + }, + { + "epoch": 3.91, + "grad_norm": 0.4921875, + "learning_rate": 3.069432444950504e-07, + "loss": 0.1772, + "step": 29303 + }, + { + "epoch": 3.91, + "grad_norm": 0.578125, + "learning_rate": 3.060322231986268e-07, + "loss": 0.2599, + "step": 29304 + }, + { + "epoch": 3.91, + "grad_norm": 0.6328125, + "learning_rate": 3.051225538163949e-07, + "loss": 0.2464, + "step": 29305 + }, + { + "epoch": 3.91, + "grad_norm": 0.6640625, + "learning_rate": 3.042142363606781e-07, + "loss": 0.3881, + "step": 29306 + }, + { + "epoch": 3.91, + "grad_norm": 0.53125, + "learning_rate": 3.0330727084380006e-07, + "loss": 0.3175, + "step": 29307 + }, + { + "epoch": 3.91, + "grad_norm": 0.4609375, + "learning_rate": 3.024016572780619e-07, + "loss": 0.2577, + "step": 29308 + }, + { + "epoch": 3.91, + "grad_norm": 0.546875, + "learning_rate": 3.0149739567574274e-07, + "loss": 0.195, + "step": 29309 + }, + { + "epoch": 3.91, + "grad_norm": 0.67578125, + "learning_rate": 3.0059448604909947e-07, + "loss": 0.5697, + "step": 29310 + }, + { + "epoch": 3.91, + "grad_norm": 0.49609375, + "learning_rate": 2.996929284103889e-07, + "loss": 0.319, + "step": 29311 + }, + { + "epoch": 3.91, + "grad_norm": 0.51171875, + "learning_rate": 2.987927227718235e-07, + "loss": 0.2783, + "step": 29312 + }, + { + "epoch": 3.91, + "grad_norm": 0.69921875, + "learning_rate": 2.978938691456157e-07, + "loss": 0.3187, + "step": 29313 + }, + { + "epoch": 3.91, + "grad_norm": 0.51171875, + "learning_rate": 2.9699636754396686e-07, + "loss": 0.3805, + "step": 29314 + }, + { + "epoch": 3.91, + "grad_norm": 0.51171875, + "learning_rate": 2.9610021797902287e-07, + "loss": 0.3491, + "step": 29315 + }, + { + "epoch": 3.91, + "grad_norm": 0.6015625, + "learning_rate": 2.9520542046296287e-07, + "loss": 0.3649, + "step": 29316 + }, + { + "epoch": 3.91, + "grad_norm": 0.51171875, + "learning_rate": 2.9431197500789933e-07, + "loss": 0.3123, + "step": 29317 + }, + { + "epoch": 3.91, + "grad_norm": 0.8046875, + "learning_rate": 2.934198816259559e-07, + "loss": 0.2694, + "step": 29318 + }, + { + "epoch": 3.91, + "grad_norm": 0.61328125, + "learning_rate": 2.9252914032923407e-07, + "loss": 0.2348, + "step": 29319 + }, + { + "epoch": 3.91, + "grad_norm": 0.640625, + "learning_rate": 2.916397511298019e-07, + "loss": 0.3366, + "step": 29320 + }, + { + "epoch": 3.91, + "grad_norm": 0.6796875, + "learning_rate": 2.9075171403972754e-07, + "loss": 0.297, + "step": 29321 + }, + { + "epoch": 3.91, + "grad_norm": 0.63671875, + "learning_rate": 2.8986502907105697e-07, + "loss": 0.3791, + "step": 29322 + }, + { + "epoch": 3.91, + "grad_norm": 0.5546875, + "learning_rate": 2.8897969623581375e-07, + "loss": 0.4063, + "step": 29323 + }, + { + "epoch": 3.91, + "grad_norm": 0.71484375, + "learning_rate": 2.8809571554599957e-07, + "loss": 0.4142, + "step": 29324 + }, + { + "epoch": 3.91, + "grad_norm": 0.55859375, + "learning_rate": 2.872130870135936e-07, + "loss": 0.2428, + "step": 29325 + }, + { + "epoch": 3.91, + "grad_norm": 0.49609375, + "learning_rate": 2.8633181065057525e-07, + "loss": 0.1924, + "step": 29326 + }, + { + "epoch": 3.91, + "grad_norm": 0.64453125, + "learning_rate": 2.854518864689015e-07, + "loss": 0.2586, + "step": 29327 + }, + { + "epoch": 3.91, + "grad_norm": 0.73046875, + "learning_rate": 2.845733144804963e-07, + "loss": 0.4192, + "step": 29328 + }, + { + "epoch": 3.91, + "grad_norm": 0.59765625, + "learning_rate": 2.836960946972611e-07, + "loss": 0.183, + "step": 29329 + }, + { + "epoch": 3.91, + "grad_norm": 0.89453125, + "learning_rate": 2.8282022713110865e-07, + "loss": 0.3363, + "step": 29330 + }, + { + "epoch": 3.91, + "grad_norm": 0.734375, + "learning_rate": 2.819457117939184e-07, + "loss": 0.3298, + "step": 29331 + }, + { + "epoch": 3.91, + "grad_norm": 0.640625, + "learning_rate": 2.810725486975474e-07, + "loss": 0.3837, + "step": 29332 + }, + { + "epoch": 3.91, + "grad_norm": 0.62109375, + "learning_rate": 2.802007378538196e-07, + "loss": 0.4394, + "step": 29333 + }, + { + "epoch": 3.91, + "grad_norm": 0.55078125, + "learning_rate": 2.7933027927457e-07, + "loss": 0.2738, + "step": 29334 + }, + { + "epoch": 3.91, + "grad_norm": 0.796875, + "learning_rate": 2.7846117297161136e-07, + "loss": 0.4689, + "step": 29335 + }, + { + "epoch": 3.91, + "grad_norm": 0.5546875, + "learning_rate": 2.7759341895671196e-07, + "loss": 0.2479, + "step": 29336 + }, + { + "epoch": 3.91, + "grad_norm": 0.78125, + "learning_rate": 2.767270172416514e-07, + "loss": 0.7041, + "step": 29337 + }, + { + "epoch": 3.91, + "grad_norm": 0.71484375, + "learning_rate": 2.7586196783817576e-07, + "loss": 0.324, + "step": 29338 + }, + { + "epoch": 3.91, + "grad_norm": 0.7265625, + "learning_rate": 2.74998270758009e-07, + "loss": 0.3782, + "step": 29339 + }, + { + "epoch": 3.92, + "grad_norm": 0.546875, + "learning_rate": 2.741359260128751e-07, + "loss": 0.286, + "step": 29340 + }, + { + "epoch": 3.92, + "grad_norm": 0.90234375, + "learning_rate": 2.7327493361446466e-07, + "loss": 0.3665, + "step": 29341 + }, + { + "epoch": 3.92, + "grad_norm": 0.478515625, + "learning_rate": 2.7241529357443506e-07, + "loss": 0.2109, + "step": 29342 + }, + { + "epoch": 3.92, + "grad_norm": 0.6015625, + "learning_rate": 2.71557005904477e-07, + "loss": 0.4843, + "step": 29343 + }, + { + "epoch": 3.92, + "grad_norm": 0.64453125, + "learning_rate": 2.7070007061619217e-07, + "loss": 0.3979, + "step": 29344 + }, + { + "epoch": 3.92, + "grad_norm": 0.494140625, + "learning_rate": 2.6984448772123803e-07, + "loss": 0.1661, + "step": 29345 + }, + { + "epoch": 3.92, + "grad_norm": 0.6953125, + "learning_rate": 2.689902572311831e-07, + "loss": 0.2769, + "step": 29346 + }, + { + "epoch": 3.92, + "grad_norm": 0.62109375, + "learning_rate": 2.681373791576403e-07, + "loss": 0.3891, + "step": 29347 + }, + { + "epoch": 3.92, + "grad_norm": 0.58984375, + "learning_rate": 2.672858535121558e-07, + "loss": 0.322, + "step": 29348 + }, + { + "epoch": 3.92, + "grad_norm": 0.58984375, + "learning_rate": 2.664356803062873e-07, + "loss": 0.2098, + "step": 29349 + }, + { + "epoch": 3.92, + "grad_norm": 0.6328125, + "learning_rate": 2.655868595515587e-07, + "loss": 0.5419, + "step": 29350 + }, + { + "epoch": 3.92, + "grad_norm": 0.66796875, + "learning_rate": 2.647393912594831e-07, + "loss": 0.1689, + "step": 29351 + }, + { + "epoch": 3.92, + "grad_norm": 0.72265625, + "learning_rate": 2.638932754415402e-07, + "loss": 0.3452, + "step": 29352 + }, + { + "epoch": 3.92, + "grad_norm": 0.5546875, + "learning_rate": 2.630485121092208e-07, + "loss": 0.388, + "step": 29353 + }, + { + "epoch": 3.92, + "grad_norm": 0.515625, + "learning_rate": 2.6220510127398237e-07, + "loss": 0.2053, + "step": 29354 + }, + { + "epoch": 3.92, + "grad_norm": 0.6796875, + "learning_rate": 2.6136304294723803e-07, + "loss": 0.6114, + "step": 29355 + }, + { + "epoch": 3.92, + "grad_norm": 0.66015625, + "learning_rate": 2.6052233714043416e-07, + "loss": 0.3448, + "step": 29356 + }, + { + "epoch": 3.92, + "grad_norm": 0.53515625, + "learning_rate": 2.596829838649617e-07, + "loss": 0.2799, + "step": 29357 + }, + { + "epoch": 3.92, + "grad_norm": 0.55859375, + "learning_rate": 2.588449831321893e-07, + "loss": 0.3067, + "step": 29358 + }, + { + "epoch": 3.92, + "grad_norm": 0.498046875, + "learning_rate": 2.5800833495350786e-07, + "loss": 0.1829, + "step": 29359 + }, + { + "epoch": 3.92, + "grad_norm": 0.6953125, + "learning_rate": 2.571730393402305e-07, + "loss": 0.5085, + "step": 29360 + }, + { + "epoch": 3.92, + "grad_norm": 0.80078125, + "learning_rate": 2.5633909630371487e-07, + "loss": 0.2148, + "step": 29361 + }, + { + "epoch": 3.92, + "grad_norm": 0.5703125, + "learning_rate": 2.5550650585525194e-07, + "loss": 0.1463, + "step": 29362 + }, + { + "epoch": 3.92, + "grad_norm": 0.8046875, + "learning_rate": 2.546752680061326e-07, + "loss": 0.3931, + "step": 29363 + }, + { + "epoch": 3.92, + "grad_norm": 0.69140625, + "learning_rate": 2.5384538276763683e-07, + "loss": 0.3777, + "step": 29364 + }, + { + "epoch": 3.92, + "grad_norm": 0.57421875, + "learning_rate": 2.530168501510111e-07, + "loss": 0.1592, + "step": 29365 + }, + { + "epoch": 3.92, + "grad_norm": 0.6328125, + "learning_rate": 2.5218967016750193e-07, + "loss": 0.1846, + "step": 29366 + }, + { + "epoch": 3.92, + "grad_norm": 0.6796875, + "learning_rate": 2.513638428283116e-07, + "loss": 0.4361, + "step": 29367 + }, + { + "epoch": 3.92, + "grad_norm": 0.41015625, + "learning_rate": 2.505393681446422e-07, + "loss": 0.1804, + "step": 29368 + }, + { + "epoch": 3.92, + "grad_norm": 0.5, + "learning_rate": 2.4971624612768474e-07, + "loss": 0.3393, + "step": 29369 + }, + { + "epoch": 3.92, + "grad_norm": 0.8203125, + "learning_rate": 2.488944767885859e-07, + "loss": 0.3862, + "step": 29370 + }, + { + "epoch": 3.92, + "grad_norm": 0.451171875, + "learning_rate": 2.480740601385034e-07, + "loss": 0.2802, + "step": 29371 + }, + { + "epoch": 3.92, + "grad_norm": 0.4921875, + "learning_rate": 2.472549961885617e-07, + "loss": 0.1331, + "step": 29372 + }, + { + "epoch": 3.92, + "grad_norm": 0.55859375, + "learning_rate": 2.464372849498631e-07, + "loss": 0.2893, + "step": 29373 + }, + { + "epoch": 3.92, + "grad_norm": 0.419921875, + "learning_rate": 2.4562092643348745e-07, + "loss": 0.2146, + "step": 29374 + }, + { + "epoch": 3.92, + "grad_norm": 0.62890625, + "learning_rate": 2.448059206505149e-07, + "loss": 0.3923, + "step": 29375 + }, + { + "epoch": 3.92, + "grad_norm": 0.490234375, + "learning_rate": 2.4399226761200324e-07, + "loss": 0.2138, + "step": 29376 + }, + { + "epoch": 3.92, + "grad_norm": 0.63671875, + "learning_rate": 2.43179967328977e-07, + "loss": 0.1474, + "step": 29377 + }, + { + "epoch": 3.92, + "grad_norm": 0.79296875, + "learning_rate": 2.423690198124606e-07, + "loss": 0.3967, + "step": 29378 + }, + { + "epoch": 3.92, + "grad_norm": 0.65625, + "learning_rate": 2.415594250734343e-07, + "loss": 0.5628, + "step": 29379 + }, + { + "epoch": 3.92, + "grad_norm": 0.7265625, + "learning_rate": 2.407511831228892e-07, + "loss": 0.2895, + "step": 29380 + }, + { + "epoch": 3.92, + "grad_norm": 0.59375, + "learning_rate": 2.399442939717833e-07, + "loss": 0.2018, + "step": 29381 + }, + { + "epoch": 3.92, + "grad_norm": 0.578125, + "learning_rate": 2.391387576310522e-07, + "loss": 0.2142, + "step": 29382 + }, + { + "epoch": 3.92, + "grad_norm": 0.65234375, + "learning_rate": 2.3833457411163162e-07, + "loss": 0.2558, + "step": 29383 + }, + { + "epoch": 3.92, + "grad_norm": 0.5625, + "learning_rate": 2.3753174342442397e-07, + "loss": 0.2229, + "step": 29384 + }, + { + "epoch": 3.92, + "grad_norm": 0.43359375, + "learning_rate": 2.3673026558030943e-07, + "loss": 0.267, + "step": 29385 + }, + { + "epoch": 3.92, + "grad_norm": 0.58984375, + "learning_rate": 2.3593014059015705e-07, + "loss": 0.222, + "step": 29386 + }, + { + "epoch": 3.92, + "grad_norm": 0.65625, + "learning_rate": 2.3513136846482487e-07, + "loss": 0.2765, + "step": 29387 + }, + { + "epoch": 3.92, + "grad_norm": 0.63671875, + "learning_rate": 2.3433394921514863e-07, + "loss": 0.2261, + "step": 29388 + }, + { + "epoch": 3.92, + "grad_norm": 0.6015625, + "learning_rate": 2.335378828519197e-07, + "loss": 0.2968, + "step": 29389 + }, + { + "epoch": 3.92, + "grad_norm": 0.71484375, + "learning_rate": 2.3274316938595163e-07, + "loss": 0.2146, + "step": 29390 + }, + { + "epoch": 3.92, + "grad_norm": 0.64453125, + "learning_rate": 2.3194980882801364e-07, + "loss": 0.2517, + "step": 29391 + }, + { + "epoch": 3.92, + "grad_norm": 0.72265625, + "learning_rate": 2.3115780118886376e-07, + "loss": 0.3824, + "step": 29392 + }, + { + "epoch": 3.92, + "grad_norm": 0.55859375, + "learning_rate": 2.3036714647924895e-07, + "loss": 0.4213, + "step": 29393 + }, + { + "epoch": 3.92, + "grad_norm": 0.53125, + "learning_rate": 2.2957784470989395e-07, + "loss": 0.2483, + "step": 29394 + }, + { + "epoch": 3.92, + "grad_norm": 0.79296875, + "learning_rate": 2.2878989589149025e-07, + "loss": 0.4698, + "step": 29395 + }, + { + "epoch": 3.92, + "grad_norm": 0.53125, + "learning_rate": 2.2800330003472926e-07, + "loss": 0.2476, + "step": 29396 + }, + { + "epoch": 3.92, + "grad_norm": 0.65234375, + "learning_rate": 2.272180571502691e-07, + "loss": 0.3001, + "step": 29397 + }, + { + "epoch": 3.92, + "grad_norm": 0.66796875, + "learning_rate": 2.2643416724877908e-07, + "loss": 0.4395, + "step": 29398 + }, + { + "epoch": 3.92, + "grad_norm": 0.486328125, + "learning_rate": 2.2565163034086178e-07, + "loss": 0.1516, + "step": 29399 + }, + { + "epoch": 3.92, + "grad_norm": 0.55859375, + "learning_rate": 2.2487044643715315e-07, + "loss": 0.2108, + "step": 29400 + }, + { + "epoch": 3.92, + "grad_norm": 0.546875, + "learning_rate": 2.2409061554823363e-07, + "loss": 0.4037, + "step": 29401 + }, + { + "epoch": 3.92, + "grad_norm": 0.76171875, + "learning_rate": 2.2331213768468363e-07, + "loss": 0.2242, + "step": 29402 + }, + { + "epoch": 3.92, + "grad_norm": 0.5546875, + "learning_rate": 2.2253501285705024e-07, + "loss": 0.4112, + "step": 29403 + }, + { + "epoch": 3.92, + "grad_norm": 0.515625, + "learning_rate": 2.2175924107589174e-07, + "loss": 0.2249, + "step": 29404 + }, + { + "epoch": 3.92, + "grad_norm": 0.703125, + "learning_rate": 2.209848223517108e-07, + "loss": 0.4438, + "step": 29405 + }, + { + "epoch": 3.92, + "grad_norm": 0.875, + "learning_rate": 2.2021175669499906e-07, + "loss": 0.3599, + "step": 29406 + }, + { + "epoch": 3.92, + "grad_norm": 0.478515625, + "learning_rate": 2.194400441162703e-07, + "loss": 0.5145, + "step": 29407 + }, + { + "epoch": 3.92, + "grad_norm": 0.5, + "learning_rate": 2.186696846259717e-07, + "loss": 0.3652, + "step": 29408 + }, + { + "epoch": 3.92, + "grad_norm": 0.74609375, + "learning_rate": 2.179006782345394e-07, + "loss": 0.4683, + "step": 29409 + }, + { + "epoch": 3.92, + "grad_norm": 0.7890625, + "learning_rate": 2.171330249524206e-07, + "loss": 0.6944, + "step": 29410 + }, + { + "epoch": 3.92, + "grad_norm": 0.478515625, + "learning_rate": 2.1636672479001807e-07, + "loss": 0.3523, + "step": 29411 + }, + { + "epoch": 3.92, + "grad_norm": 0.64453125, + "learning_rate": 2.156017777577346e-07, + "loss": 0.2687, + "step": 29412 + }, + { + "epoch": 3.92, + "grad_norm": 0.63671875, + "learning_rate": 2.1483818386591748e-07, + "loss": 0.626, + "step": 29413 + }, + { + "epoch": 3.93, + "grad_norm": 0.55078125, + "learning_rate": 2.1407594312494728e-07, + "loss": 0.1552, + "step": 29414 + }, + { + "epoch": 3.93, + "grad_norm": 0.73828125, + "learning_rate": 2.1331505554513798e-07, + "loss": 0.327, + "step": 29415 + }, + { + "epoch": 3.93, + "grad_norm": 0.609375, + "learning_rate": 2.1255552113682576e-07, + "loss": 0.3634, + "step": 29416 + }, + { + "epoch": 3.93, + "grad_norm": 0.734375, + "learning_rate": 2.1179733991030238e-07, + "loss": 0.4532, + "step": 29417 + }, + { + "epoch": 3.93, + "grad_norm": 0.5625, + "learning_rate": 2.110405118758485e-07, + "loss": 0.2398, + "step": 29418 + }, + { + "epoch": 3.93, + "grad_norm": 0.546875, + "learning_rate": 2.102850370437337e-07, + "loss": 0.2948, + "step": 29419 + }, + { + "epoch": 3.93, + "grad_norm": 0.66015625, + "learning_rate": 2.0953091542419424e-07, + "loss": 0.5083, + "step": 29420 + }, + { + "epoch": 3.93, + "grad_norm": 0.578125, + "learning_rate": 2.0877814702746634e-07, + "loss": 0.2676, + "step": 29421 + }, + { + "epoch": 3.93, + "grad_norm": 0.5, + "learning_rate": 2.08026731863753e-07, + "loss": 0.4032, + "step": 29422 + }, + { + "epoch": 3.93, + "grad_norm": 0.57421875, + "learning_rate": 2.072766699432349e-07, + "loss": 0.5086, + "step": 29423 + }, + { + "epoch": 3.93, + "grad_norm": 0.466796875, + "learning_rate": 2.0652796127609285e-07, + "loss": 0.2389, + "step": 29424 + }, + { + "epoch": 3.93, + "grad_norm": 0.80078125, + "learning_rate": 2.0578060587248537e-07, + "loss": 0.1269, + "step": 29425 + }, + { + "epoch": 3.93, + "grad_norm": 0.455078125, + "learning_rate": 2.050346037425377e-07, + "loss": 0.2892, + "step": 29426 + }, + { + "epoch": 3.93, + "grad_norm": 0.70703125, + "learning_rate": 2.0428995489636394e-07, + "loss": 0.3372, + "step": 29427 + }, + { + "epoch": 3.93, + "grad_norm": 0.5234375, + "learning_rate": 2.0354665934407823e-07, + "loss": 0.3257, + "step": 29428 + }, + { + "epoch": 3.93, + "grad_norm": 0.5703125, + "learning_rate": 2.0280471709573924e-07, + "loss": 0.3012, + "step": 29429 + }, + { + "epoch": 3.93, + "grad_norm": 0.62890625, + "learning_rate": 2.0206412816141662e-07, + "loss": 0.3372, + "step": 29430 + }, + { + "epoch": 3.93, + "grad_norm": 0.8203125, + "learning_rate": 2.0132489255115795e-07, + "loss": 0.7376, + "step": 29431 + }, + { + "epoch": 3.93, + "grad_norm": 0.5703125, + "learning_rate": 2.0058701027498848e-07, + "loss": 0.1154, + "step": 29432 + }, + { + "epoch": 3.93, + "grad_norm": 0.5625, + "learning_rate": 1.998504813429003e-07, + "loss": 0.462, + "step": 29433 + }, + { + "epoch": 3.93, + "grad_norm": 0.65234375, + "learning_rate": 1.9911530576489644e-07, + "loss": 0.2492, + "step": 29434 + }, + { + "epoch": 3.93, + "grad_norm": 0.59765625, + "learning_rate": 1.9838148355093565e-07, + "loss": 0.3457, + "step": 29435 + }, + { + "epoch": 3.93, + "grad_norm": 0.6484375, + "learning_rate": 1.9764901471098773e-07, + "loss": 0.2843, + "step": 29436 + }, + { + "epoch": 3.93, + "grad_norm": 0.75390625, + "learning_rate": 1.9691789925495585e-07, + "loss": 0.4362, + "step": 29437 + }, + { + "epoch": 3.93, + "grad_norm": 0.66796875, + "learning_rate": 1.9618813719277652e-07, + "loss": 0.3209, + "step": 29438 + }, + { + "epoch": 3.93, + "grad_norm": 0.57421875, + "learning_rate": 1.954597285343418e-07, + "loss": 0.1937, + "step": 29439 + }, + { + "epoch": 3.93, + "grad_norm": 0.5546875, + "learning_rate": 1.947326732895327e-07, + "loss": 0.2726, + "step": 29440 + }, + { + "epoch": 3.93, + "grad_norm": 0.5234375, + "learning_rate": 1.940069714681969e-07, + "loss": 0.3732, + "step": 29441 + }, + { + "epoch": 3.93, + "grad_norm": 0.62109375, + "learning_rate": 1.93282623080171e-07, + "loss": 0.2363, + "step": 29442 + }, + { + "epoch": 3.93, + "grad_norm": 0.65234375, + "learning_rate": 1.925596281353026e-07, + "loss": 0.3341, + "step": 29443 + }, + { + "epoch": 3.93, + "grad_norm": 0.73046875, + "learning_rate": 1.9183798664337282e-07, + "loss": 0.3693, + "step": 29444 + }, + { + "epoch": 3.93, + "grad_norm": 0.453125, + "learning_rate": 1.9111769861418493e-07, + "loss": 0.1156, + "step": 29445 + }, + { + "epoch": 3.93, + "grad_norm": 0.515625, + "learning_rate": 1.903987640574978e-07, + "loss": 0.2706, + "step": 29446 + }, + { + "epoch": 3.93, + "grad_norm": 0.5859375, + "learning_rate": 1.8968118298304806e-07, + "loss": 0.2589, + "step": 29447 + }, + { + "epoch": 3.93, + "grad_norm": 0.76171875, + "learning_rate": 1.8896495540059456e-07, + "loss": 0.1861, + "step": 29448 + }, + { + "epoch": 3.93, + "grad_norm": 0.65625, + "learning_rate": 1.8825008131982957e-07, + "loss": 0.5782, + "step": 29449 + }, + { + "epoch": 3.93, + "grad_norm": 0.6953125, + "learning_rate": 1.8753656075045646e-07, + "loss": 0.217, + "step": 29450 + }, + { + "epoch": 3.93, + "grad_norm": 0.52734375, + "learning_rate": 1.8682439370213412e-07, + "loss": 0.1997, + "step": 29451 + }, + { + "epoch": 3.93, + "grad_norm": 0.6953125, + "learning_rate": 1.861135801845437e-07, + "loss": 0.4521, + "step": 29452 + }, + { + "epoch": 3.93, + "grad_norm": 0.640625, + "learning_rate": 1.854041202072998e-07, + "loss": 0.5735, + "step": 29453 + }, + { + "epoch": 3.93, + "grad_norm": 0.54296875, + "learning_rate": 1.8469601378005019e-07, + "loss": 0.289, + "step": 29454 + }, + { + "epoch": 3.93, + "grad_norm": 0.72265625, + "learning_rate": 1.8398926091238721e-07, + "loss": 0.5238, + "step": 29455 + }, + { + "epoch": 3.93, + "grad_norm": 0.515625, + "learning_rate": 1.83283861613881e-07, + "loss": 0.3541, + "step": 29456 + }, + { + "epoch": 3.93, + "grad_norm": 0.482421875, + "learning_rate": 1.8257981589410167e-07, + "loss": 0.2436, + "step": 29457 + }, + { + "epoch": 3.93, + "grad_norm": 0.546875, + "learning_rate": 1.8187712376261934e-07, + "loss": 0.1375, + "step": 29458 + }, + { + "epoch": 3.93, + "grad_norm": 0.56640625, + "learning_rate": 1.8117578522893752e-07, + "loss": 0.4862, + "step": 29459 + }, + { + "epoch": 3.93, + "grad_norm": 0.63671875, + "learning_rate": 1.8047580030257083e-07, + "loss": 0.3848, + "step": 29460 + }, + { + "epoch": 3.93, + "grad_norm": 0.59375, + "learning_rate": 1.7977716899302276e-07, + "loss": 0.3794, + "step": 29461 + }, + { + "epoch": 3.93, + "grad_norm": 0.58984375, + "learning_rate": 1.7907989130976354e-07, + "loss": 0.3372, + "step": 29462 + }, + { + "epoch": 3.93, + "grad_norm": 0.5390625, + "learning_rate": 1.7838396726224117e-07, + "loss": 0.1559, + "step": 29463 + }, + { + "epoch": 3.93, + "grad_norm": 0.65625, + "learning_rate": 1.776893968598925e-07, + "loss": 0.4775, + "step": 29464 + }, + { + "epoch": 3.93, + "grad_norm": 0.5546875, + "learning_rate": 1.7699618011214337e-07, + "loss": 0.1181, + "step": 29465 + }, + { + "epoch": 3.93, + "grad_norm": 0.51953125, + "learning_rate": 1.7630431702839734e-07, + "loss": 0.3633, + "step": 29466 + }, + { + "epoch": 3.93, + "grad_norm": 0.51953125, + "learning_rate": 1.7561380761802471e-07, + "loss": 0.2904, + "step": 29467 + }, + { + "epoch": 3.93, + "grad_norm": 0.64453125, + "learning_rate": 1.7492465189039576e-07, + "loss": 0.2549, + "step": 29468 + }, + { + "epoch": 3.93, + "grad_norm": 0.63671875, + "learning_rate": 1.7423684985485856e-07, + "loss": 0.2089, + "step": 29469 + }, + { + "epoch": 3.93, + "grad_norm": 0.58984375, + "learning_rate": 1.7355040152073898e-07, + "loss": 0.4519, + "step": 29470 + }, + { + "epoch": 3.93, + "grad_norm": 0.58984375, + "learning_rate": 1.728653068973407e-07, + "loss": 0.3347, + "step": 29471 + }, + { + "epoch": 3.93, + "grad_norm": 0.74609375, + "learning_rate": 1.721815659939674e-07, + "loss": 0.4032, + "step": 29472 + }, + { + "epoch": 3.93, + "grad_norm": 0.7578125, + "learning_rate": 1.7149917881987832e-07, + "loss": 0.2557, + "step": 29473 + }, + { + "epoch": 3.93, + "grad_norm": 0.625, + "learning_rate": 1.7081814538432162e-07, + "loss": 0.1819, + "step": 29474 + }, + { + "epoch": 3.93, + "grad_norm": 0.5, + "learning_rate": 1.7013846569654545e-07, + "loss": 0.2379, + "step": 29475 + }, + { + "epoch": 3.93, + "grad_norm": 0.5, + "learning_rate": 1.6946013976575358e-07, + "loss": 0.3117, + "step": 29476 + }, + { + "epoch": 3.93, + "grad_norm": 0.64453125, + "learning_rate": 1.6878316760116086e-07, + "loss": 0.2424, + "step": 29477 + }, + { + "epoch": 3.93, + "grad_norm": 0.55078125, + "learning_rate": 1.6810754921193772e-07, + "loss": 0.1924, + "step": 29478 + }, + { + "epoch": 3.93, + "grad_norm": 0.6484375, + "learning_rate": 1.674332846072546e-07, + "loss": 0.4435, + "step": 29479 + }, + { + "epoch": 3.93, + "grad_norm": 0.75, + "learning_rate": 1.6676037379623754e-07, + "loss": 0.4402, + "step": 29480 + }, + { + "epoch": 3.93, + "grad_norm": 0.78515625, + "learning_rate": 1.660888167880348e-07, + "loss": 0.538, + "step": 29481 + }, + { + "epoch": 3.93, + "grad_norm": 0.62890625, + "learning_rate": 1.6541861359172793e-07, + "loss": 0.2899, + "step": 29482 + }, + { + "epoch": 3.93, + "grad_norm": 0.67578125, + "learning_rate": 1.6474976421642084e-07, + "loss": 0.3722, + "step": 29483 + }, + { + "epoch": 3.93, + "grad_norm": 0.5, + "learning_rate": 1.6408226867118403e-07, + "loss": 0.2405, + "step": 29484 + }, + { + "epoch": 3.93, + "grad_norm": 0.671875, + "learning_rate": 1.6341612696506582e-07, + "loss": 0.4823, + "step": 29485 + }, + { + "epoch": 3.93, + "grad_norm": 0.75390625, + "learning_rate": 1.6275133910710338e-07, + "loss": 0.3428, + "step": 29486 + }, + { + "epoch": 3.93, + "grad_norm": 0.54296875, + "learning_rate": 1.6208790510630066e-07, + "loss": 0.301, + "step": 29487 + }, + { + "epoch": 3.93, + "grad_norm": 0.7734375, + "learning_rate": 1.6142582497167268e-07, + "loss": 0.3392, + "step": 29488 + }, + { + "epoch": 3.94, + "grad_norm": 0.66015625, + "learning_rate": 1.6076509871216784e-07, + "loss": 0.3674, + "step": 29489 + }, + { + "epoch": 3.94, + "grad_norm": 0.486328125, + "learning_rate": 1.601057263367789e-07, + "loss": 0.1681, + "step": 29490 + }, + { + "epoch": 3.94, + "grad_norm": 0.5546875, + "learning_rate": 1.5944770785443208e-07, + "loss": 0.2219, + "step": 29491 + }, + { + "epoch": 3.94, + "grad_norm": 0.65234375, + "learning_rate": 1.5879104327404248e-07, + "loss": 0.2253, + "step": 29492 + }, + { + "epoch": 3.94, + "grad_norm": 0.5234375, + "learning_rate": 1.581357326045363e-07, + "loss": 0.2104, + "step": 29493 + }, + { + "epoch": 3.94, + "grad_norm": 0.66015625, + "learning_rate": 1.5748177585478418e-07, + "loss": 0.5584, + "step": 29494 + }, + { + "epoch": 3.94, + "grad_norm": 0.6796875, + "learning_rate": 1.5682917303365685e-07, + "loss": 0.3245, + "step": 29495 + }, + { + "epoch": 3.94, + "grad_norm": 0.65234375, + "learning_rate": 1.5617792415000276e-07, + "loss": 0.673, + "step": 29496 + }, + { + "epoch": 3.94, + "grad_norm": 0.6640625, + "learning_rate": 1.555280292126482e-07, + "loss": 0.4786, + "step": 29497 + }, + { + "epoch": 3.94, + "grad_norm": 0.55078125, + "learning_rate": 1.548794882304305e-07, + "loss": 0.3134, + "step": 29498 + }, + { + "epoch": 3.94, + "grad_norm": 0.765625, + "learning_rate": 1.5423230121210942e-07, + "loss": 0.5643, + "step": 29499 + }, + { + "epoch": 3.94, + "grad_norm": 0.61328125, + "learning_rate": 1.5358646816648892e-07, + "loss": 0.2759, + "step": 29500 + }, + { + "epoch": 3.94, + "grad_norm": 0.421875, + "learning_rate": 1.5294198910231762e-07, + "loss": 0.2472, + "step": 29501 + }, + { + "epoch": 3.94, + "grad_norm": 0.58203125, + "learning_rate": 1.5229886402833292e-07, + "loss": 0.1856, + "step": 29502 + }, + { + "epoch": 3.94, + "grad_norm": 0.57421875, + "learning_rate": 1.5165709295325015e-07, + "loss": 0.3734, + "step": 29503 + }, + { + "epoch": 3.94, + "grad_norm": 0.48828125, + "learning_rate": 1.510166758857845e-07, + "loss": 0.3001, + "step": 29504 + }, + { + "epoch": 3.94, + "grad_norm": 0.427734375, + "learning_rate": 1.5037761283461793e-07, + "loss": 0.1555, + "step": 29505 + }, + { + "epoch": 3.94, + "grad_norm": 0.83203125, + "learning_rate": 1.4973990380841019e-07, + "loss": 0.3309, + "step": 29506 + }, + { + "epoch": 3.94, + "grad_norm": 0.84375, + "learning_rate": 1.49103548815821e-07, + "loss": 0.725, + "step": 29507 + }, + { + "epoch": 3.94, + "grad_norm": 0.46484375, + "learning_rate": 1.484685478654657e-07, + "loss": 0.2505, + "step": 29508 + }, + { + "epoch": 3.94, + "grad_norm": 0.58984375, + "learning_rate": 1.4783490096595964e-07, + "loss": 0.3038, + "step": 29509 + }, + { + "epoch": 3.94, + "grad_norm": 0.4921875, + "learning_rate": 1.4720260812589592e-07, + "loss": 0.2212, + "step": 29510 + }, + { + "epoch": 3.94, + "grad_norm": 0.453125, + "learning_rate": 1.4657166935385658e-07, + "loss": 0.2412, + "step": 29511 + }, + { + "epoch": 3.94, + "grad_norm": 0.4140625, + "learning_rate": 1.4594208465839033e-07, + "loss": 0.1844, + "step": 29512 + }, + { + "epoch": 3.94, + "grad_norm": 0.6171875, + "learning_rate": 1.4531385404803476e-07, + "loss": 0.2927, + "step": 29513 + }, + { + "epoch": 3.94, + "grad_norm": 0.609375, + "learning_rate": 1.4468697753130533e-07, + "loss": 0.5119, + "step": 29514 + }, + { + "epoch": 3.94, + "grad_norm": 0.7421875, + "learning_rate": 1.4406145511670634e-07, + "loss": 0.4714, + "step": 29515 + }, + { + "epoch": 3.94, + "grad_norm": 0.5859375, + "learning_rate": 1.4343728681271985e-07, + "loss": 0.3075, + "step": 29516 + }, + { + "epoch": 3.94, + "grad_norm": 0.53125, + "learning_rate": 1.428144726278169e-07, + "loss": 0.3607, + "step": 29517 + }, + { + "epoch": 3.94, + "grad_norm": 0.75390625, + "learning_rate": 1.4219301257043516e-07, + "loss": 0.6633, + "step": 29518 + }, + { + "epoch": 3.94, + "grad_norm": 0.66796875, + "learning_rate": 1.4157290664900125e-07, + "loss": 0.256, + "step": 29519 + }, + { + "epoch": 3.94, + "grad_norm": 0.7421875, + "learning_rate": 1.4095415487193064e-07, + "loss": 0.2734, + "step": 29520 + }, + { + "epoch": 3.94, + "grad_norm": 0.671875, + "learning_rate": 1.4033675724761662e-07, + "loss": 0.5148, + "step": 29521 + }, + { + "epoch": 3.94, + "grad_norm": 0.67578125, + "learning_rate": 1.397207137844192e-07, + "loss": 0.3323, + "step": 29522 + }, + { + "epoch": 3.94, + "grad_norm": 0.52734375, + "learning_rate": 1.3910602449069832e-07, + "loss": 0.2275, + "step": 29523 + }, + { + "epoch": 3.94, + "grad_norm": 0.5859375, + "learning_rate": 1.3849268937479176e-07, + "loss": 0.5122, + "step": 29524 + }, + { + "epoch": 3.94, + "grad_norm": 0.71875, + "learning_rate": 1.378807084450151e-07, + "loss": 0.3306, + "step": 29525 + }, + { + "epoch": 3.94, + "grad_norm": 0.6328125, + "learning_rate": 1.3727008170966171e-07, + "loss": 0.3749, + "step": 29526 + }, + { + "epoch": 3.94, + "grad_norm": 0.625, + "learning_rate": 1.3666080917702494e-07, + "loss": 0.3814, + "step": 29527 + }, + { + "epoch": 3.94, + "grad_norm": 0.6875, + "learning_rate": 1.3605289085536487e-07, + "loss": 0.4266, + "step": 29528 + }, + { + "epoch": 3.94, + "grad_norm": 0.71875, + "learning_rate": 1.3544632675290824e-07, + "loss": 0.5135, + "step": 29529 + }, + { + "epoch": 3.94, + "grad_norm": 0.64453125, + "learning_rate": 1.348411168778929e-07, + "loss": 0.2964, + "step": 29530 + }, + { + "epoch": 3.94, + "grad_norm": 0.7421875, + "learning_rate": 1.3423726123853452e-07, + "loss": 0.4879, + "step": 29531 + }, + { + "epoch": 3.94, + "grad_norm": 0.5, + "learning_rate": 1.3363475984300434e-07, + "loss": 0.3095, + "step": 29532 + }, + { + "epoch": 3.94, + "grad_norm": 0.72265625, + "learning_rate": 1.3303361269948467e-07, + "loss": 0.2464, + "step": 29533 + }, + { + "epoch": 3.94, + "grad_norm": 0.53125, + "learning_rate": 1.3243381981611348e-07, + "loss": 0.3119, + "step": 29534 + }, + { + "epoch": 3.94, + "grad_norm": 0.54296875, + "learning_rate": 1.3183538120103977e-07, + "loss": 0.1506, + "step": 29535 + }, + { + "epoch": 3.94, + "grad_norm": 0.6796875, + "learning_rate": 1.3123829686237932e-07, + "loss": 0.5473, + "step": 29536 + }, + { + "epoch": 3.94, + "grad_norm": 0.69921875, + "learning_rate": 1.3064256680821451e-07, + "loss": 0.3733, + "step": 29537 + }, + { + "epoch": 3.94, + "grad_norm": 0.75, + "learning_rate": 1.300481910466389e-07, + "loss": 0.3137, + "step": 29538 + }, + { + "epoch": 3.94, + "grad_norm": 0.546875, + "learning_rate": 1.294551695857016e-07, + "loss": 0.3687, + "step": 29539 + }, + { + "epoch": 3.94, + "grad_norm": 0.53125, + "learning_rate": 1.288635024334517e-07, + "loss": 0.2647, + "step": 29540 + }, + { + "epoch": 3.94, + "grad_norm": 0.625, + "learning_rate": 1.2827318959790502e-07, + "loss": 0.382, + "step": 29541 + }, + { + "epoch": 3.94, + "grad_norm": 0.435546875, + "learning_rate": 1.2768423108707738e-07, + "loss": 0.1878, + "step": 29542 + }, + { + "epoch": 3.94, + "grad_norm": 0.61328125, + "learning_rate": 1.270966269089513e-07, + "loss": 0.4226, + "step": 29543 + }, + { + "epoch": 3.94, + "grad_norm": 0.83984375, + "learning_rate": 1.2651037707148704e-07, + "loss": 0.5906, + "step": 29544 + }, + { + "epoch": 3.94, + "grad_norm": 0.58203125, + "learning_rate": 1.2592548158263384e-07, + "loss": 0.2283, + "step": 29545 + }, + { + "epoch": 3.94, + "grad_norm": 0.65625, + "learning_rate": 1.2534194045034087e-07, + "loss": 0.5404, + "step": 29546 + }, + { + "epoch": 3.94, + "grad_norm": 0.52734375, + "learning_rate": 1.247597536825129e-07, + "loss": 0.1953, + "step": 29547 + }, + { + "epoch": 3.94, + "grad_norm": 0.4609375, + "learning_rate": 1.2417892128703257e-07, + "loss": 0.3016, + "step": 29548 + }, + { + "epoch": 3.94, + "grad_norm": 0.58203125, + "learning_rate": 1.2359944327178242e-07, + "loss": 0.1759, + "step": 29549 + }, + { + "epoch": 3.94, + "grad_norm": 0.6953125, + "learning_rate": 1.2302131964463391e-07, + "loss": 0.2243, + "step": 29550 + }, + { + "epoch": 3.94, + "grad_norm": 0.890625, + "learning_rate": 1.224445504134142e-07, + "loss": 0.1975, + "step": 29551 + }, + { + "epoch": 3.94, + "grad_norm": 0.77734375, + "learning_rate": 1.2186913558595027e-07, + "loss": 0.2292, + "step": 29552 + }, + { + "epoch": 3.94, + "grad_norm": 0.71484375, + "learning_rate": 1.2129507517003592e-07, + "loss": 0.3814, + "step": 29553 + }, + { + "epoch": 3.94, + "grad_norm": 0.71484375, + "learning_rate": 1.207223691734649e-07, + "loss": 0.287, + "step": 29554 + }, + { + "epoch": 3.94, + "grad_norm": 0.62890625, + "learning_rate": 1.201510176039977e-07, + "loss": 0.2855, + "step": 29555 + }, + { + "epoch": 3.94, + "grad_norm": 0.4375, + "learning_rate": 1.1958102046939478e-07, + "loss": 0.1852, + "step": 29556 + }, + { + "epoch": 3.94, + "grad_norm": 0.8984375, + "learning_rate": 1.1901237777737218e-07, + "loss": 0.6577, + "step": 29557 + }, + { + "epoch": 3.94, + "grad_norm": 0.71484375, + "learning_rate": 1.1844508953563483e-07, + "loss": 0.3673, + "step": 29558 + }, + { + "epoch": 3.94, + "grad_norm": 0.703125, + "learning_rate": 1.1787915575189879e-07, + "loss": 0.5512, + "step": 29559 + }, + { + "epoch": 3.94, + "grad_norm": 0.64453125, + "learning_rate": 1.173145764338246e-07, + "loss": 0.2814, + "step": 29560 + }, + { + "epoch": 3.94, + "grad_norm": 0.46875, + "learning_rate": 1.1675135158906169e-07, + "loss": 0.2268, + "step": 29561 + }, + { + "epoch": 3.94, + "grad_norm": 0.63671875, + "learning_rate": 1.1618948122525953e-07, + "loss": 0.2868, + "step": 29562 + }, + { + "epoch": 3.94, + "grad_norm": 0.6484375, + "learning_rate": 1.156289653500231e-07, + "loss": 0.6058, + "step": 29563 + }, + { + "epoch": 3.95, + "grad_norm": 0.470703125, + "learning_rate": 1.1506980397097966e-07, + "loss": 0.2768, + "step": 29564 + }, + { + "epoch": 3.95, + "grad_norm": 0.75, + "learning_rate": 1.1451199709567873e-07, + "loss": 0.3226, + "step": 29565 + }, + { + "epoch": 3.95, + "grad_norm": 0.474609375, + "learning_rate": 1.1395554473171422e-07, + "loss": 0.1689, + "step": 29566 + }, + { + "epoch": 3.95, + "grad_norm": 0.53125, + "learning_rate": 1.1340044688661345e-07, + "loss": 0.2825, + "step": 29567 + }, + { + "epoch": 3.95, + "grad_norm": 0.6796875, + "learning_rate": 1.1284670356790372e-07, + "loss": 0.2482, + "step": 29568 + }, + { + "epoch": 3.95, + "grad_norm": 0.4375, + "learning_rate": 1.1229431478311236e-07, + "loss": 0.1485, + "step": 29569 + }, + { + "epoch": 3.95, + "grad_norm": 0.81640625, + "learning_rate": 1.1174328053971118e-07, + "loss": 0.7024, + "step": 29570 + }, + { + "epoch": 3.95, + "grad_norm": 0.6171875, + "learning_rate": 1.1119360084518304e-07, + "loss": 0.4568, + "step": 29571 + }, + { + "epoch": 3.95, + "grad_norm": 0.63671875, + "learning_rate": 1.1064527570696648e-07, + "loss": 0.3311, + "step": 29572 + }, + { + "epoch": 3.95, + "grad_norm": 0.625, + "learning_rate": 1.1009830513252217e-07, + "loss": 0.4082, + "step": 29573 + }, + { + "epoch": 3.95, + "grad_norm": 0.59765625, + "learning_rate": 1.0955268912924422e-07, + "loss": 0.3704, + "step": 29574 + }, + { + "epoch": 3.95, + "grad_norm": 0.55859375, + "learning_rate": 1.0900842770454888e-07, + "loss": 0.2293, + "step": 29575 + }, + { + "epoch": 3.95, + "grad_norm": 0.578125, + "learning_rate": 1.0846552086579698e-07, + "loss": 0.4495, + "step": 29576 + }, + { + "epoch": 3.95, + "grad_norm": 0.66796875, + "learning_rate": 1.0792396862037147e-07, + "loss": 0.2317, + "step": 29577 + }, + { + "epoch": 3.95, + "grad_norm": 0.57421875, + "learning_rate": 1.0738377097561092e-07, + "loss": 0.1718, + "step": 29578 + }, + { + "epoch": 3.95, + "grad_norm": 0.640625, + "learning_rate": 1.0684492793883172e-07, + "loss": 0.2168, + "step": 29579 + }, + { + "epoch": 3.95, + "grad_norm": 0.5703125, + "learning_rate": 1.0630743951733913e-07, + "loss": 0.2805, + "step": 29580 + }, + { + "epoch": 3.95, + "grad_norm": 0.64453125, + "learning_rate": 1.0577130571843841e-07, + "loss": 0.2767, + "step": 29581 + }, + { + "epoch": 3.95, + "grad_norm": 0.79296875, + "learning_rate": 1.0523652654939042e-07, + "loss": 0.3372, + "step": 29582 + }, + { + "epoch": 3.95, + "grad_norm": 0.48046875, + "learning_rate": 1.0470310201744493e-07, + "loss": 0.162, + "step": 29583 + }, + { + "epoch": 3.95, + "grad_norm": 0.6484375, + "learning_rate": 1.0417103212984058e-07, + "loss": 0.3229, + "step": 29584 + }, + { + "epoch": 3.95, + "grad_norm": 0.71484375, + "learning_rate": 1.0364031689378274e-07, + "loss": 0.4239, + "step": 29585 + }, + { + "epoch": 3.95, + "grad_norm": 0.55078125, + "learning_rate": 1.0311095631648781e-07, + "loss": 0.1975, + "step": 29586 + }, + { + "epoch": 3.95, + "grad_norm": 0.546875, + "learning_rate": 1.0258295040510569e-07, + "loss": 0.4182, + "step": 29587 + }, + { + "epoch": 3.95, + "grad_norm": 0.66015625, + "learning_rate": 1.0205629916681947e-07, + "loss": 0.256, + "step": 29588 + }, + { + "epoch": 3.95, + "grad_norm": 0.5, + "learning_rate": 1.015310026087679e-07, + "loss": 0.2757, + "step": 29589 + }, + { + "epoch": 3.95, + "grad_norm": 0.470703125, + "learning_rate": 1.0100706073806754e-07, + "loss": 0.1194, + "step": 29590 + }, + { + "epoch": 3.95, + "grad_norm": 0.65625, + "learning_rate": 1.0048447356182378e-07, + "loss": 0.4026, + "step": 29591 + }, + { + "epoch": 3.95, + "grad_norm": 0.7734375, + "learning_rate": 9.996324108713095e-08, + "loss": 0.3347, + "step": 29592 + }, + { + "epoch": 3.95, + "grad_norm": 0.494140625, + "learning_rate": 9.944336332105008e-08, + "loss": 0.2676, + "step": 29593 + }, + { + "epoch": 3.95, + "grad_norm": 0.71484375, + "learning_rate": 9.892484027063109e-08, + "loss": 0.2111, + "step": 29594 + }, + { + "epoch": 3.95, + "grad_norm": 0.5546875, + "learning_rate": 9.840767194291278e-08, + "loss": 0.5988, + "step": 29595 + }, + { + "epoch": 3.95, + "grad_norm": 0.70703125, + "learning_rate": 9.789185834490066e-08, + "loss": 0.442, + "step": 29596 + }, + { + "epoch": 3.95, + "grad_norm": 0.53515625, + "learning_rate": 9.737739948357805e-08, + "loss": 0.2241, + "step": 29597 + }, + { + "epoch": 3.95, + "grad_norm": 0.7890625, + "learning_rate": 9.686429536595043e-08, + "loss": 0.3638, + "step": 29598 + }, + { + "epoch": 3.95, + "grad_norm": 0.6796875, + "learning_rate": 9.635254599895672e-08, + "loss": 0.3438, + "step": 29599 + }, + { + "epoch": 3.95, + "grad_norm": 0.63671875, + "learning_rate": 9.584215138953579e-08, + "loss": 0.475, + "step": 29600 + }, + { + "epoch": 3.95, + "grad_norm": 0.6015625, + "learning_rate": 9.533311154461544e-08, + "loss": 0.2596, + "step": 29601 + }, + { + "epoch": 3.95, + "grad_norm": 0.625, + "learning_rate": 9.482542647110126e-08, + "loss": 0.2909, + "step": 29602 + }, + { + "epoch": 3.95, + "grad_norm": 0.5625, + "learning_rate": 9.431909617586554e-08, + "loss": 0.2821, + "step": 29603 + }, + { + "epoch": 3.95, + "grad_norm": 0.9453125, + "learning_rate": 9.381412066579165e-08, + "loss": 0.2161, + "step": 29604 + }, + { + "epoch": 3.95, + "grad_norm": 0.53125, + "learning_rate": 9.331049994770746e-08, + "loss": 0.3106, + "step": 29605 + }, + { + "epoch": 3.95, + "grad_norm": 0.93359375, + "learning_rate": 9.280823402846305e-08, + "loss": 0.4527, + "step": 29606 + }, + { + "epoch": 3.95, + "grad_norm": 0.6640625, + "learning_rate": 9.230732291485301e-08, + "loss": 0.4669, + "step": 29607 + }, + { + "epoch": 3.95, + "grad_norm": 0.71484375, + "learning_rate": 9.180776661367186e-08, + "loss": 0.4046, + "step": 29608 + }, + { + "epoch": 3.95, + "grad_norm": 0.59765625, + "learning_rate": 9.130956513170307e-08, + "loss": 0.3295, + "step": 29609 + }, + { + "epoch": 3.95, + "grad_norm": 0.6953125, + "learning_rate": 9.081271847569683e-08, + "loss": 0.4248, + "step": 29610 + }, + { + "epoch": 3.95, + "grad_norm": 0.60546875, + "learning_rate": 9.031722665239217e-08, + "loss": 0.3839, + "step": 29611 + }, + { + "epoch": 3.95, + "grad_norm": 0.73828125, + "learning_rate": 8.982308966850595e-08, + "loss": 0.2491, + "step": 29612 + }, + { + "epoch": 3.95, + "grad_norm": 0.6015625, + "learning_rate": 8.933030753074389e-08, + "loss": 0.3382, + "step": 29613 + }, + { + "epoch": 3.95, + "grad_norm": 0.89453125, + "learning_rate": 8.883888024577846e-08, + "loss": 0.3191, + "step": 29614 + }, + { + "epoch": 3.95, + "grad_norm": 0.84765625, + "learning_rate": 8.83488078202932e-08, + "loss": 0.4308, + "step": 29615 + }, + { + "epoch": 3.95, + "grad_norm": 0.76171875, + "learning_rate": 8.786009026090502e-08, + "loss": 0.4039, + "step": 29616 + }, + { + "epoch": 3.95, + "grad_norm": 0.6328125, + "learning_rate": 8.737272757426418e-08, + "loss": 0.483, + "step": 29617 + }, + { + "epoch": 3.95, + "grad_norm": 0.52734375, + "learning_rate": 8.688671976697649e-08, + "loss": 0.1775, + "step": 29618 + }, + { + "epoch": 3.95, + "grad_norm": 0.5078125, + "learning_rate": 8.640206684562557e-08, + "loss": 0.2737, + "step": 29619 + }, + { + "epoch": 3.95, + "grad_norm": 0.57421875, + "learning_rate": 8.591876881678395e-08, + "loss": 0.2437, + "step": 29620 + }, + { + "epoch": 3.95, + "grad_norm": 0.51171875, + "learning_rate": 8.543682568701306e-08, + "loss": 0.2559, + "step": 29621 + }, + { + "epoch": 3.95, + "grad_norm": 0.62890625, + "learning_rate": 8.495623746284098e-08, + "loss": 0.5838, + "step": 29622 + }, + { + "epoch": 3.95, + "grad_norm": 0.6015625, + "learning_rate": 8.447700415078474e-08, + "loss": 0.2418, + "step": 29623 + }, + { + "epoch": 3.95, + "grad_norm": 0.671875, + "learning_rate": 8.399912575735025e-08, + "loss": 0.5089, + "step": 29624 + }, + { + "epoch": 3.95, + "grad_norm": 0.5546875, + "learning_rate": 8.35226022890101e-08, + "loss": 0.3322, + "step": 29625 + }, + { + "epoch": 3.95, + "grad_norm": 0.734375, + "learning_rate": 8.304743375222579e-08, + "loss": 0.2437, + "step": 29626 + }, + { + "epoch": 3.95, + "grad_norm": 0.58203125, + "learning_rate": 8.257362015345882e-08, + "loss": 0.2996, + "step": 29627 + }, + { + "epoch": 3.95, + "grad_norm": 0.80859375, + "learning_rate": 8.210116149910407e-08, + "loss": 0.358, + "step": 29628 + }, + { + "epoch": 3.95, + "grad_norm": 0.68359375, + "learning_rate": 8.163005779558974e-08, + "loss": 0.2654, + "step": 29629 + }, + { + "epoch": 3.95, + "grad_norm": 0.578125, + "learning_rate": 8.116030904931071e-08, + "loss": 0.3047, + "step": 29630 + }, + { + "epoch": 3.95, + "grad_norm": 0.5546875, + "learning_rate": 8.069191526660636e-08, + "loss": 0.1887, + "step": 29631 + }, + { + "epoch": 3.95, + "grad_norm": 0.51171875, + "learning_rate": 8.022487645386046e-08, + "loss": 0.1344, + "step": 29632 + }, + { + "epoch": 3.95, + "grad_norm": 0.57421875, + "learning_rate": 7.975919261739018e-08, + "loss": 0.521, + "step": 29633 + }, + { + "epoch": 3.95, + "grad_norm": 0.7421875, + "learning_rate": 7.92948637635127e-08, + "loss": 0.3384, + "step": 29634 + }, + { + "epoch": 3.95, + "grad_norm": 0.5859375, + "learning_rate": 7.883188989852298e-08, + "loss": 0.2646, + "step": 29635 + }, + { + "epoch": 3.95, + "grad_norm": 0.6328125, + "learning_rate": 7.837027102870487e-08, + "loss": 0.3407, + "step": 29636 + }, + { + "epoch": 3.95, + "grad_norm": 0.5546875, + "learning_rate": 7.791000716030894e-08, + "loss": 0.269, + "step": 29637 + }, + { + "epoch": 3.95, + "grad_norm": 0.6640625, + "learning_rate": 7.745109829959685e-08, + "loss": 0.4491, + "step": 29638 + }, + { + "epoch": 3.96, + "grad_norm": 0.64453125, + "learning_rate": 7.699354445276363e-08, + "loss": 0.2709, + "step": 29639 + }, + { + "epoch": 3.96, + "grad_norm": 0.52734375, + "learning_rate": 7.653734562603764e-08, + "loss": 0.2303, + "step": 29640 + }, + { + "epoch": 3.96, + "grad_norm": 0.73046875, + "learning_rate": 7.608250182559173e-08, + "loss": 0.3335, + "step": 29641 + }, + { + "epoch": 3.96, + "grad_norm": 0.51171875, + "learning_rate": 7.562901305759873e-08, + "loss": 0.1531, + "step": 29642 + }, + { + "epoch": 3.96, + "grad_norm": 0.71484375, + "learning_rate": 7.517687932819817e-08, + "loss": 0.4878, + "step": 29643 + }, + { + "epoch": 3.96, + "grad_norm": 0.54296875, + "learning_rate": 7.472610064354069e-08, + "loss": 0.3325, + "step": 29644 + }, + { + "epoch": 3.96, + "grad_norm": 0.5703125, + "learning_rate": 7.42766770097325e-08, + "loss": 0.3029, + "step": 29645 + }, + { + "epoch": 3.96, + "grad_norm": 0.6875, + "learning_rate": 7.382860843285766e-08, + "loss": 0.4432, + "step": 29646 + }, + { + "epoch": 3.96, + "grad_norm": 0.578125, + "learning_rate": 7.338189491900016e-08, + "loss": 0.5357, + "step": 29647 + }, + { + "epoch": 3.96, + "grad_norm": 0.60546875, + "learning_rate": 7.293653647421073e-08, + "loss": 0.3868, + "step": 29648 + }, + { + "epoch": 3.96, + "grad_norm": 0.63671875, + "learning_rate": 7.249253310454008e-08, + "loss": 0.3494, + "step": 29649 + }, + { + "epoch": 3.96, + "grad_norm": 0.62109375, + "learning_rate": 7.20498848160056e-08, + "loss": 0.5507, + "step": 29650 + }, + { + "epoch": 3.96, + "grad_norm": 0.57421875, + "learning_rate": 7.160859161461365e-08, + "loss": 0.2647, + "step": 29651 + }, + { + "epoch": 3.96, + "grad_norm": 0.455078125, + "learning_rate": 7.116865350633716e-08, + "loss": 0.261, + "step": 29652 + }, + { + "epoch": 3.96, + "grad_norm": 0.609375, + "learning_rate": 7.073007049713809e-08, + "loss": 0.3212, + "step": 29653 + }, + { + "epoch": 3.96, + "grad_norm": 0.6640625, + "learning_rate": 7.029284259298941e-08, + "loss": 0.3666, + "step": 29654 + }, + { + "epoch": 3.96, + "grad_norm": 0.78125, + "learning_rate": 6.98569697997975e-08, + "loss": 0.4758, + "step": 29655 + }, + { + "epoch": 3.96, + "grad_norm": 0.84765625, + "learning_rate": 6.942245212346876e-08, + "loss": 0.3986, + "step": 29656 + }, + { + "epoch": 3.96, + "grad_norm": 0.5390625, + "learning_rate": 6.898928956992068e-08, + "loss": 0.2412, + "step": 29657 + }, + { + "epoch": 3.96, + "grad_norm": 0.46484375, + "learning_rate": 6.855748214500412e-08, + "loss": 0.2599, + "step": 29658 + }, + { + "epoch": 3.96, + "grad_norm": 0.71484375, + "learning_rate": 6.812702985459218e-08, + "loss": 0.3639, + "step": 29659 + }, + { + "epoch": 3.96, + "grad_norm": 0.70703125, + "learning_rate": 6.769793270451352e-08, + "loss": 0.3061, + "step": 29660 + }, + { + "epoch": 3.96, + "grad_norm": 0.62109375, + "learning_rate": 6.727019070058571e-08, + "loss": 0.3234, + "step": 29661 + }, + { + "epoch": 3.96, + "grad_norm": 0.455078125, + "learning_rate": 6.684380384860411e-08, + "loss": 0.2098, + "step": 29662 + }, + { + "epoch": 3.96, + "grad_norm": 0.59375, + "learning_rate": 6.641877215436409e-08, + "loss": 0.2665, + "step": 29663 + }, + { + "epoch": 3.96, + "grad_norm": 0.6328125, + "learning_rate": 6.599509562361662e-08, + "loss": 0.3126, + "step": 29664 + }, + { + "epoch": 3.96, + "grad_norm": 0.57421875, + "learning_rate": 6.557277426211261e-08, + "loss": 0.3172, + "step": 29665 + }, + { + "epoch": 3.96, + "grad_norm": 0.62890625, + "learning_rate": 6.515180807558086e-08, + "loss": 0.1932, + "step": 29666 + }, + { + "epoch": 3.96, + "grad_norm": 0.5625, + "learning_rate": 6.473219706973899e-08, + "loss": 0.168, + "step": 29667 + }, + { + "epoch": 3.96, + "grad_norm": 0.5390625, + "learning_rate": 6.431394125024915e-08, + "loss": 0.1756, + "step": 29668 + }, + { + "epoch": 3.96, + "grad_norm": 0.451171875, + "learning_rate": 6.389704062281787e-08, + "loss": 0.3017, + "step": 29669 + }, + { + "epoch": 3.96, + "grad_norm": 0.6484375, + "learning_rate": 6.348149519307401e-08, + "loss": 0.3116, + "step": 29670 + }, + { + "epoch": 3.96, + "grad_norm": 0.765625, + "learning_rate": 6.306730496665747e-08, + "loss": 0.3791, + "step": 29671 + }, + { + "epoch": 3.96, + "grad_norm": 0.69140625, + "learning_rate": 6.2654469949186e-08, + "loss": 0.2895, + "step": 29672 + }, + { + "epoch": 3.96, + "grad_norm": 0.6953125, + "learning_rate": 6.224299014625511e-08, + "loss": 0.3062, + "step": 29673 + }, + { + "epoch": 3.96, + "grad_norm": 0.69921875, + "learning_rate": 6.183286556346035e-08, + "loss": 0.2601, + "step": 29674 + }, + { + "epoch": 3.96, + "grad_norm": 0.52734375, + "learning_rate": 6.142409620634171e-08, + "loss": 0.2371, + "step": 29675 + }, + { + "epoch": 3.96, + "grad_norm": 0.77734375, + "learning_rate": 6.10166820804614e-08, + "loss": 0.3065, + "step": 29676 + }, + { + "epoch": 3.96, + "grad_norm": 0.53125, + "learning_rate": 6.061062319133726e-08, + "loss": 0.1112, + "step": 29677 + }, + { + "epoch": 3.96, + "grad_norm": 0.94140625, + "learning_rate": 6.020591954446486e-08, + "loss": 0.5208, + "step": 29678 + }, + { + "epoch": 3.96, + "grad_norm": 0.455078125, + "learning_rate": 5.980257114533983e-08, + "loss": 0.2858, + "step": 29679 + }, + { + "epoch": 3.96, + "grad_norm": 0.8203125, + "learning_rate": 5.9400577999435547e-08, + "loss": 0.3101, + "step": 29680 + }, + { + "epoch": 3.96, + "grad_norm": 0.66796875, + "learning_rate": 5.8999940112203225e-08, + "loss": 0.2531, + "step": 29681 + }, + { + "epoch": 3.96, + "grad_norm": 0.58984375, + "learning_rate": 5.860065748908294e-08, + "loss": 0.2432, + "step": 29682 + }, + { + "epoch": 3.96, + "grad_norm": 0.5234375, + "learning_rate": 5.820273013545929e-08, + "loss": 0.2468, + "step": 29683 + }, + { + "epoch": 3.96, + "grad_norm": 0.5390625, + "learning_rate": 5.780615805676126e-08, + "loss": 0.1741, + "step": 29684 + }, + { + "epoch": 3.96, + "grad_norm": 0.91015625, + "learning_rate": 5.7410941258351224e-08, + "loss": 0.3977, + "step": 29685 + }, + { + "epoch": 3.96, + "grad_norm": 0.52734375, + "learning_rate": 5.701707974559156e-08, + "loss": 0.1032, + "step": 29686 + }, + { + "epoch": 3.96, + "grad_norm": 0.49609375, + "learning_rate": 5.662457352383355e-08, + "loss": 0.3408, + "step": 29687 + }, + { + "epoch": 3.96, + "grad_norm": 0.65625, + "learning_rate": 5.623342259837294e-08, + "loss": 0.3208, + "step": 29688 + }, + { + "epoch": 3.96, + "grad_norm": 0.54296875, + "learning_rate": 5.584362697453882e-08, + "loss": 0.2684, + "step": 29689 + }, + { + "epoch": 3.96, + "grad_norm": 0.7734375, + "learning_rate": 5.545518665760474e-08, + "loss": 0.6114, + "step": 29690 + }, + { + "epoch": 3.96, + "grad_norm": 0.6484375, + "learning_rate": 5.506810165284426e-08, + "loss": 0.3013, + "step": 29691 + }, + { + "epoch": 3.96, + "grad_norm": 0.6796875, + "learning_rate": 5.468237196550874e-08, + "loss": 0.5106, + "step": 29692 + }, + { + "epoch": 3.96, + "grad_norm": 0.6875, + "learning_rate": 5.429799760082732e-08, + "loss": 0.3027, + "step": 29693 + }, + { + "epoch": 3.96, + "grad_norm": 0.671875, + "learning_rate": 5.391497856399585e-08, + "loss": 0.3049, + "step": 29694 + }, + { + "epoch": 3.96, + "grad_norm": 0.51953125, + "learning_rate": 5.353331486023239e-08, + "loss": 0.1706, + "step": 29695 + }, + { + "epoch": 3.96, + "grad_norm": 0.5546875, + "learning_rate": 5.3153006494699456e-08, + "loss": 0.3218, + "step": 29696 + }, + { + "epoch": 3.96, + "grad_norm": 0.59375, + "learning_rate": 5.2774053472559596e-08, + "loss": 0.3124, + "step": 29697 + }, + { + "epoch": 3.96, + "grad_norm": 0.7421875, + "learning_rate": 5.239645579895314e-08, + "loss": 0.5213, + "step": 29698 + }, + { + "epoch": 3.96, + "grad_norm": 0.61328125, + "learning_rate": 5.202021347898711e-08, + "loss": 0.3225, + "step": 29699 + }, + { + "epoch": 3.96, + "grad_norm": 0.466796875, + "learning_rate": 5.164532651777965e-08, + "loss": 0.3222, + "step": 29700 + }, + { + "epoch": 3.96, + "grad_norm": 0.8125, + "learning_rate": 5.127179492040446e-08, + "loss": 0.3959, + "step": 29701 + }, + { + "epoch": 3.96, + "grad_norm": 0.65234375, + "learning_rate": 5.089961869193527e-08, + "loss": 0.291, + "step": 29702 + }, + { + "epoch": 3.96, + "grad_norm": 0.6953125, + "learning_rate": 5.0528797837412486e-08, + "loss": 0.2184, + "step": 29703 + }, + { + "epoch": 3.96, + "grad_norm": 0.9375, + "learning_rate": 5.015933236186543e-08, + "loss": 0.4279, + "step": 29704 + }, + { + "epoch": 3.96, + "grad_norm": 0.61328125, + "learning_rate": 4.9791222270301194e-08, + "loss": 0.2121, + "step": 29705 + }, + { + "epoch": 3.96, + "grad_norm": 0.8203125, + "learning_rate": 4.942446756771579e-08, + "loss": 0.3479, + "step": 29706 + }, + { + "epoch": 3.96, + "grad_norm": 0.52734375, + "learning_rate": 4.905906825908302e-08, + "loss": 0.2412, + "step": 29707 + }, + { + "epoch": 3.96, + "grad_norm": 0.52734375, + "learning_rate": 4.869502434936557e-08, + "loss": 0.3584, + "step": 29708 + }, + { + "epoch": 3.96, + "grad_norm": 0.59375, + "learning_rate": 4.833233584348174e-08, + "loss": 0.2912, + "step": 29709 + }, + { + "epoch": 3.96, + "grad_norm": 0.51171875, + "learning_rate": 4.797100274637201e-08, + "loss": 0.4694, + "step": 29710 + }, + { + "epoch": 3.96, + "grad_norm": 0.63671875, + "learning_rate": 4.761102506291026e-08, + "loss": 0.5375, + "step": 29711 + }, + { + "epoch": 3.96, + "grad_norm": 0.54296875, + "learning_rate": 4.725240279800369e-08, + "loss": 0.4253, + "step": 29712 + }, + { + "epoch": 3.96, + "grad_norm": 0.453125, + "learning_rate": 4.689513595649286e-08, + "loss": 0.1976, + "step": 29713 + }, + { + "epoch": 3.97, + "grad_norm": 0.64453125, + "learning_rate": 4.6539224543240553e-08, + "loss": 0.384, + "step": 29714 + }, + { + "epoch": 3.97, + "grad_norm": 0.765625, + "learning_rate": 4.618466856306514e-08, + "loss": 0.3013, + "step": 29715 + }, + { + "epoch": 3.97, + "grad_norm": 0.6484375, + "learning_rate": 4.583146802077387e-08, + "loss": 0.5278, + "step": 29716 + }, + { + "epoch": 3.97, + "grad_norm": 0.87890625, + "learning_rate": 4.547962292116292e-08, + "loss": 0.4273, + "step": 29717 + }, + { + "epoch": 3.97, + "grad_norm": 0.51171875, + "learning_rate": 4.512913326899515e-08, + "loss": 0.3084, + "step": 29718 + }, + { + "epoch": 3.97, + "grad_norm": 0.462890625, + "learning_rate": 4.4779999069022304e-08, + "loss": 0.1753, + "step": 29719 + }, + { + "epoch": 3.97, + "grad_norm": 0.63671875, + "learning_rate": 4.443222032598504e-08, + "loss": 0.2143, + "step": 29720 + }, + { + "epoch": 3.97, + "grad_norm": 0.52734375, + "learning_rate": 4.4085797044601804e-08, + "loss": 0.1665, + "step": 29721 + }, + { + "epoch": 3.97, + "grad_norm": 0.68359375, + "learning_rate": 4.374072922956884e-08, + "loss": 0.3078, + "step": 29722 + }, + { + "epoch": 3.97, + "grad_norm": 0.578125, + "learning_rate": 4.3397016885560194e-08, + "loss": 0.2492, + "step": 29723 + }, + { + "epoch": 3.97, + "grad_norm": 0.5703125, + "learning_rate": 4.305466001723879e-08, + "loss": 0.3408, + "step": 29724 + }, + { + "epoch": 3.97, + "grad_norm": 0.83984375, + "learning_rate": 4.271365862924537e-08, + "loss": 0.4053, + "step": 29725 + }, + { + "epoch": 3.97, + "grad_norm": 0.48046875, + "learning_rate": 4.237401272620956e-08, + "loss": 0.1622, + "step": 29726 + }, + { + "epoch": 3.97, + "grad_norm": 0.5625, + "learning_rate": 4.2035722312738776e-08, + "loss": 0.3209, + "step": 29727 + }, + { + "epoch": 3.97, + "grad_norm": 0.671875, + "learning_rate": 4.1698787393407156e-08, + "loss": 0.2511, + "step": 29728 + }, + { + "epoch": 3.97, + "grad_norm": 0.57421875, + "learning_rate": 4.136320797279991e-08, + "loss": 0.3988, + "step": 29729 + }, + { + "epoch": 3.97, + "grad_norm": 0.59375, + "learning_rate": 4.102898405545785e-08, + "loss": 0.2231, + "step": 29730 + }, + { + "epoch": 3.97, + "grad_norm": 0.5625, + "learning_rate": 4.06961156459218e-08, + "loss": 0.2644, + "step": 29731 + }, + { + "epoch": 3.97, + "grad_norm": 0.72265625, + "learning_rate": 4.0364602748688144e-08, + "loss": 0.5, + "step": 29732 + }, + { + "epoch": 3.97, + "grad_norm": 0.640625, + "learning_rate": 4.00344453682755e-08, + "loss": 0.2192, + "step": 29733 + }, + { + "epoch": 3.97, + "grad_norm": 0.3515625, + "learning_rate": 3.970564350914696e-08, + "loss": 0.0728, + "step": 29734 + }, + { + "epoch": 3.97, + "grad_norm": 0.73828125, + "learning_rate": 3.937819717575453e-08, + "loss": 0.3435, + "step": 29735 + }, + { + "epoch": 3.97, + "grad_norm": 0.734375, + "learning_rate": 3.905210637256129e-08, + "loss": 0.3782, + "step": 29736 + }, + { + "epoch": 3.97, + "grad_norm": 0.65625, + "learning_rate": 3.872737110397484e-08, + "loss": 0.2779, + "step": 29737 + }, + { + "epoch": 3.97, + "grad_norm": 0.62109375, + "learning_rate": 3.840399137439166e-08, + "loss": 0.4512, + "step": 29738 + }, + { + "epoch": 3.97, + "grad_norm": 0.66796875, + "learning_rate": 3.808196718820822e-08, + "loss": 0.3047, + "step": 29739 + }, + { + "epoch": 3.97, + "grad_norm": 0.65625, + "learning_rate": 3.776129854978772e-08, + "loss": 0.451, + "step": 29740 + }, + { + "epoch": 3.97, + "grad_norm": 0.51953125, + "learning_rate": 3.744198546348221e-08, + "loss": 0.2252, + "step": 29741 + }, + { + "epoch": 3.97, + "grad_norm": 0.640625, + "learning_rate": 3.712402793362157e-08, + "loss": 0.2009, + "step": 29742 + }, + { + "epoch": 3.97, + "grad_norm": 0.451171875, + "learning_rate": 3.6807425964502375e-08, + "loss": 0.2697, + "step": 29743 + }, + { + "epoch": 3.97, + "grad_norm": 0.84375, + "learning_rate": 3.6492179560443374e-08, + "loss": 0.4732, + "step": 29744 + }, + { + "epoch": 3.97, + "grad_norm": 0.515625, + "learning_rate": 3.617828872569673e-08, + "loss": 0.3885, + "step": 29745 + }, + { + "epoch": 3.97, + "grad_norm": 0.76171875, + "learning_rate": 3.58657534645368e-08, + "loss": 0.6094, + "step": 29746 + }, + { + "epoch": 3.97, + "grad_norm": 0.515625, + "learning_rate": 3.5554573781193535e-08, + "loss": 0.2884, + "step": 29747 + }, + { + "epoch": 3.97, + "grad_norm": 0.55859375, + "learning_rate": 3.524474967988578e-08, + "loss": 0.333, + "step": 29748 + }, + { + "epoch": 3.97, + "grad_norm": 0.470703125, + "learning_rate": 3.493628116481018e-08, + "loss": 0.2859, + "step": 29749 + }, + { + "epoch": 3.97, + "grad_norm": 0.734375, + "learning_rate": 3.462916824016338e-08, + "loss": 0.4271, + "step": 29750 + }, + { + "epoch": 3.97, + "grad_norm": 0.6171875, + "learning_rate": 3.432341091009761e-08, + "loss": 0.2863, + "step": 29751 + }, + { + "epoch": 3.97, + "grad_norm": 0.55078125, + "learning_rate": 3.401900917876511e-08, + "loss": 0.3143, + "step": 29752 + }, + { + "epoch": 3.97, + "grad_norm": 0.41796875, + "learning_rate": 3.37159630502959e-08, + "loss": 0.1082, + "step": 29753 + }, + { + "epoch": 3.97, + "grad_norm": 0.53125, + "learning_rate": 3.341427252879781e-08, + "loss": 0.2523, + "step": 29754 + }, + { + "epoch": 3.97, + "grad_norm": 0.6171875, + "learning_rate": 3.311393761835646e-08, + "loss": 0.1343, + "step": 29755 + }, + { + "epoch": 3.97, + "grad_norm": 0.67578125, + "learning_rate": 3.2814958323046375e-08, + "loss": 0.3045, + "step": 29756 + }, + { + "epoch": 3.97, + "grad_norm": 0.6171875, + "learning_rate": 3.251733464691986e-08, + "loss": 0.4019, + "step": 29757 + }, + { + "epoch": 3.97, + "grad_norm": 0.482421875, + "learning_rate": 3.222106659402924e-08, + "loss": 0.2253, + "step": 29758 + }, + { + "epoch": 3.97, + "grad_norm": 0.458984375, + "learning_rate": 3.1926154168371304e-08, + "loss": 0.1373, + "step": 29759 + }, + { + "epoch": 3.97, + "grad_norm": 0.6484375, + "learning_rate": 3.1632597373953965e-08, + "loss": 0.3145, + "step": 29760 + }, + { + "epoch": 3.97, + "grad_norm": 0.859375, + "learning_rate": 3.134039621476292e-08, + "loss": 0.6405, + "step": 29761 + }, + { + "epoch": 3.97, + "grad_norm": 0.68359375, + "learning_rate": 3.104955069475057e-08, + "loss": 0.18, + "step": 29762 + }, + { + "epoch": 3.97, + "grad_norm": 0.60546875, + "learning_rate": 3.076006081786931e-08, + "loss": 0.3947, + "step": 29763 + }, + { + "epoch": 3.97, + "grad_norm": 0.578125, + "learning_rate": 3.047192658803821e-08, + "loss": 0.3347, + "step": 29764 + }, + { + "epoch": 3.97, + "grad_norm": 0.490234375, + "learning_rate": 3.0185148009165276e-08, + "loss": 0.1684, + "step": 29765 + }, + { + "epoch": 3.97, + "grad_norm": 0.6328125, + "learning_rate": 2.9899725085147376e-08, + "loss": 0.2957, + "step": 29766 + }, + { + "epoch": 3.97, + "grad_norm": 0.52734375, + "learning_rate": 2.9615657819848098e-08, + "loss": 0.3118, + "step": 29767 + }, + { + "epoch": 3.97, + "grad_norm": 0.80078125, + "learning_rate": 2.9332946217119907e-08, + "loss": 0.3903, + "step": 29768 + }, + { + "epoch": 3.97, + "grad_norm": 0.671875, + "learning_rate": 2.9051590280804176e-08, + "loss": 0.2135, + "step": 29769 + }, + { + "epoch": 3.97, + "grad_norm": 0.66796875, + "learning_rate": 2.877159001470897e-08, + "loss": 0.2813, + "step": 29770 + }, + { + "epoch": 3.97, + "grad_norm": 0.62109375, + "learning_rate": 2.8492945422620155e-08, + "loss": 0.236, + "step": 29771 + }, + { + "epoch": 3.97, + "grad_norm": 0.5859375, + "learning_rate": 2.8215656508334686e-08, + "loss": 0.2112, + "step": 29772 + }, + { + "epoch": 3.97, + "grad_norm": 0.546875, + "learning_rate": 2.7939723275616225e-08, + "loss": 0.2502, + "step": 29773 + }, + { + "epoch": 3.97, + "grad_norm": 0.6484375, + "learning_rate": 2.7665145728184017e-08, + "loss": 0.3115, + "step": 29774 + }, + { + "epoch": 3.97, + "grad_norm": 0.7109375, + "learning_rate": 2.739192386977951e-08, + "loss": 0.3066, + "step": 29775 + }, + { + "epoch": 3.97, + "grad_norm": 0.609375, + "learning_rate": 2.7120057704099756e-08, + "loss": 0.4249, + "step": 29776 + }, + { + "epoch": 3.97, + "grad_norm": 0.640625, + "learning_rate": 2.6849547234830686e-08, + "loss": 0.4232, + "step": 29777 + }, + { + "epoch": 3.97, + "grad_norm": 0.765625, + "learning_rate": 2.6580392465658245e-08, + "loss": 0.4657, + "step": 29778 + }, + { + "epoch": 3.97, + "grad_norm": 0.5546875, + "learning_rate": 2.6312593400201757e-08, + "loss": 0.1625, + "step": 29779 + }, + { + "epoch": 3.97, + "grad_norm": 0.5703125, + "learning_rate": 2.6046150042124962e-08, + "loss": 0.3891, + "step": 29780 + }, + { + "epoch": 3.97, + "grad_norm": 0.59765625, + "learning_rate": 2.578106239501388e-08, + "loss": 0.3645, + "step": 29781 + }, + { + "epoch": 3.97, + "grad_norm": 0.65234375, + "learning_rate": 2.551733046247673e-08, + "loss": 0.3042, + "step": 29782 + }, + { + "epoch": 3.97, + "grad_norm": 0.66796875, + "learning_rate": 2.5254954248088435e-08, + "loss": 0.4109, + "step": 29783 + }, + { + "epoch": 3.97, + "grad_norm": 0.5, + "learning_rate": 2.499393375540171e-08, + "loss": 0.1909, + "step": 29784 + }, + { + "epoch": 3.97, + "grad_norm": 0.55859375, + "learning_rate": 2.4734268987969268e-08, + "loss": 0.325, + "step": 29785 + }, + { + "epoch": 3.97, + "grad_norm": 0.50390625, + "learning_rate": 2.447595994929941e-08, + "loss": 0.2512, + "step": 29786 + }, + { + "epoch": 3.97, + "grad_norm": 0.6796875, + "learning_rate": 2.4219006642900444e-08, + "loss": 0.338, + "step": 29787 + }, + { + "epoch": 3.97, + "grad_norm": 0.5703125, + "learning_rate": 2.396340907225847e-08, + "loss": 0.152, + "step": 29788 + }, + { + "epoch": 3.98, + "grad_norm": 0.53515625, + "learning_rate": 2.3709167240837382e-08, + "loss": 0.2932, + "step": 29789 + }, + { + "epoch": 3.98, + "grad_norm": 0.53515625, + "learning_rate": 2.3456281152078875e-08, + "loss": 0.5433, + "step": 29790 + }, + { + "epoch": 3.98, + "grad_norm": 0.703125, + "learning_rate": 2.3204750809413532e-08, + "loss": 0.5318, + "step": 29791 + }, + { + "epoch": 3.98, + "grad_norm": 0.58984375, + "learning_rate": 2.295457621626085e-08, + "loss": 0.5037, + "step": 29792 + }, + { + "epoch": 3.98, + "grad_norm": 0.8203125, + "learning_rate": 2.27057573760181e-08, + "loss": 0.4271, + "step": 29793 + }, + { + "epoch": 3.98, + "grad_norm": 0.80859375, + "learning_rate": 2.2458294292038163e-08, + "loss": 0.4318, + "step": 29794 + }, + { + "epoch": 3.98, + "grad_norm": 0.734375, + "learning_rate": 2.2212186967696113e-08, + "loss": 0.4805, + "step": 29795 + }, + { + "epoch": 3.98, + "grad_norm": 0.59375, + "learning_rate": 2.1967435406322622e-08, + "loss": 0.2589, + "step": 29796 + }, + { + "epoch": 3.98, + "grad_norm": 0.52734375, + "learning_rate": 2.1724039611226154e-08, + "loss": 0.1453, + "step": 29797 + }, + { + "epoch": 3.98, + "grad_norm": 0.62109375, + "learning_rate": 2.1481999585726277e-08, + "loss": 0.4716, + "step": 29798 + }, + { + "epoch": 3.98, + "grad_norm": 0.82421875, + "learning_rate": 2.1241315333098144e-08, + "loss": 0.2789, + "step": 29799 + }, + { + "epoch": 3.98, + "grad_norm": 0.4609375, + "learning_rate": 2.1001986856594712e-08, + "loss": 0.183, + "step": 29800 + }, + { + "epoch": 3.98, + "grad_norm": 0.57421875, + "learning_rate": 2.076401415948004e-08, + "loss": 0.2627, + "step": 29801 + }, + { + "epoch": 3.98, + "grad_norm": 0.4921875, + "learning_rate": 2.0527397244962666e-08, + "loss": 0.2514, + "step": 29802 + }, + { + "epoch": 3.98, + "grad_norm": 0.7734375, + "learning_rate": 2.0292136116262238e-08, + "loss": 0.5315, + "step": 29803 + }, + { + "epoch": 3.98, + "grad_norm": 0.63671875, + "learning_rate": 2.0058230776576205e-08, + "loss": 0.3347, + "step": 29804 + }, + { + "epoch": 3.98, + "grad_norm": 0.6171875, + "learning_rate": 1.982568122905759e-08, + "loss": 0.3442, + "step": 29805 + }, + { + "epoch": 3.98, + "grad_norm": 0.69140625, + "learning_rate": 1.959448747687054e-08, + "loss": 0.3941, + "step": 29806 + }, + { + "epoch": 3.98, + "grad_norm": 0.412109375, + "learning_rate": 1.9364649523156975e-08, + "loss": 0.1094, + "step": 29807 + }, + { + "epoch": 3.98, + "grad_norm": 0.4921875, + "learning_rate": 1.9136167371014424e-08, + "loss": 0.2329, + "step": 29808 + }, + { + "epoch": 3.98, + "grad_norm": 0.69921875, + "learning_rate": 1.890904102356261e-08, + "loss": 0.4165, + "step": 29809 + }, + { + "epoch": 3.98, + "grad_norm": 0.6484375, + "learning_rate": 1.8683270483865757e-08, + "loss": 0.3433, + "step": 29810 + }, + { + "epoch": 3.98, + "grad_norm": 0.37890625, + "learning_rate": 1.845885575498807e-08, + "loss": 0.1245, + "step": 29811 + }, + { + "epoch": 3.98, + "grad_norm": 0.80859375, + "learning_rate": 1.8235796839982665e-08, + "loss": 0.3725, + "step": 29812 + }, + { + "epoch": 3.98, + "grad_norm": 0.71875, + "learning_rate": 1.8014093741869355e-08, + "loss": 0.3223, + "step": 29813 + }, + { + "epoch": 3.98, + "grad_norm": 0.65625, + "learning_rate": 1.7793746463645734e-08, + "loss": 0.2066, + "step": 29814 + }, + { + "epoch": 3.98, + "grad_norm": 0.4765625, + "learning_rate": 1.7574755008309408e-08, + "loss": 0.1745, + "step": 29815 + }, + { + "epoch": 3.98, + "grad_norm": 0.41796875, + "learning_rate": 1.7357119378835773e-08, + "loss": 0.1697, + "step": 29816 + }, + { + "epoch": 3.98, + "grad_norm": 0.73046875, + "learning_rate": 1.714083957815582e-08, + "loss": 0.58, + "step": 29817 + }, + { + "epoch": 3.98, + "grad_norm": 0.640625, + "learning_rate": 1.692591560922274e-08, + "loss": 0.3396, + "step": 29818 + }, + { + "epoch": 3.98, + "grad_norm": 0.60546875, + "learning_rate": 1.6712347474945322e-08, + "loss": 0.2415, + "step": 29819 + }, + { + "epoch": 3.98, + "grad_norm": 0.8359375, + "learning_rate": 1.6500135178210143e-08, + "loss": 0.3357, + "step": 29820 + }, + { + "epoch": 3.98, + "grad_norm": 0.66796875, + "learning_rate": 1.6289278721903778e-08, + "loss": 0.2358, + "step": 29821 + }, + { + "epoch": 3.98, + "grad_norm": 0.62109375, + "learning_rate": 1.6079778108879505e-08, + "loss": 0.2099, + "step": 29822 + }, + { + "epoch": 3.98, + "grad_norm": 0.79296875, + "learning_rate": 1.5871633341990598e-08, + "loss": 0.3891, + "step": 29823 + }, + { + "epoch": 3.98, + "grad_norm": 0.65625, + "learning_rate": 1.566484442404592e-08, + "loss": 0.4575, + "step": 29824 + }, + { + "epoch": 3.98, + "grad_norm": 0.6484375, + "learning_rate": 1.545941135786544e-08, + "loss": 0.4411, + "step": 29825 + }, + { + "epoch": 3.98, + "grad_norm": 0.6328125, + "learning_rate": 1.5255334146213607e-08, + "loss": 0.3394, + "step": 29826 + }, + { + "epoch": 3.98, + "grad_norm": 0.58984375, + "learning_rate": 1.5052612791877085e-08, + "loss": 0.3778, + "step": 29827 + }, + { + "epoch": 3.98, + "grad_norm": 0.5703125, + "learning_rate": 1.4851247297587023e-08, + "loss": 0.3434, + "step": 29828 + }, + { + "epoch": 3.98, + "grad_norm": 0.71875, + "learning_rate": 1.4651237666096773e-08, + "loss": 0.4803, + "step": 29829 + }, + { + "epoch": 3.98, + "grad_norm": 0.62109375, + "learning_rate": 1.4452583900093075e-08, + "loss": 0.2637, + "step": 29830 + }, + { + "epoch": 3.98, + "grad_norm": 0.7109375, + "learning_rate": 1.4255286002295975e-08, + "loss": 0.2242, + "step": 29831 + }, + { + "epoch": 3.98, + "grad_norm": 0.490234375, + "learning_rate": 1.4059343975358907e-08, + "loss": 0.2131, + "step": 29832 + }, + { + "epoch": 3.98, + "grad_norm": 0.66015625, + "learning_rate": 1.3864757821946406e-08, + "loss": 0.3254, + "step": 29833 + }, + { + "epoch": 3.98, + "grad_norm": 0.6875, + "learning_rate": 1.3671527544700802e-08, + "loss": 0.3059, + "step": 29834 + }, + { + "epoch": 3.98, + "grad_norm": 0.6328125, + "learning_rate": 1.3479653146242221e-08, + "loss": 0.2348, + "step": 29835 + }, + { + "epoch": 3.98, + "grad_norm": 0.83984375, + "learning_rate": 1.3289134629168588e-08, + "loss": 0.642, + "step": 29836 + }, + { + "epoch": 3.98, + "grad_norm": 0.546875, + "learning_rate": 1.3099971996066717e-08, + "loss": 0.2769, + "step": 29837 + }, + { + "epoch": 3.98, + "grad_norm": 0.52734375, + "learning_rate": 1.2912165249501229e-08, + "loss": 0.2607, + "step": 29838 + }, + { + "epoch": 3.98, + "grad_norm": 0.609375, + "learning_rate": 1.2725714392014532e-08, + "loss": 0.1584, + "step": 29839 + }, + { + "epoch": 3.98, + "grad_norm": 0.73828125, + "learning_rate": 1.2540619426137934e-08, + "loss": 0.2016, + "step": 29840 + }, + { + "epoch": 3.98, + "grad_norm": 0.5625, + "learning_rate": 1.2356880354391643e-08, + "loss": 0.2419, + "step": 29841 + }, + { + "epoch": 3.98, + "grad_norm": 0.69140625, + "learning_rate": 1.2174497179251453e-08, + "loss": 0.3952, + "step": 29842 + }, + { + "epoch": 3.98, + "grad_norm": 0.62109375, + "learning_rate": 1.1993469903193166e-08, + "loss": 0.4071, + "step": 29843 + }, + { + "epoch": 3.98, + "grad_norm": 0.52734375, + "learning_rate": 1.1813798528681475e-08, + "loss": 0.3202, + "step": 29844 + }, + { + "epoch": 3.98, + "grad_norm": 0.6015625, + "learning_rate": 1.1635483058136665e-08, + "loss": 0.2886, + "step": 29845 + }, + { + "epoch": 3.98, + "grad_norm": 0.6015625, + "learning_rate": 1.1458523494001227e-08, + "loss": 0.2258, + "step": 29846 + }, + { + "epoch": 3.98, + "grad_norm": 0.59375, + "learning_rate": 1.128291983865104e-08, + "loss": 0.4987, + "step": 29847 + }, + { + "epoch": 3.98, + "grad_norm": 0.390625, + "learning_rate": 1.1108672094473082e-08, + "loss": 0.1827, + "step": 29848 + }, + { + "epoch": 3.98, + "grad_norm": 0.68359375, + "learning_rate": 1.0935780263843231e-08, + "loss": 0.4662, + "step": 29849 + }, + { + "epoch": 3.98, + "grad_norm": 0.73046875, + "learning_rate": 1.0764244349092956e-08, + "loss": 0.3569, + "step": 29850 + }, + { + "epoch": 3.98, + "grad_norm": 0.5859375, + "learning_rate": 1.0594064352553724e-08, + "loss": 0.4443, + "step": 29851 + }, + { + "epoch": 3.98, + "grad_norm": 0.57421875, + "learning_rate": 1.0425240276523696e-08, + "loss": 0.3014, + "step": 29852 + }, + { + "epoch": 3.98, + "grad_norm": 0.6640625, + "learning_rate": 1.0257772123312137e-08, + "loss": 0.4339, + "step": 29853 + }, + { + "epoch": 3.98, + "grad_norm": 0.6484375, + "learning_rate": 1.00916598951728e-08, + "loss": 0.3246, + "step": 29854 + }, + { + "epoch": 3.98, + "grad_norm": 0.625, + "learning_rate": 9.926903594359438e-09, + "loss": 0.1906, + "step": 29855 + }, + { + "epoch": 3.98, + "grad_norm": 0.6328125, + "learning_rate": 9.763503223114701e-09, + "loss": 0.4378, + "step": 29856 + }, + { + "epoch": 3.98, + "grad_norm": 0.74609375, + "learning_rate": 9.601458783647931e-09, + "loss": 0.2921, + "step": 29857 + }, + { + "epoch": 3.98, + "grad_norm": 0.6328125, + "learning_rate": 9.440770278168476e-09, + "loss": 0.2063, + "step": 29858 + }, + { + "epoch": 3.98, + "grad_norm": 0.734375, + "learning_rate": 9.281437708830166e-09, + "loss": 0.6916, + "step": 29859 + }, + { + "epoch": 3.98, + "grad_norm": 0.69921875, + "learning_rate": 9.12346107782014e-09, + "loss": 0.319, + "step": 29860 + }, + { + "epoch": 3.98, + "grad_norm": 0.59375, + "learning_rate": 8.966840387270025e-09, + "loss": 0.4432, + "step": 29861 + }, + { + "epoch": 3.98, + "grad_norm": 0.55859375, + "learning_rate": 8.811575639300352e-09, + "loss": 0.4589, + "step": 29862 + }, + { + "epoch": 3.98, + "grad_norm": 0.70703125, + "learning_rate": 8.65766683602054e-09, + "loss": 0.2812, + "step": 29863 + }, + { + "epoch": 3.99, + "grad_norm": 0.6640625, + "learning_rate": 8.505113979506707e-09, + "loss": 0.4386, + "step": 29864 + }, + { + "epoch": 3.99, + "grad_norm": 0.53515625, + "learning_rate": 8.353917071846074e-09, + "loss": 0.3231, + "step": 29865 + }, + { + "epoch": 3.99, + "grad_norm": 0.765625, + "learning_rate": 8.20407611508145e-09, + "loss": 0.4994, + "step": 29866 + }, + { + "epoch": 3.99, + "grad_norm": 0.63671875, + "learning_rate": 8.055591111244543e-09, + "loss": 0.4218, + "step": 29867 + }, + { + "epoch": 3.99, + "grad_norm": 0.5234375, + "learning_rate": 7.908462062344856e-09, + "loss": 0.2516, + "step": 29868 + }, + { + "epoch": 3.99, + "grad_norm": 0.765625, + "learning_rate": 7.762688970391895e-09, + "loss": 0.2903, + "step": 29869 + }, + { + "epoch": 3.99, + "grad_norm": 0.68359375, + "learning_rate": 7.618271837339652e-09, + "loss": 0.3351, + "step": 29870 + }, + { + "epoch": 3.99, + "grad_norm": 0.56640625, + "learning_rate": 7.475210665164323e-09, + "loss": 0.3155, + "step": 29871 + }, + { + "epoch": 3.99, + "grad_norm": 0.451171875, + "learning_rate": 7.333505455786593e-09, + "loss": 0.1802, + "step": 29872 + }, + { + "epoch": 3.99, + "grad_norm": 0.57421875, + "learning_rate": 7.193156211160457e-09, + "loss": 0.3259, + "step": 29873 + }, + { + "epoch": 3.99, + "grad_norm": 0.66796875, + "learning_rate": 7.054162933162189e-09, + "loss": 0.6745, + "step": 29874 + }, + { + "epoch": 3.99, + "grad_norm": 0.84375, + "learning_rate": 6.916525623679171e-09, + "loss": 0.7957, + "step": 29875 + }, + { + "epoch": 3.99, + "grad_norm": 0.6171875, + "learning_rate": 6.780244284587678e-09, + "loss": 0.6361, + "step": 29876 + }, + { + "epoch": 3.99, + "grad_norm": 0.6875, + "learning_rate": 6.645318917730681e-09, + "loss": 0.3022, + "step": 29877 + }, + { + "epoch": 3.99, + "grad_norm": 0.6328125, + "learning_rate": 6.5117495249400474e-09, + "loss": 0.4714, + "step": 29878 + }, + { + "epoch": 3.99, + "grad_norm": 0.5234375, + "learning_rate": 6.379536108025441e-09, + "loss": 0.2813, + "step": 29879 + }, + { + "epoch": 3.99, + "grad_norm": 0.69140625, + "learning_rate": 6.248678668774322e-09, + "loss": 0.3604, + "step": 29880 + }, + { + "epoch": 3.99, + "grad_norm": 0.6015625, + "learning_rate": 6.119177208974147e-09, + "loss": 0.2125, + "step": 29881 + }, + { + "epoch": 3.99, + "grad_norm": 0.51171875, + "learning_rate": 5.991031730367969e-09, + "loss": 0.2527, + "step": 29882 + }, + { + "epoch": 3.99, + "grad_norm": 0.73046875, + "learning_rate": 5.864242234698836e-09, + "loss": 0.4763, + "step": 29883 + }, + { + "epoch": 3.99, + "grad_norm": 0.58984375, + "learning_rate": 5.738808723698697e-09, + "loss": 0.2947, + "step": 29884 + }, + { + "epoch": 3.99, + "grad_norm": 0.484375, + "learning_rate": 5.614731199043987e-09, + "loss": 0.3005, + "step": 29885 + }, + { + "epoch": 3.99, + "grad_norm": 0.6328125, + "learning_rate": 5.49200966243335e-09, + "loss": 0.2067, + "step": 29886 + }, + { + "epoch": 3.99, + "grad_norm": 0.79296875, + "learning_rate": 5.370644115521017e-09, + "loss": 0.4932, + "step": 29887 + }, + { + "epoch": 3.99, + "grad_norm": 0.68359375, + "learning_rate": 5.25063455996122e-09, + "loss": 0.5902, + "step": 29888 + }, + { + "epoch": 3.99, + "grad_norm": 0.60546875, + "learning_rate": 5.1319809973859875e-09, + "loss": 0.4355, + "step": 29889 + }, + { + "epoch": 3.99, + "grad_norm": 0.75390625, + "learning_rate": 5.014683429394041e-09, + "loss": 0.3123, + "step": 29890 + }, + { + "epoch": 3.99, + "grad_norm": 0.53125, + "learning_rate": 4.8987418575729975e-09, + "loss": 0.3882, + "step": 29891 + }, + { + "epoch": 3.99, + "grad_norm": 0.87109375, + "learning_rate": 4.784156283510477e-09, + "loss": 0.4377, + "step": 29892 + }, + { + "epoch": 3.99, + "grad_norm": 0.51953125, + "learning_rate": 4.67092670874969e-09, + "loss": 0.4269, + "step": 29893 + }, + { + "epoch": 3.99, + "grad_norm": 0.6953125, + "learning_rate": 4.559053134822744e-09, + "loss": 0.2666, + "step": 29894 + }, + { + "epoch": 3.99, + "grad_norm": 0.578125, + "learning_rate": 4.448535563250644e-09, + "loss": 0.2723, + "step": 29895 + }, + { + "epoch": 3.99, + "grad_norm": 0.4140625, + "learning_rate": 4.339373995543294e-09, + "loss": 0.2054, + "step": 29896 + }, + { + "epoch": 3.99, + "grad_norm": 0.6640625, + "learning_rate": 4.231568433166189e-09, + "loss": 0.2202, + "step": 29897 + }, + { + "epoch": 3.99, + "grad_norm": 0.6796875, + "learning_rate": 4.125118877584822e-09, + "loss": 0.3739, + "step": 29898 + }, + { + "epoch": 3.99, + "grad_norm": 0.51953125, + "learning_rate": 4.020025330242483e-09, + "loss": 0.2454, + "step": 29899 + }, + { + "epoch": 3.99, + "grad_norm": 0.70703125, + "learning_rate": 3.916287792560258e-09, + "loss": 0.189, + "step": 29900 + }, + { + "epoch": 3.99, + "grad_norm": 0.625, + "learning_rate": 3.8139062659592325e-09, + "loss": 0.2461, + "step": 29901 + }, + { + "epoch": 3.99, + "grad_norm": 0.6640625, + "learning_rate": 3.7128807518160837e-09, + "loss": 0.1493, + "step": 29902 + }, + { + "epoch": 3.99, + "grad_norm": 0.72265625, + "learning_rate": 3.6132112515074868e-09, + "loss": 0.2602, + "step": 29903 + }, + { + "epoch": 3.99, + "grad_norm": 0.609375, + "learning_rate": 3.51489776636571e-09, + "loss": 0.3744, + "step": 29904 + }, + { + "epoch": 3.99, + "grad_norm": 0.6328125, + "learning_rate": 3.417940297756328e-09, + "loss": 0.3404, + "step": 29905 + }, + { + "epoch": 3.99, + "grad_norm": 0.39453125, + "learning_rate": 3.322338846967199e-09, + "loss": 0.1402, + "step": 29906 + }, + { + "epoch": 3.99, + "grad_norm": 0.65234375, + "learning_rate": 3.2280934153083864e-09, + "loss": 0.2557, + "step": 29907 + }, + { + "epoch": 3.99, + "grad_norm": 0.59375, + "learning_rate": 3.1352040040566465e-09, + "loss": 0.1959, + "step": 29908 + }, + { + "epoch": 3.99, + "grad_norm": 0.734375, + "learning_rate": 3.043670614466532e-09, + "loss": 0.609, + "step": 29909 + }, + { + "epoch": 3.99, + "grad_norm": 0.56640625, + "learning_rate": 2.9534932477814914e-09, + "loss": 0.302, + "step": 29910 + }, + { + "epoch": 3.99, + "grad_norm": 0.578125, + "learning_rate": 2.8646719052227712e-09, + "loss": 0.3374, + "step": 29911 + }, + { + "epoch": 3.99, + "grad_norm": 0.62890625, + "learning_rate": 2.777206588000514e-09, + "loss": 0.3852, + "step": 29912 + }, + { + "epoch": 3.99, + "grad_norm": 0.671875, + "learning_rate": 2.6910972972915562e-09, + "loss": 0.3856, + "step": 29913 + }, + { + "epoch": 3.99, + "grad_norm": 0.6171875, + "learning_rate": 2.606344034272734e-09, + "loss": 0.1742, + "step": 29914 + }, + { + "epoch": 3.99, + "grad_norm": 0.87890625, + "learning_rate": 2.52294680009868e-09, + "loss": 0.3725, + "step": 29915 + }, + { + "epoch": 3.99, + "grad_norm": 0.59765625, + "learning_rate": 2.4409055958796168e-09, + "loss": 0.2516, + "step": 29916 + }, + { + "epoch": 3.99, + "grad_norm": 0.55078125, + "learning_rate": 2.360220422747972e-09, + "loss": 0.1925, + "step": 29917 + }, + { + "epoch": 3.99, + "grad_norm": 0.671875, + "learning_rate": 2.2808912817806617e-09, + "loss": 0.4086, + "step": 29918 + }, + { + "epoch": 3.99, + "grad_norm": 0.578125, + "learning_rate": 2.2029181740657044e-09, + "loss": 0.4896, + "step": 29919 + }, + { + "epoch": 3.99, + "grad_norm": 0.72265625, + "learning_rate": 2.126301100657813e-09, + "loss": 0.2178, + "step": 29920 + }, + { + "epoch": 3.99, + "grad_norm": 0.73828125, + "learning_rate": 2.0510400626005953e-09, + "loss": 0.4318, + "step": 29921 + }, + { + "epoch": 3.99, + "grad_norm": 0.640625, + "learning_rate": 1.977135060904356e-09, + "loss": 0.7081, + "step": 29922 + }, + { + "epoch": 3.99, + "grad_norm": 0.56640625, + "learning_rate": 1.904586096579397e-09, + "loss": 0.1932, + "step": 29923 + }, + { + "epoch": 3.99, + "grad_norm": 0.578125, + "learning_rate": 1.8333931706027153e-09, + "loss": 0.2932, + "step": 29924 + }, + { + "epoch": 3.99, + "grad_norm": 0.62109375, + "learning_rate": 1.7635562839402042e-09, + "loss": 0.4517, + "step": 29925 + }, + { + "epoch": 3.99, + "grad_norm": 0.376953125, + "learning_rate": 1.695075437546656e-09, + "loss": 0.2187, + "step": 29926 + }, + { + "epoch": 3.99, + "grad_norm": 0.8359375, + "learning_rate": 1.6279506323546578e-09, + "loss": 0.4372, + "step": 29927 + }, + { + "epoch": 3.99, + "grad_norm": 0.330078125, + "learning_rate": 1.562181869252388e-09, + "loss": 0.1142, + "step": 29928 + }, + { + "epoch": 3.99, + "grad_norm": 0.60546875, + "learning_rate": 1.49776914915023e-09, + "loss": 0.2221, + "step": 29929 + }, + { + "epoch": 3.99, + "grad_norm": 0.61328125, + "learning_rate": 1.4347124729252592e-09, + "loss": 0.3616, + "step": 29930 + }, + { + "epoch": 3.99, + "grad_norm": 0.91796875, + "learning_rate": 1.3730118414101433e-09, + "loss": 0.2624, + "step": 29931 + }, + { + "epoch": 3.99, + "grad_norm": 0.69140625, + "learning_rate": 1.3126672554708563e-09, + "loss": 0.1562, + "step": 29932 + }, + { + "epoch": 3.99, + "grad_norm": 0.52734375, + "learning_rate": 1.2536787158956564e-09, + "loss": 0.1624, + "step": 29933 + }, + { + "epoch": 3.99, + "grad_norm": 0.61328125, + "learning_rate": 1.1960462235061088e-09, + "loss": 0.2133, + "step": 29934 + }, + { + "epoch": 3.99, + "grad_norm": 0.6796875, + "learning_rate": 1.1397697790793694e-09, + "loss": 0.2741, + "step": 29935 + }, + { + "epoch": 3.99, + "grad_norm": 0.4765625, + "learning_rate": 1.0848493833814922e-09, + "loss": 0.2096, + "step": 29936 + }, + { + "epoch": 3.99, + "grad_norm": 0.55859375, + "learning_rate": 1.031285037134122e-09, + "loss": 0.2769, + "step": 29937 + }, + { + "epoch": 3.99, + "grad_norm": 0.73828125, + "learning_rate": 9.790767410922108e-10, + "loss": 0.4763, + "step": 29938 + }, + { + "epoch": 4.0, + "grad_norm": 0.60546875, + "learning_rate": 9.282244959551989e-10, + "loss": 0.145, + "step": 29939 + }, + { + "epoch": 4.0, + "grad_norm": 0.64453125, + "learning_rate": 8.787283024114245e-10, + "loss": 0.2454, + "step": 29940 + }, + { + "epoch": 4.0, + "grad_norm": 0.73828125, + "learning_rate": 8.305881611270217e-10, + "loss": 0.2796, + "step": 29941 + }, + { + "epoch": 4.0, + "grad_norm": 0.73828125, + "learning_rate": 7.838040727570217e-10, + "loss": 0.4834, + "step": 29942 + }, + { + "epoch": 4.0, + "grad_norm": 0.6796875, + "learning_rate": 7.383760379453541e-10, + "loss": 0.2499, + "step": 29943 + }, + { + "epoch": 4.0, + "grad_norm": 0.5234375, + "learning_rate": 6.943040572915393e-10, + "loss": 0.1519, + "step": 29944 + }, + { + "epoch": 4.0, + "grad_norm": 0.6953125, + "learning_rate": 6.515881314062e-10, + "loss": 0.4239, + "step": 29945 + }, + { + "epoch": 4.0, + "grad_norm": 0.50390625, + "learning_rate": 6.102282608666521e-10, + "loss": 0.3565, + "step": 29946 + }, + { + "epoch": 4.0, + "grad_norm": 0.828125, + "learning_rate": 5.702244462280071e-10, + "loss": 0.334, + "step": 29947 + }, + { + "epoch": 4.0, + "grad_norm": 0.62109375, + "learning_rate": 5.315766880342743e-10, + "loss": 0.3547, + "step": 29948 + }, + { + "epoch": 4.0, + "grad_norm": 0.640625, + "learning_rate": 4.942849868183608e-10, + "loss": 0.4015, + "step": 29949 + }, + { + "epoch": 4.0, + "grad_norm": 0.55078125, + "learning_rate": 4.583493430687646e-10, + "loss": 0.3348, + "step": 29950 + }, + { + "epoch": 4.0, + "grad_norm": 0.86328125, + "learning_rate": 4.237697572961885e-10, + "loss": 0.3243, + "step": 29951 + }, + { + "epoch": 4.0, + "grad_norm": 0.546875, + "learning_rate": 3.9054622994472155e-10, + "loss": 0.2224, + "step": 29952 + }, + { + "epoch": 4.0, + "grad_norm": 0.66015625, + "learning_rate": 3.586787614806575e-10, + "loss": 0.3229, + "step": 29953 + }, + { + "epoch": 4.0, + "grad_norm": 0.75390625, + "learning_rate": 3.2816735233698323e-10, + "loss": 0.6249, + "step": 29954 + }, + { + "epoch": 4.0, + "grad_norm": 0.5546875, + "learning_rate": 2.990120029133792e-10, + "loss": 0.1994, + "step": 29955 + }, + { + "epoch": 4.0, + "grad_norm": 0.8125, + "learning_rate": 2.712127136206277e-10, + "loss": 0.3658, + "step": 29956 + }, + { + "epoch": 4.0, + "grad_norm": 0.65625, + "learning_rate": 2.4476948483620475e-10, + "loss": 0.2183, + "step": 29957 + }, + { + "epoch": 4.0, + "grad_norm": 0.7109375, + "learning_rate": 2.1968231690427943e-10, + "loss": 0.3472, + "step": 29958 + }, + { + "epoch": 4.0, + "grad_norm": 0.5859375, + "learning_rate": 1.9595121016902086e-10, + "loss": 0.1571, + "step": 29959 + }, + { + "epoch": 4.0, + "grad_norm": 0.75390625, + "learning_rate": 1.7357616496349594e-10, + "loss": 0.3877, + "step": 29960 + }, + { + "epoch": 4.0, + "grad_norm": 0.67578125, + "learning_rate": 1.525571815874649e-10, + "loss": 0.3188, + "step": 29961 + }, + { + "epoch": 4.0, + "grad_norm": 0.625, + "learning_rate": 1.3289426030738127e-10, + "loss": 0.2264, + "step": 29962 + }, + { + "epoch": 4.0, + "grad_norm": 0.78125, + "learning_rate": 1.1458740142300528e-10, + "loss": 0.3699, + "step": 29963 + }, + { + "epoch": 4.0, + "grad_norm": 0.62890625, + "learning_rate": 9.763660515638152e-11, + "loss": 0.3797, + "step": 29964 + }, + { + "epoch": 4.0, + "grad_norm": 0.74609375, + "learning_rate": 8.20418717406568e-11, + "loss": 0.45, + "step": 29965 + }, + { + "epoch": 4.0, + "grad_norm": 0.671875, + "learning_rate": 6.780320139787577e-11, + "loss": 0.6981, + "step": 29966 + }, + { + "epoch": 4.0, + "grad_norm": 0.625, + "learning_rate": 5.492059430567409e-11, + "loss": 0.5554, + "step": 29967 + }, + { + "epoch": 4.0, + "grad_norm": 0.68359375, + "learning_rate": 4.3394050652789675e-11, + "loss": 0.3251, + "step": 29968 + }, + { + "epoch": 4.0, + "grad_norm": 0.51953125, + "learning_rate": 3.322357059465375e-11, + "loss": 0.3538, + "step": 29969 + }, + { + "epoch": 4.0, + "grad_norm": 0.52734375, + "learning_rate": 2.4409154264493084e-11, + "loss": 0.3013, + "step": 29970 + }, + { + "epoch": 4.0, + "grad_norm": 0.734375, + "learning_rate": 1.6950801784432203e-11, + "loss": 0.3233, + "step": 29971 + }, + { + "epoch": 4.0, + "grad_norm": 0.5, + "learning_rate": 1.084851325439118e-11, + "loss": 0.3697, + "step": 29972 + }, + { + "epoch": 4.0, + "grad_norm": 0.60546875, + "learning_rate": 6.102288752085628e-12, + "loss": 0.273, + "step": 29973 + }, + { + "epoch": 4.0, + "grad_norm": 0.6640625, + "learning_rate": 2.7121283552311584e-12, + "loss": 0.6039, + "step": 29974 + }, + { + "epoch": 4.0, + "grad_norm": 0.65625, + "learning_rate": 6.78032086032232e-13, + "loss": 0.3915, + "step": 29975 + }, + { + "epoch": 4.0, + "grad_norm": 0.6953125, + "learning_rate": 0.0, + "loss": 0.3812, + "step": 29976 + }, + { + "epoch": 4.0, + "step": 29976, + "total_flos": 2.62992394781406e+18, + "train_loss": 0.4291328646524476, + "train_runtime": 44628.6614, + "train_samples_per_second": 2.687, + "train_steps_per_second": 0.672 + } + ], + "logging_steps": 1, + "max_steps": 29976, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 50, + "total_flos": 2.62992394781406e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}