{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01975, "eval_steps": 500, "global_step": 1975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 1.7129806011276105, "learning_rate": 3e-06, "loss": 10.8348, "step": 1 }, { "epoch": 2e-05, "grad_norm": 1.6872753303603527, "learning_rate": 6e-06, "loss": 10.8356, "step": 2 }, { "epoch": 3e-05, "grad_norm": 1.6970020903903387, "learning_rate": 9e-06, "loss": 10.834, "step": 3 }, { "epoch": 4e-05, "grad_norm": 1.690199421583159, "learning_rate": 1.2e-05, "loss": 10.8334, "step": 4 }, { "epoch": 5e-05, "grad_norm": 1.6936208883930068, "learning_rate": 1.5e-05, "loss": 10.8294, "step": 5 }, { "epoch": 6e-05, "grad_norm": 1.6935154610161474, "learning_rate": 1.8e-05, "loss": 10.8281, "step": 6 }, { "epoch": 7e-05, "grad_norm": 1.6333694173725648, "learning_rate": 2.1000000000000002e-05, "loss": 10.8133, "step": 7 }, { "epoch": 8e-05, "grad_norm": 1.4463755011186001, "learning_rate": 2.4e-05, "loss": 10.783, "step": 8 }, { "epoch": 9e-05, "grad_norm": 1.3815123169360315, "learning_rate": 2.7e-05, "loss": 10.7779, "step": 9 }, { "epoch": 0.0001, "grad_norm": 1.3507621465484316, "learning_rate": 3e-05, "loss": 10.7629, "step": 10 }, { "epoch": 0.00011, "grad_norm": 1.257508561634155, "learning_rate": 3.2999999999999996e-05, "loss": 10.7454, "step": 11 }, { "epoch": 0.00012, "grad_norm": 1.224298046820689, "learning_rate": 3.6e-05, "loss": 10.7321, "step": 12 }, { "epoch": 0.00013, "grad_norm": 1.1609107458726389, "learning_rate": 3.9e-05, "loss": 10.7098, "step": 13 }, { "epoch": 0.00014, "grad_norm": 1.1251765756585856, "learning_rate": 4.2000000000000004e-05, "loss": 10.6986, "step": 14 }, { "epoch": 0.00015, "grad_norm": 1.1021031797679595, "learning_rate": 4.4999999999999996e-05, "loss": 10.6882, "step": 15 }, { "epoch": 0.00016, "grad_norm": 1.05231707077907, "learning_rate": 4.8e-05, "loss": 10.6681, "step": 16 }, { "epoch": 0.00017, "grad_norm": 1.0082613504118885, "learning_rate": 5.1000000000000006e-05, "loss": 10.6513, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.9840324393168475, "learning_rate": 5.4e-05, "loss": 10.6344, "step": 18 }, { "epoch": 0.00019, "grad_norm": 0.953923239589578, "learning_rate": 5.7e-05, "loss": 10.6196, "step": 19 }, { "epoch": 0.0002, "grad_norm": 0.9458057853193742, "learning_rate": 6e-05, "loss": 10.6069, "step": 20 }, { "epoch": 0.00021, "grad_norm": 0.9177457999897578, "learning_rate": 6.3e-05, "loss": 10.5922, "step": 21 }, { "epoch": 0.00022, "grad_norm": 0.9134874433162486, "learning_rate": 6.599999999999999e-05, "loss": 10.576, "step": 22 }, { "epoch": 0.00023, "grad_norm": 0.908696989628468, "learning_rate": 6.9e-05, "loss": 10.5635, "step": 23 }, { "epoch": 0.00024, "grad_norm": 0.9051143007426985, "learning_rate": 7.2e-05, "loss": 10.5499, "step": 24 }, { "epoch": 0.00025, "grad_norm": 0.9082451576693834, "learning_rate": 7.500000000000001e-05, "loss": 10.5361, "step": 25 }, { "epoch": 0.00026, "grad_norm": 0.9099344486243927, "learning_rate": 7.8e-05, "loss": 10.521, "step": 26 }, { "epoch": 0.00027, "grad_norm": 0.9053293550746107, "learning_rate": 8.1e-05, "loss": 10.5103, "step": 27 }, { "epoch": 0.00028, "grad_norm": 0.9002471153364864, "learning_rate": 8.400000000000001e-05, "loss": 10.4955, "step": 28 }, { "epoch": 0.00029, "grad_norm": 0.9068699186733776, "learning_rate": 8.7e-05, "loss": 10.4811, "step": 29 }, { "epoch": 0.0003, "grad_norm": 0.9095271030063902, "learning_rate": 8.999999999999999e-05, "loss": 10.4648, "step": 30 }, { "epoch": 0.00031, "grad_norm": 0.9097010936405139, "learning_rate": 9.3e-05, "loss": 10.4503, "step": 31 }, { "epoch": 0.00032, "grad_norm": 0.9047462396891427, "learning_rate": 9.6e-05, "loss": 10.4348, "step": 32 }, { "epoch": 0.00033, "grad_norm": 0.9068703333942145, "learning_rate": 9.900000000000001e-05, "loss": 10.418, "step": 33 }, { "epoch": 0.00034, "grad_norm": 0.9072965837486595, "learning_rate": 0.00010200000000000001, "loss": 10.4, "step": 34 }, { "epoch": 0.00035, "grad_norm": 0.9110003633890357, "learning_rate": 0.00010500000000000002, "loss": 10.3835, "step": 35 }, { "epoch": 0.00036, "grad_norm": 0.9049119959927198, "learning_rate": 0.000108, "loss": 10.3652, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.8970709544624084, "learning_rate": 0.000111, "loss": 10.3479, "step": 37 }, { "epoch": 0.00038, "grad_norm": 0.8959068278842482, "learning_rate": 0.000114, "loss": 10.3275, "step": 38 }, { "epoch": 0.00039, "grad_norm": 0.9005947927478184, "learning_rate": 0.000117, "loss": 10.3069, "step": 39 }, { "epoch": 0.0004, "grad_norm": 0.9014442598894896, "learning_rate": 0.00012, "loss": 10.2842, "step": 40 }, { "epoch": 0.00041, "grad_norm": 0.8992939718171602, "learning_rate": 0.000123, "loss": 10.2657, "step": 41 }, { "epoch": 0.00042, "grad_norm": 0.8994818536906172, "learning_rate": 0.000126, "loss": 10.2444, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.9062946670458473, "learning_rate": 0.000129, "loss": 10.2208, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.9072550424345267, "learning_rate": 0.00013199999999999998, "loss": 10.1985, "step": 44 }, { "epoch": 0.00045, "grad_norm": 0.908308760029939, "learning_rate": 0.000135, "loss": 10.1758, "step": 45 }, { "epoch": 0.00046, "grad_norm": 0.8994605508976834, "learning_rate": 0.000138, "loss": 10.1528, "step": 46 }, { "epoch": 0.00047, "grad_norm": 0.904955141258698, "learning_rate": 0.000141, "loss": 10.1274, "step": 47 }, { "epoch": 0.00048, "grad_norm": 0.9044693581157806, "learning_rate": 0.000144, "loss": 10.1031, "step": 48 }, { "epoch": 0.00049, "grad_norm": 0.8992120995192336, "learning_rate": 0.000147, "loss": 10.0777, "step": 49 }, { "epoch": 0.0005, "grad_norm": 0.905676588399281, "learning_rate": 0.00015000000000000001, "loss": 10.0519, "step": 50 }, { "epoch": 0.00051, "grad_norm": 0.9066841497261428, "learning_rate": 0.000153, "loss": 10.0251, "step": 51 }, { "epoch": 0.00052, "grad_norm": 0.9046656683417261, "learning_rate": 0.000156, "loss": 9.9981, "step": 52 }, { "epoch": 0.00053, "grad_norm": 0.8943714853313668, "learning_rate": 0.000159, "loss": 9.974, "step": 53 }, { "epoch": 0.00054, "grad_norm": 0.9141658233846578, "learning_rate": 0.000162, "loss": 9.9419, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.9035944774643171, "learning_rate": 0.000165, "loss": 9.9169, "step": 55 }, { "epoch": 0.00056, "grad_norm": 0.895407870582166, "learning_rate": 0.00016800000000000002, "loss": 9.8872, "step": 56 }, { "epoch": 0.00057, "grad_norm": 0.9021731997760362, "learning_rate": 0.000171, "loss": 9.8601, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.8980871554912008, "learning_rate": 0.000174, "loss": 9.8343, "step": 58 }, { "epoch": 0.00059, "grad_norm": 0.9073832283363998, "learning_rate": 0.000177, "loss": 9.8028, "step": 59 }, { "epoch": 0.0006, "grad_norm": 0.8921071838486323, "learning_rate": 0.00017999999999999998, "loss": 9.777, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.8918001641348363, "learning_rate": 0.000183, "loss": 9.7484, "step": 61 }, { "epoch": 0.00062, "grad_norm": 0.897401330332219, "learning_rate": 0.000186, "loss": 9.717, "step": 62 }, { "epoch": 0.00063, "grad_norm": 0.8914516241190131, "learning_rate": 0.000189, "loss": 9.6894, "step": 63 }, { "epoch": 0.00064, "grad_norm": 0.8896652156254973, "learning_rate": 0.000192, "loss": 9.6587, "step": 64 }, { "epoch": 0.00065, "grad_norm": 0.8995447585153489, "learning_rate": 0.00019500000000000002, "loss": 9.6261, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.8896939041293862, "learning_rate": 0.00019800000000000002, "loss": 9.6026, "step": 66 }, { "epoch": 0.00067, "grad_norm": 0.8935314234316469, "learning_rate": 0.000201, "loss": 9.5723, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.8971584182008717, "learning_rate": 0.00020400000000000003, "loss": 9.5393, "step": 68 }, { "epoch": 0.00069, "grad_norm": 0.8806455604370673, "learning_rate": 0.00020700000000000002, "loss": 9.5119, "step": 69 }, { "epoch": 0.0007, "grad_norm": 0.892956094531968, "learning_rate": 0.00021000000000000004, "loss": 9.4751, "step": 70 }, { "epoch": 0.00071, "grad_norm": 0.8848452972865632, "learning_rate": 0.00021299999999999997, "loss": 9.4495, "step": 71 }, { "epoch": 0.00072, "grad_norm": 0.8831148731992822, "learning_rate": 0.000216, "loss": 9.4223, "step": 72 }, { "epoch": 0.00073, "grad_norm": 0.887150899449638, "learning_rate": 0.00021899999999999998, "loss": 9.3879, "step": 73 }, { "epoch": 0.00074, "grad_norm": 0.8878619769377328, "learning_rate": 0.000222, "loss": 9.3616, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.8808154408936898, "learning_rate": 0.000225, "loss": 9.3275, "step": 75 }, { "epoch": 0.00076, "grad_norm": 0.8908035269749474, "learning_rate": 0.000228, "loss": 9.3009, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.884531047332737, "learning_rate": 0.000231, "loss": 9.2727, "step": 77 }, { "epoch": 0.00078, "grad_norm": 0.8838664917591654, "learning_rate": 0.000234, "loss": 9.2422, "step": 78 }, { "epoch": 0.00079, "grad_norm": 0.8858668201182466, "learning_rate": 0.00023700000000000001, "loss": 9.2056, "step": 79 }, { "epoch": 0.0008, "grad_norm": 0.8856967305037482, "learning_rate": 0.00024, "loss": 9.1711, "step": 80 }, { "epoch": 0.00081, "grad_norm": 0.8942846826675519, "learning_rate": 0.00024300000000000002, "loss": 9.1382, "step": 81 }, { "epoch": 0.00082, "grad_norm": 0.897767651472895, "learning_rate": 0.000246, "loss": 9.1142, "step": 82 }, { "epoch": 0.00083, "grad_norm": 0.8951752702012633, "learning_rate": 0.00024900000000000004, "loss": 9.0778, "step": 83 }, { "epoch": 0.00084, "grad_norm": 0.8980395289622467, "learning_rate": 0.000252, "loss": 9.0469, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.8894006576183595, "learning_rate": 0.000255, "loss": 9.0242, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.8907945566480024, "learning_rate": 0.000258, "loss": 8.9886, "step": 86 }, { "epoch": 0.00087, "grad_norm": 0.8869170795764568, "learning_rate": 0.000261, "loss": 8.9664, "step": 87 }, { "epoch": 0.00088, "grad_norm": 0.8860541210154026, "learning_rate": 0.00026399999999999997, "loss": 8.9293, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.8822605600899943, "learning_rate": 0.000267, "loss": 8.9037, "step": 89 }, { "epoch": 0.0009, "grad_norm": 0.8817151929172502, "learning_rate": 0.00027, "loss": 8.8766, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.877617615465877, "learning_rate": 0.000273, "loss": 8.8478, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.8822716293479064, "learning_rate": 0.000276, "loss": 8.8156, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.8823661552266111, "learning_rate": 0.000279, "loss": 8.7863, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.8830384482321385, "learning_rate": 0.000282, "loss": 8.7609, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.8735042737334501, "learning_rate": 0.000285, "loss": 8.7321, "step": 95 }, { "epoch": 0.00096, "grad_norm": 0.8799214796836804, "learning_rate": 0.000288, "loss": 8.7028, "step": 96 }, { "epoch": 0.00097, "grad_norm": 0.8704594748643596, "learning_rate": 0.000291, "loss": 8.6791, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.8706415983834461, "learning_rate": 0.000294, "loss": 8.642, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.8683426041650804, "learning_rate": 0.000297, "loss": 8.62, "step": 99 }, { "epoch": 0.001, "grad_norm": 0.8690594926543161, "learning_rate": 0.00030000000000000003, "loss": 8.5941, "step": 100 }, { "epoch": 0.00101, "grad_norm": 0.8744725161423202, "learning_rate": 0.00030300000000000005, "loss": 8.5597, "step": 101 }, { "epoch": 0.00102, "grad_norm": 0.8626538117869429, "learning_rate": 0.000306, "loss": 8.5407, "step": 102 }, { "epoch": 0.00103, "grad_norm": 0.8630292491448714, "learning_rate": 0.000309, "loss": 8.5165, "step": 103 }, { "epoch": 0.00104, "grad_norm": 0.8566580756350954, "learning_rate": 0.000312, "loss": 8.4948, "step": 104 }, { "epoch": 0.00105, "grad_norm": 0.8588931967033124, "learning_rate": 0.000315, "loss": 8.4689, "step": 105 }, { "epoch": 0.00106, "grad_norm": 0.8531867230327145, "learning_rate": 0.000318, "loss": 8.4397, "step": 106 }, { "epoch": 0.00107, "grad_norm": 0.8474808010000593, "learning_rate": 0.000321, "loss": 8.4149, "step": 107 }, { "epoch": 0.00108, "grad_norm": 0.858890949163445, "learning_rate": 0.000324, "loss": 8.3866, "step": 108 }, { "epoch": 0.00109, "grad_norm": 0.862504115159085, "learning_rate": 0.000327, "loss": 8.3673, "step": 109 }, { "epoch": 0.0011, "grad_norm": 0.8797254817902618, "learning_rate": 0.00033, "loss": 8.3453, "step": 110 }, { "epoch": 0.00111, "grad_norm": 0.8938450574121157, "learning_rate": 0.000333, "loss": 8.3162, "step": 111 }, { "epoch": 0.00112, "grad_norm": 0.8984693362159062, "learning_rate": 0.00033600000000000004, "loss": 8.2961, "step": 112 }, { "epoch": 0.00113, "grad_norm": 0.8741969532880793, "learning_rate": 0.000339, "loss": 8.2543, "step": 113 }, { "epoch": 0.00114, "grad_norm": 0.8263135137442741, "learning_rate": 0.000342, "loss": 8.2446, "step": 114 }, { "epoch": 0.00115, "grad_norm": 0.8311105019716521, "learning_rate": 0.00034500000000000004, "loss": 8.2164, "step": 115 }, { "epoch": 0.00116, "grad_norm": 0.8585271561560018, "learning_rate": 0.000348, "loss": 8.1938, "step": 116 }, { "epoch": 0.00117, "grad_norm": 0.8687047969468357, "learning_rate": 0.000351, "loss": 8.1623, "step": 117 }, { "epoch": 0.00118, "grad_norm": 0.8460500876754268, "learning_rate": 0.000354, "loss": 8.1456, "step": 118 }, { "epoch": 0.00119, "grad_norm": 0.80734714043103, "learning_rate": 0.000357, "loss": 8.131, "step": 119 }, { "epoch": 0.0012, "grad_norm": 0.7912470130916918, "learning_rate": 0.00035999999999999997, "loss": 8.103, "step": 120 }, { "epoch": 0.00121, "grad_norm": 0.811181199244069, "learning_rate": 0.000363, "loss": 8.0751, "step": 121 }, { "epoch": 0.00122, "grad_norm": 0.8641427784894331, "learning_rate": 0.000366, "loss": 8.0581, "step": 122 }, { "epoch": 0.00123, "grad_norm": 1.059706847686213, "learning_rate": 0.000369, "loss": 8.038, "step": 123 }, { "epoch": 0.00124, "grad_norm": 1.1727027216994725, "learning_rate": 0.000372, "loss": 8.021, "step": 124 }, { "epoch": 0.00125, "grad_norm": 0.8130002892684417, "learning_rate": 0.000375, "loss": 7.9874, "step": 125 }, { "epoch": 0.00126, "grad_norm": 0.9195848585380069, "learning_rate": 0.000378, "loss": 7.9767, "step": 126 }, { "epoch": 0.00127, "grad_norm": 1.0843433185909894, "learning_rate": 0.000381, "loss": 7.9572, "step": 127 }, { "epoch": 0.00128, "grad_norm": 0.7822919696307823, "learning_rate": 0.000384, "loss": 7.9285, "step": 128 }, { "epoch": 0.00129, "grad_norm": 0.8822428605457112, "learning_rate": 0.00038700000000000003, "loss": 7.9179, "step": 129 }, { "epoch": 0.0013, "grad_norm": 0.844355538937723, "learning_rate": 0.00039000000000000005, "loss": 7.8895, "step": 130 }, { "epoch": 0.00131, "grad_norm": 0.7902535444057679, "learning_rate": 0.000393, "loss": 7.8528, "step": 131 }, { "epoch": 0.00132, "grad_norm": 0.8777082094723105, "learning_rate": 0.00039600000000000003, "loss": 7.8441, "step": 132 }, { "epoch": 0.00133, "grad_norm": 0.7973277807473979, "learning_rate": 0.00039900000000000005, "loss": 7.8195, "step": 133 }, { "epoch": 0.00134, "grad_norm": 0.7889088832890946, "learning_rate": 0.000402, "loss": 7.8056, "step": 134 }, { "epoch": 0.00135, "grad_norm": 0.7461125825498439, "learning_rate": 0.00040500000000000003, "loss": 7.7735, "step": 135 }, { "epoch": 0.00136, "grad_norm": 0.7727896835908762, "learning_rate": 0.00040800000000000005, "loss": 7.7579, "step": 136 }, { "epoch": 0.00137, "grad_norm": 0.6932995987295251, "learning_rate": 0.000411, "loss": 7.7341, "step": 137 }, { "epoch": 0.00138, "grad_norm": 0.758084762416224, "learning_rate": 0.00041400000000000003, "loss": 7.7117, "step": 138 }, { "epoch": 0.00139, "grad_norm": 0.7171019453691133, "learning_rate": 0.00041700000000000005, "loss": 7.6963, "step": 139 }, { "epoch": 0.0014, "grad_norm": 0.6814920611933867, "learning_rate": 0.00042000000000000007, "loss": 7.6775, "step": 140 }, { "epoch": 0.00141, "grad_norm": 0.7091532995122851, "learning_rate": 0.000423, "loss": 7.6638, "step": 141 }, { "epoch": 0.00142, "grad_norm": 0.6928279523561562, "learning_rate": 0.00042599999999999995, "loss": 7.6501, "step": 142 }, { "epoch": 0.00143, "grad_norm": 0.6614572727332786, "learning_rate": 0.00042899999999999997, "loss": 7.6195, "step": 143 }, { "epoch": 0.00144, "grad_norm": 0.6903462553659518, "learning_rate": 0.000432, "loss": 7.6015, "step": 144 }, { "epoch": 0.00145, "grad_norm": 0.690019772183536, "learning_rate": 0.000435, "loss": 7.5953, "step": 145 }, { "epoch": 0.00146, "grad_norm": 0.6908198257220046, "learning_rate": 0.00043799999999999997, "loss": 7.5557, "step": 146 }, { "epoch": 0.00147, "grad_norm": 0.7009866965495668, "learning_rate": 0.000441, "loss": 7.5482, "step": 147 }, { "epoch": 0.00148, "grad_norm": 0.6832764187147686, "learning_rate": 0.000444, "loss": 7.5366, "step": 148 }, { "epoch": 0.00149, "grad_norm": 0.59797192318343, "learning_rate": 0.00044699999999999997, "loss": 7.5272, "step": 149 }, { "epoch": 0.0015, "grad_norm": 0.6655702435683013, "learning_rate": 0.00045, "loss": 7.4963, "step": 150 }, { "epoch": 0.00151, "grad_norm": 0.732396941583091, "learning_rate": 0.000453, "loss": 7.48, "step": 151 }, { "epoch": 0.00152, "grad_norm": 0.5836278900992692, "learning_rate": 0.000456, "loss": 7.4694, "step": 152 }, { "epoch": 0.00153, "grad_norm": 0.6777912087785298, "learning_rate": 0.000459, "loss": 7.4588, "step": 153 }, { "epoch": 0.00154, "grad_norm": 0.727978180952039, "learning_rate": 0.000462, "loss": 7.442, "step": 154 }, { "epoch": 0.00155, "grad_norm": 0.7368922682622268, "learning_rate": 0.000465, "loss": 7.4241, "step": 155 }, { "epoch": 0.00156, "grad_norm": 0.8391325557731037, "learning_rate": 0.000468, "loss": 7.4013, "step": 156 }, { "epoch": 0.00157, "grad_norm": 0.8289929528374833, "learning_rate": 0.000471, "loss": 7.3995, "step": 157 }, { "epoch": 0.00158, "grad_norm": 0.5070337070851558, "learning_rate": 0.00047400000000000003, "loss": 7.3713, "step": 158 }, { "epoch": 0.00159, "grad_norm": 0.783946493417518, "learning_rate": 0.000477, "loss": 7.3668, "step": 159 }, { "epoch": 0.0016, "grad_norm": 0.6957053326984224, "learning_rate": 0.00048, "loss": 7.3475, "step": 160 }, { "epoch": 0.00161, "grad_norm": 0.547833885334286, "learning_rate": 0.00048300000000000003, "loss": 7.3204, "step": 161 }, { "epoch": 0.00162, "grad_norm": 0.8547649122041628, "learning_rate": 0.00048600000000000005, "loss": 7.3325, "step": 162 }, { "epoch": 0.00163, "grad_norm": 0.8673949773728752, "learning_rate": 0.0004890000000000001, "loss": 7.316, "step": 163 }, { "epoch": 0.00164, "grad_norm": 1.0619539506126108, "learning_rate": 0.000492, "loss": 7.3191, "step": 164 }, { "epoch": 0.00165, "grad_norm": 0.6324744711420325, "learning_rate": 0.000495, "loss": 7.2864, "step": 165 }, { "epoch": 0.00166, "grad_norm": 0.5662260261966024, "learning_rate": 0.0004980000000000001, "loss": 7.26, "step": 166 }, { "epoch": 0.00167, "grad_norm": 0.7262900850309921, "learning_rate": 0.000501, "loss": 7.2554, "step": 167 }, { "epoch": 0.00168, "grad_norm": 0.6121691436587496, "learning_rate": 0.000504, "loss": 7.2353, "step": 168 }, { "epoch": 0.00169, "grad_norm": 0.5390794603769147, "learning_rate": 0.0005070000000000001, "loss": 7.2263, "step": 169 }, { "epoch": 0.0017, "grad_norm": 0.5999036994585554, "learning_rate": 0.00051, "loss": 7.213, "step": 170 }, { "epoch": 0.00171, "grad_norm": 0.4637320512013434, "learning_rate": 0.000513, "loss": 7.1933, "step": 171 }, { "epoch": 0.00172, "grad_norm": 0.5250975302523401, "learning_rate": 0.000516, "loss": 7.1953, "step": 172 }, { "epoch": 0.00173, "grad_norm": 0.40559164125903624, "learning_rate": 0.0005189999999999999, "loss": 7.1764, "step": 173 }, { "epoch": 0.00174, "grad_norm": 0.4505921111310584, "learning_rate": 0.000522, "loss": 7.1953, "step": 174 }, { "epoch": 0.00175, "grad_norm": 0.4234331150208657, "learning_rate": 0.000525, "loss": 7.1572, "step": 175 }, { "epoch": 0.00176, "grad_norm": 0.3852967422981744, "learning_rate": 0.0005279999999999999, "loss": 7.1322, "step": 176 }, { "epoch": 0.00177, "grad_norm": 0.3685443025565043, "learning_rate": 0.000531, "loss": 7.1378, "step": 177 }, { "epoch": 0.00178, "grad_norm": 0.44280593644992733, "learning_rate": 0.000534, "loss": 7.1301, "step": 178 }, { "epoch": 0.00179, "grad_norm": 0.3638226120256115, "learning_rate": 0.000537, "loss": 7.1191, "step": 179 }, { "epoch": 0.0018, "grad_norm": 0.37841703582661185, "learning_rate": 0.00054, "loss": 7.0921, "step": 180 }, { "epoch": 0.00181, "grad_norm": 0.3275750999054276, "learning_rate": 0.000543, "loss": 7.0801, "step": 181 }, { "epoch": 0.00182, "grad_norm": 0.3469517461565544, "learning_rate": 0.000546, "loss": 7.0774, "step": 182 }, { "epoch": 0.00183, "grad_norm": 0.3965623823212328, "learning_rate": 0.000549, "loss": 7.0674, "step": 183 }, { "epoch": 0.00184, "grad_norm": 0.47244712960577356, "learning_rate": 0.000552, "loss": 7.0582, "step": 184 }, { "epoch": 0.00185, "grad_norm": 0.7068086356604425, "learning_rate": 0.000555, "loss": 7.0369, "step": 185 }, { "epoch": 0.00186, "grad_norm": 0.9665650200874053, "learning_rate": 0.000558, "loss": 7.0604, "step": 186 }, { "epoch": 0.00187, "grad_norm": 1.1379887499575514, "learning_rate": 0.000561, "loss": 7.0366, "step": 187 }, { "epoch": 0.00188, "grad_norm": 0.5005933831438132, "learning_rate": 0.000564, "loss": 7.0008, "step": 188 }, { "epoch": 0.00189, "grad_norm": 0.4490325126563288, "learning_rate": 0.000567, "loss": 6.997, "step": 189 }, { "epoch": 0.0019, "grad_norm": 0.6949112483193859, "learning_rate": 0.00057, "loss": 6.9846, "step": 190 }, { "epoch": 0.00191, "grad_norm": 0.4887612962658467, "learning_rate": 0.000573, "loss": 6.9724, "step": 191 }, { "epoch": 0.00192, "grad_norm": 0.5374763602633008, "learning_rate": 0.000576, "loss": 6.9655, "step": 192 }, { "epoch": 0.00193, "grad_norm": 0.4491815623326969, "learning_rate": 0.000579, "loss": 6.9637, "step": 193 }, { "epoch": 0.00194, "grad_norm": 0.4044031823800156, "learning_rate": 0.000582, "loss": 6.9565, "step": 194 }, { "epoch": 0.00195, "grad_norm": 0.5115147380417242, "learning_rate": 0.000585, "loss": 6.9386, "step": 195 }, { "epoch": 0.00196, "grad_norm": 0.45947827433809557, "learning_rate": 0.000588, "loss": 6.9258, "step": 196 }, { "epoch": 0.00197, "grad_norm": 0.5289316721591154, "learning_rate": 0.000591, "loss": 6.9226, "step": 197 }, { "epoch": 0.00198, "grad_norm": 0.4416511613975406, "learning_rate": 0.000594, "loss": 6.9132, "step": 198 }, { "epoch": 0.00199, "grad_norm": 0.36314866008934127, "learning_rate": 0.0005970000000000001, "loss": 6.8916, "step": 199 }, { "epoch": 0.002, "grad_norm": 0.4299454881914127, "learning_rate": 0.0006000000000000001, "loss": 6.8932, "step": 200 }, { "epoch": 0.00201, "grad_norm": 0.2786890859011363, "learning_rate": 0.000603, "loss": 6.8645, "step": 201 }, { "epoch": 0.00202, "grad_norm": 0.4553990879307791, "learning_rate": 0.0006060000000000001, "loss": 6.855, "step": 202 }, { "epoch": 0.00203, "grad_norm": 0.49491513980041124, "learning_rate": 0.0006090000000000001, "loss": 6.8592, "step": 203 }, { "epoch": 0.00204, "grad_norm": 0.5750090076580947, "learning_rate": 0.000612, "loss": 6.8457, "step": 204 }, { "epoch": 0.00205, "grad_norm": 0.6904749130436038, "learning_rate": 0.000615, "loss": 6.8381, "step": 205 }, { "epoch": 0.00206, "grad_norm": 0.7582947936777445, "learning_rate": 0.000618, "loss": 6.83, "step": 206 }, { "epoch": 0.00207, "grad_norm": 0.728748472942146, "learning_rate": 0.000621, "loss": 6.8214, "step": 207 }, { "epoch": 0.00208, "grad_norm": 0.5163586812963157, "learning_rate": 0.000624, "loss": 6.8116, "step": 208 }, { "epoch": 0.00209, "grad_norm": 0.5726761174567752, "learning_rate": 0.000627, "loss": 6.7933, "step": 209 }, { "epoch": 0.0021, "grad_norm": 0.6890311463252623, "learning_rate": 0.00063, "loss": 6.7854, "step": 210 }, { "epoch": 0.00211, "grad_norm": 0.9174002778722206, "learning_rate": 0.000633, "loss": 6.7849, "step": 211 }, { "epoch": 0.00212, "grad_norm": 0.8086617968740898, "learning_rate": 0.000636, "loss": 6.7808, "step": 212 }, { "epoch": 0.00213, "grad_norm": 0.6685717599500662, "learning_rate": 0.000639, "loss": 6.7542, "step": 213 }, { "epoch": 0.00214, "grad_norm": 0.511917016650173, "learning_rate": 0.000642, "loss": 6.7483, "step": 214 }, { "epoch": 0.00215, "grad_norm": 0.5132261185164837, "learning_rate": 0.000645, "loss": 6.7465, "step": 215 }, { "epoch": 0.00216, "grad_norm": 0.3896647006337605, "learning_rate": 0.000648, "loss": 6.7354, "step": 216 }, { "epoch": 0.00217, "grad_norm": 0.53153444147609, "learning_rate": 0.000651, "loss": 6.7114, "step": 217 }, { "epoch": 0.00218, "grad_norm": 0.4560253950102483, "learning_rate": 0.000654, "loss": 6.7136, "step": 218 }, { "epoch": 0.00219, "grad_norm": 0.38246603109839156, "learning_rate": 0.000657, "loss": 6.6847, "step": 219 }, { "epoch": 0.0022, "grad_norm": 0.502249830770979, "learning_rate": 0.00066, "loss": 6.7061, "step": 220 }, { "epoch": 0.00221, "grad_norm": 0.555840042257826, "learning_rate": 0.0006630000000000001, "loss": 6.6817, "step": 221 }, { "epoch": 0.00222, "grad_norm": 0.7008290795132841, "learning_rate": 0.000666, "loss": 6.6751, "step": 222 }, { "epoch": 0.00223, "grad_norm": 0.9665649898158697, "learning_rate": 0.000669, "loss": 6.6759, "step": 223 }, { "epoch": 0.00224, "grad_norm": 1.0460190685952617, "learning_rate": 0.0006720000000000001, "loss": 6.6821, "step": 224 }, { "epoch": 0.00225, "grad_norm": 0.9709238336565439, "learning_rate": 0.000675, "loss": 6.6643, "step": 225 }, { "epoch": 0.00226, "grad_norm": 0.9675609159629996, "learning_rate": 0.000678, "loss": 6.6602, "step": 226 }, { "epoch": 0.00227, "grad_norm": 0.6069059963900193, "learning_rate": 0.0006810000000000001, "loss": 6.6251, "step": 227 }, { "epoch": 0.00228, "grad_norm": 0.6661980684886607, "learning_rate": 0.000684, "loss": 6.6314, "step": 228 }, { "epoch": 0.00229, "grad_norm": 0.5755819115869941, "learning_rate": 0.000687, "loss": 6.6231, "step": 229 }, { "epoch": 0.0023, "grad_norm": 0.48612508126201437, "learning_rate": 0.0006900000000000001, "loss": 6.6015, "step": 230 }, { "epoch": 0.00231, "grad_norm": 0.42902206098374773, "learning_rate": 0.000693, "loss": 6.5844, "step": 231 }, { "epoch": 0.00232, "grad_norm": 0.4617696239896233, "learning_rate": 0.000696, "loss": 6.5964, "step": 232 }, { "epoch": 0.00233, "grad_norm": 0.397560103207551, "learning_rate": 0.0006990000000000001, "loss": 6.5819, "step": 233 }, { "epoch": 0.00234, "grad_norm": 0.39436477469071923, "learning_rate": 0.000702, "loss": 6.5732, "step": 234 }, { "epoch": 0.00235, "grad_norm": 0.37818254545129, "learning_rate": 0.000705, "loss": 6.5584, "step": 235 }, { "epoch": 0.00236, "grad_norm": 0.39793300295732814, "learning_rate": 0.000708, "loss": 6.539, "step": 236 }, { "epoch": 0.00237, "grad_norm": 0.32880148477167265, "learning_rate": 0.0007109999999999999, "loss": 6.5486, "step": 237 }, { "epoch": 0.00238, "grad_norm": 0.33186649759843, "learning_rate": 0.000714, "loss": 6.5374, "step": 238 }, { "epoch": 0.00239, "grad_norm": 0.3861082150171924, "learning_rate": 0.000717, "loss": 6.5195, "step": 239 }, { "epoch": 0.0024, "grad_norm": 0.3818382545466509, "learning_rate": 0.0007199999999999999, "loss": 6.5368, "step": 240 }, { "epoch": 0.00241, "grad_norm": 0.26517003068907236, "learning_rate": 0.000723, "loss": 6.5167, "step": 241 }, { "epoch": 0.00242, "grad_norm": 0.30652105574179844, "learning_rate": 0.000726, "loss": 6.4934, "step": 242 }, { "epoch": 0.00243, "grad_norm": 0.3382942246099826, "learning_rate": 0.000729, "loss": 6.4799, "step": 243 }, { "epoch": 0.00244, "grad_norm": 0.4164388540502762, "learning_rate": 0.000732, "loss": 6.4843, "step": 244 }, { "epoch": 0.00245, "grad_norm": 0.4035007765909141, "learning_rate": 0.000735, "loss": 6.4741, "step": 245 }, { "epoch": 0.00246, "grad_norm": 0.4484502106991885, "learning_rate": 0.000738, "loss": 6.458, "step": 246 }, { "epoch": 0.00247, "grad_norm": 0.6057401118193197, "learning_rate": 0.000741, "loss": 6.4543, "step": 247 }, { "epoch": 0.00248, "grad_norm": 1.1775332556723501, "learning_rate": 0.000744, "loss": 6.4781, "step": 248 }, { "epoch": 0.00249, "grad_norm": 1.0888595785598245, "learning_rate": 0.000747, "loss": 6.4635, "step": 249 }, { "epoch": 0.0025, "grad_norm": 0.536872031808477, "learning_rate": 0.00075, "loss": 6.4479, "step": 250 }, { "epoch": 0.00251, "grad_norm": 0.7926645181932281, "learning_rate": 0.000753, "loss": 6.4169, "step": 251 }, { "epoch": 0.00252, "grad_norm": 0.6863348407685264, "learning_rate": 0.000756, "loss": 6.4273, "step": 252 }, { "epoch": 0.00253, "grad_norm": 0.7123800606299509, "learning_rate": 0.000759, "loss": 6.4195, "step": 253 }, { "epoch": 0.00254, "grad_norm": 0.839405849029746, "learning_rate": 0.000762, "loss": 6.4177, "step": 254 }, { "epoch": 0.00255, "grad_norm": 0.76501143998226, "learning_rate": 0.0007650000000000001, "loss": 6.4159, "step": 255 }, { "epoch": 0.00256, "grad_norm": 0.597608858095952, "learning_rate": 0.000768, "loss": 6.3889, "step": 256 }, { "epoch": 0.00257, "grad_norm": 0.6526176951631347, "learning_rate": 0.000771, "loss": 6.3981, "step": 257 }, { "epoch": 0.00258, "grad_norm": 0.651228257980475, "learning_rate": 0.0007740000000000001, "loss": 6.3725, "step": 258 }, { "epoch": 0.00259, "grad_norm": 0.5603901273931662, "learning_rate": 0.000777, "loss": 6.3719, "step": 259 }, { "epoch": 0.0026, "grad_norm": 0.41845020316479425, "learning_rate": 0.0007800000000000001, "loss": 6.3536, "step": 260 }, { "epoch": 0.00261, "grad_norm": 0.5144884019867095, "learning_rate": 0.0007830000000000001, "loss": 6.3665, "step": 261 }, { "epoch": 0.00262, "grad_norm": 0.5548811083770797, "learning_rate": 0.000786, "loss": 6.3412, "step": 262 }, { "epoch": 0.00263, "grad_norm": 0.702068573310266, "learning_rate": 0.0007890000000000001, "loss": 6.353, "step": 263 }, { "epoch": 0.00264, "grad_norm": 0.9481897048028406, "learning_rate": 0.0007920000000000001, "loss": 6.3404, "step": 264 }, { "epoch": 0.00265, "grad_norm": 1.2297805755386195, "learning_rate": 0.000795, "loss": 6.3478, "step": 265 }, { "epoch": 0.00266, "grad_norm": 0.5034998722006886, "learning_rate": 0.0007980000000000001, "loss": 6.3233, "step": 266 }, { "epoch": 0.00267, "grad_norm": 0.8457797300321087, "learning_rate": 0.0008010000000000001, "loss": 6.3268, "step": 267 }, { "epoch": 0.00268, "grad_norm": 0.7624901894608749, "learning_rate": 0.000804, "loss": 6.3226, "step": 268 }, { "epoch": 0.00269, "grad_norm": 0.6803898428780553, "learning_rate": 0.0008070000000000001, "loss": 6.3045, "step": 269 }, { "epoch": 0.0027, "grad_norm": 0.5891673657315365, "learning_rate": 0.0008100000000000001, "loss": 6.3, "step": 270 }, { "epoch": 0.00271, "grad_norm": 0.6321969571669588, "learning_rate": 0.000813, "loss": 6.3081, "step": 271 }, { "epoch": 0.00272, "grad_norm": 0.6080230974854919, "learning_rate": 0.0008160000000000001, "loss": 6.2911, "step": 272 }, { "epoch": 0.00273, "grad_norm": 0.577176950863229, "learning_rate": 0.0008190000000000001, "loss": 6.2786, "step": 273 }, { "epoch": 0.00274, "grad_norm": 0.46970800663022055, "learning_rate": 0.000822, "loss": 6.2573, "step": 274 }, { "epoch": 0.00275, "grad_norm": 0.5095773122618286, "learning_rate": 0.0008250000000000001, "loss": 6.2676, "step": 275 }, { "epoch": 0.00276, "grad_norm": 0.421367493059458, "learning_rate": 0.0008280000000000001, "loss": 6.2547, "step": 276 }, { "epoch": 0.00277, "grad_norm": 0.4229723742956301, "learning_rate": 0.0008310000000000001, "loss": 6.2503, "step": 277 }, { "epoch": 0.00278, "grad_norm": 0.4631977178825306, "learning_rate": 0.0008340000000000001, "loss": 6.2346, "step": 278 }, { "epoch": 0.00279, "grad_norm": 0.41870110969580987, "learning_rate": 0.0008370000000000001, "loss": 6.2332, "step": 279 }, { "epoch": 0.0028, "grad_norm": 0.4083314739680453, "learning_rate": 0.0008400000000000001, "loss": 6.2161, "step": 280 }, { "epoch": 0.00281, "grad_norm": 0.42451645247510567, "learning_rate": 0.0008430000000000001, "loss": 6.2058, "step": 281 }, { "epoch": 0.00282, "grad_norm": 0.4811013283391871, "learning_rate": 0.000846, "loss": 6.206, "step": 282 }, { "epoch": 0.00283, "grad_norm": 0.6798083705841664, "learning_rate": 0.0008489999999999999, "loss": 6.2015, "step": 283 }, { "epoch": 0.00284, "grad_norm": 1.0382201143248402, "learning_rate": 0.0008519999999999999, "loss": 6.2055, "step": 284 }, { "epoch": 0.00285, "grad_norm": 1.115942818342409, "learning_rate": 0.000855, "loss": 6.2129, "step": 285 }, { "epoch": 0.00286, "grad_norm": 0.8889955339821247, "learning_rate": 0.0008579999999999999, "loss": 6.187, "step": 286 }, { "epoch": 0.00287, "grad_norm": 1.2422801265585652, "learning_rate": 0.000861, "loss": 6.209, "step": 287 }, { "epoch": 0.00288, "grad_norm": 0.8315932342234975, "learning_rate": 0.000864, "loss": 6.174, "step": 288 }, { "epoch": 0.00289, "grad_norm": 1.2914759013339998, "learning_rate": 0.0008669999999999999, "loss": 6.2078, "step": 289 }, { "epoch": 0.0029, "grad_norm": 0.8376507056381004, "learning_rate": 0.00087, "loss": 6.1757, "step": 290 }, { "epoch": 0.00291, "grad_norm": 0.8412780961911104, "learning_rate": 0.000873, "loss": 6.1658, "step": 291 }, { "epoch": 0.00292, "grad_norm": 1.047021583757866, "learning_rate": 0.0008759999999999999, "loss": 6.1758, "step": 292 }, { "epoch": 0.00293, "grad_norm": 0.8919470282886952, "learning_rate": 0.000879, "loss": 6.151, "step": 293 }, { "epoch": 0.00294, "grad_norm": 0.665529628519212, "learning_rate": 0.000882, "loss": 6.159, "step": 294 }, { "epoch": 0.00295, "grad_norm": 0.5169660787787601, "learning_rate": 0.0008849999999999999, "loss": 6.1239, "step": 295 }, { "epoch": 0.00296, "grad_norm": 0.5611538425989948, "learning_rate": 0.000888, "loss": 6.1363, "step": 296 }, { "epoch": 0.00297, "grad_norm": 0.46398604023920087, "learning_rate": 0.000891, "loss": 6.1045, "step": 297 }, { "epoch": 0.00298, "grad_norm": 0.4361556326298739, "learning_rate": 0.0008939999999999999, "loss": 6.1198, "step": 298 }, { "epoch": 0.00299, "grad_norm": 0.4319584905904094, "learning_rate": 0.000897, "loss": 6.0941, "step": 299 }, { "epoch": 0.003, "grad_norm": 0.4255386299160817, "learning_rate": 0.0009, "loss": 6.0936, "step": 300 }, { "epoch": 0.00301, "grad_norm": 0.3316584659066082, "learning_rate": 0.0009029999999999999, "loss": 6.0857, "step": 301 }, { "epoch": 0.00302, "grad_norm": 0.37299869635167304, "learning_rate": 0.000906, "loss": 6.0685, "step": 302 }, { "epoch": 0.00303, "grad_norm": 0.40148217950038195, "learning_rate": 0.000909, "loss": 6.0805, "step": 303 }, { "epoch": 0.00304, "grad_norm": 0.420191340163935, "learning_rate": 0.000912, "loss": 6.0758, "step": 304 }, { "epoch": 0.00305, "grad_norm": 0.45307668264044143, "learning_rate": 0.000915, "loss": 6.0736, "step": 305 }, { "epoch": 0.00306, "grad_norm": 0.6122731249830943, "learning_rate": 0.000918, "loss": 6.0651, "step": 306 }, { "epoch": 0.00307, "grad_norm": 0.851197326038436, "learning_rate": 0.000921, "loss": 6.0633, "step": 307 }, { "epoch": 0.00308, "grad_norm": 1.1284591769922636, "learning_rate": 0.000924, "loss": 6.0582, "step": 308 }, { "epoch": 0.00309, "grad_norm": 0.9596545216263644, "learning_rate": 0.000927, "loss": 6.0709, "step": 309 }, { "epoch": 0.0031, "grad_norm": 1.039007932956353, "learning_rate": 0.00093, "loss": 6.0624, "step": 310 }, { "epoch": 0.00311, "grad_norm": 0.9855401820369791, "learning_rate": 0.000933, "loss": 6.0524, "step": 311 }, { "epoch": 0.00312, "grad_norm": 1.0163701418335827, "learning_rate": 0.000936, "loss": 6.041, "step": 312 }, { "epoch": 0.00313, "grad_norm": 1.0223663613867633, "learning_rate": 0.0009390000000000001, "loss": 6.0491, "step": 313 }, { "epoch": 0.00314, "grad_norm": 1.0997292958340223, "learning_rate": 0.000942, "loss": 6.0641, "step": 314 }, { "epoch": 0.00315, "grad_norm": 0.932276773939602, "learning_rate": 0.000945, "loss": 6.0354, "step": 315 }, { "epoch": 0.00316, "grad_norm": 0.8624268848533463, "learning_rate": 0.0009480000000000001, "loss": 6.0096, "step": 316 }, { "epoch": 0.00317, "grad_norm": 0.6867359398602113, "learning_rate": 0.000951, "loss": 6.0237, "step": 317 }, { "epoch": 0.00318, "grad_norm": 0.5777711812516898, "learning_rate": 0.000954, "loss": 6.014, "step": 318 }, { "epoch": 0.00319, "grad_norm": 0.5907201170091796, "learning_rate": 0.0009570000000000001, "loss": 6.0042, "step": 319 }, { "epoch": 0.0032, "grad_norm": 0.5929910498481646, "learning_rate": 0.00096, "loss": 6.0021, "step": 320 }, { "epoch": 0.00321, "grad_norm": 0.6559636432249029, "learning_rate": 0.000963, "loss": 5.9891, "step": 321 }, { "epoch": 0.00322, "grad_norm": 0.5844133161497509, "learning_rate": 0.0009660000000000001, "loss": 5.9766, "step": 322 }, { "epoch": 0.00323, "grad_norm": 0.5466606066369618, "learning_rate": 0.000969, "loss": 5.9736, "step": 323 }, { "epoch": 0.00324, "grad_norm": 0.563270781105711, "learning_rate": 0.0009720000000000001, "loss": 5.9778, "step": 324 }, { "epoch": 0.00325, "grad_norm": 0.5312990845923178, "learning_rate": 0.0009750000000000001, "loss": 5.9405, "step": 325 }, { "epoch": 0.00326, "grad_norm": 0.5118566622058196, "learning_rate": 0.0009780000000000001, "loss": 5.9566, "step": 326 }, { "epoch": 0.00327, "grad_norm": 0.5259315695578027, "learning_rate": 0.000981, "loss": 5.9568, "step": 327 }, { "epoch": 0.00328, "grad_norm": 0.5410551164101933, "learning_rate": 0.000984, "loss": 5.9324, "step": 328 }, { "epoch": 0.00329, "grad_norm": 0.48301414107035934, "learning_rate": 0.000987, "loss": 5.931, "step": 329 }, { "epoch": 0.0033, "grad_norm": 0.5975532498257996, "learning_rate": 0.00099, "loss": 5.9265, "step": 330 }, { "epoch": 0.00331, "grad_norm": 0.9746373555768076, "learning_rate": 0.0009930000000000002, "loss": 5.9381, "step": 331 }, { "epoch": 0.00332, "grad_norm": 1.1994973147641799, "learning_rate": 0.0009960000000000001, "loss": 5.9385, "step": 332 }, { "epoch": 0.00333, "grad_norm": 0.5144992648158865, "learning_rate": 0.000999, "loss": 5.8989, "step": 333 }, { "epoch": 0.00334, "grad_norm": 0.9073672400240862, "learning_rate": 0.001002, "loss": 5.9262, "step": 334 }, { "epoch": 0.00335, "grad_norm": 0.7561451103694465, "learning_rate": 0.001005, "loss": 5.9186, "step": 335 }, { "epoch": 0.00336, "grad_norm": 0.7872757919528415, "learning_rate": 0.001008, "loss": 5.9134, "step": 336 }, { "epoch": 0.00337, "grad_norm": 0.7536020827923614, "learning_rate": 0.0010110000000000002, "loss": 5.8884, "step": 337 }, { "epoch": 0.00338, "grad_norm": 1.0523353883962923, "learning_rate": 0.0010140000000000001, "loss": 5.9132, "step": 338 }, { "epoch": 0.00339, "grad_norm": 1.2857238182949966, "learning_rate": 0.0010170000000000001, "loss": 5.9334, "step": 339 }, { "epoch": 0.0034, "grad_norm": 0.7361708327689132, "learning_rate": 0.00102, "loss": 5.885, "step": 340 }, { "epoch": 0.00341, "grad_norm": 0.6901997441262301, "learning_rate": 0.001023, "loss": 5.869, "step": 341 }, { "epoch": 0.00342, "grad_norm": 0.6381033550571615, "learning_rate": 0.001026, "loss": 5.8702, "step": 342 }, { "epoch": 0.00343, "grad_norm": 0.6071718978500397, "learning_rate": 0.0010290000000000002, "loss": 5.8743, "step": 343 }, { "epoch": 0.00344, "grad_norm": 0.5857796625429044, "learning_rate": 0.001032, "loss": 5.861, "step": 344 }, { "epoch": 0.00345, "grad_norm": 0.626640702848716, "learning_rate": 0.001035, "loss": 5.8537, "step": 345 }, { "epoch": 0.00346, "grad_norm": 0.6755670022907736, "learning_rate": 0.0010379999999999999, "loss": 5.8603, "step": 346 }, { "epoch": 0.00347, "grad_norm": 0.9144508249400731, "learning_rate": 0.001041, "loss": 5.8338, "step": 347 }, { "epoch": 0.00348, "grad_norm": 1.2125876856754099, "learning_rate": 0.001044, "loss": 5.8634, "step": 348 }, { "epoch": 0.00349, "grad_norm": 0.6928695941460523, "learning_rate": 0.001047, "loss": 5.8236, "step": 349 }, { "epoch": 0.0035, "grad_norm": 0.7654262923967496, "learning_rate": 0.00105, "loss": 5.8502, "step": 350 }, { "epoch": 0.00351, "grad_norm": 0.8300223804260752, "learning_rate": 0.001053, "loss": 5.8507, "step": 351 }, { "epoch": 0.00352, "grad_norm": 1.1393832643973667, "learning_rate": 0.0010559999999999999, "loss": 5.841, "step": 352 }, { "epoch": 0.00353, "grad_norm": 0.7670875434573843, "learning_rate": 0.001059, "loss": 5.8302, "step": 353 }, { "epoch": 0.00354, "grad_norm": 0.8617169684849714, "learning_rate": 0.001062, "loss": 5.8072, "step": 354 }, { "epoch": 0.00355, "grad_norm": 0.8787230305505044, "learning_rate": 0.001065, "loss": 5.8221, "step": 355 }, { "epoch": 0.00356, "grad_norm": 0.9037602020080988, "learning_rate": 0.001068, "loss": 5.8164, "step": 356 }, { "epoch": 0.00357, "grad_norm": 0.785887699185926, "learning_rate": 0.001071, "loss": 5.8055, "step": 357 }, { "epoch": 0.00358, "grad_norm": 0.6152742029666318, "learning_rate": 0.001074, "loss": 5.7887, "step": 358 }, { "epoch": 0.00359, "grad_norm": 0.5372063086433791, "learning_rate": 0.001077, "loss": 5.78, "step": 359 }, { "epoch": 0.0036, "grad_norm": 0.5078257678271803, "learning_rate": 0.00108, "loss": 5.7825, "step": 360 }, { "epoch": 0.00361, "grad_norm": 0.4885651334266738, "learning_rate": 0.001083, "loss": 5.7748, "step": 361 }, { "epoch": 0.00362, "grad_norm": 0.5495429650143561, "learning_rate": 0.001086, "loss": 5.7596, "step": 362 }, { "epoch": 0.00363, "grad_norm": 0.5626950540152672, "learning_rate": 0.001089, "loss": 5.7515, "step": 363 }, { "epoch": 0.00364, "grad_norm": 0.6199658617744055, "learning_rate": 0.001092, "loss": 5.766, "step": 364 }, { "epoch": 0.00365, "grad_norm": 0.7392438146286566, "learning_rate": 0.001095, "loss": 5.7655, "step": 365 }, { "epoch": 0.00366, "grad_norm": 0.9864875055616179, "learning_rate": 0.001098, "loss": 5.7524, "step": 366 }, { "epoch": 0.00367, "grad_norm": 1.1449768044417052, "learning_rate": 0.001101, "loss": 5.7648, "step": 367 }, { "epoch": 0.00368, "grad_norm": 0.9013400729864322, "learning_rate": 0.001104, "loss": 5.755, "step": 368 }, { "epoch": 0.00369, "grad_norm": 0.9093377711089596, "learning_rate": 0.001107, "loss": 5.7659, "step": 369 }, { "epoch": 0.0037, "grad_norm": 0.575923493278037, "learning_rate": 0.00111, "loss": 5.7328, "step": 370 }, { "epoch": 0.00371, "grad_norm": 0.6737016310188764, "learning_rate": 0.001113, "loss": 5.7102, "step": 371 }, { "epoch": 0.00372, "grad_norm": 0.4833347808689347, "learning_rate": 0.001116, "loss": 5.7236, "step": 372 }, { "epoch": 0.00373, "grad_norm": 0.6361357392920576, "learning_rate": 0.001119, "loss": 5.7181, "step": 373 }, { "epoch": 0.00374, "grad_norm": 0.6286357986456886, "learning_rate": 0.001122, "loss": 5.7192, "step": 374 }, { "epoch": 0.00375, "grad_norm": 0.7140127832546589, "learning_rate": 0.0011250000000000001, "loss": 5.7248, "step": 375 }, { "epoch": 0.00376, "grad_norm": 0.728891228424708, "learning_rate": 0.001128, "loss": 5.7207, "step": 376 }, { "epoch": 0.00377, "grad_norm": 0.7251122752592066, "learning_rate": 0.001131, "loss": 5.7141, "step": 377 }, { "epoch": 0.00378, "grad_norm": 0.8109517942362439, "learning_rate": 0.001134, "loss": 5.7121, "step": 378 }, { "epoch": 0.00379, "grad_norm": 0.7548765882892476, "learning_rate": 0.001137, "loss": 5.6981, "step": 379 }, { "epoch": 0.0038, "grad_norm": 0.5982490555873449, "learning_rate": 0.00114, "loss": 5.7128, "step": 380 }, { "epoch": 0.00381, "grad_norm": 0.5479723067602768, "learning_rate": 0.0011430000000000001, "loss": 5.6793, "step": 381 }, { "epoch": 0.00382, "grad_norm": 0.5400365110175976, "learning_rate": 0.001146, "loss": 5.6631, "step": 382 }, { "epoch": 0.00383, "grad_norm": 0.4406698702316126, "learning_rate": 0.001149, "loss": 5.673, "step": 383 }, { "epoch": 0.00384, "grad_norm": 0.5495584711003424, "learning_rate": 0.001152, "loss": 5.6782, "step": 384 }, { "epoch": 0.00385, "grad_norm": 0.7047837665038742, "learning_rate": 0.001155, "loss": 5.6686, "step": 385 }, { "epoch": 0.00386, "grad_norm": 1.0039450355838517, "learning_rate": 0.001158, "loss": 5.6846, "step": 386 }, { "epoch": 0.00387, "grad_norm": 1.2552299335364856, "learning_rate": 0.0011610000000000001, "loss": 5.6713, "step": 387 }, { "epoch": 0.00388, "grad_norm": 1.0630057286422998, "learning_rate": 0.001164, "loss": 5.6615, "step": 388 }, { "epoch": 0.00389, "grad_norm": 1.4085777330550793, "learning_rate": 0.001167, "loss": 5.6963, "step": 389 }, { "epoch": 0.0039, "grad_norm": 0.7893066659624004, "learning_rate": 0.00117, "loss": 5.651, "step": 390 }, { "epoch": 0.00391, "grad_norm": 0.8891270576556106, "learning_rate": 0.001173, "loss": 5.666, "step": 391 }, { "epoch": 0.00392, "grad_norm": 1.1058013192110903, "learning_rate": 0.001176, "loss": 5.6796, "step": 392 }, { "epoch": 0.00393, "grad_norm": 1.170614508410806, "learning_rate": 0.0011790000000000001, "loss": 5.6646, "step": 393 }, { "epoch": 0.00394, "grad_norm": 0.8391276502601887, "learning_rate": 0.001182, "loss": 5.6402, "step": 394 }, { "epoch": 0.00395, "grad_norm": 0.9435882620236007, "learning_rate": 0.001185, "loss": 5.6277, "step": 395 }, { "epoch": 0.00396, "grad_norm": 0.7925001626557522, "learning_rate": 0.001188, "loss": 5.6404, "step": 396 }, { "epoch": 0.00397, "grad_norm": 0.8633162203152536, "learning_rate": 0.001191, "loss": 5.6366, "step": 397 }, { "epoch": 0.00398, "grad_norm": 0.9359127674730449, "learning_rate": 0.0011940000000000002, "loss": 5.6437, "step": 398 }, { "epoch": 0.00399, "grad_norm": 1.0926478209626875, "learning_rate": 0.0011970000000000001, "loss": 5.6494, "step": 399 }, { "epoch": 0.004, "grad_norm": 0.8943926064407558, "learning_rate": 0.0012000000000000001, "loss": 5.6306, "step": 400 }, { "epoch": 0.00401, "grad_norm": 1.211025202532141, "learning_rate": 0.001203, "loss": 5.6241, "step": 401 }, { "epoch": 0.00402, "grad_norm": 0.8585006093020132, "learning_rate": 0.001206, "loss": 5.6196, "step": 402 }, { "epoch": 0.00403, "grad_norm": 0.8708424012246267, "learning_rate": 0.001209, "loss": 5.618, "step": 403 }, { "epoch": 0.00404, "grad_norm": 0.6771235232466043, "learning_rate": 0.0012120000000000002, "loss": 5.6057, "step": 404 }, { "epoch": 0.00405, "grad_norm": 0.5089480196948696, "learning_rate": 0.0012150000000000002, "loss": 5.5986, "step": 405 }, { "epoch": 0.00406, "grad_norm": 0.5324784457955185, "learning_rate": 0.0012180000000000001, "loss": 5.583, "step": 406 }, { "epoch": 0.00407, "grad_norm": 0.4806328504890235, "learning_rate": 0.0012209999999999999, "loss": 5.575, "step": 407 }, { "epoch": 0.00408, "grad_norm": 0.5340674298116082, "learning_rate": 0.001224, "loss": 5.5941, "step": 408 }, { "epoch": 0.00409, "grad_norm": 0.6817510563164704, "learning_rate": 0.001227, "loss": 5.5739, "step": 409 }, { "epoch": 0.0041, "grad_norm": 0.8230982603577015, "learning_rate": 0.00123, "loss": 5.5739, "step": 410 }, { "epoch": 0.00411, "grad_norm": 0.7982833366881755, "learning_rate": 0.001233, "loss": 5.5886, "step": 411 }, { "epoch": 0.00412, "grad_norm": 0.7882120301866252, "learning_rate": 0.001236, "loss": 5.5767, "step": 412 }, { "epoch": 0.00413, "grad_norm": 0.9078160072473371, "learning_rate": 0.0012389999999999999, "loss": 5.5798, "step": 413 }, { "epoch": 0.00414, "grad_norm": 0.8046291024914881, "learning_rate": 0.001242, "loss": 5.5582, "step": 414 }, { "epoch": 0.00415, "grad_norm": 0.9449024826812693, "learning_rate": 0.001245, "loss": 5.5633, "step": 415 }, { "epoch": 0.00416, "grad_norm": 0.9578358959807691, "learning_rate": 0.001248, "loss": 5.5489, "step": 416 }, { "epoch": 0.00417, "grad_norm": 0.7364680005190741, "learning_rate": 0.001251, "loss": 5.575, "step": 417 }, { "epoch": 0.00418, "grad_norm": 0.5600093653837771, "learning_rate": 0.001254, "loss": 5.5419, "step": 418 }, { "epoch": 0.00419, "grad_norm": 0.7369458002937045, "learning_rate": 0.0012569999999999999, "loss": 5.535, "step": 419 }, { "epoch": 0.0042, "grad_norm": 0.7566412883958042, "learning_rate": 0.00126, "loss": 5.5478, "step": 420 }, { "epoch": 0.00421, "grad_norm": 0.9341471688658377, "learning_rate": 0.001263, "loss": 5.5468, "step": 421 }, { "epoch": 0.00422, "grad_norm": 0.9387048270351058, "learning_rate": 0.001266, "loss": 5.5395, "step": 422 }, { "epoch": 0.00423, "grad_norm": 0.738543672170714, "learning_rate": 0.001269, "loss": 5.5309, "step": 423 }, { "epoch": 0.00424, "grad_norm": 0.879163854006119, "learning_rate": 0.001272, "loss": 5.5379, "step": 424 }, { "epoch": 0.00425, "grad_norm": 0.878245832078137, "learning_rate": 0.001275, "loss": 5.5393, "step": 425 }, { "epoch": 0.00426, "grad_norm": 0.8393572375675296, "learning_rate": 0.001278, "loss": 5.5388, "step": 426 }, { "epoch": 0.00427, "grad_norm": 0.8175993205302655, "learning_rate": 0.001281, "loss": 5.5188, "step": 427 }, { "epoch": 0.00428, "grad_norm": 0.8492227718152501, "learning_rate": 0.001284, "loss": 5.5011, "step": 428 }, { "epoch": 0.00429, "grad_norm": 0.8455500448937461, "learning_rate": 0.001287, "loss": 5.5167, "step": 429 }, { "epoch": 0.0043, "grad_norm": 0.9588196540360735, "learning_rate": 0.00129, "loss": 5.5126, "step": 430 }, { "epoch": 0.00431, "grad_norm": 1.0358439149859766, "learning_rate": 0.001293, "loss": 5.5121, "step": 431 }, { "epoch": 0.00432, "grad_norm": 0.853137595287236, "learning_rate": 0.001296, "loss": 5.5152, "step": 432 }, { "epoch": 0.00433, "grad_norm": 0.9144896540159448, "learning_rate": 0.001299, "loss": 5.5075, "step": 433 }, { "epoch": 0.00434, "grad_norm": 1.0340397416077374, "learning_rate": 0.001302, "loss": 5.5131, "step": 434 }, { "epoch": 0.00435, "grad_norm": 1.1136200661191735, "learning_rate": 0.001305, "loss": 5.5153, "step": 435 }, { "epoch": 0.00436, "grad_norm": 0.7998503424321469, "learning_rate": 0.001308, "loss": 5.4814, "step": 436 }, { "epoch": 0.00437, "grad_norm": 0.8862208467810537, "learning_rate": 0.001311, "loss": 5.5052, "step": 437 }, { "epoch": 0.00438, "grad_norm": 0.85557749799579, "learning_rate": 0.001314, "loss": 5.4855, "step": 438 }, { "epoch": 0.00439, "grad_norm": 0.6596001138977952, "learning_rate": 0.001317, "loss": 5.5056, "step": 439 }, { "epoch": 0.0044, "grad_norm": 0.5461926920380444, "learning_rate": 0.00132, "loss": 5.4734, "step": 440 }, { "epoch": 0.00441, "grad_norm": 0.5325344576484976, "learning_rate": 0.001323, "loss": 5.4692, "step": 441 }, { "epoch": 0.00442, "grad_norm": 0.46029396349038315, "learning_rate": 0.0013260000000000001, "loss": 5.4603, "step": 442 }, { "epoch": 0.00443, "grad_norm": 0.5200620875251907, "learning_rate": 0.001329, "loss": 5.4641, "step": 443 }, { "epoch": 0.00444, "grad_norm": 0.511034817927936, "learning_rate": 0.001332, "loss": 5.4632, "step": 444 }, { "epoch": 0.00445, "grad_norm": 0.61375364791033, "learning_rate": 0.001335, "loss": 5.4483, "step": 445 }, { "epoch": 0.00446, "grad_norm": 0.7540282970336214, "learning_rate": 0.001338, "loss": 5.4549, "step": 446 }, { "epoch": 0.00447, "grad_norm": 0.7743861790351634, "learning_rate": 0.001341, "loss": 5.456, "step": 447 }, { "epoch": 0.00448, "grad_norm": 0.6949785247448689, "learning_rate": 0.0013440000000000001, "loss": 5.4375, "step": 448 }, { "epoch": 0.00449, "grad_norm": 0.8972954522362333, "learning_rate": 0.001347, "loss": 5.4453, "step": 449 }, { "epoch": 0.0045, "grad_norm": 1.0136292885280909, "learning_rate": 0.00135, "loss": 5.4524, "step": 450 }, { "epoch": 0.00451, "grad_norm": 0.7959348815359711, "learning_rate": 0.001353, "loss": 5.4372, "step": 451 }, { "epoch": 0.00452, "grad_norm": 0.750530581913797, "learning_rate": 0.001356, "loss": 5.4212, "step": 452 }, { "epoch": 0.00453, "grad_norm": 0.718332283553841, "learning_rate": 0.001359, "loss": 5.4094, "step": 453 }, { "epoch": 0.00454, "grad_norm": 0.8243339574967999, "learning_rate": 0.0013620000000000001, "loss": 5.4327, "step": 454 }, { "epoch": 0.00455, "grad_norm": 0.8060545663764288, "learning_rate": 0.0013650000000000001, "loss": 5.4278, "step": 455 }, { "epoch": 0.00456, "grad_norm": 0.9387057405661987, "learning_rate": 0.001368, "loss": 5.4287, "step": 456 }, { "epoch": 0.00457, "grad_norm": 1.110172512819111, "learning_rate": 0.001371, "loss": 5.4304, "step": 457 }, { "epoch": 0.00458, "grad_norm": 0.7485802071411273, "learning_rate": 0.001374, "loss": 5.4279, "step": 458 }, { "epoch": 0.00459, "grad_norm": 0.846395295484429, "learning_rate": 0.0013770000000000002, "loss": 5.4177, "step": 459 }, { "epoch": 0.0046, "grad_norm": 1.2095188964594632, "learning_rate": 0.0013800000000000002, "loss": 5.4166, "step": 460 }, { "epoch": 0.00461, "grad_norm": 1.1548058188436976, "learning_rate": 0.0013830000000000001, "loss": 5.417, "step": 461 }, { "epoch": 0.00462, "grad_norm": 0.9626057997692408, "learning_rate": 0.001386, "loss": 5.4177, "step": 462 }, { "epoch": 0.00463, "grad_norm": 1.1365427244526745, "learning_rate": 0.001389, "loss": 5.4083, "step": 463 }, { "epoch": 0.00464, "grad_norm": 0.7154214826672701, "learning_rate": 0.001392, "loss": 5.4147, "step": 464 }, { "epoch": 0.00465, "grad_norm": 0.5933778225768791, "learning_rate": 0.0013950000000000002, "loss": 5.3806, "step": 465 }, { "epoch": 0.00466, "grad_norm": 0.6213055581786315, "learning_rate": 0.0013980000000000002, "loss": 5.3973, "step": 466 }, { "epoch": 0.00467, "grad_norm": 0.5608640811659587, "learning_rate": 0.0014010000000000001, "loss": 5.3871, "step": 467 }, { "epoch": 0.00468, "grad_norm": 0.4459725756410885, "learning_rate": 0.001404, "loss": 5.3713, "step": 468 }, { "epoch": 0.00469, "grad_norm": 0.46857319789964524, "learning_rate": 0.001407, "loss": 5.3733, "step": 469 }, { "epoch": 0.0047, "grad_norm": 0.4864537455422831, "learning_rate": 0.00141, "loss": 5.3823, "step": 470 }, { "epoch": 0.00471, "grad_norm": 0.5233417273033707, "learning_rate": 0.001413, "loss": 5.3595, "step": 471 }, { "epoch": 0.00472, "grad_norm": 0.7276814872840428, "learning_rate": 0.001416, "loss": 5.376, "step": 472 }, { "epoch": 0.00473, "grad_norm": 0.9313958457119089, "learning_rate": 0.001419, "loss": 5.3908, "step": 473 }, { "epoch": 0.00474, "grad_norm": 0.9969581851520253, "learning_rate": 0.0014219999999999999, "loss": 5.3782, "step": 474 }, { "epoch": 0.00475, "grad_norm": 0.7172709684261298, "learning_rate": 0.001425, "loss": 5.3626, "step": 475 }, { "epoch": 0.00476, "grad_norm": 0.749267541315669, "learning_rate": 0.001428, "loss": 5.395, "step": 476 }, { "epoch": 0.00477, "grad_norm": 0.7586220730764037, "learning_rate": 0.001431, "loss": 5.35, "step": 477 }, { "epoch": 0.00478, "grad_norm": 0.8456343691773762, "learning_rate": 0.001434, "loss": 5.378, "step": 478 }, { "epoch": 0.00479, "grad_norm": 1.0643326937248148, "learning_rate": 0.001437, "loss": 5.3628, "step": 479 }, { "epoch": 0.0048, "grad_norm": 0.9414664669888669, "learning_rate": 0.0014399999999999999, "loss": 5.3552, "step": 480 }, { "epoch": 0.00481, "grad_norm": 0.9003358375425434, "learning_rate": 0.001443, "loss": 5.3593, "step": 481 }, { "epoch": 0.00482, "grad_norm": 0.8978331557974801, "learning_rate": 0.001446, "loss": 5.3398, "step": 482 }, { "epoch": 0.00483, "grad_norm": 1.1289938883763697, "learning_rate": 0.001449, "loss": 5.3578, "step": 483 }, { "epoch": 0.00484, "grad_norm": 1.1013965257300222, "learning_rate": 0.001452, "loss": 5.3611, "step": 484 }, { "epoch": 0.00485, "grad_norm": 0.9590590729365723, "learning_rate": 0.001455, "loss": 5.3653, "step": 485 }, { "epoch": 0.00486, "grad_norm": 0.8987758447866343, "learning_rate": 0.001458, "loss": 5.3363, "step": 486 }, { "epoch": 0.00487, "grad_norm": 1.1924974582473045, "learning_rate": 0.001461, "loss": 5.3676, "step": 487 }, { "epoch": 0.00488, "grad_norm": 0.945268328759551, "learning_rate": 0.001464, "loss": 5.3393, "step": 488 }, { "epoch": 0.00489, "grad_norm": 0.9295239913179795, "learning_rate": 0.001467, "loss": 5.3304, "step": 489 }, { "epoch": 0.0049, "grad_norm": 0.8440135575742712, "learning_rate": 0.00147, "loss": 5.3401, "step": 490 }, { "epoch": 0.00491, "grad_norm": 0.8518595255895461, "learning_rate": 0.001473, "loss": 5.3387, "step": 491 }, { "epoch": 0.00492, "grad_norm": 0.8521564820940866, "learning_rate": 0.001476, "loss": 5.3306, "step": 492 }, { "epoch": 0.00493, "grad_norm": 1.037786125041147, "learning_rate": 0.001479, "loss": 5.337, "step": 493 }, { "epoch": 0.00494, "grad_norm": 0.8602402216970677, "learning_rate": 0.001482, "loss": 5.3365, "step": 494 }, { "epoch": 0.00495, "grad_norm": 0.833325441881814, "learning_rate": 0.001485, "loss": 5.3402, "step": 495 }, { "epoch": 0.00496, "grad_norm": 0.7919536178225737, "learning_rate": 0.001488, "loss": 5.324, "step": 496 }, { "epoch": 0.00497, "grad_norm": 0.873050532284563, "learning_rate": 0.001491, "loss": 5.3076, "step": 497 }, { "epoch": 0.00498, "grad_norm": 0.8401174504098841, "learning_rate": 0.001494, "loss": 5.3331, "step": 498 }, { "epoch": 0.00499, "grad_norm": 0.7077904347820982, "learning_rate": 0.001497, "loss": 5.3023, "step": 499 }, { "epoch": 0.005, "grad_norm": 0.722193970534152, "learning_rate": 0.0015, "loss": 5.3084, "step": 500 }, { "epoch": 0.00501, "grad_norm": 0.6788524699363854, "learning_rate": 0.001503, "loss": 5.3068, "step": 501 }, { "epoch": 0.00502, "grad_norm": 0.4688637134987383, "learning_rate": 0.001506, "loss": 5.3027, "step": 502 }, { "epoch": 0.00503, "grad_norm": 0.5347885266184398, "learning_rate": 0.0015090000000000001, "loss": 5.3107, "step": 503 }, { "epoch": 0.00504, "grad_norm": 0.4568589587316421, "learning_rate": 0.001512, "loss": 5.2883, "step": 504 }, { "epoch": 0.00505, "grad_norm": 0.6059364079243326, "learning_rate": 0.001515, "loss": 5.2843, "step": 505 }, { "epoch": 0.00506, "grad_norm": 0.5690871559871206, "learning_rate": 0.001518, "loss": 5.2813, "step": 506 }, { "epoch": 0.00507, "grad_norm": 0.6408278920334604, "learning_rate": 0.001521, "loss": 5.279, "step": 507 }, { "epoch": 0.00508, "grad_norm": 0.6374798421453328, "learning_rate": 0.001524, "loss": 5.2759, "step": 508 }, { "epoch": 0.00509, "grad_norm": 0.5508180980448808, "learning_rate": 0.0015270000000000001, "loss": 5.2603, "step": 509 }, { "epoch": 0.0051, "grad_norm": 0.4678264081626914, "learning_rate": 0.0015300000000000001, "loss": 5.2734, "step": 510 }, { "epoch": 0.00511, "grad_norm": 0.4947375080326457, "learning_rate": 0.001533, "loss": 5.252, "step": 511 }, { "epoch": 0.00512, "grad_norm": 0.4646141343190718, "learning_rate": 0.001536, "loss": 5.2454, "step": 512 }, { "epoch": 0.00513, "grad_norm": 0.4614608859817223, "learning_rate": 0.001539, "loss": 5.2616, "step": 513 }, { "epoch": 0.00514, "grad_norm": 0.47514560040822135, "learning_rate": 0.001542, "loss": 5.2462, "step": 514 }, { "epoch": 0.00515, "grad_norm": 0.5516172292908152, "learning_rate": 0.0015450000000000001, "loss": 5.2495, "step": 515 }, { "epoch": 0.00516, "grad_norm": 0.6762883843976203, "learning_rate": 0.0015480000000000001, "loss": 5.2514, "step": 516 }, { "epoch": 0.00517, "grad_norm": 0.8220059659316783, "learning_rate": 0.001551, "loss": 5.2491, "step": 517 }, { "epoch": 0.00518, "grad_norm": 0.8472303407674783, "learning_rate": 0.001554, "loss": 5.2384, "step": 518 }, { "epoch": 0.00519, "grad_norm": 0.8352554081583162, "learning_rate": 0.001557, "loss": 5.2403, "step": 519 }, { "epoch": 0.0052, "grad_norm": 0.9289527635802941, "learning_rate": 0.0015600000000000002, "loss": 5.2527, "step": 520 }, { "epoch": 0.00521, "grad_norm": 1.112613217213008, "learning_rate": 0.0015630000000000002, "loss": 5.242, "step": 521 }, { "epoch": 0.00522, "grad_norm": 0.9402831205824588, "learning_rate": 0.0015660000000000001, "loss": 5.2388, "step": 522 }, { "epoch": 0.00523, "grad_norm": 1.103453237567065, "learning_rate": 0.001569, "loss": 5.2627, "step": 523 }, { "epoch": 0.00524, "grad_norm": 1.0118007660209627, "learning_rate": 0.001572, "loss": 5.268, "step": 524 }, { "epoch": 0.00525, "grad_norm": 0.9435955195760587, "learning_rate": 0.001575, "loss": 5.2606, "step": 525 }, { "epoch": 0.00526, "grad_norm": 0.8731456366832249, "learning_rate": 0.0015780000000000002, "loss": 5.2355, "step": 526 }, { "epoch": 0.00527, "grad_norm": 0.7556141048030679, "learning_rate": 0.0015810000000000002, "loss": 5.244, "step": 527 }, { "epoch": 0.00528, "grad_norm": 0.8959181971092488, "learning_rate": 0.0015840000000000001, "loss": 5.2287, "step": 528 }, { "epoch": 0.00529, "grad_norm": 1.10956087267792, "learning_rate": 0.001587, "loss": 5.2485, "step": 529 }, { "epoch": 0.0053, "grad_norm": 0.7373688170406403, "learning_rate": 0.00159, "loss": 5.2473, "step": 530 }, { "epoch": 0.00531, "grad_norm": 0.9440929198745536, "learning_rate": 0.001593, "loss": 5.2364, "step": 531 }, { "epoch": 0.00532, "grad_norm": 1.2761807884581948, "learning_rate": 0.0015960000000000002, "loss": 5.2627, "step": 532 }, { "epoch": 0.00533, "grad_norm": 0.7254918561906474, "learning_rate": 0.0015990000000000002, "loss": 5.2411, "step": 533 }, { "epoch": 0.00534, "grad_norm": 0.9458095799105631, "learning_rate": 0.0016020000000000001, "loss": 5.2364, "step": 534 }, { "epoch": 0.00535, "grad_norm": 1.1518877149230409, "learning_rate": 0.001605, "loss": 5.2538, "step": 535 }, { "epoch": 0.00536, "grad_norm": 1.0003795894462209, "learning_rate": 0.001608, "loss": 5.2516, "step": 536 }, { "epoch": 0.00537, "grad_norm": 1.0837750602504435, "learning_rate": 0.0016110000000000002, "loss": 5.245, "step": 537 }, { "epoch": 0.00538, "grad_norm": 0.8341741073056014, "learning_rate": 0.0016140000000000002, "loss": 5.2215, "step": 538 }, { "epoch": 0.00539, "grad_norm": 0.8405248729905289, "learning_rate": 0.0016170000000000002, "loss": 5.218, "step": 539 }, { "epoch": 0.0054, "grad_norm": 1.0652634270364731, "learning_rate": 0.0016200000000000001, "loss": 5.2348, "step": 540 }, { "epoch": 0.00541, "grad_norm": 1.02504417487003, "learning_rate": 0.001623, "loss": 5.2275, "step": 541 }, { "epoch": 0.00542, "grad_norm": 1.0410836109463524, "learning_rate": 0.001626, "loss": 5.2461, "step": 542 }, { "epoch": 0.00543, "grad_norm": 1.1820143116937272, "learning_rate": 0.0016290000000000002, "loss": 5.2251, "step": 543 }, { "epoch": 0.00544, "grad_norm": 0.7766425096978492, "learning_rate": 0.0016320000000000002, "loss": 5.227, "step": 544 }, { "epoch": 0.00545, "grad_norm": 0.7162190734488068, "learning_rate": 0.0016350000000000002, "loss": 5.2344, "step": 545 }, { "epoch": 0.00546, "grad_norm": 0.6138609629035112, "learning_rate": 0.0016380000000000001, "loss": 5.1997, "step": 546 }, { "epoch": 0.00547, "grad_norm": 0.5845964576550613, "learning_rate": 0.001641, "loss": 5.199, "step": 547 }, { "epoch": 0.00548, "grad_norm": 0.5000045746748227, "learning_rate": 0.001644, "loss": 5.1903, "step": 548 }, { "epoch": 0.00549, "grad_norm": 0.45001097530426093, "learning_rate": 0.0016470000000000002, "loss": 5.1848, "step": 549 }, { "epoch": 0.0055, "grad_norm": 0.48852089918970865, "learning_rate": 0.0016500000000000002, "loss": 5.1966, "step": 550 }, { "epoch": 0.00551, "grad_norm": 0.4504277526474317, "learning_rate": 0.0016530000000000002, "loss": 5.1901, "step": 551 }, { "epoch": 0.00552, "grad_norm": 0.3849177997836439, "learning_rate": 0.0016560000000000001, "loss": 5.1829, "step": 552 }, { "epoch": 0.00553, "grad_norm": 0.3648459807382568, "learning_rate": 0.001659, "loss": 5.1708, "step": 553 }, { "epoch": 0.00554, "grad_norm": 0.36806425815097327, "learning_rate": 0.0016620000000000003, "loss": 5.1683, "step": 554 }, { "epoch": 0.00555, "grad_norm": 0.399857849170916, "learning_rate": 0.0016650000000000002, "loss": 5.174, "step": 555 }, { "epoch": 0.00556, "grad_norm": 0.5317181922092629, "learning_rate": 0.0016680000000000002, "loss": 5.1608, "step": 556 }, { "epoch": 0.00557, "grad_norm": 0.7831777956168835, "learning_rate": 0.0016710000000000002, "loss": 5.1496, "step": 557 }, { "epoch": 0.00558, "grad_norm": 1.0341253030675983, "learning_rate": 0.0016740000000000001, "loss": 5.1915, "step": 558 }, { "epoch": 0.00559, "grad_norm": 0.9310990839840096, "learning_rate": 0.001677, "loss": 5.1724, "step": 559 }, { "epoch": 0.0056, "grad_norm": 1.1320536331440925, "learning_rate": 0.0016800000000000003, "loss": 5.1553, "step": 560 }, { "epoch": 0.00561, "grad_norm": 1.0798635762834115, "learning_rate": 0.0016830000000000003, "loss": 5.1888, "step": 561 }, { "epoch": 0.00562, "grad_norm": 0.9955409946448898, "learning_rate": 0.0016860000000000002, "loss": 5.1785, "step": 562 }, { "epoch": 0.00563, "grad_norm": 0.8046398619546209, "learning_rate": 0.001689, "loss": 5.1727, "step": 563 }, { "epoch": 0.00564, "grad_norm": 0.8640806614647399, "learning_rate": 0.001692, "loss": 5.1564, "step": 564 }, { "epoch": 0.00565, "grad_norm": 0.9094590673653183, "learning_rate": 0.001695, "loss": 5.1591, "step": 565 }, { "epoch": 0.00566, "grad_norm": 1.0763832222921514, "learning_rate": 0.0016979999999999999, "loss": 5.1686, "step": 566 }, { "epoch": 0.00567, "grad_norm": 0.9106969464088174, "learning_rate": 0.0017009999999999998, "loss": 5.1536, "step": 567 }, { "epoch": 0.00568, "grad_norm": 1.0612496613421336, "learning_rate": 0.0017039999999999998, "loss": 5.175, "step": 568 }, { "epoch": 0.00569, "grad_norm": 1.030428797640198, "learning_rate": 0.001707, "loss": 5.1588, "step": 569 }, { "epoch": 0.0057, "grad_norm": 1.0168109110540526, "learning_rate": 0.00171, "loss": 5.1716, "step": 570 }, { "epoch": 0.00571, "grad_norm": 1.1348044931505399, "learning_rate": 0.001713, "loss": 5.1856, "step": 571 }, { "epoch": 0.00572, "grad_norm": 0.761146802779985, "learning_rate": 0.0017159999999999999, "loss": 5.1497, "step": 572 }, { "epoch": 0.00573, "grad_norm": 0.7662650183514703, "learning_rate": 0.0017189999999999998, "loss": 5.1536, "step": 573 }, { "epoch": 0.00574, "grad_norm": 0.8733185334950944, "learning_rate": 0.001722, "loss": 5.1641, "step": 574 }, { "epoch": 0.00575, "grad_norm": 0.8629871927891676, "learning_rate": 0.001725, "loss": 5.142, "step": 575 }, { "epoch": 0.00576, "grad_norm": 0.8467410686880783, "learning_rate": 0.001728, "loss": 5.137, "step": 576 }, { "epoch": 0.00577, "grad_norm": 0.8007452144545169, "learning_rate": 0.001731, "loss": 5.1615, "step": 577 }, { "epoch": 0.00578, "grad_norm": 0.6405300314411674, "learning_rate": 0.0017339999999999999, "loss": 5.1325, "step": 578 }, { "epoch": 0.00579, "grad_norm": 0.70587507977218, "learning_rate": 0.0017369999999999998, "loss": 5.1208, "step": 579 }, { "epoch": 0.0058, "grad_norm": 0.6985854652221088, "learning_rate": 0.00174, "loss": 5.1378, "step": 580 }, { "epoch": 0.00581, "grad_norm": 0.6137529113435678, "learning_rate": 0.001743, "loss": 5.1348, "step": 581 }, { "epoch": 0.00582, "grad_norm": 0.48264514730128444, "learning_rate": 0.001746, "loss": 5.1045, "step": 582 }, { "epoch": 0.00583, "grad_norm": 0.5771367675923135, "learning_rate": 0.001749, "loss": 5.1293, "step": 583 }, { "epoch": 0.00584, "grad_norm": 0.6936037332792736, "learning_rate": 0.0017519999999999999, "loss": 5.1229, "step": 584 }, { "epoch": 0.00585, "grad_norm": 0.7595444862659105, "learning_rate": 0.0017549999999999998, "loss": 5.1095, "step": 585 }, { "epoch": 0.00586, "grad_norm": 0.8605796861767604, "learning_rate": 0.001758, "loss": 5.1202, "step": 586 }, { "epoch": 0.00587, "grad_norm": 0.907920304070859, "learning_rate": 0.001761, "loss": 5.1079, "step": 587 }, { "epoch": 0.00588, "grad_norm": 0.9246177820411534, "learning_rate": 0.001764, "loss": 5.1163, "step": 588 }, { "epoch": 0.00589, "grad_norm": 0.8704696950275989, "learning_rate": 0.001767, "loss": 5.1211, "step": 589 }, { "epoch": 0.0059, "grad_norm": 0.7193563853719346, "learning_rate": 0.0017699999999999999, "loss": 5.101, "step": 590 }, { "epoch": 0.00591, "grad_norm": 0.6949293332158505, "learning_rate": 0.001773, "loss": 5.0928, "step": 591 }, { "epoch": 0.00592, "grad_norm": 0.654055845037553, "learning_rate": 0.001776, "loss": 5.1147, "step": 592 }, { "epoch": 0.00593, "grad_norm": 0.7140132542710956, "learning_rate": 0.001779, "loss": 5.1161, "step": 593 }, { "epoch": 0.00594, "grad_norm": 0.6270820258277058, "learning_rate": 0.001782, "loss": 5.107, "step": 594 }, { "epoch": 0.00595, "grad_norm": 0.5627466921921568, "learning_rate": 0.001785, "loss": 5.0962, "step": 595 }, { "epoch": 0.00596, "grad_norm": 0.6408060930916085, "learning_rate": 0.0017879999999999999, "loss": 5.0966, "step": 596 }, { "epoch": 0.00597, "grad_norm": 0.7315483645065906, "learning_rate": 0.001791, "loss": 5.077, "step": 597 }, { "epoch": 0.00598, "grad_norm": 0.7860917802127037, "learning_rate": 0.001794, "loss": 5.0917, "step": 598 }, { "epoch": 0.00599, "grad_norm": 0.7396694082174577, "learning_rate": 0.001797, "loss": 5.0901, "step": 599 }, { "epoch": 0.006, "grad_norm": 0.678011019966431, "learning_rate": 0.0018, "loss": 5.0859, "step": 600 }, { "epoch": 0.00601, "grad_norm": 0.8039567544190375, "learning_rate": 0.001803, "loss": 5.0817, "step": 601 }, { "epoch": 0.00602, "grad_norm": 0.8362811130198314, "learning_rate": 0.0018059999999999999, "loss": 5.0736, "step": 602 }, { "epoch": 0.00603, "grad_norm": 0.7437701841167941, "learning_rate": 0.001809, "loss": 5.0897, "step": 603 }, { "epoch": 0.00604, "grad_norm": 0.7865788629191941, "learning_rate": 0.001812, "loss": 5.0711, "step": 604 }, { "epoch": 0.00605, "grad_norm": 0.7065527487675252, "learning_rate": 0.001815, "loss": 5.0798, "step": 605 }, { "epoch": 0.00606, "grad_norm": 0.6576721805135337, "learning_rate": 0.001818, "loss": 5.0782, "step": 606 }, { "epoch": 0.00607, "grad_norm": 0.6528064751937639, "learning_rate": 0.001821, "loss": 5.0831, "step": 607 }, { "epoch": 0.00608, "grad_norm": 0.6182798778507802, "learning_rate": 0.001824, "loss": 5.0615, "step": 608 }, { "epoch": 0.00609, "grad_norm": 0.8163285355074863, "learning_rate": 0.001827, "loss": 5.079, "step": 609 }, { "epoch": 0.0061, "grad_norm": 0.795800426894062, "learning_rate": 0.00183, "loss": 5.053, "step": 610 }, { "epoch": 0.00611, "grad_norm": 0.7510077277929101, "learning_rate": 0.001833, "loss": 5.0545, "step": 611 }, { "epoch": 0.00612, "grad_norm": 0.7863529213037952, "learning_rate": 0.001836, "loss": 5.0625, "step": 612 }, { "epoch": 0.00613, "grad_norm": 0.718541559578873, "learning_rate": 0.001839, "loss": 5.0526, "step": 613 }, { "epoch": 0.00614, "grad_norm": 0.648240484004962, "learning_rate": 0.001842, "loss": 5.0729, "step": 614 }, { "epoch": 0.00615, "grad_norm": 0.6572100797643513, "learning_rate": 0.001845, "loss": 5.0607, "step": 615 }, { "epoch": 0.00616, "grad_norm": 0.598653088729871, "learning_rate": 0.001848, "loss": 5.0431, "step": 616 }, { "epoch": 0.00617, "grad_norm": 0.6867319038647199, "learning_rate": 0.001851, "loss": 5.058, "step": 617 }, { "epoch": 0.00618, "grad_norm": 0.8065427766951909, "learning_rate": 0.001854, "loss": 5.0471, "step": 618 }, { "epoch": 0.00619, "grad_norm": 0.952336607614539, "learning_rate": 0.001857, "loss": 5.0614, "step": 619 }, { "epoch": 0.0062, "grad_norm": 0.937144329881144, "learning_rate": 0.00186, "loss": 5.0503, "step": 620 }, { "epoch": 0.00621, "grad_norm": 0.861992409216581, "learning_rate": 0.001863, "loss": 5.0519, "step": 621 }, { "epoch": 0.00622, "grad_norm": 0.7937303039626423, "learning_rate": 0.001866, "loss": 5.046, "step": 622 }, { "epoch": 0.00623, "grad_norm": 0.7729108287648654, "learning_rate": 0.001869, "loss": 5.0315, "step": 623 }, { "epoch": 0.00624, "grad_norm": 0.7644499866680247, "learning_rate": 0.001872, "loss": 5.0449, "step": 624 }, { "epoch": 0.00625, "grad_norm": 0.833886296843224, "learning_rate": 0.001875, "loss": 5.0598, "step": 625 }, { "epoch": 0.00626, "grad_norm": 0.8372671439007264, "learning_rate": 0.0018780000000000001, "loss": 5.0351, "step": 626 }, { "epoch": 0.00627, "grad_norm": 0.8603722936304884, "learning_rate": 0.001881, "loss": 5.0306, "step": 627 }, { "epoch": 0.00628, "grad_norm": 0.7718433014027412, "learning_rate": 0.001884, "loss": 5.0548, "step": 628 }, { "epoch": 0.00629, "grad_norm": 0.6787638881219394, "learning_rate": 0.001887, "loss": 5.0263, "step": 629 }, { "epoch": 0.0063, "grad_norm": 0.6841857077673691, "learning_rate": 0.00189, "loss": 5.0201, "step": 630 }, { "epoch": 0.00631, "grad_norm": 0.7297958952205305, "learning_rate": 0.0018930000000000002, "loss": 5.0163, "step": 631 }, { "epoch": 0.00632, "grad_norm": 0.8611511166687711, "learning_rate": 0.0018960000000000001, "loss": 5.0388, "step": 632 }, { "epoch": 0.00633, "grad_norm": 0.9514927337678543, "learning_rate": 0.001899, "loss": 5.0438, "step": 633 }, { "epoch": 0.00634, "grad_norm": 0.8956137982105657, "learning_rate": 0.001902, "loss": 5.031, "step": 634 }, { "epoch": 0.00635, "grad_norm": 0.952460918530188, "learning_rate": 0.001905, "loss": 5.0218, "step": 635 }, { "epoch": 0.00636, "grad_norm": 1.0747774477703274, "learning_rate": 0.001908, "loss": 5.0402, "step": 636 }, { "epoch": 0.00637, "grad_norm": 0.8336825219104492, "learning_rate": 0.0019110000000000002, "loss": 5.0466, "step": 637 }, { "epoch": 0.00638, "grad_norm": 0.8986808700713379, "learning_rate": 0.0019140000000000001, "loss": 5.0353, "step": 638 }, { "epoch": 0.00639, "grad_norm": 1.0369332378334974, "learning_rate": 0.001917, "loss": 5.0213, "step": 639 }, { "epoch": 0.0064, "grad_norm": 1.0511699002679498, "learning_rate": 0.00192, "loss": 5.0287, "step": 640 }, { "epoch": 0.00641, "grad_norm": 1.0967175650217074, "learning_rate": 0.001923, "loss": 5.0426, "step": 641 }, { "epoch": 0.00642, "grad_norm": 0.9933027226251017, "learning_rate": 0.001926, "loss": 5.0352, "step": 642 }, { "epoch": 0.00643, "grad_norm": 0.9768777735178511, "learning_rate": 0.0019290000000000002, "loss": 5.0523, "step": 643 }, { "epoch": 0.00644, "grad_norm": 1.001060805060802, "learning_rate": 0.0019320000000000001, "loss": 5.0363, "step": 644 }, { "epoch": 0.00645, "grad_norm": 1.0724455418770003, "learning_rate": 0.001935, "loss": 5.0522, "step": 645 }, { "epoch": 0.00646, "grad_norm": 1.1074452374796593, "learning_rate": 0.001938, "loss": 5.0611, "step": 646 }, { "epoch": 0.00647, "grad_norm": 0.9675974316517354, "learning_rate": 0.001941, "loss": 5.062, "step": 647 }, { "epoch": 0.00648, "grad_norm": 0.7269750089664031, "learning_rate": 0.0019440000000000002, "loss": 5.0382, "step": 648 }, { "epoch": 0.00649, "grad_norm": 0.6773093371497811, "learning_rate": 0.0019470000000000002, "loss": 5.0209, "step": 649 }, { "epoch": 0.0065, "grad_norm": 0.6721153823123059, "learning_rate": 0.0019500000000000001, "loss": 5.0458, "step": 650 }, { "epoch": 0.00651, "grad_norm": 0.6257734701697928, "learning_rate": 0.001953, "loss": 5.022, "step": 651 }, { "epoch": 0.00652, "grad_norm": 0.6412471052264174, "learning_rate": 0.0019560000000000003, "loss": 5.0157, "step": 652 }, { "epoch": 0.00653, "grad_norm": 0.7030358108381808, "learning_rate": 0.0019590000000000002, "loss": 5.0108, "step": 653 }, { "epoch": 0.00654, "grad_norm": 0.6417272703555769, "learning_rate": 0.001962, "loss": 4.994, "step": 654 }, { "epoch": 0.00655, "grad_norm": 0.6123939930365004, "learning_rate": 0.001965, "loss": 5.0066, "step": 655 }, { "epoch": 0.00656, "grad_norm": 0.6761646659152818, "learning_rate": 0.001968, "loss": 4.9975, "step": 656 }, { "epoch": 0.00657, "grad_norm": 0.9015512681716132, "learning_rate": 0.001971, "loss": 4.9965, "step": 657 }, { "epoch": 0.00658, "grad_norm": 1.1070883554163469, "learning_rate": 0.001974, "loss": 5.032, "step": 658 }, { "epoch": 0.00659, "grad_norm": 0.8166390486888542, "learning_rate": 0.001977, "loss": 5.0022, "step": 659 }, { "epoch": 0.0066, "grad_norm": 0.7886404692657704, "learning_rate": 0.00198, "loss": 4.9952, "step": 660 }, { "epoch": 0.00661, "grad_norm": 0.6930157053874706, "learning_rate": 0.001983, "loss": 4.9751, "step": 661 }, { "epoch": 0.00662, "grad_norm": 0.6021137246671374, "learning_rate": 0.0019860000000000004, "loss": 4.9738, "step": 662 }, { "epoch": 0.00663, "grad_norm": 0.5342549595827694, "learning_rate": 0.0019890000000000003, "loss": 4.9858, "step": 663 }, { "epoch": 0.00664, "grad_norm": 0.5549032292488858, "learning_rate": 0.0019920000000000003, "loss": 4.9668, "step": 664 }, { "epoch": 0.00665, "grad_norm": 0.5630527450969944, "learning_rate": 0.0019950000000000002, "loss": 4.9437, "step": 665 }, { "epoch": 0.00666, "grad_norm": 0.5436432724284425, "learning_rate": 0.001998, "loss": 4.9653, "step": 666 }, { "epoch": 0.00667, "grad_norm": 0.5455501294317395, "learning_rate": 0.002001, "loss": 4.9611, "step": 667 }, { "epoch": 0.00668, "grad_norm": 0.5771340900782089, "learning_rate": 0.002004, "loss": 4.9815, "step": 668 }, { "epoch": 0.00669, "grad_norm": 0.5741027689717983, "learning_rate": 0.002007, "loss": 4.9757, "step": 669 }, { "epoch": 0.0067, "grad_norm": 0.6058348461614704, "learning_rate": 0.00201, "loss": 4.969, "step": 670 }, { "epoch": 0.00671, "grad_norm": 0.566759025441364, "learning_rate": 0.002013, "loss": 4.9434, "step": 671 }, { "epoch": 0.00672, "grad_norm": 0.5204858890697072, "learning_rate": 0.002016, "loss": 4.9491, "step": 672 }, { "epoch": 0.00673, "grad_norm": 0.5776654990466389, "learning_rate": 0.002019, "loss": 4.9473, "step": 673 }, { "epoch": 0.00674, "grad_norm": 0.5899189939484214, "learning_rate": 0.0020220000000000004, "loss": 4.9512, "step": 674 }, { "epoch": 0.00675, "grad_norm": 0.73017970913878, "learning_rate": 0.0020250000000000003, "loss": 4.954, "step": 675 }, { "epoch": 0.00676, "grad_norm": 0.8714534217285366, "learning_rate": 0.0020280000000000003, "loss": 4.9527, "step": 676 }, { "epoch": 0.00677, "grad_norm": 0.9455529231086871, "learning_rate": 0.0020310000000000003, "loss": 4.966, "step": 677 }, { "epoch": 0.00678, "grad_norm": 0.7654589233287328, "learning_rate": 0.0020340000000000002, "loss": 4.9721, "step": 678 }, { "epoch": 0.00679, "grad_norm": 0.7453542715563302, "learning_rate": 0.002037, "loss": 4.9514, "step": 679 }, { "epoch": 0.0068, "grad_norm": 0.8198542095591255, "learning_rate": 0.00204, "loss": 4.9599, "step": 680 }, { "epoch": 0.00681, "grad_norm": 1.0035327070670854, "learning_rate": 0.002043, "loss": 4.9608, "step": 681 }, { "epoch": 0.00682, "grad_norm": 1.1272493939475496, "learning_rate": 0.002046, "loss": 4.9829, "step": 682 }, { "epoch": 0.00683, "grad_norm": 0.9375943776902779, "learning_rate": 0.002049, "loss": 4.9573, "step": 683 }, { "epoch": 0.00684, "grad_norm": 0.988255846210588, "learning_rate": 0.002052, "loss": 4.9739, "step": 684 }, { "epoch": 0.00685, "grad_norm": 1.2166901935378913, "learning_rate": 0.0020550000000000004, "loss": 4.9877, "step": 685 }, { "epoch": 0.00686, "grad_norm": 0.8834239063706651, "learning_rate": 0.0020580000000000004, "loss": 4.9625, "step": 686 }, { "epoch": 0.00687, "grad_norm": 0.9980240769781185, "learning_rate": 0.0020610000000000003, "loss": 4.9587, "step": 687 }, { "epoch": 0.00688, "grad_norm": 1.1254414916617237, "learning_rate": 0.002064, "loss": 4.9804, "step": 688 }, { "epoch": 0.00689, "grad_norm": 1.0306142304378458, "learning_rate": 0.002067, "loss": 4.9703, "step": 689 }, { "epoch": 0.0069, "grad_norm": 1.1841729949788287, "learning_rate": 0.00207, "loss": 4.9832, "step": 690 }, { "epoch": 0.00691, "grad_norm": 0.94703321985356, "learning_rate": 0.0020729999999999998, "loss": 4.946, "step": 691 }, { "epoch": 0.00692, "grad_norm": 1.0500733994835558, "learning_rate": 0.0020759999999999997, "loss": 4.9769, "step": 692 }, { "epoch": 0.00693, "grad_norm": 0.8703789610160195, "learning_rate": 0.0020789999999999997, "loss": 4.9707, "step": 693 }, { "epoch": 0.00694, "grad_norm": 0.8616077913099822, "learning_rate": 0.002082, "loss": 4.9464, "step": 694 }, { "epoch": 0.00695, "grad_norm": 1.1391889561194677, "learning_rate": 0.002085, "loss": 4.9626, "step": 695 }, { "epoch": 0.00696, "grad_norm": 0.9584412207038412, "learning_rate": 0.002088, "loss": 4.9606, "step": 696 }, { "epoch": 0.00697, "grad_norm": 1.0011041706204022, "learning_rate": 0.002091, "loss": 4.9624, "step": 697 }, { "epoch": 0.00698, "grad_norm": 1.3127609957868622, "learning_rate": 0.002094, "loss": 4.9728, "step": 698 }, { "epoch": 0.00699, "grad_norm": 0.8537920758308664, "learning_rate": 0.002097, "loss": 4.9491, "step": 699 }, { "epoch": 0.007, "grad_norm": 1.0018321540367086, "learning_rate": 0.0021, "loss": 4.9652, "step": 700 }, { "epoch": 0.00701, "grad_norm": 0.9481035749179906, "learning_rate": 0.002103, "loss": 4.9422, "step": 701 }, { "epoch": 0.00702, "grad_norm": 0.8258463386850557, "learning_rate": 0.002106, "loss": 4.945, "step": 702 }, { "epoch": 0.00703, "grad_norm": 0.6305963851301584, "learning_rate": 0.0021089999999999998, "loss": 4.9167, "step": 703 }, { "epoch": 0.00704, "grad_norm": 0.6428932601365419, "learning_rate": 0.0021119999999999997, "loss": 4.9232, "step": 704 }, { "epoch": 0.00705, "grad_norm": 0.6965384868722897, "learning_rate": 0.002115, "loss": 4.9266, "step": 705 }, { "epoch": 0.00706, "grad_norm": 0.7410784964260257, "learning_rate": 0.002118, "loss": 4.9344, "step": 706 }, { "epoch": 0.00707, "grad_norm": 0.721339398040246, "learning_rate": 0.002121, "loss": 4.9083, "step": 707 }, { "epoch": 0.00708, "grad_norm": 0.675102725576719, "learning_rate": 0.002124, "loss": 4.8875, "step": 708 }, { "epoch": 0.00709, "grad_norm": 0.6428565761724286, "learning_rate": 0.002127, "loss": 4.9151, "step": 709 }, { "epoch": 0.0071, "grad_norm": 0.5795546215215135, "learning_rate": 0.00213, "loss": 4.8959, "step": 710 }, { "epoch": 0.00711, "grad_norm": 0.6667417841414063, "learning_rate": 0.002133, "loss": 4.8739, "step": 711 }, { "epoch": 0.00712, "grad_norm": 0.7094343142489271, "learning_rate": 0.002136, "loss": 4.8945, "step": 712 }, { "epoch": 0.00713, "grad_norm": 0.6503826772007358, "learning_rate": 0.002139, "loss": 4.9061, "step": 713 }, { "epoch": 0.00714, "grad_norm": 0.5916220730293257, "learning_rate": 0.002142, "loss": 4.8928, "step": 714 }, { "epoch": 0.00715, "grad_norm": 0.6248268778602033, "learning_rate": 0.0021449999999999998, "loss": 4.8651, "step": 715 }, { "epoch": 0.00716, "grad_norm": 0.7056512941727522, "learning_rate": 0.002148, "loss": 4.8619, "step": 716 }, { "epoch": 0.00717, "grad_norm": 0.7609530945312164, "learning_rate": 0.002151, "loss": 4.8823, "step": 717 }, { "epoch": 0.00718, "grad_norm": 0.7494152535556156, "learning_rate": 0.002154, "loss": 4.8579, "step": 718 }, { "epoch": 0.00719, "grad_norm": 0.7503709346262388, "learning_rate": 0.002157, "loss": 4.8753, "step": 719 }, { "epoch": 0.0072, "grad_norm": 0.7326012456404516, "learning_rate": 0.00216, "loss": 4.8631, "step": 720 }, { "epoch": 0.00721, "grad_norm": 0.7261591024591523, "learning_rate": 0.002163, "loss": 4.867, "step": 721 }, { "epoch": 0.00722, "grad_norm": 0.782783835861699, "learning_rate": 0.002166, "loss": 4.843, "step": 722 }, { "epoch": 0.00723, "grad_norm": 0.9405574246546834, "learning_rate": 0.002169, "loss": 4.859, "step": 723 }, { "epoch": 0.00724, "grad_norm": 0.9635066655828881, "learning_rate": 0.002172, "loss": 4.8537, "step": 724 }, { "epoch": 0.00725, "grad_norm": 1.040794527733107, "learning_rate": 0.002175, "loss": 4.8644, "step": 725 }, { "epoch": 0.00726, "grad_norm": 0.8259351691267115, "learning_rate": 0.002178, "loss": 4.8549, "step": 726 }, { "epoch": 0.00727, "grad_norm": 0.9246523356382073, "learning_rate": 0.0021809999999999998, "loss": 4.8481, "step": 727 }, { "epoch": 0.00728, "grad_norm": 0.8492007448518646, "learning_rate": 0.002184, "loss": 4.8551, "step": 728 }, { "epoch": 0.00729, "grad_norm": 0.8802732367664233, "learning_rate": 0.002187, "loss": 4.8519, "step": 729 }, { "epoch": 0.0073, "grad_norm": 0.8143690086226398, "learning_rate": 0.00219, "loss": 4.8583, "step": 730 }, { "epoch": 0.00731, "grad_norm": 0.7978098404882568, "learning_rate": 0.002193, "loss": 4.846, "step": 731 }, { "epoch": 0.00732, "grad_norm": 0.8726405483808616, "learning_rate": 0.002196, "loss": 4.8538, "step": 732 }, { "epoch": 0.00733, "grad_norm": 0.852624212596407, "learning_rate": 0.002199, "loss": 4.8348, "step": 733 }, { "epoch": 0.00734, "grad_norm": 0.8516024625713395, "learning_rate": 0.002202, "loss": 4.8503, "step": 734 }, { "epoch": 0.00735, "grad_norm": 1.0296852244953179, "learning_rate": 0.002205, "loss": 4.8752, "step": 735 }, { "epoch": 0.00736, "grad_norm": 0.8907291733705317, "learning_rate": 0.002208, "loss": 4.8478, "step": 736 }, { "epoch": 0.00737, "grad_norm": 0.847923173569621, "learning_rate": 0.002211, "loss": 4.8138, "step": 737 }, { "epoch": 0.00738, "grad_norm": 1.0201562870165168, "learning_rate": 0.002214, "loss": 4.8304, "step": 738 }, { "epoch": 0.00739, "grad_norm": 0.8864771113941665, "learning_rate": 0.0022170000000000002, "loss": 4.8243, "step": 739 }, { "epoch": 0.0074, "grad_norm": 0.8107829600567034, "learning_rate": 0.00222, "loss": 4.8172, "step": 740 }, { "epoch": 0.00741, "grad_norm": 0.9926562607182313, "learning_rate": 0.002223, "loss": 4.7981, "step": 741 }, { "epoch": 0.00742, "grad_norm": 1.1169095501625559, "learning_rate": 0.002226, "loss": 4.8325, "step": 742 }, { "epoch": 0.00743, "grad_norm": 1.0154903575297356, "learning_rate": 0.002229, "loss": 4.8033, "step": 743 }, { "epoch": 0.00744, "grad_norm": 0.8796310645612629, "learning_rate": 0.002232, "loss": 4.8021, "step": 744 }, { "epoch": 0.00745, "grad_norm": 0.8347673276949072, "learning_rate": 0.002235, "loss": 4.8034, "step": 745 }, { "epoch": 0.00746, "grad_norm": 1.0163090629032976, "learning_rate": 0.002238, "loss": 4.8122, "step": 746 }, { "epoch": 0.00747, "grad_norm": 0.9716623005872416, "learning_rate": 0.002241, "loss": 4.7984, "step": 747 }, { "epoch": 0.00748, "grad_norm": 1.0278655828642824, "learning_rate": 0.002244, "loss": 4.8089, "step": 748 }, { "epoch": 0.00749, "grad_norm": 1.0480541445842007, "learning_rate": 0.002247, "loss": 4.797, "step": 749 }, { "epoch": 0.0075, "grad_norm": 0.8547105592298554, "learning_rate": 0.0022500000000000003, "loss": 4.8161, "step": 750 }, { "epoch": 0.00751, "grad_norm": 1.124867997489978, "learning_rate": 0.0022530000000000002, "loss": 4.8181, "step": 751 }, { "epoch": 0.00752, "grad_norm": 0.9791690369641056, "learning_rate": 0.002256, "loss": 4.777, "step": 752 }, { "epoch": 0.00753, "grad_norm": 1.076442658365417, "learning_rate": 0.002259, "loss": 4.8205, "step": 753 }, { "epoch": 0.00754, "grad_norm": 1.2763721488952993, "learning_rate": 0.002262, "loss": 4.8438, "step": 754 }, { "epoch": 0.00755, "grad_norm": 0.956893666746658, "learning_rate": 0.002265, "loss": 4.7969, "step": 755 }, { "epoch": 0.00756, "grad_norm": 0.9171753465968743, "learning_rate": 0.002268, "loss": 4.7762, "step": 756 }, { "epoch": 0.00757, "grad_norm": 1.0908406750664337, "learning_rate": 0.002271, "loss": 4.7959, "step": 757 }, { "epoch": 0.00758, "grad_norm": 0.9176733613061218, "learning_rate": 0.002274, "loss": 4.7709, "step": 758 }, { "epoch": 0.00759, "grad_norm": 0.8908245026099332, "learning_rate": 0.002277, "loss": 4.7862, "step": 759 }, { "epoch": 0.0076, "grad_norm": 0.9024826649233609, "learning_rate": 0.00228, "loss": 4.7716, "step": 760 }, { "epoch": 0.00761, "grad_norm": 0.8163892030302853, "learning_rate": 0.002283, "loss": 4.7408, "step": 761 }, { "epoch": 0.00762, "grad_norm": 0.8370737395441165, "learning_rate": 0.0022860000000000003, "loss": 4.7247, "step": 762 }, { "epoch": 0.00763, "grad_norm": 0.8480960736158745, "learning_rate": 0.0022890000000000002, "loss": 4.7641, "step": 763 }, { "epoch": 0.00764, "grad_norm": 0.7323677069229925, "learning_rate": 0.002292, "loss": 4.7635, "step": 764 }, { "epoch": 0.00765, "grad_norm": 0.517428476031845, "learning_rate": 0.002295, "loss": 4.7402, "step": 765 }, { "epoch": 0.00766, "grad_norm": 0.5449748434883583, "learning_rate": 0.002298, "loss": 4.7349, "step": 766 }, { "epoch": 0.00767, "grad_norm": 0.519905380789997, "learning_rate": 0.002301, "loss": 4.7297, "step": 767 }, { "epoch": 0.00768, "grad_norm": 0.5353014095205901, "learning_rate": 0.002304, "loss": 4.7326, "step": 768 }, { "epoch": 0.00769, "grad_norm": 0.6021476092106454, "learning_rate": 0.002307, "loss": 4.7369, "step": 769 }, { "epoch": 0.0077, "grad_norm": 0.6291707943467315, "learning_rate": 0.00231, "loss": 4.6951, "step": 770 }, { "epoch": 0.00771, "grad_norm": 0.6259748544734598, "learning_rate": 0.002313, "loss": 4.726, "step": 771 }, { "epoch": 0.00772, "grad_norm": 0.6411937087975925, "learning_rate": 0.002316, "loss": 4.71, "step": 772 }, { "epoch": 0.00773, "grad_norm": 0.6087209583882061, "learning_rate": 0.0023190000000000003, "loss": 4.7089, "step": 773 }, { "epoch": 0.00774, "grad_norm": 0.6187878154543281, "learning_rate": 0.0023220000000000003, "loss": 4.7216, "step": 774 }, { "epoch": 0.00775, "grad_norm": 0.6698244521960122, "learning_rate": 0.0023250000000000002, "loss": 4.7125, "step": 775 }, { "epoch": 0.00776, "grad_norm": 0.681313335099601, "learning_rate": 0.002328, "loss": 4.716, "step": 776 }, { "epoch": 0.00777, "grad_norm": 0.564623702064127, "learning_rate": 0.002331, "loss": 4.6866, "step": 777 }, { "epoch": 0.00778, "grad_norm": 0.5709713255684672, "learning_rate": 0.002334, "loss": 4.6954, "step": 778 }, { "epoch": 0.00779, "grad_norm": 0.6149821587836622, "learning_rate": 0.002337, "loss": 4.6565, "step": 779 }, { "epoch": 0.0078, "grad_norm": 0.632842020953678, "learning_rate": 0.00234, "loss": 4.6725, "step": 780 }, { "epoch": 0.00781, "grad_norm": 0.7467970568022195, "learning_rate": 0.002343, "loss": 4.678, "step": 781 }, { "epoch": 0.00782, "grad_norm": 0.9047857384333676, "learning_rate": 0.002346, "loss": 4.6801, "step": 782 }, { "epoch": 0.00783, "grad_norm": 1.0576276659724262, "learning_rate": 0.002349, "loss": 4.6678, "step": 783 }, { "epoch": 0.00784, "grad_norm": 1.1751809929780095, "learning_rate": 0.002352, "loss": 4.6921, "step": 784 }, { "epoch": 0.00785, "grad_norm": 0.8865561583369859, "learning_rate": 0.0023550000000000003, "loss": 4.6774, "step": 785 }, { "epoch": 0.00786, "grad_norm": 1.1372694124786462, "learning_rate": 0.0023580000000000003, "loss": 4.7016, "step": 786 }, { "epoch": 0.00787, "grad_norm": 1.0092158733190273, "learning_rate": 0.0023610000000000003, "loss": 4.6918, "step": 787 }, { "epoch": 0.00788, "grad_norm": 1.1115636173510615, "learning_rate": 0.002364, "loss": 4.7036, "step": 788 }, { "epoch": 0.00789, "grad_norm": 0.9627507497820444, "learning_rate": 0.002367, "loss": 4.7196, "step": 789 }, { "epoch": 0.0079, "grad_norm": 1.0391564883926698, "learning_rate": 0.00237, "loss": 4.706, "step": 790 }, { "epoch": 0.00791, "grad_norm": 0.9857967273862618, "learning_rate": 0.002373, "loss": 4.6613, "step": 791 }, { "epoch": 0.00792, "grad_norm": 1.07807376567954, "learning_rate": 0.002376, "loss": 4.6969, "step": 792 }, { "epoch": 0.00793, "grad_norm": 1.1762161150555115, "learning_rate": 0.002379, "loss": 4.7002, "step": 793 }, { "epoch": 0.00794, "grad_norm": 0.8247319634032979, "learning_rate": 0.002382, "loss": 4.6948, "step": 794 }, { "epoch": 0.00795, "grad_norm": 0.8498192334201926, "learning_rate": 0.002385, "loss": 4.676, "step": 795 }, { "epoch": 0.00796, "grad_norm": 0.7871292688998256, "learning_rate": 0.0023880000000000004, "loss": 4.6606, "step": 796 }, { "epoch": 0.00797, "grad_norm": 0.7600244620970749, "learning_rate": 0.0023910000000000003, "loss": 4.7021, "step": 797 }, { "epoch": 0.00798, "grad_norm": 0.7960964641914068, "learning_rate": 0.0023940000000000003, "loss": 4.6698, "step": 798 }, { "epoch": 0.00799, "grad_norm": 0.740030757660218, "learning_rate": 0.0023970000000000003, "loss": 4.6941, "step": 799 }, { "epoch": 0.008, "grad_norm": 0.7620304314170098, "learning_rate": 0.0024000000000000002, "loss": 4.6524, "step": 800 }, { "epoch": 0.00801, "grad_norm": 0.7109641363313225, "learning_rate": 0.002403, "loss": 4.6746, "step": 801 }, { "epoch": 0.00802, "grad_norm": 0.6915734125547375, "learning_rate": 0.002406, "loss": 4.6614, "step": 802 }, { "epoch": 0.00803, "grad_norm": 0.6786740334805601, "learning_rate": 0.002409, "loss": 4.6408, "step": 803 }, { "epoch": 0.00804, "grad_norm": 0.6631767318006158, "learning_rate": 0.002412, "loss": 4.6361, "step": 804 }, { "epoch": 0.00805, "grad_norm": 0.6460763714070807, "learning_rate": 0.002415, "loss": 4.631, "step": 805 }, { "epoch": 0.00806, "grad_norm": 0.5689446567265103, "learning_rate": 0.002418, "loss": 4.6413, "step": 806 }, { "epoch": 0.00807, "grad_norm": 0.7294155919129667, "learning_rate": 0.0024210000000000004, "loss": 4.6407, "step": 807 }, { "epoch": 0.00808, "grad_norm": 0.9713010260235329, "learning_rate": 0.0024240000000000004, "loss": 4.6372, "step": 808 }, { "epoch": 0.00809, "grad_norm": 1.039651350431828, "learning_rate": 0.0024270000000000003, "loss": 4.6452, "step": 809 }, { "epoch": 0.0081, "grad_norm": 0.7987971948237619, "learning_rate": 0.0024300000000000003, "loss": 4.6558, "step": 810 }, { "epoch": 0.00811, "grad_norm": 0.866506786468452, "learning_rate": 0.0024330000000000003, "loss": 4.6332, "step": 811 }, { "epoch": 0.00812, "grad_norm": 0.8940070034144846, "learning_rate": 0.0024360000000000002, "loss": 4.6162, "step": 812 }, { "epoch": 0.00813, "grad_norm": 0.7601015395596301, "learning_rate": 0.0024389999999999998, "loss": 4.6418, "step": 813 }, { "epoch": 0.00814, "grad_norm": 0.959371802874838, "learning_rate": 0.0024419999999999997, "loss": 4.6561, "step": 814 }, { "epoch": 0.00815, "grad_norm": 1.0501943727451555, "learning_rate": 0.0024449999999999997, "loss": 4.6758, "step": 815 }, { "epoch": 0.00816, "grad_norm": 1.0657571457398118, "learning_rate": 0.002448, "loss": 4.6479, "step": 816 }, { "epoch": 0.00817, "grad_norm": 0.9466339757774411, "learning_rate": 0.002451, "loss": 4.6581, "step": 817 }, { "epoch": 0.00818, "grad_norm": 1.0180120345579544, "learning_rate": 0.002454, "loss": 4.6439, "step": 818 }, { "epoch": 0.00819, "grad_norm": 0.9541848583096257, "learning_rate": 0.002457, "loss": 4.6558, "step": 819 }, { "epoch": 0.0082, "grad_norm": 0.8369907334148766, "learning_rate": 0.00246, "loss": 4.6334, "step": 820 }, { "epoch": 0.00821, "grad_norm": 0.9636511535948165, "learning_rate": 0.002463, "loss": 4.6201, "step": 821 }, { "epoch": 0.00822, "grad_norm": 0.9781201110522014, "learning_rate": 0.002466, "loss": 4.6234, "step": 822 }, { "epoch": 0.00823, "grad_norm": 0.7544698826164155, "learning_rate": 0.002469, "loss": 4.6177, "step": 823 }, { "epoch": 0.00824, "grad_norm": 0.8351046210351873, "learning_rate": 0.002472, "loss": 4.6395, "step": 824 }, { "epoch": 0.00825, "grad_norm": 0.9579203449788568, "learning_rate": 0.0024749999999999998, "loss": 4.6552, "step": 825 }, { "epoch": 0.00826, "grad_norm": 0.6855903524790949, "learning_rate": 0.0024779999999999997, "loss": 4.6015, "step": 826 }, { "epoch": 0.00827, "grad_norm": 0.5831382107833026, "learning_rate": 0.002481, "loss": 4.6271, "step": 827 }, { "epoch": 0.00828, "grad_norm": 0.6353397555552095, "learning_rate": 0.002484, "loss": 4.6166, "step": 828 }, { "epoch": 0.00829, "grad_norm": 0.7710602439494972, "learning_rate": 0.002487, "loss": 4.619, "step": 829 }, { "epoch": 0.0083, "grad_norm": 0.8708544431374072, "learning_rate": 0.00249, "loss": 4.6284, "step": 830 }, { "epoch": 0.00831, "grad_norm": 1.0453965047267408, "learning_rate": 0.002493, "loss": 4.6196, "step": 831 }, { "epoch": 0.00832, "grad_norm": 1.1447270695034462, "learning_rate": 0.002496, "loss": 4.6327, "step": 832 }, { "epoch": 0.00833, "grad_norm": 1.0246810397019857, "learning_rate": 0.002499, "loss": 4.6176, "step": 833 }, { "epoch": 0.00834, "grad_norm": 0.9810039542594958, "learning_rate": 0.002502, "loss": 4.6274, "step": 834 }, { "epoch": 0.00835, "grad_norm": 0.8876541499902995, "learning_rate": 0.002505, "loss": 4.6197, "step": 835 }, { "epoch": 0.00836, "grad_norm": 0.903016402602156, "learning_rate": 0.002508, "loss": 4.6457, "step": 836 }, { "epoch": 0.00837, "grad_norm": 0.9089070943446178, "learning_rate": 0.0025109999999999998, "loss": 4.6001, "step": 837 }, { "epoch": 0.00838, "grad_norm": 1.0056159571167094, "learning_rate": 0.0025139999999999997, "loss": 4.6276, "step": 838 }, { "epoch": 0.00839, "grad_norm": 0.9338401740679543, "learning_rate": 0.002517, "loss": 4.6258, "step": 839 }, { "epoch": 0.0084, "grad_norm": 0.8595710991386702, "learning_rate": 0.00252, "loss": 4.6076, "step": 840 }, { "epoch": 0.00841, "grad_norm": 0.8408169361023179, "learning_rate": 0.002523, "loss": 4.5981, "step": 841 }, { "epoch": 0.00842, "grad_norm": 0.9223461051136856, "learning_rate": 0.002526, "loss": 4.6018, "step": 842 }, { "epoch": 0.00843, "grad_norm": 0.8967298269444864, "learning_rate": 0.002529, "loss": 4.5992, "step": 843 }, { "epoch": 0.00844, "grad_norm": 1.0054950179651305, "learning_rate": 0.002532, "loss": 4.6384, "step": 844 }, { "epoch": 0.00845, "grad_norm": 0.9421873102957943, "learning_rate": 0.002535, "loss": 4.597, "step": 845 }, { "epoch": 0.00846, "grad_norm": 0.7813468730903476, "learning_rate": 0.002538, "loss": 4.612, "step": 846 }, { "epoch": 0.00847, "grad_norm": 0.7336538825841323, "learning_rate": 0.002541, "loss": 4.5922, "step": 847 }, { "epoch": 0.00848, "grad_norm": 0.7490910216536115, "learning_rate": 0.002544, "loss": 4.5881, "step": 848 }, { "epoch": 0.00849, "grad_norm": 0.7281767237417216, "learning_rate": 0.002547, "loss": 4.5791, "step": 849 }, { "epoch": 0.0085, "grad_norm": 0.6108432594850526, "learning_rate": 0.00255, "loss": 4.5847, "step": 850 }, { "epoch": 0.00851, "grad_norm": 0.6024151823798927, "learning_rate": 0.002553, "loss": 4.5988, "step": 851 }, { "epoch": 0.00852, "grad_norm": 0.4780185378158163, "learning_rate": 0.002556, "loss": 4.5828, "step": 852 }, { "epoch": 0.00853, "grad_norm": 0.5435830639563773, "learning_rate": 0.002559, "loss": 4.5614, "step": 853 }, { "epoch": 0.00854, "grad_norm": 0.6028384776402124, "learning_rate": 0.002562, "loss": 4.5542, "step": 854 }, { "epoch": 0.00855, "grad_norm": 0.5885480983638159, "learning_rate": 0.002565, "loss": 4.5306, "step": 855 }, { "epoch": 0.00856, "grad_norm": 0.6060410896547319, "learning_rate": 0.002568, "loss": 4.5634, "step": 856 }, { "epoch": 0.00857, "grad_norm": 0.5609885323605243, "learning_rate": 0.002571, "loss": 4.5495, "step": 857 }, { "epoch": 0.00858, "grad_norm": 0.5199675893089152, "learning_rate": 0.002574, "loss": 4.541, "step": 858 }, { "epoch": 0.00859, "grad_norm": 0.5200210947429836, "learning_rate": 0.002577, "loss": 4.5318, "step": 859 }, { "epoch": 0.0086, "grad_norm": 0.5623426902483657, "learning_rate": 0.00258, "loss": 4.5597, "step": 860 }, { "epoch": 0.00861, "grad_norm": 0.6385070114593622, "learning_rate": 0.0025830000000000002, "loss": 4.5763, "step": 861 }, { "epoch": 0.00862, "grad_norm": 0.7215967762933786, "learning_rate": 0.002586, "loss": 4.5292, "step": 862 }, { "epoch": 0.00863, "grad_norm": 0.8778870896923412, "learning_rate": 0.002589, "loss": 4.559, "step": 863 }, { "epoch": 0.00864, "grad_norm": 1.1440074056398795, "learning_rate": 0.002592, "loss": 4.547, "step": 864 }, { "epoch": 0.00865, "grad_norm": 0.9219223071258862, "learning_rate": 0.002595, "loss": 4.5491, "step": 865 }, { "epoch": 0.00866, "grad_norm": 1.065047382603863, "learning_rate": 0.002598, "loss": 4.5845, "step": 866 }, { "epoch": 0.00867, "grad_norm": 0.9527212983309642, "learning_rate": 0.002601, "loss": 4.5736, "step": 867 }, { "epoch": 0.00868, "grad_norm": 0.7943624191664302, "learning_rate": 0.002604, "loss": 4.5474, "step": 868 }, { "epoch": 0.00869, "grad_norm": 0.671885521142916, "learning_rate": 0.002607, "loss": 4.5536, "step": 869 }, { "epoch": 0.0087, "grad_norm": 0.6527559883208905, "learning_rate": 0.00261, "loss": 4.54, "step": 870 }, { "epoch": 0.00871, "grad_norm": 0.6266922867904314, "learning_rate": 0.002613, "loss": 4.5522, "step": 871 }, { "epoch": 0.00872, "grad_norm": 0.5879044949928981, "learning_rate": 0.002616, "loss": 4.5345, "step": 872 }, { "epoch": 0.00873, "grad_norm": 0.5844885657438569, "learning_rate": 0.0026190000000000002, "loss": 4.5441, "step": 873 }, { "epoch": 0.00874, "grad_norm": 0.5286751889514939, "learning_rate": 0.002622, "loss": 4.5573, "step": 874 }, { "epoch": 0.00875, "grad_norm": 0.5494502975686567, "learning_rate": 0.002625, "loss": 4.5051, "step": 875 }, { "epoch": 0.00876, "grad_norm": 0.6269645893334619, "learning_rate": 0.002628, "loss": 4.5356, "step": 876 }, { "epoch": 0.00877, "grad_norm": 0.6901692969476347, "learning_rate": 0.002631, "loss": 4.5537, "step": 877 }, { "epoch": 0.00878, "grad_norm": 0.7433571199864201, "learning_rate": 0.002634, "loss": 4.4894, "step": 878 }, { "epoch": 0.00879, "grad_norm": 0.7806263902694818, "learning_rate": 0.002637, "loss": 4.5303, "step": 879 }, { "epoch": 0.0088, "grad_norm": 0.7694352427135146, "learning_rate": 0.00264, "loss": 4.5353, "step": 880 }, { "epoch": 0.00881, "grad_norm": 0.6938035372615263, "learning_rate": 0.002643, "loss": 4.5351, "step": 881 }, { "epoch": 0.00882, "grad_norm": 0.672108824408367, "learning_rate": 0.002646, "loss": 4.5331, "step": 882 }, { "epoch": 0.00883, "grad_norm": 0.6847246415873477, "learning_rate": 0.002649, "loss": 4.524, "step": 883 }, { "epoch": 0.00884, "grad_norm": 0.8034466162851082, "learning_rate": 0.0026520000000000003, "loss": 4.5123, "step": 884 }, { "epoch": 0.00885, "grad_norm": 0.8504740488359851, "learning_rate": 0.0026550000000000002, "loss": 4.5174, "step": 885 }, { "epoch": 0.00886, "grad_norm": 0.6814828631263485, "learning_rate": 0.002658, "loss": 4.526, "step": 886 }, { "epoch": 0.00887, "grad_norm": 0.7764171535679266, "learning_rate": 0.002661, "loss": 4.5254, "step": 887 }, { "epoch": 0.00888, "grad_norm": 0.864178556132455, "learning_rate": 0.002664, "loss": 4.5323, "step": 888 }, { "epoch": 0.00889, "grad_norm": 0.9963366133988169, "learning_rate": 0.002667, "loss": 4.4915, "step": 889 }, { "epoch": 0.0089, "grad_norm": 0.9684646116139818, "learning_rate": 0.00267, "loss": 4.5454, "step": 890 }, { "epoch": 0.00891, "grad_norm": 0.8019607803647243, "learning_rate": 0.002673, "loss": 4.523, "step": 891 }, { "epoch": 0.00892, "grad_norm": 0.9082109544296421, "learning_rate": 0.002676, "loss": 4.5431, "step": 892 }, { "epoch": 0.00893, "grad_norm": 0.9728985136087724, "learning_rate": 0.002679, "loss": 4.5212, "step": 893 }, { "epoch": 0.00894, "grad_norm": 1.0580229080074564, "learning_rate": 0.002682, "loss": 4.5156, "step": 894 }, { "epoch": 0.00895, "grad_norm": 0.8166896686472195, "learning_rate": 0.0026850000000000003, "loss": 4.5188, "step": 895 }, { "epoch": 0.00896, "grad_norm": 0.8942454442625324, "learning_rate": 0.0026880000000000003, "loss": 4.5121, "step": 896 }, { "epoch": 0.00897, "grad_norm": 0.9627689980406442, "learning_rate": 0.0026910000000000002, "loss": 4.5406, "step": 897 }, { "epoch": 0.00898, "grad_norm": 0.915209672072645, "learning_rate": 0.002694, "loss": 4.5291, "step": 898 }, { "epoch": 0.00899, "grad_norm": 0.8008505248098504, "learning_rate": 0.002697, "loss": 4.5313, "step": 899 }, { "epoch": 0.009, "grad_norm": 0.974662986907947, "learning_rate": 0.0027, "loss": 4.5627, "step": 900 }, { "epoch": 0.00901, "grad_norm": 1.1878109077704273, "learning_rate": 0.002703, "loss": 4.5336, "step": 901 }, { "epoch": 0.00902, "grad_norm": 0.7391945290133592, "learning_rate": 0.002706, "loss": 4.5248, "step": 902 }, { "epoch": 0.00903, "grad_norm": 0.7310904041760188, "learning_rate": 0.002709, "loss": 4.5195, "step": 903 }, { "epoch": 0.00904, "grad_norm": 0.763540341773903, "learning_rate": 0.002712, "loss": 4.4985, "step": 904 }, { "epoch": 0.00905, "grad_norm": 0.922476362328922, "learning_rate": 0.002715, "loss": 4.5209, "step": 905 }, { "epoch": 0.00906, "grad_norm": 0.9574009618515481, "learning_rate": 0.002718, "loss": 4.5379, "step": 906 }, { "epoch": 0.00907, "grad_norm": 1.035774941473643, "learning_rate": 0.0027210000000000003, "loss": 4.5538, "step": 907 }, { "epoch": 0.00908, "grad_norm": 1.0191319400722418, "learning_rate": 0.0027240000000000003, "loss": 4.5279, "step": 908 }, { "epoch": 0.00909, "grad_norm": 1.1103950353838743, "learning_rate": 0.0027270000000000003, "loss": 4.554, "step": 909 }, { "epoch": 0.0091, "grad_norm": 0.8691030580510016, "learning_rate": 0.0027300000000000002, "loss": 4.5543, "step": 910 }, { "epoch": 0.00911, "grad_norm": 0.7723878752960539, "learning_rate": 0.002733, "loss": 4.5035, "step": 911 }, { "epoch": 0.00912, "grad_norm": 0.8180906874107354, "learning_rate": 0.002736, "loss": 4.5117, "step": 912 }, { "epoch": 0.00913, "grad_norm": 0.8148069350423363, "learning_rate": 0.002739, "loss": 4.505, "step": 913 }, { "epoch": 0.00914, "grad_norm": 0.7118677869163097, "learning_rate": 0.002742, "loss": 4.5326, "step": 914 }, { "epoch": 0.00915, "grad_norm": 0.67221021637466, "learning_rate": 0.002745, "loss": 4.4896, "step": 915 }, { "epoch": 0.00916, "grad_norm": 0.7521734182214744, "learning_rate": 0.002748, "loss": 4.4742, "step": 916 }, { "epoch": 0.00917, "grad_norm": 0.6660072639776646, "learning_rate": 0.002751, "loss": 4.4809, "step": 917 }, { "epoch": 0.00918, "grad_norm": 0.5671697873930601, "learning_rate": 0.0027540000000000004, "loss": 4.4963, "step": 918 }, { "epoch": 0.00919, "grad_norm": 0.5594822914263968, "learning_rate": 0.0027570000000000003, "loss": 4.4754, "step": 919 }, { "epoch": 0.0092, "grad_norm": 0.6080421327199805, "learning_rate": 0.0027600000000000003, "loss": 4.4903, "step": 920 }, { "epoch": 0.00921, "grad_norm": 0.728784609109995, "learning_rate": 0.0027630000000000003, "loss": 4.4759, "step": 921 }, { "epoch": 0.00922, "grad_norm": 0.8477951683682543, "learning_rate": 0.0027660000000000002, "loss": 4.5076, "step": 922 }, { "epoch": 0.00923, "grad_norm": 0.6016579194588629, "learning_rate": 0.002769, "loss": 4.4637, "step": 923 }, { "epoch": 0.00924, "grad_norm": 0.6726777284073512, "learning_rate": 0.002772, "loss": 4.4919, "step": 924 }, { "epoch": 0.00925, "grad_norm": 0.7061399279754877, "learning_rate": 0.002775, "loss": 4.4751, "step": 925 }, { "epoch": 0.00926, "grad_norm": 0.6525850527724611, "learning_rate": 0.002778, "loss": 4.4844, "step": 926 }, { "epoch": 0.00927, "grad_norm": 0.6775355167555962, "learning_rate": 0.002781, "loss": 4.4493, "step": 927 }, { "epoch": 0.00928, "grad_norm": 0.660488843824027, "learning_rate": 0.002784, "loss": 4.493, "step": 928 }, { "epoch": 0.00929, "grad_norm": 0.6375906920183253, "learning_rate": 0.0027870000000000004, "loss": 4.4714, "step": 929 }, { "epoch": 0.0093, "grad_norm": 0.7170906127689122, "learning_rate": 0.0027900000000000004, "loss": 4.4557, "step": 930 }, { "epoch": 0.00931, "grad_norm": 0.7727209257871167, "learning_rate": 0.0027930000000000003, "loss": 4.4706, "step": 931 }, { "epoch": 0.00932, "grad_norm": 0.759902395005458, "learning_rate": 0.0027960000000000003, "loss": 4.4795, "step": 932 }, { "epoch": 0.00933, "grad_norm": 0.9089287828754257, "learning_rate": 0.0027990000000000003, "loss": 4.4536, "step": 933 }, { "epoch": 0.00934, "grad_norm": 0.9561562356933814, "learning_rate": 0.0028020000000000002, "loss": 4.4859, "step": 934 }, { "epoch": 0.00935, "grad_norm": 1.010681147518413, "learning_rate": 0.002805, "loss": 4.5044, "step": 935 }, { "epoch": 0.00936, "grad_norm": 0.960270788501537, "learning_rate": 0.002808, "loss": 4.5076, "step": 936 }, { "epoch": 0.00937, "grad_norm": 0.8170703531029062, "learning_rate": 0.002811, "loss": 4.5151, "step": 937 }, { "epoch": 0.00938, "grad_norm": 0.6779931914379209, "learning_rate": 0.002814, "loss": 4.4854, "step": 938 }, { "epoch": 0.00939, "grad_norm": 0.6509733323350428, "learning_rate": 0.002817, "loss": 4.4771, "step": 939 }, { "epoch": 0.0094, "grad_norm": 0.6633333111873998, "learning_rate": 0.00282, "loss": 4.4796, "step": 940 }, { "epoch": 0.00941, "grad_norm": 0.7449615790040228, "learning_rate": 0.002823, "loss": 4.4576, "step": 941 }, { "epoch": 0.00942, "grad_norm": 0.7696922222349031, "learning_rate": 0.002826, "loss": 4.4823, "step": 942 }, { "epoch": 0.00943, "grad_norm": 0.6087986992050594, "learning_rate": 0.002829, "loss": 4.4688, "step": 943 }, { "epoch": 0.00944, "grad_norm": 0.5228736659630974, "learning_rate": 0.002832, "loss": 4.4678, "step": 944 }, { "epoch": 0.00945, "grad_norm": 0.5754533269053475, "learning_rate": 0.002835, "loss": 4.4713, "step": 945 }, { "epoch": 0.00946, "grad_norm": 0.6164648444392735, "learning_rate": 0.002838, "loss": 4.4832, "step": 946 }, { "epoch": 0.00947, "grad_norm": 0.6419505093155794, "learning_rate": 0.0028409999999999998, "loss": 4.4556, "step": 947 }, { "epoch": 0.00948, "grad_norm": 0.556707209906786, "learning_rate": 0.0028439999999999997, "loss": 4.4351, "step": 948 }, { "epoch": 0.00949, "grad_norm": 0.5031407625923785, "learning_rate": 0.002847, "loss": 4.4737, "step": 949 }, { "epoch": 0.0095, "grad_norm": 0.5881138889925357, "learning_rate": 0.00285, "loss": 4.4655, "step": 950 }, { "epoch": 0.00951, "grad_norm": 0.6194945571715194, "learning_rate": 0.002853, "loss": 4.4418, "step": 951 }, { "epoch": 0.00952, "grad_norm": 0.6965416314433459, "learning_rate": 0.002856, "loss": 4.4704, "step": 952 }, { "epoch": 0.00953, "grad_norm": 0.7506696864658969, "learning_rate": 0.002859, "loss": 4.4161, "step": 953 }, { "epoch": 0.00954, "grad_norm": 0.6440899731036704, "learning_rate": 0.002862, "loss": 4.4703, "step": 954 }, { "epoch": 0.00955, "grad_norm": 0.7716821398038454, "learning_rate": 0.002865, "loss": 4.4601, "step": 955 }, { "epoch": 0.00956, "grad_norm": 0.9111507025485583, "learning_rate": 0.002868, "loss": 4.4578, "step": 956 }, { "epoch": 0.00957, "grad_norm": 0.8443539201487298, "learning_rate": 0.002871, "loss": 4.4727, "step": 957 }, { "epoch": 0.00958, "grad_norm": 0.7989694313181581, "learning_rate": 0.002874, "loss": 4.4582, "step": 958 }, { "epoch": 0.00959, "grad_norm": 0.7547037856292312, "learning_rate": 0.002877, "loss": 4.4482, "step": 959 }, { "epoch": 0.0096, "grad_norm": 0.7121103430025651, "learning_rate": 0.0028799999999999997, "loss": 4.4466, "step": 960 }, { "epoch": 0.00961, "grad_norm": 0.6758285086831927, "learning_rate": 0.002883, "loss": 4.4209, "step": 961 }, { "epoch": 0.00962, "grad_norm": 0.5336350485211147, "learning_rate": 0.002886, "loss": 4.4436, "step": 962 }, { "epoch": 0.00963, "grad_norm": 0.5703472269796636, "learning_rate": 0.002889, "loss": 4.4467, "step": 963 }, { "epoch": 0.00964, "grad_norm": 0.6524118600131552, "learning_rate": 0.002892, "loss": 4.4355, "step": 964 }, { "epoch": 0.00965, "grad_norm": 0.7342343756321769, "learning_rate": 0.002895, "loss": 4.4144, "step": 965 }, { "epoch": 0.00966, "grad_norm": 0.9544017629329996, "learning_rate": 0.002898, "loss": 4.4408, "step": 966 }, { "epoch": 0.00967, "grad_norm": 1.1065817167809169, "learning_rate": 0.002901, "loss": 4.4927, "step": 967 }, { "epoch": 0.00968, "grad_norm": 0.9829826223932284, "learning_rate": 0.002904, "loss": 4.4568, "step": 968 }, { "epoch": 0.00969, "grad_norm": 0.8112557218556605, "learning_rate": 0.002907, "loss": 4.4805, "step": 969 }, { "epoch": 0.0097, "grad_norm": 0.983264710703698, "learning_rate": 0.00291, "loss": 4.4488, "step": 970 }, { "epoch": 0.00971, "grad_norm": 1.010591995610007, "learning_rate": 0.002913, "loss": 4.4796, "step": 971 }, { "epoch": 0.00972, "grad_norm": 0.9930743768968635, "learning_rate": 0.002916, "loss": 4.4798, "step": 972 }, { "epoch": 0.00973, "grad_norm": 1.1817662624154759, "learning_rate": 0.002919, "loss": 4.4931, "step": 973 }, { "epoch": 0.00974, "grad_norm": 1.1915796929226945, "learning_rate": 0.002922, "loss": 4.4839, "step": 974 }, { "epoch": 0.00975, "grad_norm": 0.9880254500237188, "learning_rate": 0.002925, "loss": 4.4969, "step": 975 }, { "epoch": 0.00976, "grad_norm": 1.0394868289507098, "learning_rate": 0.002928, "loss": 4.4906, "step": 976 }, { "epoch": 0.00977, "grad_norm": 1.1242485803004214, "learning_rate": 0.002931, "loss": 4.5346, "step": 977 }, { "epoch": 0.00978, "grad_norm": 1.1287291198302327, "learning_rate": 0.002934, "loss": 4.5134, "step": 978 }, { "epoch": 0.00979, "grad_norm": 0.937365777135907, "learning_rate": 0.002937, "loss": 4.4945, "step": 979 }, { "epoch": 0.0098, "grad_norm": 0.9999316178288561, "learning_rate": 0.00294, "loss": 4.4906, "step": 980 }, { "epoch": 0.00981, "grad_norm": 1.0692695747943242, "learning_rate": 0.002943, "loss": 4.5166, "step": 981 }, { "epoch": 0.00982, "grad_norm": 0.855671043658802, "learning_rate": 0.002946, "loss": 4.4983, "step": 982 }, { "epoch": 0.00983, "grad_norm": 0.8819131773225922, "learning_rate": 0.0029490000000000002, "loss": 4.4932, "step": 983 }, { "epoch": 0.00984, "grad_norm": 0.8474323035176099, "learning_rate": 0.002952, "loss": 4.4794, "step": 984 }, { "epoch": 0.00985, "grad_norm": 0.8010021292921615, "learning_rate": 0.002955, "loss": 4.4955, "step": 985 }, { "epoch": 0.00986, "grad_norm": 0.6778548937329826, "learning_rate": 0.002958, "loss": 4.4543, "step": 986 }, { "epoch": 0.00987, "grad_norm": 0.6335901643582099, "learning_rate": 0.002961, "loss": 4.4618, "step": 987 }, { "epoch": 0.00988, "grad_norm": 0.6898288520048222, "learning_rate": 0.002964, "loss": 4.4934, "step": 988 }, { "epoch": 0.00989, "grad_norm": 0.6968481266470886, "learning_rate": 0.002967, "loss": 4.4338, "step": 989 }, { "epoch": 0.0099, "grad_norm": 0.7462093807424477, "learning_rate": 0.00297, "loss": 4.4706, "step": 990 }, { "epoch": 0.00991, "grad_norm": 0.7437971839370325, "learning_rate": 0.002973, "loss": 4.4697, "step": 991 }, { "epoch": 0.00992, "grad_norm": 0.6722774287362101, "learning_rate": 0.002976, "loss": 4.4576, "step": 992 }, { "epoch": 0.00993, "grad_norm": 0.6583210228081078, "learning_rate": 0.002979, "loss": 4.4522, "step": 993 }, { "epoch": 0.00994, "grad_norm": 0.6827391668775007, "learning_rate": 0.002982, "loss": 4.4553, "step": 994 }, { "epoch": 0.00995, "grad_norm": 0.7054299500075728, "learning_rate": 0.0029850000000000002, "loss": 4.4538, "step": 995 }, { "epoch": 0.00996, "grad_norm": 0.7382372248626952, "learning_rate": 0.002988, "loss": 4.4592, "step": 996 }, { "epoch": 0.00997, "grad_norm": 0.6621181102222602, "learning_rate": 0.002991, "loss": 4.4437, "step": 997 }, { "epoch": 0.00998, "grad_norm": 0.6532026522918166, "learning_rate": 0.002994, "loss": 4.4467, "step": 998 }, { "epoch": 0.00999, "grad_norm": 0.6863970198472243, "learning_rate": 0.002997, "loss": 4.4317, "step": 999 }, { "epoch": 0.01, "grad_norm": 0.6643394709769835, "learning_rate": 0.003, "loss": 4.4529, "step": 1000 }, { "epoch": 0.01001, "grad_norm": 0.5664327532419113, "learning_rate": 0.003, "loss": 4.4443, "step": 1001 }, { "epoch": 0.01002, "grad_norm": 0.5614282124245983, "learning_rate": 0.003, "loss": 4.4107, "step": 1002 }, { "epoch": 0.01003, "grad_norm": 0.5395347922472447, "learning_rate": 0.003, "loss": 4.4319, "step": 1003 }, { "epoch": 0.01004, "grad_norm": 0.5638853817459814, "learning_rate": 0.003, "loss": 4.4194, "step": 1004 }, { "epoch": 0.01005, "grad_norm": 0.5916080142589283, "learning_rate": 0.003, "loss": 4.3932, "step": 1005 }, { "epoch": 0.01006, "grad_norm": 0.5700688347090367, "learning_rate": 0.003, "loss": 4.3913, "step": 1006 }, { "epoch": 0.01007, "grad_norm": 0.5618557938751672, "learning_rate": 0.003, "loss": 4.4343, "step": 1007 }, { "epoch": 0.01008, "grad_norm": 0.6176294799951538, "learning_rate": 0.003, "loss": 4.4449, "step": 1008 }, { "epoch": 0.01009, "grad_norm": 0.6917308148051521, "learning_rate": 0.003, "loss": 4.4152, "step": 1009 }, { "epoch": 0.0101, "grad_norm": 0.7355219940735465, "learning_rate": 0.003, "loss": 4.4242, "step": 1010 }, { "epoch": 0.01011, "grad_norm": 0.8258393023556594, "learning_rate": 0.003, "loss": 4.437, "step": 1011 }, { "epoch": 0.01012, "grad_norm": 0.7883368033913232, "learning_rate": 0.003, "loss": 4.4273, "step": 1012 }, { "epoch": 0.01013, "grad_norm": 0.8421670780561094, "learning_rate": 0.003, "loss": 4.453, "step": 1013 }, { "epoch": 0.01014, "grad_norm": 0.7611364620814401, "learning_rate": 0.003, "loss": 4.3906, "step": 1014 }, { "epoch": 0.01015, "grad_norm": 0.8948224171004916, "learning_rate": 0.003, "loss": 4.4422, "step": 1015 }, { "epoch": 0.01016, "grad_norm": 0.9330968648547454, "learning_rate": 0.003, "loss": 4.4537, "step": 1016 }, { "epoch": 0.01017, "grad_norm": 0.9071988294298332, "learning_rate": 0.003, "loss": 4.4046, "step": 1017 }, { "epoch": 0.01018, "grad_norm": 0.9028445315923417, "learning_rate": 0.003, "loss": 4.4536, "step": 1018 }, { "epoch": 0.01019, "grad_norm": 0.701398424995911, "learning_rate": 0.003, "loss": 4.4297, "step": 1019 }, { "epoch": 0.0102, "grad_norm": 0.7422831527736846, "learning_rate": 0.003, "loss": 4.428, "step": 1020 }, { "epoch": 0.01021, "grad_norm": 0.7468650100074169, "learning_rate": 0.003, "loss": 4.4136, "step": 1021 }, { "epoch": 0.01022, "grad_norm": 0.9325291879496833, "learning_rate": 0.003, "loss": 4.4294, "step": 1022 }, { "epoch": 0.01023, "grad_norm": 0.8401676161943546, "learning_rate": 0.003, "loss": 4.4261, "step": 1023 }, { "epoch": 0.01024, "grad_norm": 0.8270343729935282, "learning_rate": 0.003, "loss": 4.3898, "step": 1024 }, { "epoch": 0.01025, "grad_norm": 0.7180913593704098, "learning_rate": 0.003, "loss": 4.4147, "step": 1025 }, { "epoch": 0.01026, "grad_norm": 0.6277034329789435, "learning_rate": 0.003, "loss": 4.4039, "step": 1026 }, { "epoch": 0.01027, "grad_norm": 0.5318737503801085, "learning_rate": 0.003, "loss": 4.4219, "step": 1027 }, { "epoch": 0.01028, "grad_norm": 0.4999785226446009, "learning_rate": 0.003, "loss": 4.4015, "step": 1028 }, { "epoch": 0.01029, "grad_norm": 0.48973961084120876, "learning_rate": 0.003, "loss": 4.3769, "step": 1029 }, { "epoch": 0.0103, "grad_norm": 0.5942733082980836, "learning_rate": 0.003, "loss": 4.4125, "step": 1030 }, { "epoch": 0.01031, "grad_norm": 0.7229752342980397, "learning_rate": 0.003, "loss": 4.4274, "step": 1031 }, { "epoch": 0.01032, "grad_norm": 0.7373514084562369, "learning_rate": 0.003, "loss": 4.4145, "step": 1032 }, { "epoch": 0.01033, "grad_norm": 0.6482800157489234, "learning_rate": 0.003, "loss": 4.4346, "step": 1033 }, { "epoch": 0.01034, "grad_norm": 0.6524184922218951, "learning_rate": 0.003, "loss": 4.4051, "step": 1034 }, { "epoch": 0.01035, "grad_norm": 0.705389079509784, "learning_rate": 0.003, "loss": 4.4073, "step": 1035 }, { "epoch": 0.01036, "grad_norm": 0.7235541204605349, "learning_rate": 0.003, "loss": 4.3939, "step": 1036 }, { "epoch": 0.01037, "grad_norm": 0.6796343657160102, "learning_rate": 0.003, "loss": 4.4161, "step": 1037 }, { "epoch": 0.01038, "grad_norm": 0.5773141410598708, "learning_rate": 0.003, "loss": 4.4121, "step": 1038 }, { "epoch": 0.01039, "grad_norm": 0.5565285106963757, "learning_rate": 0.003, "loss": 4.4014, "step": 1039 }, { "epoch": 0.0104, "grad_norm": 0.526665138631767, "learning_rate": 0.003, "loss": 4.4124, "step": 1040 }, { "epoch": 0.01041, "grad_norm": 0.5117793067919421, "learning_rate": 0.003, "loss": 4.3909, "step": 1041 }, { "epoch": 0.01042, "grad_norm": 0.5092553549098504, "learning_rate": 0.003, "loss": 4.4167, "step": 1042 }, { "epoch": 0.01043, "grad_norm": 0.5543198083705748, "learning_rate": 0.003, "loss": 4.3593, "step": 1043 }, { "epoch": 0.01044, "grad_norm": 0.600481111898673, "learning_rate": 0.003, "loss": 4.3987, "step": 1044 }, { "epoch": 0.01045, "grad_norm": 0.5965980004717368, "learning_rate": 0.003, "loss": 4.4162, "step": 1045 }, { "epoch": 0.01046, "grad_norm": 0.6740252530686676, "learning_rate": 0.003, "loss": 4.3928, "step": 1046 }, { "epoch": 0.01047, "grad_norm": 0.7361053030528937, "learning_rate": 0.003, "loss": 4.3997, "step": 1047 }, { "epoch": 0.01048, "grad_norm": 0.767513361829787, "learning_rate": 0.003, "loss": 4.3936, "step": 1048 }, { "epoch": 0.01049, "grad_norm": 0.6855190682446736, "learning_rate": 0.003, "loss": 4.3699, "step": 1049 }, { "epoch": 0.0105, "grad_norm": 0.5741691502464937, "learning_rate": 0.003, "loss": 4.4062, "step": 1050 }, { "epoch": 0.01051, "grad_norm": 0.6230464159511063, "learning_rate": 0.003, "loss": 4.3827, "step": 1051 }, { "epoch": 0.01052, "grad_norm": 0.6366737529257143, "learning_rate": 0.003, "loss": 4.3967, "step": 1052 }, { "epoch": 0.01053, "grad_norm": 0.6755427193415695, "learning_rate": 0.003, "loss": 4.4056, "step": 1053 }, { "epoch": 0.01054, "grad_norm": 0.6231015147472246, "learning_rate": 0.003, "loss": 4.3965, "step": 1054 }, { "epoch": 0.01055, "grad_norm": 0.5394645878748071, "learning_rate": 0.003, "loss": 4.3715, "step": 1055 }, { "epoch": 0.01056, "grad_norm": 0.5755128064540977, "learning_rate": 0.003, "loss": 4.3719, "step": 1056 }, { "epoch": 0.01057, "grad_norm": 0.5491694969265289, "learning_rate": 0.003, "loss": 4.3754, "step": 1057 }, { "epoch": 0.01058, "grad_norm": 0.7343919109861163, "learning_rate": 0.003, "loss": 4.375, "step": 1058 }, { "epoch": 0.01059, "grad_norm": 0.9361910412332192, "learning_rate": 0.003, "loss": 4.3892, "step": 1059 }, { "epoch": 0.0106, "grad_norm": 0.9065515100292879, "learning_rate": 0.003, "loss": 4.3901, "step": 1060 }, { "epoch": 0.01061, "grad_norm": 1.1211758916188983, "learning_rate": 0.003, "loss": 4.3972, "step": 1061 }, { "epoch": 0.01062, "grad_norm": 0.7834596971762835, "learning_rate": 0.003, "loss": 4.3907, "step": 1062 }, { "epoch": 0.01063, "grad_norm": 0.6963669031971059, "learning_rate": 0.003, "loss": 4.3832, "step": 1063 }, { "epoch": 0.01064, "grad_norm": 0.6238185439704477, "learning_rate": 0.003, "loss": 4.3945, "step": 1064 }, { "epoch": 0.01065, "grad_norm": 0.6958753605701185, "learning_rate": 0.003, "loss": 4.3993, "step": 1065 }, { "epoch": 0.01066, "grad_norm": 0.7129893176363267, "learning_rate": 0.003, "loss": 4.4025, "step": 1066 }, { "epoch": 0.01067, "grad_norm": 0.7862199002009355, "learning_rate": 0.003, "loss": 4.4178, "step": 1067 }, { "epoch": 0.01068, "grad_norm": 0.7422673244500091, "learning_rate": 0.003, "loss": 4.3984, "step": 1068 }, { "epoch": 0.01069, "grad_norm": 0.7997129592129516, "learning_rate": 0.003, "loss": 4.4072, "step": 1069 }, { "epoch": 0.0107, "grad_norm": 0.945099613944759, "learning_rate": 0.003, "loss": 4.4674, "step": 1070 }, { "epoch": 0.01071, "grad_norm": 0.8590095455876905, "learning_rate": 0.003, "loss": 4.387, "step": 1071 }, { "epoch": 0.01072, "grad_norm": 0.8651521698594221, "learning_rate": 0.003, "loss": 4.4227, "step": 1072 }, { "epoch": 0.01073, "grad_norm": 0.80387311045371, "learning_rate": 0.003, "loss": 4.4158, "step": 1073 }, { "epoch": 0.01074, "grad_norm": 0.8137862583554676, "learning_rate": 0.003, "loss": 4.3801, "step": 1074 }, { "epoch": 0.01075, "grad_norm": 0.8055768924194764, "learning_rate": 0.003, "loss": 4.4185, "step": 1075 }, { "epoch": 0.01076, "grad_norm": 0.8990845414650448, "learning_rate": 0.003, "loss": 4.4196, "step": 1076 }, { "epoch": 0.01077, "grad_norm": 0.9178768409478193, "learning_rate": 0.003, "loss": 4.4207, "step": 1077 }, { "epoch": 0.01078, "grad_norm": 0.9083931280693812, "learning_rate": 0.003, "loss": 4.3748, "step": 1078 }, { "epoch": 0.01079, "grad_norm": 0.861335694801563, "learning_rate": 0.003, "loss": 4.4104, "step": 1079 }, { "epoch": 0.0108, "grad_norm": 0.6640978224934152, "learning_rate": 0.003, "loss": 4.3777, "step": 1080 }, { "epoch": 0.01081, "grad_norm": 0.6684756830200824, "learning_rate": 0.003, "loss": 4.3836, "step": 1081 }, { "epoch": 0.01082, "grad_norm": 0.7577090594132565, "learning_rate": 0.003, "loss": 4.4054, "step": 1082 }, { "epoch": 0.01083, "grad_norm": 0.8160609336744046, "learning_rate": 0.003, "loss": 4.3924, "step": 1083 }, { "epoch": 0.01084, "grad_norm": 0.7897303142858512, "learning_rate": 0.003, "loss": 4.4011, "step": 1084 }, { "epoch": 0.01085, "grad_norm": 0.7064792552725174, "learning_rate": 0.003, "loss": 4.3862, "step": 1085 }, { "epoch": 0.01086, "grad_norm": 0.6262724475441191, "learning_rate": 0.003, "loss": 4.3571, "step": 1086 }, { "epoch": 0.01087, "grad_norm": 0.6391781173229293, "learning_rate": 0.003, "loss": 4.3729, "step": 1087 }, { "epoch": 0.01088, "grad_norm": 0.6632808145329404, "learning_rate": 0.003, "loss": 4.4092, "step": 1088 }, { "epoch": 0.01089, "grad_norm": 0.6686792714887265, "learning_rate": 0.003, "loss": 4.3783, "step": 1089 }, { "epoch": 0.0109, "grad_norm": 0.5882408044346112, "learning_rate": 0.003, "loss": 4.3992, "step": 1090 }, { "epoch": 0.01091, "grad_norm": 0.5565191786424966, "learning_rate": 0.003, "loss": 4.3662, "step": 1091 }, { "epoch": 0.01092, "grad_norm": 0.571165259700944, "learning_rate": 0.003, "loss": 4.3627, "step": 1092 }, { "epoch": 0.01093, "grad_norm": 0.5483152737763334, "learning_rate": 0.003, "loss": 4.379, "step": 1093 }, { "epoch": 0.01094, "grad_norm": 0.4816093735859256, "learning_rate": 0.003, "loss": 4.3664, "step": 1094 }, { "epoch": 0.01095, "grad_norm": 0.4607091969369753, "learning_rate": 0.003, "loss": 4.3913, "step": 1095 }, { "epoch": 0.01096, "grad_norm": 0.5948450571982326, "learning_rate": 0.003, "loss": 4.3662, "step": 1096 }, { "epoch": 0.01097, "grad_norm": 0.8991911041937081, "learning_rate": 0.003, "loss": 4.3648, "step": 1097 }, { "epoch": 0.01098, "grad_norm": 1.00898451325596, "learning_rate": 0.003, "loss": 4.4173, "step": 1098 }, { "epoch": 0.01099, "grad_norm": 0.837940726063793, "learning_rate": 0.003, "loss": 4.3702, "step": 1099 }, { "epoch": 0.011, "grad_norm": 1.0135265311826804, "learning_rate": 0.003, "loss": 4.3911, "step": 1100 }, { "epoch": 0.01101, "grad_norm": 1.0865706640064363, "learning_rate": 0.003, "loss": 4.4129, "step": 1101 }, { "epoch": 0.01102, "grad_norm": 0.9407674508321733, "learning_rate": 0.003, "loss": 4.4141, "step": 1102 }, { "epoch": 0.01103, "grad_norm": 0.7826637713653176, "learning_rate": 0.003, "loss": 4.4004, "step": 1103 }, { "epoch": 0.01104, "grad_norm": 0.6792138170427816, "learning_rate": 0.003, "loss": 4.3864, "step": 1104 }, { "epoch": 0.01105, "grad_norm": 0.7494779633947588, "learning_rate": 0.003, "loss": 4.4098, "step": 1105 }, { "epoch": 0.01106, "grad_norm": 0.7074202272250578, "learning_rate": 0.003, "loss": 4.3837, "step": 1106 }, { "epoch": 0.01107, "grad_norm": 0.6111631455372859, "learning_rate": 0.003, "loss": 4.3903, "step": 1107 }, { "epoch": 0.01108, "grad_norm": 0.5958647482211449, "learning_rate": 0.003, "loss": 4.3565, "step": 1108 }, { "epoch": 0.01109, "grad_norm": 0.615214127583215, "learning_rate": 0.003, "loss": 4.3726, "step": 1109 }, { "epoch": 0.0111, "grad_norm": 0.5327988413049348, "learning_rate": 0.003, "loss": 4.3878, "step": 1110 }, { "epoch": 0.01111, "grad_norm": 0.47023927407071675, "learning_rate": 0.003, "loss": 4.3622, "step": 1111 }, { "epoch": 0.01112, "grad_norm": 0.3776559700056012, "learning_rate": 0.003, "loss": 4.3759, "step": 1112 }, { "epoch": 0.01113, "grad_norm": 0.3763967476164199, "learning_rate": 0.003, "loss": 4.3414, "step": 1113 }, { "epoch": 0.01114, "grad_norm": 0.3564238802994098, "learning_rate": 0.003, "loss": 4.354, "step": 1114 }, { "epoch": 0.01115, "grad_norm": 0.30679188146183123, "learning_rate": 0.003, "loss": 4.3753, "step": 1115 }, { "epoch": 0.01116, "grad_norm": 0.33853412768388663, "learning_rate": 0.003, "loss": 4.3633, "step": 1116 }, { "epoch": 0.01117, "grad_norm": 0.3107462307737106, "learning_rate": 0.003, "loss": 4.3492, "step": 1117 }, { "epoch": 0.01118, "grad_norm": 0.32797709961339483, "learning_rate": 0.003, "loss": 4.352, "step": 1118 }, { "epoch": 0.01119, "grad_norm": 0.3013823503547338, "learning_rate": 0.003, "loss": 4.3543, "step": 1119 }, { "epoch": 0.0112, "grad_norm": 0.3198307315980734, "learning_rate": 0.003, "loss": 4.3378, "step": 1120 }, { "epoch": 0.01121, "grad_norm": 0.33595869744145906, "learning_rate": 0.003, "loss": 4.3478, "step": 1121 }, { "epoch": 0.01122, "grad_norm": 0.4236485736167927, "learning_rate": 0.003, "loss": 4.3259, "step": 1122 }, { "epoch": 0.01123, "grad_norm": 0.5207438928810102, "learning_rate": 0.003, "loss": 4.3165, "step": 1123 }, { "epoch": 0.01124, "grad_norm": 0.5897496309635056, "learning_rate": 0.003, "loss": 4.3639, "step": 1124 }, { "epoch": 0.01125, "grad_norm": 0.6251226930864733, "learning_rate": 0.003, "loss": 4.3351, "step": 1125 }, { "epoch": 0.01126, "grad_norm": 0.7904920259399445, "learning_rate": 0.003, "loss": 4.3485, "step": 1126 }, { "epoch": 0.01127, "grad_norm": 0.8706338403024567, "learning_rate": 0.003, "loss": 4.3878, "step": 1127 }, { "epoch": 0.01128, "grad_norm": 0.7520394017234492, "learning_rate": 0.003, "loss": 4.3618, "step": 1128 }, { "epoch": 0.01129, "grad_norm": 0.9434772506575799, "learning_rate": 0.003, "loss": 4.3617, "step": 1129 }, { "epoch": 0.0113, "grad_norm": 0.9515853773144796, "learning_rate": 0.003, "loss": 4.3566, "step": 1130 }, { "epoch": 0.01131, "grad_norm": 1.1049329865545139, "learning_rate": 0.003, "loss": 4.3775, "step": 1131 }, { "epoch": 0.01132, "grad_norm": 0.869968096155467, "learning_rate": 0.003, "loss": 4.3991, "step": 1132 }, { "epoch": 0.01133, "grad_norm": 0.7539769321914868, "learning_rate": 0.003, "loss": 4.3532, "step": 1133 }, { "epoch": 0.01134, "grad_norm": 0.8275263694532653, "learning_rate": 0.003, "loss": 4.3686, "step": 1134 }, { "epoch": 0.01135, "grad_norm": 0.8617681055772174, "learning_rate": 0.003, "loss": 4.402, "step": 1135 }, { "epoch": 0.01136, "grad_norm": 0.7602421394000872, "learning_rate": 0.003, "loss": 4.3721, "step": 1136 }, { "epoch": 0.01137, "grad_norm": 0.7778258812220352, "learning_rate": 0.003, "loss": 4.3963, "step": 1137 }, { "epoch": 0.01138, "grad_norm": 0.7594339489071814, "learning_rate": 0.003, "loss": 4.3534, "step": 1138 }, { "epoch": 0.01139, "grad_norm": 0.7671596268779747, "learning_rate": 0.003, "loss": 4.3643, "step": 1139 }, { "epoch": 0.0114, "grad_norm": 0.8353688690035959, "learning_rate": 0.003, "loss": 4.3455, "step": 1140 }, { "epoch": 0.01141, "grad_norm": 0.9712894348927099, "learning_rate": 0.003, "loss": 4.3818, "step": 1141 }, { "epoch": 0.01142, "grad_norm": 0.8605012156156574, "learning_rate": 0.003, "loss": 4.355, "step": 1142 }, { "epoch": 0.01143, "grad_norm": 0.7447870609491655, "learning_rate": 0.003, "loss": 4.3674, "step": 1143 }, { "epoch": 0.01144, "grad_norm": 0.8620443864386994, "learning_rate": 0.003, "loss": 4.339, "step": 1144 }, { "epoch": 0.01145, "grad_norm": 0.9495119843583, "learning_rate": 0.003, "loss": 4.3837, "step": 1145 }, { "epoch": 0.01146, "grad_norm": 0.8719913828537735, "learning_rate": 0.003, "loss": 4.3682, "step": 1146 }, { "epoch": 0.01147, "grad_norm": 0.8406491643071615, "learning_rate": 0.003, "loss": 4.3599, "step": 1147 }, { "epoch": 0.01148, "grad_norm": 0.7299655375485881, "learning_rate": 0.003, "loss": 4.3549, "step": 1148 }, { "epoch": 0.01149, "grad_norm": 0.7470410614101604, "learning_rate": 0.003, "loss": 4.3727, "step": 1149 }, { "epoch": 0.0115, "grad_norm": 0.7232429634523785, "learning_rate": 0.003, "loss": 4.364, "step": 1150 }, { "epoch": 0.01151, "grad_norm": 0.6295150712699281, "learning_rate": 0.003, "loss": 4.3586, "step": 1151 }, { "epoch": 0.01152, "grad_norm": 0.652544707149974, "learning_rate": 0.003, "loss": 4.3661, "step": 1152 }, { "epoch": 0.01153, "grad_norm": 0.5683271703897544, "learning_rate": 0.003, "loss": 4.3775, "step": 1153 }, { "epoch": 0.01154, "grad_norm": 0.535300431807245, "learning_rate": 0.003, "loss": 4.3515, "step": 1154 }, { "epoch": 0.01155, "grad_norm": 0.4766589820931375, "learning_rate": 0.003, "loss": 4.3377, "step": 1155 }, { "epoch": 0.01156, "grad_norm": 0.5530856703722951, "learning_rate": 0.003, "loss": 4.342, "step": 1156 }, { "epoch": 0.01157, "grad_norm": 0.6154599631908627, "learning_rate": 0.003, "loss": 4.374, "step": 1157 }, { "epoch": 0.01158, "grad_norm": 0.693370036269422, "learning_rate": 0.003, "loss": 4.3417, "step": 1158 }, { "epoch": 0.01159, "grad_norm": 0.7413452595288457, "learning_rate": 0.003, "loss": 4.3455, "step": 1159 }, { "epoch": 0.0116, "grad_norm": 0.7301771780442605, "learning_rate": 0.003, "loss": 4.3545, "step": 1160 }, { "epoch": 0.01161, "grad_norm": 0.8159054162369931, "learning_rate": 0.003, "loss": 4.3334, "step": 1161 }, { "epoch": 0.01162, "grad_norm": 0.7760523716092679, "learning_rate": 0.003, "loss": 4.3486, "step": 1162 }, { "epoch": 0.01163, "grad_norm": 0.6309863142169387, "learning_rate": 0.003, "loss": 4.36, "step": 1163 }, { "epoch": 0.01164, "grad_norm": 0.559610446270928, "learning_rate": 0.003, "loss": 4.3468, "step": 1164 }, { "epoch": 0.01165, "grad_norm": 0.5932728337340286, "learning_rate": 0.003, "loss": 4.3443, "step": 1165 }, { "epoch": 0.01166, "grad_norm": 0.47536439541847164, "learning_rate": 0.003, "loss": 4.3373, "step": 1166 }, { "epoch": 0.01167, "grad_norm": 0.4935599339579411, "learning_rate": 0.003, "loss": 4.3054, "step": 1167 }, { "epoch": 0.01168, "grad_norm": 0.530157986351205, "learning_rate": 0.003, "loss": 4.3136, "step": 1168 }, { "epoch": 0.01169, "grad_norm": 0.4039160553718301, "learning_rate": 0.003, "loss": 4.3497, "step": 1169 }, { "epoch": 0.0117, "grad_norm": 0.3887795730112425, "learning_rate": 0.003, "loss": 4.3331, "step": 1170 }, { "epoch": 0.01171, "grad_norm": 0.44146129518485994, "learning_rate": 0.003, "loss": 4.3186, "step": 1171 }, { "epoch": 0.01172, "grad_norm": 0.3842074037767514, "learning_rate": 0.003, "loss": 4.3032, "step": 1172 }, { "epoch": 0.01173, "grad_norm": 0.3741626855737148, "learning_rate": 0.003, "loss": 4.3368, "step": 1173 }, { "epoch": 0.01174, "grad_norm": 0.41728859284602393, "learning_rate": 0.003, "loss": 4.3131, "step": 1174 }, { "epoch": 0.01175, "grad_norm": 0.41981830032719913, "learning_rate": 0.003, "loss": 4.2965, "step": 1175 }, { "epoch": 0.01176, "grad_norm": 0.37957919503111737, "learning_rate": 0.003, "loss": 4.3296, "step": 1176 }, { "epoch": 0.01177, "grad_norm": 0.3992995566049025, "learning_rate": 0.003, "loss": 4.2992, "step": 1177 }, { "epoch": 0.01178, "grad_norm": 0.4902741144335086, "learning_rate": 0.003, "loss": 4.3211, "step": 1178 }, { "epoch": 0.01179, "grad_norm": 0.6458507017424667, "learning_rate": 0.003, "loss": 4.3023, "step": 1179 }, { "epoch": 0.0118, "grad_norm": 0.7925849610797445, "learning_rate": 0.003, "loss": 4.3757, "step": 1180 }, { "epoch": 0.01181, "grad_norm": 0.8606498974404746, "learning_rate": 0.003, "loss": 4.3669, "step": 1181 }, { "epoch": 0.01182, "grad_norm": 0.7553862661379784, "learning_rate": 0.003, "loss": 4.3259, "step": 1182 }, { "epoch": 0.01183, "grad_norm": 0.7241455767877882, "learning_rate": 0.003, "loss": 4.3159, "step": 1183 }, { "epoch": 0.01184, "grad_norm": 0.6945084653960217, "learning_rate": 0.003, "loss": 4.3593, "step": 1184 }, { "epoch": 0.01185, "grad_norm": 0.5809073742888989, "learning_rate": 0.003, "loss": 4.3238, "step": 1185 }, { "epoch": 0.01186, "grad_norm": 0.556060401057652, "learning_rate": 0.003, "loss": 4.3461, "step": 1186 }, { "epoch": 0.01187, "grad_norm": 0.6174052114657497, "learning_rate": 0.003, "loss": 4.3108, "step": 1187 }, { "epoch": 0.01188, "grad_norm": 0.721410101268809, "learning_rate": 0.003, "loss": 4.3288, "step": 1188 }, { "epoch": 0.01189, "grad_norm": 0.7947776232413513, "learning_rate": 0.003, "loss": 4.3281, "step": 1189 }, { "epoch": 0.0119, "grad_norm": 0.882572116743252, "learning_rate": 0.003, "loss": 4.3193, "step": 1190 }, { "epoch": 0.01191, "grad_norm": 1.0146227461981676, "learning_rate": 0.003, "loss": 4.307, "step": 1191 }, { "epoch": 0.01192, "grad_norm": 1.112972484482218, "learning_rate": 0.003, "loss": 4.3512, "step": 1192 }, { "epoch": 0.01193, "grad_norm": 0.9615449743933697, "learning_rate": 0.003, "loss": 4.3541, "step": 1193 }, { "epoch": 0.01194, "grad_norm": 0.8644326468862364, "learning_rate": 0.003, "loss": 4.3507, "step": 1194 }, { "epoch": 0.01195, "grad_norm": 0.8480336292760144, "learning_rate": 0.003, "loss": 4.3317, "step": 1195 }, { "epoch": 0.01196, "grad_norm": 0.8784381650624298, "learning_rate": 0.003, "loss": 4.3413, "step": 1196 }, { "epoch": 0.01197, "grad_norm": 0.9498997917047696, "learning_rate": 0.003, "loss": 4.3186, "step": 1197 }, { "epoch": 0.01198, "grad_norm": 0.8920952388458802, "learning_rate": 0.003, "loss": 4.388, "step": 1198 }, { "epoch": 0.01199, "grad_norm": 1.0871047324767358, "learning_rate": 0.003, "loss": 4.3742, "step": 1199 }, { "epoch": 0.012, "grad_norm": 0.9019122174295907, "learning_rate": 0.003, "loss": 4.3579, "step": 1200 }, { "epoch": 0.01201, "grad_norm": 0.85321834247841, "learning_rate": 0.003, "loss": 4.3858, "step": 1201 }, { "epoch": 0.01202, "grad_norm": 0.7349368268324239, "learning_rate": 0.003, "loss": 4.3505, "step": 1202 }, { "epoch": 0.01203, "grad_norm": 0.6774471285416843, "learning_rate": 0.003, "loss": 4.357, "step": 1203 }, { "epoch": 0.01204, "grad_norm": 0.6937988439444178, "learning_rate": 0.003, "loss": 4.3513, "step": 1204 }, { "epoch": 0.01205, "grad_norm": 0.6364064699328508, "learning_rate": 0.003, "loss": 4.3406, "step": 1205 }, { "epoch": 0.01206, "grad_norm": 0.522276421270424, "learning_rate": 0.003, "loss": 4.3798, "step": 1206 }, { "epoch": 0.01207, "grad_norm": 0.5076942128127889, "learning_rate": 0.003, "loss": 4.2994, "step": 1207 }, { "epoch": 0.01208, "grad_norm": 0.47287636588909837, "learning_rate": 0.003, "loss": 4.3565, "step": 1208 }, { "epoch": 0.01209, "grad_norm": 0.5041956058683869, "learning_rate": 0.003, "loss": 4.3379, "step": 1209 }, { "epoch": 0.0121, "grad_norm": 0.6167670848439076, "learning_rate": 0.003, "loss": 4.341, "step": 1210 }, { "epoch": 0.01211, "grad_norm": 0.7242996454774496, "learning_rate": 0.003, "loss": 4.3379, "step": 1211 }, { "epoch": 0.01212, "grad_norm": 0.8213908511974465, "learning_rate": 0.003, "loss": 4.3624, "step": 1212 }, { "epoch": 0.01213, "grad_norm": 0.6880698765717139, "learning_rate": 0.003, "loss": 4.3342, "step": 1213 }, { "epoch": 0.01214, "grad_norm": 0.5407616002767914, "learning_rate": 0.003, "loss": 4.311, "step": 1214 }, { "epoch": 0.01215, "grad_norm": 0.6707389680302347, "learning_rate": 0.003, "loss": 4.3237, "step": 1215 }, { "epoch": 0.01216, "grad_norm": 0.715155480014794, "learning_rate": 0.003, "loss": 4.3206, "step": 1216 }, { "epoch": 0.01217, "grad_norm": 0.6669611805526361, "learning_rate": 0.003, "loss": 4.3242, "step": 1217 }, { "epoch": 0.01218, "grad_norm": 0.6194536332440173, "learning_rate": 0.003, "loss": 4.3183, "step": 1218 }, { "epoch": 0.01219, "grad_norm": 0.5624325800012369, "learning_rate": 0.003, "loss": 4.3151, "step": 1219 }, { "epoch": 0.0122, "grad_norm": 0.5994752597130036, "learning_rate": 0.003, "loss": 4.3082, "step": 1220 }, { "epoch": 0.01221, "grad_norm": 0.5929964237485116, "learning_rate": 0.003, "loss": 4.3217, "step": 1221 }, { "epoch": 0.01222, "grad_norm": 0.5364789869662485, "learning_rate": 0.003, "loss": 4.3034, "step": 1222 }, { "epoch": 0.01223, "grad_norm": 0.4883935817063237, "learning_rate": 0.003, "loss": 4.2989, "step": 1223 }, { "epoch": 0.01224, "grad_norm": 0.5365327424367556, "learning_rate": 0.003, "loss": 4.2844, "step": 1224 }, { "epoch": 0.01225, "grad_norm": 0.5060580384449018, "learning_rate": 0.003, "loss": 4.3078, "step": 1225 }, { "epoch": 0.01226, "grad_norm": 0.4316062800492973, "learning_rate": 0.003, "loss": 4.297, "step": 1226 }, { "epoch": 0.01227, "grad_norm": 0.41324487305946883, "learning_rate": 0.003, "loss": 4.3243, "step": 1227 }, { "epoch": 0.01228, "grad_norm": 0.4686302520601058, "learning_rate": 0.003, "loss": 4.3085, "step": 1228 }, { "epoch": 0.01229, "grad_norm": 0.5830158408620507, "learning_rate": 0.003, "loss": 4.2989, "step": 1229 }, { "epoch": 0.0123, "grad_norm": 0.7143816453247004, "learning_rate": 0.003, "loss": 4.3393, "step": 1230 }, { "epoch": 0.01231, "grad_norm": 0.8518087843200909, "learning_rate": 0.003, "loss": 4.3134, "step": 1231 }, { "epoch": 0.01232, "grad_norm": 0.8467846529950723, "learning_rate": 0.003, "loss": 4.3028, "step": 1232 }, { "epoch": 0.01233, "grad_norm": 0.7843462879273329, "learning_rate": 0.003, "loss": 4.3052, "step": 1233 }, { "epoch": 0.01234, "grad_norm": 0.8956533482473612, "learning_rate": 0.003, "loss": 4.3298, "step": 1234 }, { "epoch": 0.01235, "grad_norm": 0.9162978759093423, "learning_rate": 0.003, "loss": 4.3266, "step": 1235 }, { "epoch": 0.01236, "grad_norm": 0.860682215697324, "learning_rate": 0.003, "loss": 4.3299, "step": 1236 }, { "epoch": 0.01237, "grad_norm": 0.851315880486755, "learning_rate": 0.003, "loss": 4.3393, "step": 1237 }, { "epoch": 0.01238, "grad_norm": 0.8294971616591855, "learning_rate": 0.003, "loss": 4.3358, "step": 1238 }, { "epoch": 0.01239, "grad_norm": 0.9437199934028814, "learning_rate": 0.003, "loss": 4.3282, "step": 1239 }, { "epoch": 0.0124, "grad_norm": 0.9106084651998833, "learning_rate": 0.003, "loss": 4.3137, "step": 1240 }, { "epoch": 0.01241, "grad_norm": 0.8351496076806352, "learning_rate": 0.003, "loss": 4.3273, "step": 1241 }, { "epoch": 0.01242, "grad_norm": 0.6918670619381783, "learning_rate": 0.003, "loss": 4.3257, "step": 1242 }, { "epoch": 0.01243, "grad_norm": 0.732924073441867, "learning_rate": 0.003, "loss": 4.3077, "step": 1243 }, { "epoch": 0.01244, "grad_norm": 0.7515551346375118, "learning_rate": 0.003, "loss": 4.3219, "step": 1244 }, { "epoch": 0.01245, "grad_norm": 0.635919850996669, "learning_rate": 0.003, "loss": 4.3108, "step": 1245 }, { "epoch": 0.01246, "grad_norm": 0.5245627080366145, "learning_rate": 0.003, "loss": 4.2806, "step": 1246 }, { "epoch": 0.01247, "grad_norm": 0.5964323822303592, "learning_rate": 0.003, "loss": 4.3378, "step": 1247 }, { "epoch": 0.01248, "grad_norm": 0.7016619320460757, "learning_rate": 0.003, "loss": 4.3217, "step": 1248 }, { "epoch": 0.01249, "grad_norm": 0.7231849999092693, "learning_rate": 0.003, "loss": 4.3154, "step": 1249 }, { "epoch": 0.0125, "grad_norm": 0.6036388669727554, "learning_rate": 0.003, "loss": 4.2869, "step": 1250 }, { "epoch": 0.01251, "grad_norm": 0.5253366712272899, "learning_rate": 0.003, "loss": 4.316, "step": 1251 }, { "epoch": 0.01252, "grad_norm": 0.46320485886810614, "learning_rate": 0.003, "loss": 4.2912, "step": 1252 }, { "epoch": 0.01253, "grad_norm": 0.4963582371471178, "learning_rate": 0.003, "loss": 4.2694, "step": 1253 }, { "epoch": 0.01254, "grad_norm": 0.5081054207596929, "learning_rate": 0.003, "loss": 4.3157, "step": 1254 }, { "epoch": 0.01255, "grad_norm": 0.4761071055102706, "learning_rate": 0.003, "loss": 4.2641, "step": 1255 }, { "epoch": 0.01256, "grad_norm": 0.5232678892632563, "learning_rate": 0.003, "loss": 4.2916, "step": 1256 }, { "epoch": 0.01257, "grad_norm": 0.5344373546996498, "learning_rate": 0.003, "loss": 4.2768, "step": 1257 }, { "epoch": 0.01258, "grad_norm": 0.5773429526145795, "learning_rate": 0.003, "loss": 4.2867, "step": 1258 }, { "epoch": 0.01259, "grad_norm": 0.6584062985843472, "learning_rate": 0.003, "loss": 4.2918, "step": 1259 }, { "epoch": 0.0126, "grad_norm": 0.6804145752695209, "learning_rate": 0.003, "loss": 4.3214, "step": 1260 }, { "epoch": 0.01261, "grad_norm": 0.6981998021120679, "learning_rate": 0.003, "loss": 4.3161, "step": 1261 }, { "epoch": 0.01262, "grad_norm": 0.6112143338587285, "learning_rate": 0.003, "loss": 4.304, "step": 1262 }, { "epoch": 0.01263, "grad_norm": 0.5927786151181257, "learning_rate": 0.003, "loss": 4.298, "step": 1263 }, { "epoch": 0.01264, "grad_norm": 0.6346760589695591, "learning_rate": 0.003, "loss": 4.2851, "step": 1264 }, { "epoch": 0.01265, "grad_norm": 0.6927824976704505, "learning_rate": 0.003, "loss": 4.3011, "step": 1265 }, { "epoch": 0.01266, "grad_norm": 0.7328319392289776, "learning_rate": 0.003, "loss": 4.3026, "step": 1266 }, { "epoch": 0.01267, "grad_norm": 0.8489062737553393, "learning_rate": 0.003, "loss": 4.325, "step": 1267 }, { "epoch": 0.01268, "grad_norm": 0.8961579939511539, "learning_rate": 0.003, "loss": 4.3257, "step": 1268 }, { "epoch": 0.01269, "grad_norm": 0.821810765966473, "learning_rate": 0.003, "loss": 4.2839, "step": 1269 }, { "epoch": 0.0127, "grad_norm": 0.8357784457532415, "learning_rate": 0.003, "loss": 4.3228, "step": 1270 }, { "epoch": 0.01271, "grad_norm": 0.7922720998753914, "learning_rate": 0.003, "loss": 4.3037, "step": 1271 }, { "epoch": 0.01272, "grad_norm": 0.7964991232532974, "learning_rate": 0.003, "loss": 4.3039, "step": 1272 }, { "epoch": 0.01273, "grad_norm": 0.7778280400168709, "learning_rate": 0.003, "loss": 4.3248, "step": 1273 }, { "epoch": 0.01274, "grad_norm": 0.9153345412853071, "learning_rate": 0.003, "loss": 4.3241, "step": 1274 }, { "epoch": 0.01275, "grad_norm": 0.8587735334056731, "learning_rate": 0.003, "loss": 4.294, "step": 1275 }, { "epoch": 0.01276, "grad_norm": 0.7198180114679126, "learning_rate": 0.003, "loss": 4.2973, "step": 1276 }, { "epoch": 0.01277, "grad_norm": 0.7510923187630398, "learning_rate": 0.003, "loss": 4.3066, "step": 1277 }, { "epoch": 0.01278, "grad_norm": 0.8054501129326099, "learning_rate": 0.003, "loss": 4.3179, "step": 1278 }, { "epoch": 0.01279, "grad_norm": 0.7824154610465377, "learning_rate": 0.003, "loss": 4.2997, "step": 1279 }, { "epoch": 0.0128, "grad_norm": 0.7320824999573285, "learning_rate": 0.003, "loss": 4.2956, "step": 1280 }, { "epoch": 0.01281, "grad_norm": 0.6960943615110375, "learning_rate": 0.003, "loss": 4.2806, "step": 1281 }, { "epoch": 0.01282, "grad_norm": 0.5992318655887974, "learning_rate": 0.003, "loss": 4.3205, "step": 1282 }, { "epoch": 0.01283, "grad_norm": 0.6115671413316082, "learning_rate": 0.003, "loss": 4.2898, "step": 1283 }, { "epoch": 0.01284, "grad_norm": 0.5685168810152473, "learning_rate": 0.003, "loss": 4.2893, "step": 1284 }, { "epoch": 0.01285, "grad_norm": 0.6350368869509793, "learning_rate": 0.003, "loss": 4.2923, "step": 1285 }, { "epoch": 0.01286, "grad_norm": 0.6769385854863653, "learning_rate": 0.003, "loss": 4.3189, "step": 1286 }, { "epoch": 0.01287, "grad_norm": 0.5997038647975563, "learning_rate": 0.003, "loss": 4.2792, "step": 1287 }, { "epoch": 0.01288, "grad_norm": 0.6318526861304625, "learning_rate": 0.003, "loss": 4.2923, "step": 1288 }, { "epoch": 0.01289, "grad_norm": 0.6619003980657734, "learning_rate": 0.003, "loss": 4.2877, "step": 1289 }, { "epoch": 0.0129, "grad_norm": 0.7141266768182228, "learning_rate": 0.003, "loss": 4.3104, "step": 1290 }, { "epoch": 0.01291, "grad_norm": 0.7063834987249501, "learning_rate": 0.003, "loss": 4.2921, "step": 1291 }, { "epoch": 0.01292, "grad_norm": 0.6423787800334868, "learning_rate": 0.003, "loss": 4.3242, "step": 1292 }, { "epoch": 0.01293, "grad_norm": 0.5344471091426154, "learning_rate": 0.003, "loss": 4.3062, "step": 1293 }, { "epoch": 0.01294, "grad_norm": 0.4523910877254715, "learning_rate": 0.003, "loss": 4.2707, "step": 1294 }, { "epoch": 0.01295, "grad_norm": 0.5040604805545252, "learning_rate": 0.003, "loss": 4.2993, "step": 1295 }, { "epoch": 0.01296, "grad_norm": 0.5193740138388458, "learning_rate": 0.003, "loss": 4.2656, "step": 1296 }, { "epoch": 0.01297, "grad_norm": 0.5346893098464323, "learning_rate": 0.003, "loss": 4.2788, "step": 1297 }, { "epoch": 0.01298, "grad_norm": 0.5373039682823991, "learning_rate": 0.003, "loss": 4.2592, "step": 1298 }, { "epoch": 0.01299, "grad_norm": 0.5661548725913075, "learning_rate": 0.003, "loss": 4.2709, "step": 1299 }, { "epoch": 0.013, "grad_norm": 0.7446298448290246, "learning_rate": 0.003, "loss": 4.2678, "step": 1300 }, { "epoch": 0.01301, "grad_norm": 0.8230150822772082, "learning_rate": 0.003, "loss": 4.2955, "step": 1301 }, { "epoch": 0.01302, "grad_norm": 0.7744841619477817, "learning_rate": 0.003, "loss": 4.3361, "step": 1302 }, { "epoch": 0.01303, "grad_norm": 0.6454504186361478, "learning_rate": 0.003, "loss": 4.2765, "step": 1303 }, { "epoch": 0.01304, "grad_norm": 0.7188710255689805, "learning_rate": 0.003, "loss": 4.3063, "step": 1304 }, { "epoch": 0.01305, "grad_norm": 0.757402061055745, "learning_rate": 0.003, "loss": 4.2938, "step": 1305 }, { "epoch": 0.01306, "grad_norm": 0.691272304301314, "learning_rate": 0.003, "loss": 4.2864, "step": 1306 }, { "epoch": 0.01307, "grad_norm": 0.706053416610055, "learning_rate": 0.003, "loss": 4.3074, "step": 1307 }, { "epoch": 0.01308, "grad_norm": 0.6221880747565551, "learning_rate": 0.003, "loss": 4.284, "step": 1308 }, { "epoch": 0.01309, "grad_norm": 0.5791770340268083, "learning_rate": 0.003, "loss": 4.3174, "step": 1309 }, { "epoch": 0.0131, "grad_norm": 0.6695730992498613, "learning_rate": 0.003, "loss": 4.2672, "step": 1310 }, { "epoch": 0.01311, "grad_norm": 0.6922028781625468, "learning_rate": 0.003, "loss": 4.3073, "step": 1311 }, { "epoch": 0.01312, "grad_norm": 0.8287735129364245, "learning_rate": 0.003, "loss": 4.2832, "step": 1312 }, { "epoch": 0.01313, "grad_norm": 0.7976431888385418, "learning_rate": 0.003, "loss": 4.2757, "step": 1313 }, { "epoch": 0.01314, "grad_norm": 0.7220518823683293, "learning_rate": 0.003, "loss": 4.2709, "step": 1314 }, { "epoch": 0.01315, "grad_norm": 0.6095265673664224, "learning_rate": 0.003, "loss": 4.2914, "step": 1315 }, { "epoch": 0.01316, "grad_norm": 0.7674466841738673, "learning_rate": 0.003, "loss": 4.2855, "step": 1316 }, { "epoch": 0.01317, "grad_norm": 0.9059745922778419, "learning_rate": 0.003, "loss": 4.3151, "step": 1317 }, { "epoch": 0.01318, "grad_norm": 0.9427302613153309, "learning_rate": 0.003, "loss": 4.3056, "step": 1318 }, { "epoch": 0.01319, "grad_norm": 0.73054171411267, "learning_rate": 0.003, "loss": 4.3015, "step": 1319 }, { "epoch": 0.0132, "grad_norm": 0.7111720819817223, "learning_rate": 0.003, "loss": 4.3007, "step": 1320 }, { "epoch": 0.01321, "grad_norm": 0.7456349593949091, "learning_rate": 0.003, "loss": 4.2768, "step": 1321 }, { "epoch": 0.01322, "grad_norm": 0.7807986643070355, "learning_rate": 0.003, "loss": 4.3113, "step": 1322 }, { "epoch": 0.01323, "grad_norm": 0.8224389080614981, "learning_rate": 0.003, "loss": 4.3093, "step": 1323 }, { "epoch": 0.01324, "grad_norm": 1.035353941682105, "learning_rate": 0.003, "loss": 4.3086, "step": 1324 }, { "epoch": 0.01325, "grad_norm": 0.8946051092509474, "learning_rate": 0.003, "loss": 4.3063, "step": 1325 }, { "epoch": 0.01326, "grad_norm": 1.0201089618117705, "learning_rate": 0.003, "loss": 4.2973, "step": 1326 }, { "epoch": 0.01327, "grad_norm": 0.864867336973391, "learning_rate": 0.003, "loss": 4.3115, "step": 1327 }, { "epoch": 0.01328, "grad_norm": 0.8025779319028616, "learning_rate": 0.003, "loss": 4.3034, "step": 1328 }, { "epoch": 0.01329, "grad_norm": 0.7807799111244325, "learning_rate": 0.003, "loss": 4.2681, "step": 1329 }, { "epoch": 0.0133, "grad_norm": 0.8681894612953762, "learning_rate": 0.003, "loss": 4.3154, "step": 1330 }, { "epoch": 0.01331, "grad_norm": 1.0494856392086225, "learning_rate": 0.003, "loss": 4.3161, "step": 1331 }, { "epoch": 0.01332, "grad_norm": 0.9504785674947578, "learning_rate": 0.003, "loss": 4.3364, "step": 1332 }, { "epoch": 0.01333, "grad_norm": 0.9898834831271559, "learning_rate": 0.003, "loss": 4.3084, "step": 1333 }, { "epoch": 0.01334, "grad_norm": 1.0019143105022317, "learning_rate": 0.003, "loss": 4.3012, "step": 1334 }, { "epoch": 0.01335, "grad_norm": 0.8650334140122408, "learning_rate": 0.003, "loss": 4.3214, "step": 1335 }, { "epoch": 0.01336, "grad_norm": 0.8920064299561147, "learning_rate": 0.003, "loss": 4.317, "step": 1336 }, { "epoch": 0.01337, "grad_norm": 0.7707840908187834, "learning_rate": 0.003, "loss": 4.3286, "step": 1337 }, { "epoch": 0.01338, "grad_norm": 0.7632598332017204, "learning_rate": 0.003, "loss": 4.2919, "step": 1338 }, { "epoch": 0.01339, "grad_norm": 0.7360297385886754, "learning_rate": 0.003, "loss": 4.3155, "step": 1339 }, { "epoch": 0.0134, "grad_norm": 0.6003622832803963, "learning_rate": 0.003, "loss": 4.2916, "step": 1340 }, { "epoch": 0.01341, "grad_norm": 0.6955896552188154, "learning_rate": 0.003, "loss": 4.3105, "step": 1341 }, { "epoch": 0.01342, "grad_norm": 0.6883046873976916, "learning_rate": 0.003, "loss": 4.2818, "step": 1342 }, { "epoch": 0.01343, "grad_norm": 0.7284558823650665, "learning_rate": 0.003, "loss": 4.2883, "step": 1343 }, { "epoch": 0.01344, "grad_norm": 0.6360126374822531, "learning_rate": 0.003, "loss": 4.2837, "step": 1344 }, { "epoch": 0.01345, "grad_norm": 0.5334484786655853, "learning_rate": 0.003, "loss": 4.3176, "step": 1345 }, { "epoch": 0.01346, "grad_norm": 0.5681507381379014, "learning_rate": 0.003, "loss": 4.2885, "step": 1346 }, { "epoch": 0.01347, "grad_norm": 0.5788404528903132, "learning_rate": 0.003, "loss": 4.2816, "step": 1347 }, { "epoch": 0.01348, "grad_norm": 0.5418128304616573, "learning_rate": 0.003, "loss": 4.2755, "step": 1348 }, { "epoch": 0.01349, "grad_norm": 0.525198484250559, "learning_rate": 0.003, "loss": 4.2954, "step": 1349 }, { "epoch": 0.0135, "grad_norm": 0.46326210720927236, "learning_rate": 0.003, "loss": 4.2853, "step": 1350 }, { "epoch": 0.01351, "grad_norm": 0.4585121858352072, "learning_rate": 0.003, "loss": 4.313, "step": 1351 }, { "epoch": 0.01352, "grad_norm": 0.4283890963121618, "learning_rate": 0.003, "loss": 4.286, "step": 1352 }, { "epoch": 0.01353, "grad_norm": 0.42199420882026434, "learning_rate": 0.003, "loss": 4.2702, "step": 1353 }, { "epoch": 0.01354, "grad_norm": 0.4262744973588792, "learning_rate": 0.003, "loss": 4.2568, "step": 1354 }, { "epoch": 0.01355, "grad_norm": 0.40850743293980507, "learning_rate": 0.003, "loss": 4.268, "step": 1355 }, { "epoch": 0.01356, "grad_norm": 0.41818294724190413, "learning_rate": 0.003, "loss": 4.2623, "step": 1356 }, { "epoch": 0.01357, "grad_norm": 0.43655941768902196, "learning_rate": 0.003, "loss": 4.2503, "step": 1357 }, { "epoch": 0.01358, "grad_norm": 0.49395890148457533, "learning_rate": 0.003, "loss": 4.2463, "step": 1358 }, { "epoch": 0.01359, "grad_norm": 0.616978373290081, "learning_rate": 0.003, "loss": 4.2894, "step": 1359 }, { "epoch": 0.0136, "grad_norm": 0.735374833430507, "learning_rate": 0.003, "loss": 4.2639, "step": 1360 }, { "epoch": 0.01361, "grad_norm": 0.7389266502027623, "learning_rate": 0.003, "loss": 4.2792, "step": 1361 }, { "epoch": 0.01362, "grad_norm": 0.65177426874564, "learning_rate": 0.003, "loss": 4.2865, "step": 1362 }, { "epoch": 0.01363, "grad_norm": 0.6753488054632895, "learning_rate": 0.003, "loss": 4.2893, "step": 1363 }, { "epoch": 0.01364, "grad_norm": 0.8240837082437891, "learning_rate": 0.003, "loss": 4.2868, "step": 1364 }, { "epoch": 0.01365, "grad_norm": 0.7993265568553571, "learning_rate": 0.003, "loss": 4.306, "step": 1365 }, { "epoch": 0.01366, "grad_norm": 0.6564545276995603, "learning_rate": 0.003, "loss": 4.2558, "step": 1366 }, { "epoch": 0.01367, "grad_norm": 0.5493737530398193, "learning_rate": 0.003, "loss": 4.3003, "step": 1367 }, { "epoch": 0.01368, "grad_norm": 0.5650653386363494, "learning_rate": 0.003, "loss": 4.2739, "step": 1368 }, { "epoch": 0.01369, "grad_norm": 0.46328750272215335, "learning_rate": 0.003, "loss": 4.262, "step": 1369 }, { "epoch": 0.0137, "grad_norm": 0.5045609042462222, "learning_rate": 0.003, "loss": 4.2894, "step": 1370 }, { "epoch": 0.01371, "grad_norm": 0.5910891998788811, "learning_rate": 0.003, "loss": 4.2822, "step": 1371 }, { "epoch": 0.01372, "grad_norm": 0.6815189005820225, "learning_rate": 0.003, "loss": 4.2381, "step": 1372 }, { "epoch": 0.01373, "grad_norm": 0.842580155535958, "learning_rate": 0.003, "loss": 4.2808, "step": 1373 }, { "epoch": 0.01374, "grad_norm": 0.907805736475784, "learning_rate": 0.003, "loss": 4.2961, "step": 1374 }, { "epoch": 0.01375, "grad_norm": 0.8312986426917952, "learning_rate": 0.003, "loss": 4.2534, "step": 1375 }, { "epoch": 0.01376, "grad_norm": 0.8401650089095936, "learning_rate": 0.003, "loss": 4.2829, "step": 1376 }, { "epoch": 0.01377, "grad_norm": 0.7706178388137035, "learning_rate": 0.003, "loss": 4.2816, "step": 1377 }, { "epoch": 0.01378, "grad_norm": 0.6957781157082459, "learning_rate": 0.003, "loss": 4.3087, "step": 1378 }, { "epoch": 0.01379, "grad_norm": 0.6831406156786277, "learning_rate": 0.003, "loss": 4.2792, "step": 1379 }, { "epoch": 0.0138, "grad_norm": 0.6334610013963772, "learning_rate": 0.003, "loss": 4.2739, "step": 1380 }, { "epoch": 0.01381, "grad_norm": 0.5316939383075773, "learning_rate": 0.003, "loss": 4.2776, "step": 1381 }, { "epoch": 0.01382, "grad_norm": 0.516573615018121, "learning_rate": 0.003, "loss": 4.2735, "step": 1382 }, { "epoch": 0.01383, "grad_norm": 0.5599212453399529, "learning_rate": 0.003, "loss": 4.2692, "step": 1383 }, { "epoch": 0.01384, "grad_norm": 0.6188520650392525, "learning_rate": 0.003, "loss": 4.3, "step": 1384 }, { "epoch": 0.01385, "grad_norm": 0.6353409140192112, "learning_rate": 0.003, "loss": 4.2877, "step": 1385 }, { "epoch": 0.01386, "grad_norm": 0.6356880916095538, "learning_rate": 0.003, "loss": 4.2638, "step": 1386 }, { "epoch": 0.01387, "grad_norm": 0.7257450023455765, "learning_rate": 0.003, "loss": 4.2537, "step": 1387 }, { "epoch": 0.01388, "grad_norm": 0.7014765592615619, "learning_rate": 0.003, "loss": 4.2914, "step": 1388 }, { "epoch": 0.01389, "grad_norm": 0.6616359972565842, "learning_rate": 0.003, "loss": 4.2702, "step": 1389 }, { "epoch": 0.0139, "grad_norm": 0.6527699070789852, "learning_rate": 0.003, "loss": 4.2735, "step": 1390 }, { "epoch": 0.01391, "grad_norm": 0.7095964540484884, "learning_rate": 0.003, "loss": 4.2741, "step": 1391 }, { "epoch": 0.01392, "grad_norm": 0.7501817269895354, "learning_rate": 0.003, "loss": 4.2866, "step": 1392 }, { "epoch": 0.01393, "grad_norm": 0.6714529168989051, "learning_rate": 0.003, "loss": 4.2555, "step": 1393 }, { "epoch": 0.01394, "grad_norm": 0.5871737080135582, "learning_rate": 0.003, "loss": 4.2392, "step": 1394 }, { "epoch": 0.01395, "grad_norm": 0.5621623507740485, "learning_rate": 0.003, "loss": 4.2747, "step": 1395 }, { "epoch": 0.01396, "grad_norm": 0.664320619088776, "learning_rate": 0.003, "loss": 4.2655, "step": 1396 }, { "epoch": 0.01397, "grad_norm": 0.7274104246733604, "learning_rate": 0.003, "loss": 4.2817, "step": 1397 }, { "epoch": 0.01398, "grad_norm": 0.6424657996436192, "learning_rate": 0.003, "loss": 4.2594, "step": 1398 }, { "epoch": 0.01399, "grad_norm": 0.6842824738024023, "learning_rate": 0.003, "loss": 4.2829, "step": 1399 }, { "epoch": 0.014, "grad_norm": 0.8261095933984154, "learning_rate": 0.003, "loss": 4.2493, "step": 1400 }, { "epoch": 0.01401, "grad_norm": 0.7965575457737929, "learning_rate": 0.003, "loss": 4.2708, "step": 1401 }, { "epoch": 0.01402, "grad_norm": 0.8026102237656736, "learning_rate": 0.003, "loss": 4.2842, "step": 1402 }, { "epoch": 0.01403, "grad_norm": 0.816192334350639, "learning_rate": 0.003, "loss": 4.2628, "step": 1403 }, { "epoch": 0.01404, "grad_norm": 0.9312442614599608, "learning_rate": 0.003, "loss": 4.2897, "step": 1404 }, { "epoch": 0.01405, "grad_norm": 1.0392313893683864, "learning_rate": 0.003, "loss": 4.319, "step": 1405 }, { "epoch": 0.01406, "grad_norm": 0.9393883719208338, "learning_rate": 0.003, "loss": 4.3, "step": 1406 }, { "epoch": 0.01407, "grad_norm": 0.9978258942739143, "learning_rate": 0.003, "loss": 4.3193, "step": 1407 }, { "epoch": 0.01408, "grad_norm": 1.1500652805383735, "learning_rate": 0.003, "loss": 4.2914, "step": 1408 }, { "epoch": 0.01409, "grad_norm": 1.0900231626852614, "learning_rate": 0.003, "loss": 4.3253, "step": 1409 }, { "epoch": 0.0141, "grad_norm": 1.0202054584780536, "learning_rate": 0.003, "loss": 4.3009, "step": 1410 }, { "epoch": 0.01411, "grad_norm": 0.898714028166491, "learning_rate": 0.003, "loss": 4.2744, "step": 1411 }, { "epoch": 0.01412, "grad_norm": 0.7435025676246113, "learning_rate": 0.003, "loss": 4.3061, "step": 1412 }, { "epoch": 0.01413, "grad_norm": 0.5873930770772416, "learning_rate": 0.003, "loss": 4.2941, "step": 1413 }, { "epoch": 0.01414, "grad_norm": 0.6016255593464158, "learning_rate": 0.003, "loss": 4.2747, "step": 1414 }, { "epoch": 0.01415, "grad_norm": 0.6099426868013017, "learning_rate": 0.003, "loss": 4.2659, "step": 1415 }, { "epoch": 0.01416, "grad_norm": 0.5575813775095378, "learning_rate": 0.003, "loss": 4.2911, "step": 1416 }, { "epoch": 0.01417, "grad_norm": 0.5917623365800833, "learning_rate": 0.003, "loss": 4.2803, "step": 1417 }, { "epoch": 0.01418, "grad_norm": 0.6068864838347083, "learning_rate": 0.003, "loss": 4.2958, "step": 1418 }, { "epoch": 0.01419, "grad_norm": 0.6824165624145411, "learning_rate": 0.003, "loss": 4.2858, "step": 1419 }, { "epoch": 0.0142, "grad_norm": 0.6199413471532862, "learning_rate": 0.003, "loss": 4.291, "step": 1420 }, { "epoch": 0.01421, "grad_norm": 0.5408211469998263, "learning_rate": 0.003, "loss": 4.2544, "step": 1421 }, { "epoch": 0.01422, "grad_norm": 0.5533737134579111, "learning_rate": 0.003, "loss": 4.2598, "step": 1422 }, { "epoch": 0.01423, "grad_norm": 0.5685842699001438, "learning_rate": 0.003, "loss": 4.2645, "step": 1423 }, { "epoch": 0.01424, "grad_norm": 0.6304522666735822, "learning_rate": 0.003, "loss": 4.2833, "step": 1424 }, { "epoch": 0.01425, "grad_norm": 0.6515865244102063, "learning_rate": 0.003, "loss": 4.2865, "step": 1425 }, { "epoch": 0.01426, "grad_norm": 0.7444754270308325, "learning_rate": 0.003, "loss": 4.2655, "step": 1426 }, { "epoch": 0.01427, "grad_norm": 0.7497304563870452, "learning_rate": 0.003, "loss": 4.263, "step": 1427 }, { "epoch": 0.01428, "grad_norm": 0.6556010515324793, "learning_rate": 0.003, "loss": 4.2594, "step": 1428 }, { "epoch": 0.01429, "grad_norm": 0.6636248907749417, "learning_rate": 0.003, "loss": 4.265, "step": 1429 }, { "epoch": 0.0143, "grad_norm": 0.7211105035251077, "learning_rate": 0.003, "loss": 4.28, "step": 1430 }, { "epoch": 0.01431, "grad_norm": 0.7061472665079888, "learning_rate": 0.003, "loss": 4.2684, "step": 1431 }, { "epoch": 0.01432, "grad_norm": 0.6456431394575474, "learning_rate": 0.003, "loss": 4.2556, "step": 1432 }, { "epoch": 0.01433, "grad_norm": 0.7570512378286692, "learning_rate": 0.003, "loss": 4.2568, "step": 1433 }, { "epoch": 0.01434, "grad_norm": 0.7648500642290317, "learning_rate": 0.003, "loss": 4.2443, "step": 1434 }, { "epoch": 0.01435, "grad_norm": 0.7689156199684593, "learning_rate": 0.003, "loss": 4.2531, "step": 1435 }, { "epoch": 0.01436, "grad_norm": 0.7360234649948437, "learning_rate": 0.003, "loss": 4.274, "step": 1436 }, { "epoch": 0.01437, "grad_norm": 0.7697472519317102, "learning_rate": 0.003, "loss": 4.2879, "step": 1437 }, { "epoch": 0.01438, "grad_norm": 0.7889422694170278, "learning_rate": 0.003, "loss": 4.2805, "step": 1438 }, { "epoch": 0.01439, "grad_norm": 0.6891331546216416, "learning_rate": 0.003, "loss": 4.2417, "step": 1439 }, { "epoch": 0.0144, "grad_norm": 0.560460778999846, "learning_rate": 0.003, "loss": 4.2508, "step": 1440 }, { "epoch": 0.01441, "grad_norm": 0.5400270352929513, "learning_rate": 0.003, "loss": 4.2578, "step": 1441 }, { "epoch": 0.01442, "grad_norm": 0.4911140953080599, "learning_rate": 0.003, "loss": 4.2391, "step": 1442 }, { "epoch": 0.01443, "grad_norm": 0.45294215274078276, "learning_rate": 0.003, "loss": 4.2993, "step": 1443 }, { "epoch": 0.01444, "grad_norm": 0.4224101633170518, "learning_rate": 0.003, "loss": 4.2618, "step": 1444 }, { "epoch": 0.01445, "grad_norm": 0.3961185894381269, "learning_rate": 0.003, "loss": 4.2206, "step": 1445 }, { "epoch": 0.01446, "grad_norm": 0.33118667900370186, "learning_rate": 0.003, "loss": 4.2343, "step": 1446 }, { "epoch": 0.01447, "grad_norm": 0.3556733761768107, "learning_rate": 0.003, "loss": 4.2433, "step": 1447 }, { "epoch": 0.01448, "grad_norm": 0.36561455130036685, "learning_rate": 0.003, "loss": 4.2519, "step": 1448 }, { "epoch": 0.01449, "grad_norm": 0.46874354720560246, "learning_rate": 0.003, "loss": 4.2461, "step": 1449 }, { "epoch": 0.0145, "grad_norm": 0.60149549500153, "learning_rate": 0.003, "loss": 4.266, "step": 1450 }, { "epoch": 0.01451, "grad_norm": 0.6696836890558183, "learning_rate": 0.003, "loss": 4.2469, "step": 1451 }, { "epoch": 0.01452, "grad_norm": 0.6515350232296636, "learning_rate": 0.003, "loss": 4.2476, "step": 1452 }, { "epoch": 0.01453, "grad_norm": 0.662277322649583, "learning_rate": 0.003, "loss": 4.2608, "step": 1453 }, { "epoch": 0.01454, "grad_norm": 0.8972969416580211, "learning_rate": 0.003, "loss": 4.2374, "step": 1454 }, { "epoch": 0.01455, "grad_norm": 1.030023068993243, "learning_rate": 0.003, "loss": 4.2974, "step": 1455 }, { "epoch": 0.01456, "grad_norm": 1.0186167234807815, "learning_rate": 0.003, "loss": 4.2685, "step": 1456 }, { "epoch": 0.01457, "grad_norm": 0.9067819891416041, "learning_rate": 0.003, "loss": 4.2556, "step": 1457 }, { "epoch": 0.01458, "grad_norm": 0.8145250303169672, "learning_rate": 0.003, "loss": 4.2507, "step": 1458 }, { "epoch": 0.01459, "grad_norm": 0.8089455723824179, "learning_rate": 0.003, "loss": 4.2807, "step": 1459 }, { "epoch": 0.0146, "grad_norm": 0.7818105608289873, "learning_rate": 0.003, "loss": 4.2825, "step": 1460 }, { "epoch": 0.01461, "grad_norm": 0.8037592863895617, "learning_rate": 0.003, "loss": 4.2628, "step": 1461 }, { "epoch": 0.01462, "grad_norm": 0.7476671417509403, "learning_rate": 0.003, "loss": 4.2637, "step": 1462 }, { "epoch": 0.01463, "grad_norm": 0.7531868665794315, "learning_rate": 0.003, "loss": 4.2649, "step": 1463 }, { "epoch": 0.01464, "grad_norm": 0.7893765528220285, "learning_rate": 0.003, "loss": 4.2417, "step": 1464 }, { "epoch": 0.01465, "grad_norm": 0.6572421930862311, "learning_rate": 0.003, "loss": 4.2823, "step": 1465 }, { "epoch": 0.01466, "grad_norm": 0.5506048327632588, "learning_rate": 0.003, "loss": 4.2484, "step": 1466 }, { "epoch": 0.01467, "grad_norm": 0.611053596724818, "learning_rate": 0.003, "loss": 4.2828, "step": 1467 }, { "epoch": 0.01468, "grad_norm": 0.68776369659849, "learning_rate": 0.003, "loss": 4.2597, "step": 1468 }, { "epoch": 0.01469, "grad_norm": 0.6702695663961474, "learning_rate": 0.003, "loss": 4.2631, "step": 1469 }, { "epoch": 0.0147, "grad_norm": 0.6961169309212991, "learning_rate": 0.003, "loss": 4.2544, "step": 1470 }, { "epoch": 0.01471, "grad_norm": 0.760155282734985, "learning_rate": 0.003, "loss": 4.2691, "step": 1471 }, { "epoch": 0.01472, "grad_norm": 0.7721176095708484, "learning_rate": 0.003, "loss": 4.2542, "step": 1472 }, { "epoch": 0.01473, "grad_norm": 0.7877033148700755, "learning_rate": 0.003, "loss": 4.2676, "step": 1473 }, { "epoch": 0.01474, "grad_norm": 0.7464730946193943, "learning_rate": 0.003, "loss": 4.2539, "step": 1474 }, { "epoch": 0.01475, "grad_norm": 0.6862863699060794, "learning_rate": 0.003, "loss": 4.2711, "step": 1475 }, { "epoch": 0.01476, "grad_norm": 0.6949444506676095, "learning_rate": 0.003, "loss": 4.2569, "step": 1476 }, { "epoch": 0.01477, "grad_norm": 0.6485108315252704, "learning_rate": 0.003, "loss": 4.2593, "step": 1477 }, { "epoch": 0.01478, "grad_norm": 0.7249395862424638, "learning_rate": 0.003, "loss": 4.2474, "step": 1478 }, { "epoch": 0.01479, "grad_norm": 0.6746802016444003, "learning_rate": 0.003, "loss": 4.28, "step": 1479 }, { "epoch": 0.0148, "grad_norm": 0.6561363565990774, "learning_rate": 0.003, "loss": 4.2473, "step": 1480 }, { "epoch": 0.01481, "grad_norm": 0.7984072714276804, "learning_rate": 0.003, "loss": 4.2564, "step": 1481 }, { "epoch": 0.01482, "grad_norm": 0.8535873443126762, "learning_rate": 0.003, "loss": 4.2436, "step": 1482 }, { "epoch": 0.01483, "grad_norm": 0.8309335851279978, "learning_rate": 0.003, "loss": 4.2469, "step": 1483 }, { "epoch": 0.01484, "grad_norm": 0.7153917621202344, "learning_rate": 0.003, "loss": 4.2896, "step": 1484 }, { "epoch": 0.01485, "grad_norm": 0.7644465422516641, "learning_rate": 0.003, "loss": 4.2586, "step": 1485 }, { "epoch": 0.01486, "grad_norm": 0.6575547181769861, "learning_rate": 0.003, "loss": 4.2505, "step": 1486 }, { "epoch": 0.01487, "grad_norm": 0.616480892995353, "learning_rate": 0.003, "loss": 4.2525, "step": 1487 }, { "epoch": 0.01488, "grad_norm": 0.5676164891655556, "learning_rate": 0.003, "loss": 4.259, "step": 1488 }, { "epoch": 0.01489, "grad_norm": 0.5087336805317257, "learning_rate": 0.003, "loss": 4.252, "step": 1489 }, { "epoch": 0.0149, "grad_norm": 0.5711204937411694, "learning_rate": 0.003, "loss": 4.2449, "step": 1490 }, { "epoch": 0.01491, "grad_norm": 0.5555383511180694, "learning_rate": 0.003, "loss": 4.2594, "step": 1491 }, { "epoch": 0.01492, "grad_norm": 0.5910867309514242, "learning_rate": 0.003, "loss": 4.2249, "step": 1492 }, { "epoch": 0.01493, "grad_norm": 0.5552788367316016, "learning_rate": 0.003, "loss": 4.2566, "step": 1493 }, { "epoch": 0.01494, "grad_norm": 0.597504731138664, "learning_rate": 0.003, "loss": 4.236, "step": 1494 }, { "epoch": 0.01495, "grad_norm": 0.7040905961754581, "learning_rate": 0.003, "loss": 4.2494, "step": 1495 }, { "epoch": 0.01496, "grad_norm": 0.6504979761973873, "learning_rate": 0.003, "loss": 4.2253, "step": 1496 }, { "epoch": 0.01497, "grad_norm": 0.7479904857429174, "learning_rate": 0.003, "loss": 4.2013, "step": 1497 }, { "epoch": 0.01498, "grad_norm": 0.8354013055859276, "learning_rate": 0.003, "loss": 4.2649, "step": 1498 }, { "epoch": 0.01499, "grad_norm": 0.9228916776476623, "learning_rate": 0.003, "loss": 4.2868, "step": 1499 }, { "epoch": 0.015, "grad_norm": 0.9718172707744857, "learning_rate": 0.003, "loss": 4.25, "step": 1500 }, { "epoch": 0.01501, "grad_norm": 1.0136419160882046, "learning_rate": 0.003, "loss": 4.2532, "step": 1501 }, { "epoch": 0.01502, "grad_norm": 0.9325478476141171, "learning_rate": 0.003, "loss": 4.2999, "step": 1502 }, { "epoch": 0.01503, "grad_norm": 1.045999048375591, "learning_rate": 0.003, "loss": 4.2898, "step": 1503 }, { "epoch": 0.01504, "grad_norm": 0.9914485824003936, "learning_rate": 0.003, "loss": 4.2936, "step": 1504 }, { "epoch": 0.01505, "grad_norm": 0.915373742082242, "learning_rate": 0.003, "loss": 4.2776, "step": 1505 }, { "epoch": 0.01506, "grad_norm": 0.8272684821900249, "learning_rate": 0.003, "loss": 4.287, "step": 1506 }, { "epoch": 0.01507, "grad_norm": 0.8618015345967726, "learning_rate": 0.003, "loss": 4.305, "step": 1507 }, { "epoch": 0.01508, "grad_norm": 0.9316744896757736, "learning_rate": 0.003, "loss": 4.2861, "step": 1508 }, { "epoch": 0.01509, "grad_norm": 0.8440466174421261, "learning_rate": 0.003, "loss": 4.2981, "step": 1509 }, { "epoch": 0.0151, "grad_norm": 0.7063003162706939, "learning_rate": 0.003, "loss": 4.2679, "step": 1510 }, { "epoch": 0.01511, "grad_norm": 0.6428638550974116, "learning_rate": 0.003, "loss": 4.2518, "step": 1511 }, { "epoch": 0.01512, "grad_norm": 0.6112249954297037, "learning_rate": 0.003, "loss": 4.2613, "step": 1512 }, { "epoch": 0.01513, "grad_norm": 0.5280577640396528, "learning_rate": 0.003, "loss": 4.2622, "step": 1513 }, { "epoch": 0.01514, "grad_norm": 0.5269385985194992, "learning_rate": 0.003, "loss": 4.2672, "step": 1514 }, { "epoch": 0.01515, "grad_norm": 0.5487987330042349, "learning_rate": 0.003, "loss": 4.2562, "step": 1515 }, { "epoch": 0.01516, "grad_norm": 0.5680391134156543, "learning_rate": 0.003, "loss": 4.2622, "step": 1516 }, { "epoch": 0.01517, "grad_norm": 0.6062107860159427, "learning_rate": 0.003, "loss": 4.2129, "step": 1517 }, { "epoch": 0.01518, "grad_norm": 0.6323154755891632, "learning_rate": 0.003, "loss": 4.2497, "step": 1518 }, { "epoch": 0.01519, "grad_norm": 0.5798448883069626, "learning_rate": 0.003, "loss": 4.2808, "step": 1519 }, { "epoch": 0.0152, "grad_norm": 0.5997731039690386, "learning_rate": 0.003, "loss": 4.2326, "step": 1520 }, { "epoch": 0.01521, "grad_norm": 0.6222254423875409, "learning_rate": 0.003, "loss": 4.248, "step": 1521 }, { "epoch": 0.01522, "grad_norm": 0.5290334858261236, "learning_rate": 0.003, "loss": 4.2398, "step": 1522 }, { "epoch": 0.01523, "grad_norm": 0.5493394171973806, "learning_rate": 0.003, "loss": 4.2321, "step": 1523 }, { "epoch": 0.01524, "grad_norm": 0.6415152277388406, "learning_rate": 0.003, "loss": 4.269, "step": 1524 }, { "epoch": 0.01525, "grad_norm": 0.7027827268185114, "learning_rate": 0.003, "loss": 4.2694, "step": 1525 }, { "epoch": 0.01526, "grad_norm": 0.6962686558421027, "learning_rate": 0.003, "loss": 4.259, "step": 1526 }, { "epoch": 0.01527, "grad_norm": 0.6968664170247966, "learning_rate": 0.003, "loss": 4.2293, "step": 1527 }, { "epoch": 0.01528, "grad_norm": 0.6258396233188714, "learning_rate": 0.003, "loss": 4.2456, "step": 1528 }, { "epoch": 0.01529, "grad_norm": 0.6831052588682119, "learning_rate": 0.003, "loss": 4.2426, "step": 1529 }, { "epoch": 0.0153, "grad_norm": 0.5524942784192245, "learning_rate": 0.003, "loss": 4.2472, "step": 1530 }, { "epoch": 0.01531, "grad_norm": 0.5976116227060053, "learning_rate": 0.003, "loss": 4.241, "step": 1531 }, { "epoch": 0.01532, "grad_norm": 0.5212214454213059, "learning_rate": 0.003, "loss": 4.2166, "step": 1532 }, { "epoch": 0.01533, "grad_norm": 0.5878087168177306, "learning_rate": 0.003, "loss": 4.2439, "step": 1533 }, { "epoch": 0.01534, "grad_norm": 0.8062347592746759, "learning_rate": 0.003, "loss": 4.2507, "step": 1534 }, { "epoch": 0.01535, "grad_norm": 1.1825381040517529, "learning_rate": 0.003, "loss": 4.2824, "step": 1535 }, { "epoch": 0.01536, "grad_norm": 0.7819176506012738, "learning_rate": 0.003, "loss": 4.2554, "step": 1536 }, { "epoch": 0.01537, "grad_norm": 0.7933291638944598, "learning_rate": 0.003, "loss": 4.2465, "step": 1537 }, { "epoch": 0.01538, "grad_norm": 0.7224294373390021, "learning_rate": 0.003, "loss": 4.2237, "step": 1538 }, { "epoch": 0.01539, "grad_norm": 0.7725279995753968, "learning_rate": 0.003, "loss": 4.2589, "step": 1539 }, { "epoch": 0.0154, "grad_norm": 0.6639445396789071, "learning_rate": 0.003, "loss": 4.2578, "step": 1540 }, { "epoch": 0.01541, "grad_norm": 0.5655107354833333, "learning_rate": 0.003, "loss": 4.2359, "step": 1541 }, { "epoch": 0.01542, "grad_norm": 0.4939945119276493, "learning_rate": 0.003, "loss": 4.2606, "step": 1542 }, { "epoch": 0.01543, "grad_norm": 0.6098938664838743, "learning_rate": 0.003, "loss": 4.2469, "step": 1543 }, { "epoch": 0.01544, "grad_norm": 0.58488864006147, "learning_rate": 0.003, "loss": 4.2344, "step": 1544 }, { "epoch": 0.01545, "grad_norm": 0.5313644010442865, "learning_rate": 0.003, "loss": 4.2981, "step": 1545 }, { "epoch": 0.01546, "grad_norm": 0.45959833989927285, "learning_rate": 0.003, "loss": 4.263, "step": 1546 }, { "epoch": 0.01547, "grad_norm": 0.41872682800093836, "learning_rate": 0.003, "loss": 4.2626, "step": 1547 }, { "epoch": 0.01548, "grad_norm": 0.4070005827014792, "learning_rate": 0.003, "loss": 4.2237, "step": 1548 }, { "epoch": 0.01549, "grad_norm": 0.40332574830782875, "learning_rate": 0.003, "loss": 4.2125, "step": 1549 }, { "epoch": 0.0155, "grad_norm": 0.4273425707216713, "learning_rate": 0.003, "loss": 4.2323, "step": 1550 }, { "epoch": 0.01551, "grad_norm": 0.44340108675114914, "learning_rate": 0.003, "loss": 4.2095, "step": 1551 }, { "epoch": 0.01552, "grad_norm": 0.4283402844721516, "learning_rate": 0.003, "loss": 4.2297, "step": 1552 }, { "epoch": 0.01553, "grad_norm": 0.39909368113710497, "learning_rate": 0.003, "loss": 4.2548, "step": 1553 }, { "epoch": 0.01554, "grad_norm": 0.4303761621068326, "learning_rate": 0.003, "loss": 4.2356, "step": 1554 }, { "epoch": 0.01555, "grad_norm": 0.5073116437720543, "learning_rate": 0.003, "loss": 4.2089, "step": 1555 }, { "epoch": 0.01556, "grad_norm": 0.6171177000269511, "learning_rate": 0.003, "loss": 4.2523, "step": 1556 }, { "epoch": 0.01557, "grad_norm": 0.678464738142453, "learning_rate": 0.003, "loss": 4.229, "step": 1557 }, { "epoch": 0.01558, "grad_norm": 0.6779041786729552, "learning_rate": 0.003, "loss": 4.22, "step": 1558 }, { "epoch": 0.01559, "grad_norm": 0.7160170404134826, "learning_rate": 0.003, "loss": 4.2601, "step": 1559 }, { "epoch": 0.0156, "grad_norm": 0.646776348851303, "learning_rate": 0.003, "loss": 4.2484, "step": 1560 }, { "epoch": 0.01561, "grad_norm": 0.7362963973677504, "learning_rate": 0.003, "loss": 4.232, "step": 1561 }, { "epoch": 0.01562, "grad_norm": 0.8173738140090338, "learning_rate": 0.003, "loss": 4.2318, "step": 1562 }, { "epoch": 0.01563, "grad_norm": 0.6949580272753917, "learning_rate": 0.003, "loss": 4.2334, "step": 1563 }, { "epoch": 0.01564, "grad_norm": 0.8704354124122435, "learning_rate": 0.003, "loss": 4.2705, "step": 1564 }, { "epoch": 0.01565, "grad_norm": 1.0516715274291202, "learning_rate": 0.003, "loss": 4.2561, "step": 1565 }, { "epoch": 0.01566, "grad_norm": 0.9138795915356578, "learning_rate": 0.003, "loss": 4.2729, "step": 1566 }, { "epoch": 0.01567, "grad_norm": 0.7933215729025597, "learning_rate": 0.003, "loss": 4.2643, "step": 1567 }, { "epoch": 0.01568, "grad_norm": 0.7524451299585192, "learning_rate": 0.003, "loss": 4.2455, "step": 1568 }, { "epoch": 0.01569, "grad_norm": 0.8688441972266158, "learning_rate": 0.003, "loss": 4.2557, "step": 1569 }, { "epoch": 0.0157, "grad_norm": 1.0213964087576262, "learning_rate": 0.003, "loss": 4.2839, "step": 1570 }, { "epoch": 0.01571, "grad_norm": 0.9658714210834742, "learning_rate": 0.003, "loss": 4.2787, "step": 1571 }, { "epoch": 0.01572, "grad_norm": 0.8962942943425795, "learning_rate": 0.003, "loss": 4.28, "step": 1572 }, { "epoch": 0.01573, "grad_norm": 0.936508363757706, "learning_rate": 0.003, "loss": 4.2727, "step": 1573 }, { "epoch": 0.01574, "grad_norm": 0.8583102869770243, "learning_rate": 0.003, "loss": 4.279, "step": 1574 }, { "epoch": 0.01575, "grad_norm": 0.8543483448302793, "learning_rate": 0.003, "loss": 4.2483, "step": 1575 }, { "epoch": 0.01576, "grad_norm": 0.6740643949431316, "learning_rate": 0.003, "loss": 4.2592, "step": 1576 }, { "epoch": 0.01577, "grad_norm": 0.7068574162099283, "learning_rate": 0.003, "loss": 4.2234, "step": 1577 }, { "epoch": 0.01578, "grad_norm": 0.679751091133263, "learning_rate": 0.003, "loss": 4.2501, "step": 1578 }, { "epoch": 0.01579, "grad_norm": 0.6207998089836335, "learning_rate": 0.003, "loss": 4.2428, "step": 1579 }, { "epoch": 0.0158, "grad_norm": 0.546999417238052, "learning_rate": 0.003, "loss": 4.2499, "step": 1580 }, { "epoch": 0.01581, "grad_norm": 0.5890586432360596, "learning_rate": 0.003, "loss": 4.2581, "step": 1581 }, { "epoch": 0.01582, "grad_norm": 0.6738822960276017, "learning_rate": 0.003, "loss": 4.2412, "step": 1582 }, { "epoch": 0.01583, "grad_norm": 0.8449645723590827, "learning_rate": 0.003, "loss": 4.235, "step": 1583 }, { "epoch": 0.01584, "grad_norm": 0.8760719049873085, "learning_rate": 0.003, "loss": 4.2744, "step": 1584 }, { "epoch": 0.01585, "grad_norm": 0.8749576730978259, "learning_rate": 0.003, "loss": 4.2653, "step": 1585 }, { "epoch": 0.01586, "grad_norm": 0.9817764014825603, "learning_rate": 0.003, "loss": 4.2423, "step": 1586 }, { "epoch": 0.01587, "grad_norm": 0.916565337571196, "learning_rate": 0.003, "loss": 4.2657, "step": 1587 }, { "epoch": 0.01588, "grad_norm": 0.5958323489289675, "learning_rate": 0.003, "loss": 4.2761, "step": 1588 }, { "epoch": 0.01589, "grad_norm": 0.6040948221660047, "learning_rate": 0.003, "loss": 4.2694, "step": 1589 }, { "epoch": 0.0159, "grad_norm": 0.5541532228259608, "learning_rate": 0.003, "loss": 4.2601, "step": 1590 }, { "epoch": 0.01591, "grad_norm": 0.5732941807574177, "learning_rate": 0.003, "loss": 4.2648, "step": 1591 }, { "epoch": 0.01592, "grad_norm": 0.6004965962049119, "learning_rate": 0.003, "loss": 4.2552, "step": 1592 }, { "epoch": 0.01593, "grad_norm": 0.6750400712704605, "learning_rate": 0.003, "loss": 4.2608, "step": 1593 }, { "epoch": 0.01594, "grad_norm": 0.6685590290117844, "learning_rate": 0.003, "loss": 4.2444, "step": 1594 }, { "epoch": 0.01595, "grad_norm": 0.6394603041102755, "learning_rate": 0.003, "loss": 4.27, "step": 1595 }, { "epoch": 0.01596, "grad_norm": 0.5525501078218852, "learning_rate": 0.003, "loss": 4.2582, "step": 1596 }, { "epoch": 0.01597, "grad_norm": 0.5429531891975062, "learning_rate": 0.003, "loss": 4.2192, "step": 1597 }, { "epoch": 0.01598, "grad_norm": 0.5494281969618091, "learning_rate": 0.003, "loss": 4.2123, "step": 1598 }, { "epoch": 0.01599, "grad_norm": 0.48563975518620667, "learning_rate": 0.003, "loss": 4.2331, "step": 1599 }, { "epoch": 0.016, "grad_norm": 0.4293130690644179, "learning_rate": 0.003, "loss": 4.1975, "step": 1600 }, { "epoch": 0.01601, "grad_norm": 0.4385393368882689, "learning_rate": 0.003, "loss": 4.2457, "step": 1601 }, { "epoch": 0.01602, "grad_norm": 0.6149074026063737, "learning_rate": 0.003, "loss": 4.2243, "step": 1602 }, { "epoch": 0.01603, "grad_norm": 0.7618943354871598, "learning_rate": 0.003, "loss": 4.2459, "step": 1603 }, { "epoch": 0.01604, "grad_norm": 0.8231087133002348, "learning_rate": 0.003, "loss": 4.2421, "step": 1604 }, { "epoch": 0.01605, "grad_norm": 0.792939816240345, "learning_rate": 0.003, "loss": 4.2225, "step": 1605 }, { "epoch": 0.01606, "grad_norm": 0.8283386222980521, "learning_rate": 0.003, "loss": 4.2264, "step": 1606 }, { "epoch": 0.01607, "grad_norm": 0.7929167281458711, "learning_rate": 0.003, "loss": 4.2525, "step": 1607 }, { "epoch": 0.01608, "grad_norm": 0.8238114241929788, "learning_rate": 0.003, "loss": 4.2293, "step": 1608 }, { "epoch": 0.01609, "grad_norm": 0.8482072969488592, "learning_rate": 0.003, "loss": 4.2448, "step": 1609 }, { "epoch": 0.0161, "grad_norm": 0.9731734226169589, "learning_rate": 0.003, "loss": 4.2157, "step": 1610 }, { "epoch": 0.01611, "grad_norm": 0.8477820620248173, "learning_rate": 0.003, "loss": 4.2328, "step": 1611 }, { "epoch": 0.01612, "grad_norm": 0.8003056368656658, "learning_rate": 0.003, "loss": 4.2224, "step": 1612 }, { "epoch": 0.01613, "grad_norm": 0.9384869181655229, "learning_rate": 0.003, "loss": 4.2759, "step": 1613 }, { "epoch": 0.01614, "grad_norm": 1.094095683053054, "learning_rate": 0.003, "loss": 4.305, "step": 1614 }, { "epoch": 0.01615, "grad_norm": 1.0454054633439653, "learning_rate": 0.003, "loss": 4.284, "step": 1615 }, { "epoch": 0.01616, "grad_norm": 0.728394920780058, "learning_rate": 0.003, "loss": 4.2459, "step": 1616 }, { "epoch": 0.01617, "grad_norm": 0.7455143593181587, "learning_rate": 0.003, "loss": 4.2373, "step": 1617 }, { "epoch": 0.01618, "grad_norm": 0.6872936552392757, "learning_rate": 0.003, "loss": 4.2451, "step": 1618 }, { "epoch": 0.01619, "grad_norm": 0.5634358260711833, "learning_rate": 0.003, "loss": 4.2274, "step": 1619 }, { "epoch": 0.0162, "grad_norm": 0.5499920921999998, "learning_rate": 0.003, "loss": 4.231, "step": 1620 }, { "epoch": 0.01621, "grad_norm": 0.4948415173461953, "learning_rate": 0.003, "loss": 4.226, "step": 1621 }, { "epoch": 0.01622, "grad_norm": 0.45485020618430855, "learning_rate": 0.003, "loss": 4.2573, "step": 1622 }, { "epoch": 0.01623, "grad_norm": 0.4128235375978694, "learning_rate": 0.003, "loss": 4.2306, "step": 1623 }, { "epoch": 0.01624, "grad_norm": 0.35723113909525034, "learning_rate": 0.003, "loss": 4.2422, "step": 1624 }, { "epoch": 0.01625, "grad_norm": 0.3930391853636471, "learning_rate": 0.003, "loss": 4.2175, "step": 1625 }, { "epoch": 0.01626, "grad_norm": 0.3980902109371374, "learning_rate": 0.003, "loss": 4.2372, "step": 1626 }, { "epoch": 0.01627, "grad_norm": 0.4192958733549867, "learning_rate": 0.003, "loss": 4.2057, "step": 1627 }, { "epoch": 0.01628, "grad_norm": 0.4456736368427327, "learning_rate": 0.003, "loss": 4.2132, "step": 1628 }, { "epoch": 0.01629, "grad_norm": 0.46084356058160403, "learning_rate": 0.003, "loss": 4.2138, "step": 1629 }, { "epoch": 0.0163, "grad_norm": 0.4494988261781255, "learning_rate": 0.003, "loss": 4.2025, "step": 1630 }, { "epoch": 0.01631, "grad_norm": 0.49049706860110326, "learning_rate": 0.003, "loss": 4.2248, "step": 1631 }, { "epoch": 0.01632, "grad_norm": 0.4941434115670244, "learning_rate": 0.003, "loss": 4.234, "step": 1632 }, { "epoch": 0.01633, "grad_norm": 0.5015101022129131, "learning_rate": 0.003, "loss": 4.2072, "step": 1633 }, { "epoch": 0.01634, "grad_norm": 0.5355988850166129, "learning_rate": 0.003, "loss": 4.2264, "step": 1634 }, { "epoch": 0.01635, "grad_norm": 0.6114378514764486, "learning_rate": 0.003, "loss": 4.2077, "step": 1635 }, { "epoch": 0.01636, "grad_norm": 0.659185840591732, "learning_rate": 0.003, "loss": 4.2261, "step": 1636 }, { "epoch": 0.01637, "grad_norm": 0.6890891308278232, "learning_rate": 0.003, "loss": 4.2122, "step": 1637 }, { "epoch": 0.01638, "grad_norm": 0.7917384547475219, "learning_rate": 0.003, "loss": 4.2254, "step": 1638 }, { "epoch": 0.01639, "grad_norm": 0.8310494258877913, "learning_rate": 0.003, "loss": 4.2496, "step": 1639 }, { "epoch": 0.0164, "grad_norm": 0.6945218446546085, "learning_rate": 0.003, "loss": 4.2286, "step": 1640 }, { "epoch": 0.01641, "grad_norm": 0.643774136427719, "learning_rate": 0.003, "loss": 4.2028, "step": 1641 }, { "epoch": 0.01642, "grad_norm": 0.5630395282689907, "learning_rate": 0.003, "loss": 4.2418, "step": 1642 }, { "epoch": 0.01643, "grad_norm": 0.6319019547985867, "learning_rate": 0.003, "loss": 4.23, "step": 1643 }, { "epoch": 0.01644, "grad_norm": 0.6923812193124619, "learning_rate": 0.003, "loss": 4.2497, "step": 1644 }, { "epoch": 0.01645, "grad_norm": 0.7835455696779042, "learning_rate": 0.003, "loss": 4.2006, "step": 1645 }, { "epoch": 0.01646, "grad_norm": 0.9362858528690088, "learning_rate": 0.003, "loss": 4.2431, "step": 1646 }, { "epoch": 0.01647, "grad_norm": 1.2289381301321516, "learning_rate": 0.003, "loss": 4.255, "step": 1647 }, { "epoch": 0.01648, "grad_norm": 0.8093728261853784, "learning_rate": 0.003, "loss": 4.2206, "step": 1648 }, { "epoch": 0.01649, "grad_norm": 0.7780860637025212, "learning_rate": 0.003, "loss": 4.2535, "step": 1649 }, { "epoch": 0.0165, "grad_norm": 0.8631062415473619, "learning_rate": 0.003, "loss": 4.2375, "step": 1650 }, { "epoch": 0.01651, "grad_norm": 0.9142840285195195, "learning_rate": 0.003, "loss": 4.2523, "step": 1651 }, { "epoch": 0.01652, "grad_norm": 0.8336324309893678, "learning_rate": 0.003, "loss": 4.2094, "step": 1652 }, { "epoch": 0.01653, "grad_norm": 0.856140708339795, "learning_rate": 0.003, "loss": 4.2552, "step": 1653 }, { "epoch": 0.01654, "grad_norm": 0.9035014747718915, "learning_rate": 0.003, "loss": 4.2428, "step": 1654 }, { "epoch": 0.01655, "grad_norm": 0.8007220508865799, "learning_rate": 0.003, "loss": 4.2521, "step": 1655 }, { "epoch": 0.01656, "grad_norm": 0.8140605513593369, "learning_rate": 0.003, "loss": 4.2202, "step": 1656 }, { "epoch": 0.01657, "grad_norm": 0.752017726600531, "learning_rate": 0.003, "loss": 4.2402, "step": 1657 }, { "epoch": 0.01658, "grad_norm": 0.8846697411940587, "learning_rate": 0.003, "loss": 4.2338, "step": 1658 }, { "epoch": 0.01659, "grad_norm": 0.8979228683366066, "learning_rate": 0.003, "loss": 4.2417, "step": 1659 }, { "epoch": 0.0166, "grad_norm": 0.73714542235099, "learning_rate": 0.003, "loss": 4.2224, "step": 1660 }, { "epoch": 0.01661, "grad_norm": 0.6566854277166256, "learning_rate": 0.003, "loss": 4.2212, "step": 1661 }, { "epoch": 0.01662, "grad_norm": 0.5217494005958339, "learning_rate": 0.003, "loss": 4.2208, "step": 1662 }, { "epoch": 0.01663, "grad_norm": 0.4794160667989573, "learning_rate": 0.003, "loss": 4.2395, "step": 1663 }, { "epoch": 0.01664, "grad_norm": 0.5082285006914213, "learning_rate": 0.003, "loss": 4.2563, "step": 1664 }, { "epoch": 0.01665, "grad_norm": 0.5164271930929455, "learning_rate": 0.003, "loss": 4.2188, "step": 1665 }, { "epoch": 0.01666, "grad_norm": 0.6299180885641164, "learning_rate": 0.003, "loss": 4.2234, "step": 1666 }, { "epoch": 0.01667, "grad_norm": 0.6409430167727818, "learning_rate": 0.003, "loss": 4.2382, "step": 1667 }, { "epoch": 0.01668, "grad_norm": 0.6129313286956495, "learning_rate": 0.003, "loss": 4.2415, "step": 1668 }, { "epoch": 0.01669, "grad_norm": 0.5586023730002122, "learning_rate": 0.003, "loss": 4.223, "step": 1669 }, { "epoch": 0.0167, "grad_norm": 0.6736111064245857, "learning_rate": 0.003, "loss": 4.2169, "step": 1670 }, { "epoch": 0.01671, "grad_norm": 0.7441939131840011, "learning_rate": 0.003, "loss": 4.2115, "step": 1671 }, { "epoch": 0.01672, "grad_norm": 0.8159394752701435, "learning_rate": 0.003, "loss": 4.2214, "step": 1672 }, { "epoch": 0.01673, "grad_norm": 0.7952390142878619, "learning_rate": 0.003, "loss": 4.2239, "step": 1673 }, { "epoch": 0.01674, "grad_norm": 0.7199816431285693, "learning_rate": 0.003, "loss": 4.2398, "step": 1674 }, { "epoch": 0.01675, "grad_norm": 0.7812011305713198, "learning_rate": 0.003, "loss": 4.2244, "step": 1675 }, { "epoch": 0.01676, "grad_norm": 0.7469280078216791, "learning_rate": 0.003, "loss": 4.2309, "step": 1676 }, { "epoch": 0.01677, "grad_norm": 0.7271162906259458, "learning_rate": 0.003, "loss": 4.2196, "step": 1677 }, { "epoch": 0.01678, "grad_norm": 0.8881037129933885, "learning_rate": 0.003, "loss": 4.2406, "step": 1678 }, { "epoch": 0.01679, "grad_norm": 0.7948545784278963, "learning_rate": 0.003, "loss": 4.2229, "step": 1679 }, { "epoch": 0.0168, "grad_norm": 0.7189446111250655, "learning_rate": 0.003, "loss": 4.2374, "step": 1680 }, { "epoch": 0.01681, "grad_norm": 0.7033815727690789, "learning_rate": 0.003, "loss": 4.2384, "step": 1681 }, { "epoch": 0.01682, "grad_norm": 0.6685587436768649, "learning_rate": 0.003, "loss": 4.231, "step": 1682 }, { "epoch": 0.01683, "grad_norm": 0.637929998688511, "learning_rate": 0.003, "loss": 4.241, "step": 1683 }, { "epoch": 0.01684, "grad_norm": 0.628959376324859, "learning_rate": 0.003, "loss": 4.2154, "step": 1684 }, { "epoch": 0.01685, "grad_norm": 0.6231765249837203, "learning_rate": 0.003, "loss": 4.231, "step": 1685 }, { "epoch": 0.01686, "grad_norm": 0.5820543845215673, "learning_rate": 0.003, "loss": 4.2226, "step": 1686 }, { "epoch": 0.01687, "grad_norm": 0.5908668431711505, "learning_rate": 0.003, "loss": 4.2167, "step": 1687 }, { "epoch": 0.01688, "grad_norm": 0.6244492735925097, "learning_rate": 0.003, "loss": 4.2038, "step": 1688 }, { "epoch": 0.01689, "grad_norm": 0.601372798813749, "learning_rate": 0.003, "loss": 4.2192, "step": 1689 }, { "epoch": 0.0169, "grad_norm": 0.5539912290934804, "learning_rate": 0.003, "loss": 4.2069, "step": 1690 }, { "epoch": 0.01691, "grad_norm": 0.6406257238232621, "learning_rate": 0.003, "loss": 4.2196, "step": 1691 }, { "epoch": 0.01692, "grad_norm": 0.6715764673442781, "learning_rate": 0.003, "loss": 4.2214, "step": 1692 }, { "epoch": 0.01693, "grad_norm": 0.594965855834695, "learning_rate": 0.003, "loss": 4.2461, "step": 1693 }, { "epoch": 0.01694, "grad_norm": 0.5534146564551462, "learning_rate": 0.003, "loss": 4.1943, "step": 1694 }, { "epoch": 0.01695, "grad_norm": 0.6691590053804509, "learning_rate": 0.003, "loss": 4.2273, "step": 1695 }, { "epoch": 0.01696, "grad_norm": 0.7988479106506707, "learning_rate": 0.003, "loss": 4.2265, "step": 1696 }, { "epoch": 0.01697, "grad_norm": 0.9378853835820298, "learning_rate": 0.003, "loss": 4.2129, "step": 1697 }, { "epoch": 0.01698, "grad_norm": 0.8351693624536569, "learning_rate": 0.003, "loss": 4.2098, "step": 1698 }, { "epoch": 0.01699, "grad_norm": 0.6689256544140322, "learning_rate": 0.003, "loss": 4.2116, "step": 1699 }, { "epoch": 0.017, "grad_norm": 0.718191815671613, "learning_rate": 0.003, "loss": 4.2331, "step": 1700 }, { "epoch": 0.01701, "grad_norm": 0.6236805335228212, "learning_rate": 0.003, "loss": 4.2114, "step": 1701 }, { "epoch": 0.01702, "grad_norm": 0.6198051313671201, "learning_rate": 0.003, "loss": 4.235, "step": 1702 }, { "epoch": 0.01703, "grad_norm": 0.6124735544668184, "learning_rate": 0.003, "loss": 4.2192, "step": 1703 }, { "epoch": 0.01704, "grad_norm": 0.5788411907923544, "learning_rate": 0.003, "loss": 4.2046, "step": 1704 }, { "epoch": 0.01705, "grad_norm": 0.528954361543718, "learning_rate": 0.003, "loss": 4.2048, "step": 1705 }, { "epoch": 0.01706, "grad_norm": 0.5207913048275148, "learning_rate": 0.003, "loss": 4.203, "step": 1706 }, { "epoch": 0.01707, "grad_norm": 0.49956087831547324, "learning_rate": 0.003, "loss": 4.2234, "step": 1707 }, { "epoch": 0.01708, "grad_norm": 0.5265055046037553, "learning_rate": 0.003, "loss": 4.2005, "step": 1708 }, { "epoch": 0.01709, "grad_norm": 0.601511173671749, "learning_rate": 0.003, "loss": 4.2245, "step": 1709 }, { "epoch": 0.0171, "grad_norm": 0.6457072596477392, "learning_rate": 0.003, "loss": 4.1751, "step": 1710 }, { "epoch": 0.01711, "grad_norm": 0.6931354519414087, "learning_rate": 0.003, "loss": 4.206, "step": 1711 }, { "epoch": 0.01712, "grad_norm": 0.7843884994764989, "learning_rate": 0.003, "loss": 4.2207, "step": 1712 }, { "epoch": 0.01713, "grad_norm": 0.8034349282711927, "learning_rate": 0.003, "loss": 4.1971, "step": 1713 }, { "epoch": 0.01714, "grad_norm": 0.8979650985806761, "learning_rate": 0.003, "loss": 4.2412, "step": 1714 }, { "epoch": 0.01715, "grad_norm": 0.9199811287834594, "learning_rate": 0.003, "loss": 4.2496, "step": 1715 }, { "epoch": 0.01716, "grad_norm": 0.8606879750752993, "learning_rate": 0.003, "loss": 4.2283, "step": 1716 }, { "epoch": 0.01717, "grad_norm": 0.9133738914940963, "learning_rate": 0.003, "loss": 4.2649, "step": 1717 }, { "epoch": 0.01718, "grad_norm": 0.9751937973978706, "learning_rate": 0.003, "loss": 4.2385, "step": 1718 }, { "epoch": 0.01719, "grad_norm": 0.9249947621239035, "learning_rate": 0.003, "loss": 4.2632, "step": 1719 }, { "epoch": 0.0172, "grad_norm": 0.9364522247503431, "learning_rate": 0.003, "loss": 4.2208, "step": 1720 }, { "epoch": 0.01721, "grad_norm": 0.8201093609998811, "learning_rate": 0.003, "loss": 4.2353, "step": 1721 }, { "epoch": 0.01722, "grad_norm": 0.8214210127594672, "learning_rate": 0.003, "loss": 4.2288, "step": 1722 }, { "epoch": 0.01723, "grad_norm": 0.8471893471073342, "learning_rate": 0.003, "loss": 4.2399, "step": 1723 }, { "epoch": 0.01724, "grad_norm": 0.7683690946961591, "learning_rate": 0.003, "loss": 4.2434, "step": 1724 }, { "epoch": 0.01725, "grad_norm": 0.8115047065658012, "learning_rate": 0.003, "loss": 4.2272, "step": 1725 }, { "epoch": 0.01726, "grad_norm": 0.8092210144832711, "learning_rate": 0.003, "loss": 4.2174, "step": 1726 }, { "epoch": 0.01727, "grad_norm": 0.8550625584545292, "learning_rate": 0.003, "loss": 4.226, "step": 1727 }, { "epoch": 0.01728, "grad_norm": 0.8021206512196133, "learning_rate": 0.003, "loss": 4.2298, "step": 1728 }, { "epoch": 0.01729, "grad_norm": 0.7363522392419929, "learning_rate": 0.003, "loss": 4.2336, "step": 1729 }, { "epoch": 0.0173, "grad_norm": 0.6975599031052584, "learning_rate": 0.003, "loss": 4.2051, "step": 1730 }, { "epoch": 0.01731, "grad_norm": 0.6221588088033019, "learning_rate": 0.003, "loss": 4.2243, "step": 1731 }, { "epoch": 0.01732, "grad_norm": 0.612618335472448, "learning_rate": 0.003, "loss": 4.2315, "step": 1732 }, { "epoch": 0.01733, "grad_norm": 0.6541617874624247, "learning_rate": 0.003, "loss": 4.2125, "step": 1733 }, { "epoch": 0.01734, "grad_norm": 0.5863531048899692, "learning_rate": 0.003, "loss": 4.2205, "step": 1734 }, { "epoch": 0.01735, "grad_norm": 0.5699608850382641, "learning_rate": 0.003, "loss": 4.1915, "step": 1735 }, { "epoch": 0.01736, "grad_norm": 0.5871625865225932, "learning_rate": 0.003, "loss": 4.1898, "step": 1736 }, { "epoch": 0.01737, "grad_norm": 0.5870381595261084, "learning_rate": 0.003, "loss": 4.2212, "step": 1737 }, { "epoch": 0.01738, "grad_norm": 0.5498612952733963, "learning_rate": 0.003, "loss": 4.2039, "step": 1738 }, { "epoch": 0.01739, "grad_norm": 0.6117901077151079, "learning_rate": 0.003, "loss": 4.2218, "step": 1739 }, { "epoch": 0.0174, "grad_norm": 0.6161361385546128, "learning_rate": 0.003, "loss": 4.2194, "step": 1740 }, { "epoch": 0.01741, "grad_norm": 0.6356544908412293, "learning_rate": 0.003, "loss": 4.2288, "step": 1741 }, { "epoch": 0.01742, "grad_norm": 0.6972113147848942, "learning_rate": 0.003, "loss": 4.2174, "step": 1742 }, { "epoch": 0.01743, "grad_norm": 0.76783001607299, "learning_rate": 0.003, "loss": 4.2327, "step": 1743 }, { "epoch": 0.01744, "grad_norm": 0.7771532804911003, "learning_rate": 0.003, "loss": 4.2177, "step": 1744 }, { "epoch": 0.01745, "grad_norm": 0.61106822860021, "learning_rate": 0.003, "loss": 4.1781, "step": 1745 }, { "epoch": 0.01746, "grad_norm": 0.6089323419748714, "learning_rate": 0.003, "loss": 4.2031, "step": 1746 }, { "epoch": 0.01747, "grad_norm": 0.6068526864216675, "learning_rate": 0.003, "loss": 4.2005, "step": 1747 }, { "epoch": 0.01748, "grad_norm": 0.6143873963395499, "learning_rate": 0.003, "loss": 4.194, "step": 1748 }, { "epoch": 0.01749, "grad_norm": 0.505834715053384, "learning_rate": 0.003, "loss": 4.1866, "step": 1749 }, { "epoch": 0.0175, "grad_norm": 0.4183071099190176, "learning_rate": 0.003, "loss": 4.1986, "step": 1750 }, { "epoch": 0.01751, "grad_norm": 0.4200651633429046, "learning_rate": 0.003, "loss": 4.1916, "step": 1751 }, { "epoch": 0.01752, "grad_norm": 0.42294753534060125, "learning_rate": 0.003, "loss": 4.2318, "step": 1752 }, { "epoch": 0.01753, "grad_norm": 0.4040582052822703, "learning_rate": 0.003, "loss": 4.1786, "step": 1753 }, { "epoch": 0.01754, "grad_norm": 0.39927824671480916, "learning_rate": 0.003, "loss": 4.2108, "step": 1754 }, { "epoch": 0.01755, "grad_norm": 0.39324726452057945, "learning_rate": 0.003, "loss": 4.1497, "step": 1755 }, { "epoch": 0.01756, "grad_norm": 0.40168143250637856, "learning_rate": 0.003, "loss": 4.227, "step": 1756 }, { "epoch": 0.01757, "grad_norm": 0.4561700566055392, "learning_rate": 0.003, "loss": 4.2081, "step": 1757 }, { "epoch": 0.01758, "grad_norm": 0.5466402873638766, "learning_rate": 0.003, "loss": 4.2157, "step": 1758 }, { "epoch": 0.01759, "grad_norm": 0.6515131335626271, "learning_rate": 0.003, "loss": 4.1877, "step": 1759 }, { "epoch": 0.0176, "grad_norm": 0.7221307724120931, "learning_rate": 0.003, "loss": 4.2153, "step": 1760 }, { "epoch": 0.01761, "grad_norm": 0.7514386530231217, "learning_rate": 0.003, "loss": 4.2014, "step": 1761 }, { "epoch": 0.01762, "grad_norm": 0.8563467654669285, "learning_rate": 0.003, "loss": 4.2266, "step": 1762 }, { "epoch": 0.01763, "grad_norm": 1.0251781966749487, "learning_rate": 0.003, "loss": 4.2304, "step": 1763 }, { "epoch": 0.01764, "grad_norm": 1.1022130069351062, "learning_rate": 0.003, "loss": 4.204, "step": 1764 }, { "epoch": 0.01765, "grad_norm": 0.9424736955387317, "learning_rate": 0.003, "loss": 4.2315, "step": 1765 }, { "epoch": 0.01766, "grad_norm": 0.9227888573535311, "learning_rate": 0.003, "loss": 4.2455, "step": 1766 }, { "epoch": 0.01767, "grad_norm": 0.8152567408138972, "learning_rate": 0.003, "loss": 4.2043, "step": 1767 }, { "epoch": 0.01768, "grad_norm": 0.8952526282025846, "learning_rate": 0.003, "loss": 4.2178, "step": 1768 }, { "epoch": 0.01769, "grad_norm": 0.9903660885622939, "learning_rate": 0.003, "loss": 4.221, "step": 1769 }, { "epoch": 0.0177, "grad_norm": 1.1434768458973932, "learning_rate": 0.003, "loss": 4.2196, "step": 1770 }, { "epoch": 0.01771, "grad_norm": 1.1463152027154602, "learning_rate": 0.003, "loss": 4.2541, "step": 1771 }, { "epoch": 0.01772, "grad_norm": 0.9320499048871318, "learning_rate": 0.003, "loss": 4.2202, "step": 1772 }, { "epoch": 0.01773, "grad_norm": 0.9158947764816099, "learning_rate": 0.003, "loss": 4.2126, "step": 1773 }, { "epoch": 0.01774, "grad_norm": 1.0234123818722582, "learning_rate": 0.003, "loss": 4.25, "step": 1774 }, { "epoch": 0.01775, "grad_norm": 1.0396053755051766, "learning_rate": 0.003, "loss": 4.2387, "step": 1775 }, { "epoch": 0.01776, "grad_norm": 0.8699023391257188, "learning_rate": 0.003, "loss": 4.2279, "step": 1776 }, { "epoch": 0.01777, "grad_norm": 0.7504359108002385, "learning_rate": 0.003, "loss": 4.2409, "step": 1777 }, { "epoch": 0.01778, "grad_norm": 0.6686463646026519, "learning_rate": 0.003, "loss": 4.2296, "step": 1778 }, { "epoch": 0.01779, "grad_norm": 0.627501081983313, "learning_rate": 0.003, "loss": 4.2441, "step": 1779 }, { "epoch": 0.0178, "grad_norm": 0.6074782341836313, "learning_rate": 0.003, "loss": 4.213, "step": 1780 }, { "epoch": 0.01781, "grad_norm": 0.6632306275046532, "learning_rate": 0.003, "loss": 4.2271, "step": 1781 }, { "epoch": 0.01782, "grad_norm": 0.6482376732581775, "learning_rate": 0.003, "loss": 4.2498, "step": 1782 }, { "epoch": 0.01783, "grad_norm": 0.6647693064019089, "learning_rate": 0.003, "loss": 4.229, "step": 1783 }, { "epoch": 0.01784, "grad_norm": 0.7433144987650033, "learning_rate": 0.003, "loss": 4.2289, "step": 1784 }, { "epoch": 0.01785, "grad_norm": 0.8023042672883854, "learning_rate": 0.003, "loss": 4.1929, "step": 1785 }, { "epoch": 0.01786, "grad_norm": 0.7590350402092263, "learning_rate": 0.003, "loss": 4.2189, "step": 1786 }, { "epoch": 0.01787, "grad_norm": 0.6044391074634635, "learning_rate": 0.003, "loss": 4.223, "step": 1787 }, { "epoch": 0.01788, "grad_norm": 0.5734225415780418, "learning_rate": 0.003, "loss": 4.2291, "step": 1788 }, { "epoch": 0.01789, "grad_norm": 0.5619094879913861, "learning_rate": 0.003, "loss": 4.1969, "step": 1789 }, { "epoch": 0.0179, "grad_norm": 0.5903172908647163, "learning_rate": 0.003, "loss": 4.2152, "step": 1790 }, { "epoch": 0.01791, "grad_norm": 0.48196952921006103, "learning_rate": 0.003, "loss": 4.2224, "step": 1791 }, { "epoch": 0.01792, "grad_norm": 0.451545250518652, "learning_rate": 0.003, "loss": 4.2239, "step": 1792 }, { "epoch": 0.01793, "grad_norm": 0.521884061957321, "learning_rate": 0.003, "loss": 4.1852, "step": 1793 }, { "epoch": 0.01794, "grad_norm": 0.5052209264531972, "learning_rate": 0.003, "loss": 4.1997, "step": 1794 }, { "epoch": 0.01795, "grad_norm": 0.4700227936734472, "learning_rate": 0.003, "loss": 4.1984, "step": 1795 }, { "epoch": 0.01796, "grad_norm": 0.48384581004820526, "learning_rate": 0.003, "loss": 4.2111, "step": 1796 }, { "epoch": 0.01797, "grad_norm": 0.5674789805804968, "learning_rate": 0.003, "loss": 4.1791, "step": 1797 }, { "epoch": 0.01798, "grad_norm": 0.5733510313406954, "learning_rate": 0.003, "loss": 4.1913, "step": 1798 }, { "epoch": 0.01799, "grad_norm": 0.6643001766771917, "learning_rate": 0.003, "loss": 4.2106, "step": 1799 }, { "epoch": 0.018, "grad_norm": 0.7693742824268138, "learning_rate": 0.003, "loss": 4.2019, "step": 1800 }, { "epoch": 0.01801, "grad_norm": 0.7789742970956256, "learning_rate": 0.003, "loss": 4.2028, "step": 1801 }, { "epoch": 0.01802, "grad_norm": 0.7483411999928193, "learning_rate": 0.003, "loss": 4.2174, "step": 1802 }, { "epoch": 0.01803, "grad_norm": 0.6912125156211221, "learning_rate": 0.003, "loss": 4.2209, "step": 1803 }, { "epoch": 0.01804, "grad_norm": 0.6110578547720885, "learning_rate": 0.003, "loss": 4.1902, "step": 1804 }, { "epoch": 0.01805, "grad_norm": 0.5506478742000366, "learning_rate": 0.003, "loss": 4.2069, "step": 1805 }, { "epoch": 0.01806, "grad_norm": 0.6061937579522265, "learning_rate": 0.003, "loss": 4.1674, "step": 1806 }, { "epoch": 0.01807, "grad_norm": 0.5932129898926418, "learning_rate": 0.003, "loss": 4.2038, "step": 1807 }, { "epoch": 0.01808, "grad_norm": 0.5444224258882361, "learning_rate": 0.003, "loss": 4.1921, "step": 1808 }, { "epoch": 0.01809, "grad_norm": 0.5399756231214402, "learning_rate": 0.003, "loss": 4.2143, "step": 1809 }, { "epoch": 0.0181, "grad_norm": 0.4832914161347141, "learning_rate": 0.003, "loss": 4.2061, "step": 1810 }, { "epoch": 0.01811, "grad_norm": 0.5091858383706224, "learning_rate": 0.003, "loss": 4.1923, "step": 1811 }, { "epoch": 0.01812, "grad_norm": 0.4960424818906239, "learning_rate": 0.003, "loss": 4.1761, "step": 1812 }, { "epoch": 0.01813, "grad_norm": 0.4858161245409072, "learning_rate": 0.003, "loss": 4.2194, "step": 1813 }, { "epoch": 0.01814, "grad_norm": 0.5128609502675707, "learning_rate": 0.003, "loss": 4.1853, "step": 1814 }, { "epoch": 0.01815, "grad_norm": 0.4696917504376051, "learning_rate": 0.003, "loss": 4.1713, "step": 1815 }, { "epoch": 0.01816, "grad_norm": 0.4276704853681577, "learning_rate": 0.003, "loss": 4.1969, "step": 1816 }, { "epoch": 0.01817, "grad_norm": 0.49742714220067663, "learning_rate": 0.003, "loss": 4.1904, "step": 1817 }, { "epoch": 0.01818, "grad_norm": 0.6269759675631359, "learning_rate": 0.003, "loss": 4.1931, "step": 1818 }, { "epoch": 0.01819, "grad_norm": 0.7520796750217182, "learning_rate": 0.003, "loss": 4.1832, "step": 1819 }, { "epoch": 0.0182, "grad_norm": 0.9014481818810978, "learning_rate": 0.003, "loss": 4.2101, "step": 1820 }, { "epoch": 0.01821, "grad_norm": 0.7861765129999728, "learning_rate": 0.003, "loss": 4.1954, "step": 1821 }, { "epoch": 0.01822, "grad_norm": 0.6190127192025973, "learning_rate": 0.003, "loss": 4.2, "step": 1822 }, { "epoch": 0.01823, "grad_norm": 0.7778520136299403, "learning_rate": 0.003, "loss": 4.2211, "step": 1823 }, { "epoch": 0.01824, "grad_norm": 0.7843362143306014, "learning_rate": 0.003, "loss": 4.2055, "step": 1824 }, { "epoch": 0.01825, "grad_norm": 0.7380149583575544, "learning_rate": 0.003, "loss": 4.2108, "step": 1825 }, { "epoch": 0.01826, "grad_norm": 0.7040620507144565, "learning_rate": 0.003, "loss": 4.2188, "step": 1826 }, { "epoch": 0.01827, "grad_norm": 0.8207902713006542, "learning_rate": 0.003, "loss": 4.1955, "step": 1827 }, { "epoch": 0.01828, "grad_norm": 0.8054878482534694, "learning_rate": 0.003, "loss": 4.2093, "step": 1828 }, { "epoch": 0.01829, "grad_norm": 0.8245790564775142, "learning_rate": 0.003, "loss": 4.2301, "step": 1829 }, { "epoch": 0.0183, "grad_norm": 0.8846767870010911, "learning_rate": 0.003, "loss": 4.2277, "step": 1830 }, { "epoch": 0.01831, "grad_norm": 0.8289617540456092, "learning_rate": 0.003, "loss": 4.2216, "step": 1831 }, { "epoch": 0.01832, "grad_norm": 0.731274205528004, "learning_rate": 0.003, "loss": 4.2108, "step": 1832 }, { "epoch": 0.01833, "grad_norm": 0.7928566591291573, "learning_rate": 0.003, "loss": 4.2459, "step": 1833 }, { "epoch": 0.01834, "grad_norm": 0.7576155251040918, "learning_rate": 0.003, "loss": 4.2123, "step": 1834 }, { "epoch": 0.01835, "grad_norm": 0.7931842778411774, "learning_rate": 0.003, "loss": 4.214, "step": 1835 }, { "epoch": 0.01836, "grad_norm": 0.6445999325297602, "learning_rate": 0.003, "loss": 4.2036, "step": 1836 }, { "epoch": 0.01837, "grad_norm": 0.680643728796186, "learning_rate": 0.003, "loss": 4.208, "step": 1837 }, { "epoch": 0.01838, "grad_norm": 0.6239843687734086, "learning_rate": 0.003, "loss": 4.2079, "step": 1838 }, { "epoch": 0.01839, "grad_norm": 0.6182658027468049, "learning_rate": 0.003, "loss": 4.2277, "step": 1839 }, { "epoch": 0.0184, "grad_norm": 0.5890463339530954, "learning_rate": 0.003, "loss": 4.2247, "step": 1840 }, { "epoch": 0.01841, "grad_norm": 0.5863839914398512, "learning_rate": 0.003, "loss": 4.1619, "step": 1841 }, { "epoch": 0.01842, "grad_norm": 0.5858091006393259, "learning_rate": 0.003, "loss": 4.1952, "step": 1842 }, { "epoch": 0.01843, "grad_norm": 0.5624576041074113, "learning_rate": 0.003, "loss": 4.1755, "step": 1843 }, { "epoch": 0.01844, "grad_norm": 0.5465927738839299, "learning_rate": 0.003, "loss": 4.1732, "step": 1844 }, { "epoch": 0.01845, "grad_norm": 0.6027290912828838, "learning_rate": 0.003, "loss": 4.1768, "step": 1845 }, { "epoch": 0.01846, "grad_norm": 0.7027351856984548, "learning_rate": 0.003, "loss": 4.2065, "step": 1846 }, { "epoch": 0.01847, "grad_norm": 0.8199216861922868, "learning_rate": 0.003, "loss": 4.2072, "step": 1847 }, { "epoch": 0.01848, "grad_norm": 0.7370764094470499, "learning_rate": 0.003, "loss": 4.2221, "step": 1848 }, { "epoch": 0.01849, "grad_norm": 0.6895980947043083, "learning_rate": 0.003, "loss": 4.2176, "step": 1849 }, { "epoch": 0.0185, "grad_norm": 0.6744601685013915, "learning_rate": 0.003, "loss": 4.2071, "step": 1850 }, { "epoch": 0.01851, "grad_norm": 0.7592763226230413, "learning_rate": 0.003, "loss": 4.1894, "step": 1851 }, { "epoch": 0.01852, "grad_norm": 0.7512384494982312, "learning_rate": 0.003, "loss": 4.1913, "step": 1852 }, { "epoch": 0.01853, "grad_norm": 0.642959606761384, "learning_rate": 0.003, "loss": 4.1897, "step": 1853 }, { "epoch": 0.01854, "grad_norm": 0.6302719538000866, "learning_rate": 0.003, "loss": 4.2198, "step": 1854 }, { "epoch": 0.01855, "grad_norm": 0.5865658475812112, "learning_rate": 0.003, "loss": 4.2046, "step": 1855 }, { "epoch": 0.01856, "grad_norm": 0.6166190320727692, "learning_rate": 0.003, "loss": 4.189, "step": 1856 }, { "epoch": 0.01857, "grad_norm": 0.7181557293855338, "learning_rate": 0.003, "loss": 4.1771, "step": 1857 }, { "epoch": 0.01858, "grad_norm": 0.8412048999626534, "learning_rate": 0.003, "loss": 4.1934, "step": 1858 }, { "epoch": 0.01859, "grad_norm": 0.9023360494815195, "learning_rate": 0.003, "loss": 4.1912, "step": 1859 }, { "epoch": 0.0186, "grad_norm": 0.7898380300240098, "learning_rate": 0.003, "loss": 4.1718, "step": 1860 }, { "epoch": 0.01861, "grad_norm": 0.7208167303209774, "learning_rate": 0.003, "loss": 4.2166, "step": 1861 }, { "epoch": 0.01862, "grad_norm": 0.7104369497590945, "learning_rate": 0.003, "loss": 4.179, "step": 1862 }, { "epoch": 0.01863, "grad_norm": 0.7138975727375058, "learning_rate": 0.003, "loss": 4.1882, "step": 1863 }, { "epoch": 0.01864, "grad_norm": 0.6519153740387305, "learning_rate": 0.003, "loss": 4.2193, "step": 1864 }, { "epoch": 0.01865, "grad_norm": 0.7057872052617501, "learning_rate": 0.003, "loss": 4.1967, "step": 1865 }, { "epoch": 0.01866, "grad_norm": 0.7579696750373845, "learning_rate": 0.003, "loss": 4.203, "step": 1866 }, { "epoch": 0.01867, "grad_norm": 0.7437399572447418, "learning_rate": 0.003, "loss": 4.1772, "step": 1867 }, { "epoch": 0.01868, "grad_norm": 0.7143070391507745, "learning_rate": 0.003, "loss": 4.1947, "step": 1868 }, { "epoch": 0.01869, "grad_norm": 0.7078429320794878, "learning_rate": 0.003, "loss": 4.2102, "step": 1869 }, { "epoch": 0.0187, "grad_norm": 0.7908878960855552, "learning_rate": 0.003, "loss": 4.2031, "step": 1870 }, { "epoch": 0.01871, "grad_norm": 0.9307560381931724, "learning_rate": 0.003, "loss": 4.1837, "step": 1871 }, { "epoch": 0.01872, "grad_norm": 0.937846873989918, "learning_rate": 0.003, "loss": 4.2367, "step": 1872 }, { "epoch": 0.01873, "grad_norm": 0.7588221299073229, "learning_rate": 0.003, "loss": 4.1923, "step": 1873 }, { "epoch": 0.01874, "grad_norm": 0.6728512390013429, "learning_rate": 0.003, "loss": 4.1913, "step": 1874 }, { "epoch": 0.01875, "grad_norm": 0.6514462862930852, "learning_rate": 0.003, "loss": 4.21, "step": 1875 }, { "epoch": 0.01876, "grad_norm": 0.703173016887, "learning_rate": 0.003, "loss": 4.2153, "step": 1876 }, { "epoch": 0.01877, "grad_norm": 0.73028206380223, "learning_rate": 0.003, "loss": 4.2227, "step": 1877 }, { "epoch": 0.01878, "grad_norm": 0.7555785531530169, "learning_rate": 0.003, "loss": 4.2069, "step": 1878 }, { "epoch": 0.01879, "grad_norm": 0.7934996955230512, "learning_rate": 0.003, "loss": 4.196, "step": 1879 }, { "epoch": 0.0188, "grad_norm": 0.7476047962254878, "learning_rate": 0.003, "loss": 4.199, "step": 1880 }, { "epoch": 0.01881, "grad_norm": 0.732632895668458, "learning_rate": 0.003, "loss": 4.195, "step": 1881 }, { "epoch": 0.01882, "grad_norm": 0.7676196664715533, "learning_rate": 0.003, "loss": 4.2236, "step": 1882 }, { "epoch": 0.01883, "grad_norm": 0.774042907112019, "learning_rate": 0.003, "loss": 4.197, "step": 1883 }, { "epoch": 0.01884, "grad_norm": 0.6501154660141607, "learning_rate": 0.003, "loss": 4.1919, "step": 1884 }, { "epoch": 0.01885, "grad_norm": 0.5559497730489893, "learning_rate": 0.003, "loss": 4.1659, "step": 1885 }, { "epoch": 0.01886, "grad_norm": 0.5108169963928473, "learning_rate": 0.003, "loss": 4.2034, "step": 1886 }, { "epoch": 0.01887, "grad_norm": 0.4802312262485427, "learning_rate": 0.003, "loss": 4.1941, "step": 1887 }, { "epoch": 0.01888, "grad_norm": 0.44491425211803826, "learning_rate": 0.003, "loss": 4.1789, "step": 1888 }, { "epoch": 0.01889, "grad_norm": 0.4616155311993016, "learning_rate": 0.003, "loss": 4.1923, "step": 1889 }, { "epoch": 0.0189, "grad_norm": 0.4932376764256785, "learning_rate": 0.003, "loss": 4.1724, "step": 1890 }, { "epoch": 0.01891, "grad_norm": 0.5809213606136114, "learning_rate": 0.003, "loss": 4.2219, "step": 1891 }, { "epoch": 0.01892, "grad_norm": 0.7827791825078997, "learning_rate": 0.003, "loss": 4.2063, "step": 1892 }, { "epoch": 0.01893, "grad_norm": 0.9634537072256092, "learning_rate": 0.003, "loss": 4.1968, "step": 1893 }, { "epoch": 0.01894, "grad_norm": 0.8975581900781968, "learning_rate": 0.003, "loss": 4.2047, "step": 1894 }, { "epoch": 0.01895, "grad_norm": 0.670439659623389, "learning_rate": 0.003, "loss": 4.1794, "step": 1895 }, { "epoch": 0.01896, "grad_norm": 0.774532069243431, "learning_rate": 0.003, "loss": 4.208, "step": 1896 }, { "epoch": 0.01897, "grad_norm": 0.7575182930280945, "learning_rate": 0.003, "loss": 4.2077, "step": 1897 }, { "epoch": 0.01898, "grad_norm": 0.6342553387149622, "learning_rate": 0.003, "loss": 4.1723, "step": 1898 }, { "epoch": 0.01899, "grad_norm": 0.6743268097098117, "learning_rate": 0.003, "loss": 4.1867, "step": 1899 }, { "epoch": 0.019, "grad_norm": 0.6675442258402475, "learning_rate": 0.003, "loss": 4.1804, "step": 1900 }, { "epoch": 0.01901, "grad_norm": 0.6448176087551704, "learning_rate": 0.003, "loss": 4.2267, "step": 1901 }, { "epoch": 0.01902, "grad_norm": 0.6309260453072276, "learning_rate": 0.003, "loss": 4.2038, "step": 1902 }, { "epoch": 0.01903, "grad_norm": 0.5175825025006746, "learning_rate": 0.003, "loss": 4.1533, "step": 1903 }, { "epoch": 0.01904, "grad_norm": 0.44558661360578977, "learning_rate": 0.003, "loss": 4.2075, "step": 1904 }, { "epoch": 0.01905, "grad_norm": 0.4519539268740215, "learning_rate": 0.003, "loss": 4.1828, "step": 1905 }, { "epoch": 0.01906, "grad_norm": 0.4534257888011715, "learning_rate": 0.003, "loss": 4.1993, "step": 1906 }, { "epoch": 0.01907, "grad_norm": 0.4205401145833596, "learning_rate": 0.003, "loss": 4.1917, "step": 1907 }, { "epoch": 0.01908, "grad_norm": 0.4717335154892585, "learning_rate": 0.003, "loss": 4.2064, "step": 1908 }, { "epoch": 0.01909, "grad_norm": 0.5307249224766913, "learning_rate": 0.003, "loss": 4.1715, "step": 1909 }, { "epoch": 0.0191, "grad_norm": 0.6322393539173273, "learning_rate": 0.003, "loss": 4.1858, "step": 1910 }, { "epoch": 0.01911, "grad_norm": 0.6545216933386392, "learning_rate": 0.003, "loss": 4.1623, "step": 1911 }, { "epoch": 0.01912, "grad_norm": 0.5663347338798967, "learning_rate": 0.003, "loss": 4.1711, "step": 1912 }, { "epoch": 0.01913, "grad_norm": 0.5706339861620788, "learning_rate": 0.003, "loss": 4.188, "step": 1913 }, { "epoch": 0.01914, "grad_norm": 0.6036176037925194, "learning_rate": 0.003, "loss": 4.2071, "step": 1914 }, { "epoch": 0.01915, "grad_norm": 0.5703771194775653, "learning_rate": 0.003, "loss": 4.1909, "step": 1915 }, { "epoch": 0.01916, "grad_norm": 0.6012322903506473, "learning_rate": 0.003, "loss": 4.173, "step": 1916 }, { "epoch": 0.01917, "grad_norm": 0.7091647735109047, "learning_rate": 0.003, "loss": 4.1961, "step": 1917 }, { "epoch": 0.01918, "grad_norm": 0.8129980711183025, "learning_rate": 0.003, "loss": 4.1895, "step": 1918 }, { "epoch": 0.01919, "grad_norm": 1.0293282506765802, "learning_rate": 0.003, "loss": 4.2041, "step": 1919 }, { "epoch": 0.0192, "grad_norm": 0.9313730998487869, "learning_rate": 0.003, "loss": 4.2061, "step": 1920 }, { "epoch": 0.01921, "grad_norm": 0.8720092800409024, "learning_rate": 0.003, "loss": 4.1913, "step": 1921 }, { "epoch": 0.01922, "grad_norm": 0.7996945150379353, "learning_rate": 0.003, "loss": 4.2091, "step": 1922 }, { "epoch": 0.01923, "grad_norm": 0.8800137161869123, "learning_rate": 0.003, "loss": 4.2165, "step": 1923 }, { "epoch": 0.01924, "grad_norm": 1.0182126450186646, "learning_rate": 0.003, "loss": 4.2271, "step": 1924 }, { "epoch": 0.01925, "grad_norm": 0.9807294623812044, "learning_rate": 0.003, "loss": 4.2164, "step": 1925 }, { "epoch": 0.01926, "grad_norm": 0.8494587539441437, "learning_rate": 0.003, "loss": 4.2224, "step": 1926 }, { "epoch": 0.01927, "grad_norm": 0.7179780359821647, "learning_rate": 0.003, "loss": 4.2071, "step": 1927 }, { "epoch": 0.01928, "grad_norm": 0.7281644181803769, "learning_rate": 0.003, "loss": 4.1812, "step": 1928 }, { "epoch": 0.01929, "grad_norm": 0.7065947298847162, "learning_rate": 0.003, "loss": 4.1992, "step": 1929 }, { "epoch": 0.0193, "grad_norm": 0.5649201338939857, "learning_rate": 0.003, "loss": 4.1851, "step": 1930 }, { "epoch": 0.01931, "grad_norm": 0.5271540347343493, "learning_rate": 0.003, "loss": 4.1849, "step": 1931 }, { "epoch": 0.01932, "grad_norm": 0.45255335182017054, "learning_rate": 0.003, "loss": 4.1909, "step": 1932 }, { "epoch": 0.01933, "grad_norm": 0.47680900276175325, "learning_rate": 0.003, "loss": 4.1783, "step": 1933 }, { "epoch": 0.01934, "grad_norm": 0.5504435487639365, "learning_rate": 0.003, "loss": 4.1833, "step": 1934 }, { "epoch": 0.01935, "grad_norm": 0.6345773523818935, "learning_rate": 0.003, "loss": 4.1989, "step": 1935 }, { "epoch": 0.01936, "grad_norm": 0.7991398209168664, "learning_rate": 0.003, "loss": 4.1875, "step": 1936 }, { "epoch": 0.01937, "grad_norm": 0.9725224419371103, "learning_rate": 0.003, "loss": 4.2069, "step": 1937 }, { "epoch": 0.01938, "grad_norm": 0.8008224777593416, "learning_rate": 0.003, "loss": 4.2024, "step": 1938 }, { "epoch": 0.01939, "grad_norm": 0.6377996497038249, "learning_rate": 0.003, "loss": 4.2204, "step": 1939 }, { "epoch": 0.0194, "grad_norm": 0.7081879235874463, "learning_rate": 0.003, "loss": 4.1973, "step": 1940 }, { "epoch": 0.01941, "grad_norm": 0.6250228653949582, "learning_rate": 0.003, "loss": 4.1784, "step": 1941 }, { "epoch": 0.01942, "grad_norm": 0.6717463944474127, "learning_rate": 0.003, "loss": 4.2019, "step": 1942 }, { "epoch": 0.01943, "grad_norm": 0.6654540389471064, "learning_rate": 0.003, "loss": 4.1745, "step": 1943 }, { "epoch": 0.01944, "grad_norm": 0.6610991638775895, "learning_rate": 0.003, "loss": 4.2146, "step": 1944 }, { "epoch": 0.01945, "grad_norm": 0.6283793764328768, "learning_rate": 0.003, "loss": 4.1837, "step": 1945 }, { "epoch": 0.01946, "grad_norm": 0.563815590170461, "learning_rate": 0.003, "loss": 4.175, "step": 1946 }, { "epoch": 0.01947, "grad_norm": 0.5825718696133029, "learning_rate": 0.003, "loss": 4.1841, "step": 1947 }, { "epoch": 0.01948, "grad_norm": 0.7034835120761256, "learning_rate": 0.003, "loss": 4.2096, "step": 1948 }, { "epoch": 0.01949, "grad_norm": 0.9237001793340572, "learning_rate": 0.003, "loss": 4.209, "step": 1949 }, { "epoch": 0.0195, "grad_norm": 0.9365393224462327, "learning_rate": 0.003, "loss": 4.207, "step": 1950 }, { "epoch": 0.01951, "grad_norm": 0.8087831479581115, "learning_rate": 0.003, "loss": 4.194, "step": 1951 }, { "epoch": 0.01952, "grad_norm": 0.8819853779113177, "learning_rate": 0.003, "loss": 4.2078, "step": 1952 }, { "epoch": 0.01953, "grad_norm": 0.9866084074346885, "learning_rate": 0.003, "loss": 4.2291, "step": 1953 }, { "epoch": 0.01954, "grad_norm": 1.037114417680343, "learning_rate": 0.003, "loss": 4.2303, "step": 1954 }, { "epoch": 0.01955, "grad_norm": 0.9479637499094957, "learning_rate": 0.003, "loss": 4.2225, "step": 1955 }, { "epoch": 0.01956, "grad_norm": 0.8994010519419664, "learning_rate": 0.003, "loss": 4.2135, "step": 1956 }, { "epoch": 0.01957, "grad_norm": 0.8842871012128143, "learning_rate": 0.003, "loss": 4.1851, "step": 1957 }, { "epoch": 0.01958, "grad_norm": 0.9376407923168335, "learning_rate": 0.003, "loss": 4.1829, "step": 1958 }, { "epoch": 0.01959, "grad_norm": 0.8263866600223667, "learning_rate": 0.003, "loss": 4.2075, "step": 1959 }, { "epoch": 0.0196, "grad_norm": 0.6863533699796172, "learning_rate": 0.003, "loss": 4.1925, "step": 1960 }, { "epoch": 0.01961, "grad_norm": 0.6116137760392246, "learning_rate": 0.003, "loss": 4.191, "step": 1961 }, { "epoch": 0.01962, "grad_norm": 0.6395227196191122, "learning_rate": 0.003, "loss": 4.1912, "step": 1962 }, { "epoch": 0.01963, "grad_norm": 0.5883418831705517, "learning_rate": 0.003, "loss": 4.1771, "step": 1963 }, { "epoch": 0.01964, "grad_norm": 0.5921487580120356, "learning_rate": 0.003, "loss": 4.2037, "step": 1964 }, { "epoch": 0.01965, "grad_norm": 0.5536329776274996, "learning_rate": 0.003, "loss": 4.1947, "step": 1965 }, { "epoch": 0.01966, "grad_norm": 0.47388805459585637, "learning_rate": 0.003, "loss": 4.1925, "step": 1966 }, { "epoch": 0.01967, "grad_norm": 0.4786574861196878, "learning_rate": 0.003, "loss": 4.1683, "step": 1967 }, { "epoch": 0.01968, "grad_norm": 0.49077496594088527, "learning_rate": 0.003, "loss": 4.1875, "step": 1968 }, { "epoch": 0.01969, "grad_norm": 0.5052802929518135, "learning_rate": 0.003, "loss": 4.1833, "step": 1969 }, { "epoch": 0.0197, "grad_norm": 0.5295838660446354, "learning_rate": 0.003, "loss": 4.1837, "step": 1970 }, { "epoch": 0.01971, "grad_norm": 0.5513602113626036, "learning_rate": 0.003, "loss": 4.1684, "step": 1971 }, { "epoch": 0.01972, "grad_norm": 0.5210985598522027, "learning_rate": 0.003, "loss": 4.1701, "step": 1972 }, { "epoch": 0.01973, "grad_norm": 0.6615280275024347, "learning_rate": 0.003, "loss": 4.1567, "step": 1973 }, { "epoch": 0.01974, "grad_norm": 0.728061728231907, "learning_rate": 0.003, "loss": 4.1988, "step": 1974 }, { "epoch": 0.01975, "grad_norm": 0.7196409318491062, "learning_rate": 0.003, "loss": 4.1895, "step": 1975 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.82961946394624e+16, "train_batch_size": 512, "trial_name": null, "trial_params": null }