{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9991150442477876, "eval_steps": 500, "global_step": 2260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008849557522123894, "grad_norm": 2.106339463731836, "learning_rate": 2.5000000000000004e-07, "loss": 1.3744, "step": 1 }, { "epoch": 0.0017699115044247787, "grad_norm": 1.5105734285508876, "learning_rate": 5.000000000000001e-07, "loss": 1.1543, "step": 2 }, { "epoch": 0.002654867256637168, "grad_norm": 1.6749506066232989, "learning_rate": 7.5e-07, "loss": 1.315, "step": 3 }, { "epoch": 0.0035398230088495575, "grad_norm": 1.7430195727628652, "learning_rate": 1.0000000000000002e-06, "loss": 1.3294, "step": 4 }, { "epoch": 0.004424778761061947, "grad_norm": 2.143337075138686, "learning_rate": 1.25e-06, "loss": 1.2227, "step": 5 }, { "epoch": 0.005309734513274336, "grad_norm": 2.1055664281812243, "learning_rate": 1.5e-06, "loss": 1.282, "step": 6 }, { "epoch": 0.006194690265486726, "grad_norm": 2.281855125337273, "learning_rate": 1.75e-06, "loss": 1.3369, "step": 7 }, { "epoch": 0.007079646017699115, "grad_norm": 2.3908462755134448, "learning_rate": 2.0000000000000003e-06, "loss": 1.2283, "step": 8 }, { "epoch": 0.007964601769911504, "grad_norm": 1.747562822101487, "learning_rate": 2.25e-06, "loss": 1.3123, "step": 9 }, { "epoch": 0.008849557522123894, "grad_norm": 1.732266128437899, "learning_rate": 2.5e-06, "loss": 1.2368, "step": 10 }, { "epoch": 0.009734513274336283, "grad_norm": 2.384971844119039, "learning_rate": 2.7500000000000004e-06, "loss": 1.1892, "step": 11 }, { "epoch": 0.010619469026548672, "grad_norm": 1.6545504474142574, "learning_rate": 3e-06, "loss": 0.9094, "step": 12 }, { "epoch": 0.011504424778761062, "grad_norm": 2.2455927433948966, "learning_rate": 3.2500000000000002e-06, "loss": 1.2892, "step": 13 }, { "epoch": 0.012389380530973451, "grad_norm": 1.4521424185830174, "learning_rate": 3.5e-06, "loss": 1.1334, "step": 14 }, { "epoch": 0.01327433628318584, "grad_norm": 1.5818540908165248, "learning_rate": 3.7500000000000005e-06, "loss": 1.2522, "step": 15 }, { "epoch": 0.01415929203539823, "grad_norm": 1.7777052791924028, "learning_rate": 4.000000000000001e-06, "loss": 1.2011, "step": 16 }, { "epoch": 0.01504424778761062, "grad_norm": 3.357870050775763, "learning_rate": 4.25e-06, "loss": 1.2937, "step": 17 }, { "epoch": 0.01592920353982301, "grad_norm": 1.8503986085786266, "learning_rate": 4.5e-06, "loss": 1.2751, "step": 18 }, { "epoch": 0.016814159292035398, "grad_norm": 1.648606407454642, "learning_rate": 4.75e-06, "loss": 1.4626, "step": 19 }, { "epoch": 0.017699115044247787, "grad_norm": 2.1357574986557912, "learning_rate": 5e-06, "loss": 1.266, "step": 20 }, { "epoch": 0.018584070796460177, "grad_norm": 1.6166661619275249, "learning_rate": 5.2500000000000006e-06, "loss": 1.5541, "step": 21 }, { "epoch": 0.019469026548672566, "grad_norm": 6.617789342610703, "learning_rate": 5.500000000000001e-06, "loss": 1.5204, "step": 22 }, { "epoch": 0.020353982300884955, "grad_norm": 1.8932016403825505, "learning_rate": 5.75e-06, "loss": 1.2488, "step": 23 }, { "epoch": 0.021238938053097345, "grad_norm": 2.0605283088900785, "learning_rate": 6e-06, "loss": 1.3947, "step": 24 }, { "epoch": 0.022123893805309734, "grad_norm": 1.8819699113836863, "learning_rate": 6.25e-06, "loss": 1.3318, "step": 25 }, { "epoch": 0.023008849557522124, "grad_norm": 1.6069834622478092, "learning_rate": 6.5000000000000004e-06, "loss": 1.089, "step": 26 }, { "epoch": 0.023893805309734513, "grad_norm": 2.204920957254516, "learning_rate": 6.750000000000001e-06, "loss": 1.2617, "step": 27 }, { "epoch": 0.024778761061946902, "grad_norm": 2.1748018547717654, "learning_rate": 7e-06, "loss": 1.3647, "step": 28 }, { "epoch": 0.02566371681415929, "grad_norm": 6.1063382668778345, "learning_rate": 7.25e-06, "loss": 1.3002, "step": 29 }, { "epoch": 0.02654867256637168, "grad_norm": 1.4407191725781503, "learning_rate": 7.500000000000001e-06, "loss": 1.3444, "step": 30 }, { "epoch": 0.02743362831858407, "grad_norm": 1.5408542981317885, "learning_rate": 7.75e-06, "loss": 1.1818, "step": 31 }, { "epoch": 0.02831858407079646, "grad_norm": 3.7699210060727886, "learning_rate": 8.000000000000001e-06, "loss": 1.4476, "step": 32 }, { "epoch": 0.02920353982300885, "grad_norm": 2.0576698179659636, "learning_rate": 8.25e-06, "loss": 1.1289, "step": 33 }, { "epoch": 0.03008849557522124, "grad_norm": 6.485642503098495, "learning_rate": 8.5e-06, "loss": 1.1318, "step": 34 }, { "epoch": 0.030973451327433628, "grad_norm": 1.9653889268026394, "learning_rate": 8.750000000000001e-06, "loss": 1.2212, "step": 35 }, { "epoch": 0.03185840707964602, "grad_norm": 1.7144483487323567, "learning_rate": 9e-06, "loss": 1.4872, "step": 36 }, { "epoch": 0.03274336283185841, "grad_norm": 2.85832916872879, "learning_rate": 9.250000000000001e-06, "loss": 1.3008, "step": 37 }, { "epoch": 0.033628318584070796, "grad_norm": 1.9291696862275731, "learning_rate": 9.5e-06, "loss": 1.3362, "step": 38 }, { "epoch": 0.034513274336283185, "grad_norm": 1.7912257872874406, "learning_rate": 9.75e-06, "loss": 1.2119, "step": 39 }, { "epoch": 0.035398230088495575, "grad_norm": 1.3696234180807896, "learning_rate": 1e-05, "loss": 1.114, "step": 40 }, { "epoch": 0.036283185840707964, "grad_norm": 1.5786517682762398, "learning_rate": 9.999998770626895e-06, "loss": 1.452, "step": 41 }, { "epoch": 0.03716814159292035, "grad_norm": 2.42562679627461, "learning_rate": 9.99999508250818e-06, "loss": 1.4073, "step": 42 }, { "epoch": 0.03805309734513274, "grad_norm": 2.0201914947045125, "learning_rate": 9.999988935645673e-06, "loss": 1.1332, "step": 43 }, { "epoch": 0.03893805309734513, "grad_norm": 1.3794932088095848, "learning_rate": 9.999980330042391e-06, "loss": 1.1558, "step": 44 }, { "epoch": 0.03982300884955752, "grad_norm": 2.0117823019227443, "learning_rate": 9.999969265702571e-06, "loss": 1.2022, "step": 45 }, { "epoch": 0.04070796460176991, "grad_norm": 2.1725813480883422, "learning_rate": 9.999955742631653e-06, "loss": 1.5591, "step": 46 }, { "epoch": 0.0415929203539823, "grad_norm": 2.142242924025013, "learning_rate": 9.999939760836287e-06, "loss": 1.1975, "step": 47 }, { "epoch": 0.04247787610619469, "grad_norm": 2.099750280568012, "learning_rate": 9.999921320324328e-06, "loss": 1.4589, "step": 48 }, { "epoch": 0.04336283185840708, "grad_norm": 1.6468369242903702, "learning_rate": 9.999900421104848e-06, "loss": 1.1888, "step": 49 }, { "epoch": 0.04424778761061947, "grad_norm": 2.167562495271245, "learning_rate": 9.999877063188124e-06, "loss": 1.3554, "step": 50 }, { "epoch": 0.04513274336283186, "grad_norm": 2.120076764627322, "learning_rate": 9.999851246585641e-06, "loss": 1.0536, "step": 51 }, { "epoch": 0.04601769911504425, "grad_norm": 1.5377929102282335, "learning_rate": 9.999822971310096e-06, "loss": 1.4181, "step": 52 }, { "epoch": 0.046902654867256637, "grad_norm": 1.423972863379274, "learning_rate": 9.999792237375392e-06, "loss": 1.1166, "step": 53 }, { "epoch": 0.047787610619469026, "grad_norm": 1.5669348701783705, "learning_rate": 9.99975904479664e-06, "loss": 1.3596, "step": 54 }, { "epoch": 0.048672566371681415, "grad_norm": 5.865175901173417, "learning_rate": 9.999723393590169e-06, "loss": 1.1397, "step": 55 }, { "epoch": 0.049557522123893805, "grad_norm": 3.078784972882926, "learning_rate": 9.999685283773504e-06, "loss": 1.2719, "step": 56 }, { "epoch": 0.050442477876106194, "grad_norm": 1.4717887955175926, "learning_rate": 9.99964471536539e-06, "loss": 1.2013, "step": 57 }, { "epoch": 0.05132743362831858, "grad_norm": 1.7404773138676433, "learning_rate": 9.999601688385771e-06, "loss": 1.2103, "step": 58 }, { "epoch": 0.05221238938053097, "grad_norm": 1.6363411809520576, "learning_rate": 9.999556202855812e-06, "loss": 1.3865, "step": 59 }, { "epoch": 0.05309734513274336, "grad_norm": 2.204721561903993, "learning_rate": 9.999508258797876e-06, "loss": 1.3196, "step": 60 }, { "epoch": 0.05398230088495575, "grad_norm": 1.680117896043576, "learning_rate": 9.999457856235542e-06, "loss": 1.3015, "step": 61 }, { "epoch": 0.05486725663716814, "grad_norm": 2.0823844948599604, "learning_rate": 9.999404995193593e-06, "loss": 1.303, "step": 62 }, { "epoch": 0.05575221238938053, "grad_norm": 2.110454364156204, "learning_rate": 9.999349675698025e-06, "loss": 1.2679, "step": 63 }, { "epoch": 0.05663716814159292, "grad_norm": 3.05910270831914, "learning_rate": 9.999291897776043e-06, "loss": 1.6213, "step": 64 }, { "epoch": 0.05752212389380531, "grad_norm": 1.665203893362491, "learning_rate": 9.999231661456056e-06, "loss": 1.1071, "step": 65 }, { "epoch": 0.0584070796460177, "grad_norm": 2.212161066084012, "learning_rate": 9.999168966767683e-06, "loss": 1.2396, "step": 66 }, { "epoch": 0.05929203539823009, "grad_norm": 2.2097061109088783, "learning_rate": 9.999103813741761e-06, "loss": 1.2946, "step": 67 }, { "epoch": 0.06017699115044248, "grad_norm": 1.9941355404770835, "learning_rate": 9.999036202410324e-06, "loss": 1.4842, "step": 68 }, { "epoch": 0.061061946902654866, "grad_norm": 1.608985926346185, "learning_rate": 9.998966132806623e-06, "loss": 1.367, "step": 69 }, { "epoch": 0.061946902654867256, "grad_norm": 1.6189177343338328, "learning_rate": 9.998893604965111e-06, "loss": 1.2667, "step": 70 }, { "epoch": 0.06283185840707965, "grad_norm": 1.979819494199703, "learning_rate": 9.998818618921458e-06, "loss": 1.2099, "step": 71 }, { "epoch": 0.06371681415929203, "grad_norm": 2.255645828147679, "learning_rate": 9.998741174712534e-06, "loss": 1.5324, "step": 72 }, { "epoch": 0.06460176991150443, "grad_norm": 2.3757308295751804, "learning_rate": 9.998661272376424e-06, "loss": 1.228, "step": 73 }, { "epoch": 0.06548672566371681, "grad_norm": 2.415081688467693, "learning_rate": 9.99857891195242e-06, "loss": 1.3029, "step": 74 }, { "epoch": 0.06637168141592921, "grad_norm": 1.6269423119055877, "learning_rate": 9.998494093481022e-06, "loss": 1.327, "step": 75 }, { "epoch": 0.06725663716814159, "grad_norm": 1.7441811662592015, "learning_rate": 9.99840681700394e-06, "loss": 0.8723, "step": 76 }, { "epoch": 0.06814159292035399, "grad_norm": 1.4800987610156806, "learning_rate": 9.998317082564093e-06, "loss": 1.4789, "step": 77 }, { "epoch": 0.06902654867256637, "grad_norm": 2.425095144796416, "learning_rate": 9.998224890205606e-06, "loss": 1.4784, "step": 78 }, { "epoch": 0.06991150442477877, "grad_norm": 2.2295815118372375, "learning_rate": 9.998130239973816e-06, "loss": 1.1663, "step": 79 }, { "epoch": 0.07079646017699115, "grad_norm": 2.130765950908652, "learning_rate": 9.998033131915266e-06, "loss": 1.4019, "step": 80 }, { "epoch": 0.07168141592920355, "grad_norm": 4.398790053759135, "learning_rate": 9.997933566077709e-06, "loss": 1.6084, "step": 81 }, { "epoch": 0.07256637168141593, "grad_norm": 4.123380242945179, "learning_rate": 9.997831542510107e-06, "loss": 1.3459, "step": 82 }, { "epoch": 0.07345132743362832, "grad_norm": 2.0788785033613846, "learning_rate": 9.99772706126263e-06, "loss": 1.3291, "step": 83 }, { "epoch": 0.0743362831858407, "grad_norm": 1.5887424723178736, "learning_rate": 9.997620122386658e-06, "loss": 1.1042, "step": 84 }, { "epoch": 0.0752212389380531, "grad_norm": 1.288311015331904, "learning_rate": 9.997510725934774e-06, "loss": 1.0449, "step": 85 }, { "epoch": 0.07610619469026549, "grad_norm": 2.5061648545618525, "learning_rate": 9.997398871960778e-06, "loss": 1.3613, "step": 86 }, { "epoch": 0.07699115044247788, "grad_norm": 1.5885110143600036, "learning_rate": 9.997284560519672e-06, "loss": 1.2658, "step": 87 }, { "epoch": 0.07787610619469026, "grad_norm": 2.216834428360423, "learning_rate": 9.997167791667668e-06, "loss": 1.0864, "step": 88 }, { "epoch": 0.07876106194690266, "grad_norm": 2.929719014536817, "learning_rate": 9.997048565462188e-06, "loss": 1.17, "step": 89 }, { "epoch": 0.07964601769911504, "grad_norm": 2.582752043580876, "learning_rate": 9.996926881961862e-06, "loss": 1.2782, "step": 90 }, { "epoch": 0.08053097345132744, "grad_norm": 2.3765940672495542, "learning_rate": 9.996802741226526e-06, "loss": 1.1691, "step": 91 }, { "epoch": 0.08141592920353982, "grad_norm": 2.0197186066491484, "learning_rate": 9.996676143317227e-06, "loss": 1.4105, "step": 92 }, { "epoch": 0.08230088495575222, "grad_norm": 1.5701325057069015, "learning_rate": 9.99654708829622e-06, "loss": 1.386, "step": 93 }, { "epoch": 0.0831858407079646, "grad_norm": 2.1098292817712028, "learning_rate": 9.996415576226967e-06, "loss": 1.2432, "step": 94 }, { "epoch": 0.084070796460177, "grad_norm": 1.8158093605510917, "learning_rate": 9.99628160717414e-06, "loss": 1.4574, "step": 95 }, { "epoch": 0.08495575221238938, "grad_norm": 2.397975174104846, "learning_rate": 9.996145181203616e-06, "loss": 1.4544, "step": 96 }, { "epoch": 0.08584070796460178, "grad_norm": 2.917341646521041, "learning_rate": 9.996006298382484e-06, "loss": 1.13, "step": 97 }, { "epoch": 0.08672566371681416, "grad_norm": 3.0273996791471878, "learning_rate": 9.99586495877904e-06, "loss": 1.2654, "step": 98 }, { "epoch": 0.08761061946902655, "grad_norm": 22.698336061640962, "learning_rate": 9.995721162462785e-06, "loss": 1.5905, "step": 99 }, { "epoch": 0.08849557522123894, "grad_norm": 1.6713918605946168, "learning_rate": 9.995574909504434e-06, "loss": 1.1465, "step": 100 }, { "epoch": 0.08938053097345133, "grad_norm": 2.3057511843891882, "learning_rate": 9.995426199975906e-06, "loss": 1.3753, "step": 101 }, { "epoch": 0.09026548672566372, "grad_norm": 1.4197183195268506, "learning_rate": 9.99527503395033e-06, "loss": 1.1226, "step": 102 }, { "epoch": 0.09115044247787611, "grad_norm": 2.2944893268245212, "learning_rate": 9.995121411502037e-06, "loss": 1.1534, "step": 103 }, { "epoch": 0.0920353982300885, "grad_norm": 2.551398682899462, "learning_rate": 9.994965332706574e-06, "loss": 1.1304, "step": 104 }, { "epoch": 0.09292035398230089, "grad_norm": 2.151351321278214, "learning_rate": 9.994806797640692e-06, "loss": 1.3171, "step": 105 }, { "epoch": 0.09380530973451327, "grad_norm": 1.6404407842550301, "learning_rate": 9.994645806382353e-06, "loss": 1.2404, "step": 106 }, { "epoch": 0.09469026548672567, "grad_norm": 1.955146650542099, "learning_rate": 9.99448235901072e-06, "loss": 1.0375, "step": 107 }, { "epoch": 0.09557522123893805, "grad_norm": 3.060086131522053, "learning_rate": 9.994316455606172e-06, "loss": 1.1526, "step": 108 }, { "epoch": 0.09646017699115045, "grad_norm": 1.6809192258655545, "learning_rate": 9.99414809625029e-06, "loss": 1.2699, "step": 109 }, { "epoch": 0.09734513274336283, "grad_norm": 1.4715304582895026, "learning_rate": 9.993977281025862e-06, "loss": 1.2004, "step": 110 }, { "epoch": 0.09823008849557523, "grad_norm": 1.8730326348910076, "learning_rate": 9.993804010016891e-06, "loss": 1.283, "step": 111 }, { "epoch": 0.09911504424778761, "grad_norm": 3.4740093774734833, "learning_rate": 9.993628283308582e-06, "loss": 1.4179, "step": 112 }, { "epoch": 0.1, "grad_norm": 4.613726407254826, "learning_rate": 9.993450100987346e-06, "loss": 1.3937, "step": 113 }, { "epoch": 0.10088495575221239, "grad_norm": 5.04798041119589, "learning_rate": 9.993269463140805e-06, "loss": 1.1483, "step": 114 }, { "epoch": 0.10176991150442478, "grad_norm": 1.9327893567163577, "learning_rate": 9.99308636985779e-06, "loss": 1.3283, "step": 115 }, { "epoch": 0.10265486725663717, "grad_norm": 1.5225686419004625, "learning_rate": 9.992900821228333e-06, "loss": 1.4935, "step": 116 }, { "epoch": 0.10353982300884956, "grad_norm": 3.027765551166018, "learning_rate": 9.99271281734368e-06, "loss": 1.5318, "step": 117 }, { "epoch": 0.10442477876106195, "grad_norm": 2.418216252119143, "learning_rate": 9.99252235829628e-06, "loss": 1.4974, "step": 118 }, { "epoch": 0.10530973451327434, "grad_norm": 3.8285208056376487, "learning_rate": 9.992329444179793e-06, "loss": 1.2257, "step": 119 }, { "epoch": 0.10619469026548672, "grad_norm": 4.579772183332124, "learning_rate": 9.992134075089085e-06, "loss": 1.2489, "step": 120 }, { "epoch": 0.10707964601769912, "grad_norm": 4.497540329674831, "learning_rate": 9.991936251120224e-06, "loss": 1.0856, "step": 121 }, { "epoch": 0.1079646017699115, "grad_norm": 1.5802899566837967, "learning_rate": 9.991735972370495e-06, "loss": 1.1822, "step": 122 }, { "epoch": 0.1088495575221239, "grad_norm": 1.8808695810821894, "learning_rate": 9.991533238938382e-06, "loss": 0.9849, "step": 123 }, { "epoch": 0.10973451327433628, "grad_norm": 5.381321627519825, "learning_rate": 9.99132805092358e-06, "loss": 1.4219, "step": 124 }, { "epoch": 0.11061946902654868, "grad_norm": 5.268428237132689, "learning_rate": 9.991120408426991e-06, "loss": 1.4609, "step": 125 }, { "epoch": 0.11150442477876106, "grad_norm": 1.4857630495002763, "learning_rate": 9.99091031155072e-06, "loss": 1.3565, "step": 126 }, { "epoch": 0.11238938053097346, "grad_norm": 1.6594413391618987, "learning_rate": 9.990697760398085e-06, "loss": 1.0978, "step": 127 }, { "epoch": 0.11327433628318584, "grad_norm": 3.4049754502531515, "learning_rate": 9.990482755073607e-06, "loss": 1.291, "step": 128 }, { "epoch": 0.11415929203539824, "grad_norm": 4.040654855012943, "learning_rate": 9.990265295683014e-06, "loss": 1.6493, "step": 129 }, { "epoch": 0.11504424778761062, "grad_norm": 1.7755331912676693, "learning_rate": 9.990045382333243e-06, "loss": 1.4423, "step": 130 }, { "epoch": 0.11592920353982301, "grad_norm": 14.138189516003235, "learning_rate": 9.989823015132433e-06, "loss": 1.1231, "step": 131 }, { "epoch": 0.1168141592920354, "grad_norm": 2.250923196484513, "learning_rate": 9.989598194189938e-06, "loss": 1.5074, "step": 132 }, { "epoch": 0.11769911504424779, "grad_norm": 4.218347505335163, "learning_rate": 9.989370919616308e-06, "loss": 1.2549, "step": 133 }, { "epoch": 0.11858407079646018, "grad_norm": 1.786198620774865, "learning_rate": 9.989141191523309e-06, "loss": 1.4056, "step": 134 }, { "epoch": 0.11946902654867257, "grad_norm": 6.051397601678817, "learning_rate": 9.988909010023908e-06, "loss": 1.087, "step": 135 }, { "epoch": 0.12035398230088495, "grad_norm": 2.8738452979527582, "learning_rate": 9.98867437523228e-06, "loss": 1.0379, "step": 136 }, { "epoch": 0.12123893805309735, "grad_norm": 1.9318046030216827, "learning_rate": 9.98843728726381e-06, "loss": 1.2427, "step": 137 }, { "epoch": 0.12212389380530973, "grad_norm": 2.0434675331079286, "learning_rate": 9.98819774623508e-06, "loss": 0.9622, "step": 138 }, { "epoch": 0.12300884955752213, "grad_norm": 4.060721180336629, "learning_rate": 9.987955752263888e-06, "loss": 1.4053, "step": 139 }, { "epoch": 0.12389380530973451, "grad_norm": 3.1148816584935104, "learning_rate": 9.987711305469232e-06, "loss": 1.4765, "step": 140 }, { "epoch": 0.12477876106194691, "grad_norm": 1.8894994711343975, "learning_rate": 9.987464405971318e-06, "loss": 1.3686, "step": 141 }, { "epoch": 0.1256637168141593, "grad_norm": 3.720042114027315, "learning_rate": 9.987215053891564e-06, "loss": 1.369, "step": 142 }, { "epoch": 0.12654867256637167, "grad_norm": 1.8541905026929657, "learning_rate": 9.986963249352584e-06, "loss": 1.1501, "step": 143 }, { "epoch": 0.12743362831858407, "grad_norm": 4.376537250647029, "learning_rate": 9.986708992478202e-06, "loss": 1.3364, "step": 144 }, { "epoch": 0.12831858407079647, "grad_norm": 2.294513558207053, "learning_rate": 9.986452283393452e-06, "loss": 1.129, "step": 145 }, { "epoch": 0.12920353982300886, "grad_norm": 6.491466961589033, "learning_rate": 9.986193122224568e-06, "loss": 1.1243, "step": 146 }, { "epoch": 0.13008849557522123, "grad_norm": 1.68671921216604, "learning_rate": 9.985931509098994e-06, "loss": 1.2134, "step": 147 }, { "epoch": 0.13097345132743363, "grad_norm": 5.340003844279616, "learning_rate": 9.985667444145378e-06, "loss": 1.146, "step": 148 }, { "epoch": 0.13185840707964602, "grad_norm": 3.0672201505842844, "learning_rate": 9.985400927493572e-06, "loss": 1.3213, "step": 149 }, { "epoch": 0.13274336283185842, "grad_norm": 2.539467816665046, "learning_rate": 9.985131959274637e-06, "loss": 1.1107, "step": 150 }, { "epoch": 0.1336283185840708, "grad_norm": 1.6364894515323483, "learning_rate": 9.984860539620835e-06, "loss": 1.2738, "step": 151 }, { "epoch": 0.13451327433628318, "grad_norm": 1.5452139573070158, "learning_rate": 9.984586668665641e-06, "loss": 1.1356, "step": 152 }, { "epoch": 0.13539823008849558, "grad_norm": 3.96775567211081, "learning_rate": 9.984310346543728e-06, "loss": 1.3977, "step": 153 }, { "epoch": 0.13628318584070798, "grad_norm": 4.060357146521268, "learning_rate": 9.984031573390977e-06, "loss": 1.407, "step": 154 }, { "epoch": 0.13716814159292035, "grad_norm": 8.528194605693052, "learning_rate": 9.983750349344476e-06, "loss": 1.4903, "step": 155 }, { "epoch": 0.13805309734513274, "grad_norm": 2.639189595586812, "learning_rate": 9.983466674542516e-06, "loss": 1.024, "step": 156 }, { "epoch": 0.13893805309734514, "grad_norm": 2.309511434917322, "learning_rate": 9.983180549124594e-06, "loss": 1.2968, "step": 157 }, { "epoch": 0.13982300884955753, "grad_norm": 4.854772032586159, "learning_rate": 9.98289197323141e-06, "loss": 1.1981, "step": 158 }, { "epoch": 0.1407079646017699, "grad_norm": 2.1874333318713943, "learning_rate": 9.982600947004875e-06, "loss": 1.3222, "step": 159 }, { "epoch": 0.1415929203539823, "grad_norm": 3.840468313198694, "learning_rate": 9.982307470588097e-06, "loss": 1.3087, "step": 160 }, { "epoch": 0.1424778761061947, "grad_norm": 6.4905105018971625, "learning_rate": 9.982011544125394e-06, "loss": 1.2583, "step": 161 }, { "epoch": 0.1433628318584071, "grad_norm": 9.375080360460279, "learning_rate": 9.98171316776229e-06, "loss": 1.1789, "step": 162 }, { "epoch": 0.14424778761061946, "grad_norm": 2.6873168431120305, "learning_rate": 9.981412341645508e-06, "loss": 1.4424, "step": 163 }, { "epoch": 0.14513274336283186, "grad_norm": 2.0697888188503173, "learning_rate": 9.98110906592298e-06, "loss": 1.1816, "step": 164 }, { "epoch": 0.14601769911504425, "grad_norm": 1.9274085867817112, "learning_rate": 9.980803340743842e-06, "loss": 1.1063, "step": 165 }, { "epoch": 0.14690265486725665, "grad_norm": 1.680946640245512, "learning_rate": 9.980495166258437e-06, "loss": 1.1403, "step": 166 }, { "epoch": 0.14778761061946902, "grad_norm": 1.9419531226152704, "learning_rate": 9.980184542618306e-06, "loss": 1.3682, "step": 167 }, { "epoch": 0.1486725663716814, "grad_norm": 2.727350540230249, "learning_rate": 9.979871469976197e-06, "loss": 1.3981, "step": 168 }, { "epoch": 0.1495575221238938, "grad_norm": 1.9956955215091174, "learning_rate": 9.979555948486063e-06, "loss": 1.3094, "step": 169 }, { "epoch": 0.1504424778761062, "grad_norm": 1.8358348493722312, "learning_rate": 9.979237978303066e-06, "loss": 1.2948, "step": 170 }, { "epoch": 0.15132743362831858, "grad_norm": 2.7540312420834843, "learning_rate": 9.978917559583565e-06, "loss": 1.1643, "step": 171 }, { "epoch": 0.15221238938053097, "grad_norm": 10.985013305924243, "learning_rate": 9.978594692485127e-06, "loss": 0.9601, "step": 172 }, { "epoch": 0.15309734513274337, "grad_norm": 1.9016848566014999, "learning_rate": 9.978269377166517e-06, "loss": 1.1673, "step": 173 }, { "epoch": 0.15398230088495576, "grad_norm": 2.9880839854928927, "learning_rate": 9.977941613787714e-06, "loss": 1.2551, "step": 174 }, { "epoch": 0.15486725663716813, "grad_norm": 13.111797669739648, "learning_rate": 9.977611402509893e-06, "loss": 1.1002, "step": 175 }, { "epoch": 0.15575221238938053, "grad_norm": 3.4816200508206925, "learning_rate": 9.977278743495434e-06, "loss": 1.3279, "step": 176 }, { "epoch": 0.15663716814159293, "grad_norm": 14.560462894565802, "learning_rate": 9.976943636907924e-06, "loss": 1.2941, "step": 177 }, { "epoch": 0.15752212389380532, "grad_norm": 1.8904066524429728, "learning_rate": 9.97660608291215e-06, "loss": 1.1987, "step": 178 }, { "epoch": 0.1584070796460177, "grad_norm": 3.3619595591525777, "learning_rate": 9.976266081674107e-06, "loss": 1.1302, "step": 179 }, { "epoch": 0.1592920353982301, "grad_norm": 4.809387605940993, "learning_rate": 9.975923633360985e-06, "loss": 0.9869, "step": 180 }, { "epoch": 0.16017699115044248, "grad_norm": 1.726226622495568, "learning_rate": 9.975578738141188e-06, "loss": 1.2044, "step": 181 }, { "epoch": 0.16106194690265488, "grad_norm": 5.545437863848784, "learning_rate": 9.975231396184313e-06, "loss": 1.4947, "step": 182 }, { "epoch": 0.16194690265486725, "grad_norm": 17.688299362265273, "learning_rate": 9.974881607661168e-06, "loss": 1.405, "step": 183 }, { "epoch": 0.16283185840707964, "grad_norm": 2.190952653935449, "learning_rate": 9.974529372743762e-06, "loss": 1.2799, "step": 184 }, { "epoch": 0.16371681415929204, "grad_norm": 2.630906062098555, "learning_rate": 9.974174691605306e-06, "loss": 1.3954, "step": 185 }, { "epoch": 0.16460176991150444, "grad_norm": 2.0567856184006037, "learning_rate": 9.973817564420212e-06, "loss": 1.491, "step": 186 }, { "epoch": 0.1654867256637168, "grad_norm": 2.276231423356681, "learning_rate": 9.973457991364098e-06, "loss": 1.2651, "step": 187 }, { "epoch": 0.1663716814159292, "grad_norm": 1.53006599486584, "learning_rate": 9.973095972613785e-06, "loss": 1.1206, "step": 188 }, { "epoch": 0.1672566371681416, "grad_norm": 3.156021302394108, "learning_rate": 9.972731508347296e-06, "loss": 1.1459, "step": 189 }, { "epoch": 0.168141592920354, "grad_norm": 3.4089344807073565, "learning_rate": 9.972364598743851e-06, "loss": 1.1703, "step": 190 }, { "epoch": 0.16902654867256636, "grad_norm": 4.163656704488586, "learning_rate": 9.971995243983883e-06, "loss": 1.3787, "step": 191 }, { "epoch": 0.16991150442477876, "grad_norm": 2.534583062948447, "learning_rate": 9.97162344424902e-06, "loss": 1.3725, "step": 192 }, { "epoch": 0.17079646017699116, "grad_norm": 3.755729096508455, "learning_rate": 9.971249199722095e-06, "loss": 1.3051, "step": 193 }, { "epoch": 0.17168141592920355, "grad_norm": 1.8728805414791814, "learning_rate": 9.970872510587142e-06, "loss": 1.4469, "step": 194 }, { "epoch": 0.17256637168141592, "grad_norm": 1.599250875384866, "learning_rate": 9.970493377029396e-06, "loss": 0.8646, "step": 195 }, { "epoch": 0.17345132743362832, "grad_norm": 2.9383488319003646, "learning_rate": 9.970111799235298e-06, "loss": 1.3047, "step": 196 }, { "epoch": 0.1743362831858407, "grad_norm": 6.713303355696724, "learning_rate": 9.969727777392488e-06, "loss": 1.1116, "step": 197 }, { "epoch": 0.1752212389380531, "grad_norm": 1.7122983175951767, "learning_rate": 9.969341311689807e-06, "loss": 1.2964, "step": 198 }, { "epoch": 0.17610619469026548, "grad_norm": 4.027415939539951, "learning_rate": 9.9689524023173e-06, "loss": 1.1712, "step": 199 }, { "epoch": 0.17699115044247787, "grad_norm": 5.79263627741334, "learning_rate": 9.968561049466214e-06, "loss": 1.3614, "step": 200 }, { "epoch": 0.17787610619469027, "grad_norm": 2.4650028413203993, "learning_rate": 9.968167253328995e-06, "loss": 1.1875, "step": 201 }, { "epoch": 0.17876106194690267, "grad_norm": 6.099631039411052, "learning_rate": 9.967771014099292e-06, "loss": 1.2291, "step": 202 }, { "epoch": 0.17964601769911503, "grad_norm": 1.6045887797918976, "learning_rate": 9.967372331971959e-06, "loss": 1.2087, "step": 203 }, { "epoch": 0.18053097345132743, "grad_norm": 3.1398190946886615, "learning_rate": 9.966971207143041e-06, "loss": 1.3798, "step": 204 }, { "epoch": 0.18141592920353983, "grad_norm": 1.6380269941319128, "learning_rate": 9.966567639809796e-06, "loss": 1.2176, "step": 205 }, { "epoch": 0.18230088495575222, "grad_norm": 1.6206698209820896, "learning_rate": 9.966161630170676e-06, "loss": 1.0739, "step": 206 }, { "epoch": 0.1831858407079646, "grad_norm": 2.390700631568957, "learning_rate": 9.965753178425336e-06, "loss": 1.2784, "step": 207 }, { "epoch": 0.184070796460177, "grad_norm": 3.5051861036743692, "learning_rate": 9.965342284774633e-06, "loss": 1.2655, "step": 208 }, { "epoch": 0.18495575221238938, "grad_norm": 1.4725136621557064, "learning_rate": 9.964928949420621e-06, "loss": 1.1306, "step": 209 }, { "epoch": 0.18584070796460178, "grad_norm": 1.6383059736428502, "learning_rate": 9.96451317256656e-06, "loss": 1.2302, "step": 210 }, { "epoch": 0.18672566371681415, "grad_norm": 2.232191796141461, "learning_rate": 9.964094954416908e-06, "loss": 1.3403, "step": 211 }, { "epoch": 0.18761061946902655, "grad_norm": 2.3641943587123797, "learning_rate": 9.963674295177321e-06, "loss": 1.2137, "step": 212 }, { "epoch": 0.18849557522123894, "grad_norm": 1.4695071952326233, "learning_rate": 9.96325119505466e-06, "loss": 1.3089, "step": 213 }, { "epoch": 0.18938053097345134, "grad_norm": 1.970145330570456, "learning_rate": 9.962825654256984e-06, "loss": 1.1696, "step": 214 }, { "epoch": 0.1902654867256637, "grad_norm": 11.4277798156, "learning_rate": 9.962397672993552e-06, "loss": 1.2375, "step": 215 }, { "epoch": 0.1911504424778761, "grad_norm": 2.395975613522335, "learning_rate": 9.961967251474823e-06, "loss": 1.2713, "step": 216 }, { "epoch": 0.1920353982300885, "grad_norm": 3.7638286548255104, "learning_rate": 9.961534389912455e-06, "loss": 1.1126, "step": 217 }, { "epoch": 0.1929203539823009, "grad_norm": 1.3961346252289852, "learning_rate": 9.961099088519311e-06, "loss": 1.2614, "step": 218 }, { "epoch": 0.19380530973451326, "grad_norm": 2.3623388983635616, "learning_rate": 9.960661347509447e-06, "loss": 1.318, "step": 219 }, { "epoch": 0.19469026548672566, "grad_norm": 2.4721133223876257, "learning_rate": 9.960221167098124e-06, "loss": 1.4508, "step": 220 }, { "epoch": 0.19557522123893806, "grad_norm": 1.8764124654424452, "learning_rate": 9.9597785475018e-06, "loss": 1.3686, "step": 221 }, { "epoch": 0.19646017699115045, "grad_norm": 2.5065317159171934, "learning_rate": 9.95933348893813e-06, "loss": 1.24, "step": 222 }, { "epoch": 0.19734513274336282, "grad_norm": 1.6819328321218259, "learning_rate": 9.958885991625975e-06, "loss": 1.2583, "step": 223 }, { "epoch": 0.19823008849557522, "grad_norm": 2.829191967338729, "learning_rate": 9.958436055785391e-06, "loss": 1.2769, "step": 224 }, { "epoch": 0.19911504424778761, "grad_norm": 2.4408818646192523, "learning_rate": 9.957983681637632e-06, "loss": 1.3373, "step": 225 }, { "epoch": 0.2, "grad_norm": 2.5330344483281677, "learning_rate": 9.957528869405153e-06, "loss": 1.2906, "step": 226 }, { "epoch": 0.20088495575221238, "grad_norm": 1.7761566849866997, "learning_rate": 9.957071619311608e-06, "loss": 1.1235, "step": 227 }, { "epoch": 0.20176991150442478, "grad_norm": 6.346262833788042, "learning_rate": 9.956611931581849e-06, "loss": 1.2564, "step": 228 }, { "epoch": 0.20265486725663717, "grad_norm": 2.911583210917221, "learning_rate": 9.956149806441927e-06, "loss": 1.4855, "step": 229 }, { "epoch": 0.20353982300884957, "grad_norm": 2.7796780697924826, "learning_rate": 9.955685244119092e-06, "loss": 1.5058, "step": 230 }, { "epoch": 0.20442477876106194, "grad_norm": 2.3807027505190095, "learning_rate": 9.955218244841794e-06, "loss": 1.4187, "step": 231 }, { "epoch": 0.20530973451327433, "grad_norm": 3.1579472838590843, "learning_rate": 9.954748808839675e-06, "loss": 1.3513, "step": 232 }, { "epoch": 0.20619469026548673, "grad_norm": 1.792162780818138, "learning_rate": 9.954276936343583e-06, "loss": 1.3673, "step": 233 }, { "epoch": 0.20707964601769913, "grad_norm": 2.1099257381783416, "learning_rate": 9.95380262758556e-06, "loss": 1.0061, "step": 234 }, { "epoch": 0.2079646017699115, "grad_norm": 11.005395043800336, "learning_rate": 9.95332588279885e-06, "loss": 1.2723, "step": 235 }, { "epoch": 0.2088495575221239, "grad_norm": 8.547586229004494, "learning_rate": 9.952846702217886e-06, "loss": 1.3047, "step": 236 }, { "epoch": 0.2097345132743363, "grad_norm": 2.800463481523076, "learning_rate": 9.952365086078311e-06, "loss": 1.214, "step": 237 }, { "epoch": 0.21061946902654868, "grad_norm": 3.597446860797548, "learning_rate": 9.951881034616954e-06, "loss": 1.3481, "step": 238 }, { "epoch": 0.21150442477876105, "grad_norm": 4.136891375175812, "learning_rate": 9.951394548071852e-06, "loss": 1.156, "step": 239 }, { "epoch": 0.21238938053097345, "grad_norm": 1.8805307398239894, "learning_rate": 9.950905626682229e-06, "loss": 1.2997, "step": 240 }, { "epoch": 0.21327433628318584, "grad_norm": 3.527069611916172, "learning_rate": 9.950414270688515e-06, "loss": 1.2222, "step": 241 }, { "epoch": 0.21415929203539824, "grad_norm": 5.448934781019782, "learning_rate": 9.949920480332334e-06, "loss": 1.3328, "step": 242 }, { "epoch": 0.2150442477876106, "grad_norm": 3.4419806735298213, "learning_rate": 9.949424255856506e-06, "loss": 1.273, "step": 243 }, { "epoch": 0.215929203539823, "grad_norm": 4.166170098273219, "learning_rate": 9.94892559750505e-06, "loss": 1.1358, "step": 244 }, { "epoch": 0.2168141592920354, "grad_norm": 2.5284374667339913, "learning_rate": 9.948424505523178e-06, "loss": 1.41, "step": 245 }, { "epoch": 0.2176991150442478, "grad_norm": 1.8981875174303033, "learning_rate": 9.947920980157305e-06, "loss": 1.1431, "step": 246 }, { "epoch": 0.21858407079646017, "grad_norm": 2.4636827836846313, "learning_rate": 9.947415021655041e-06, "loss": 1.3564, "step": 247 }, { "epoch": 0.21946902654867256, "grad_norm": 2.0778840125659612, "learning_rate": 9.946906630265184e-06, "loss": 1.1461, "step": 248 }, { "epoch": 0.22035398230088496, "grad_norm": 1.8571628269071778, "learning_rate": 9.946395806237741e-06, "loss": 1.232, "step": 249 }, { "epoch": 0.22123893805309736, "grad_norm": 2.4073822155259315, "learning_rate": 9.945882549823906e-06, "loss": 1.5498, "step": 250 }, { "epoch": 0.22212389380530972, "grad_norm": 1.6900338220439164, "learning_rate": 9.945366861276074e-06, "loss": 1.3164, "step": 251 }, { "epoch": 0.22300884955752212, "grad_norm": 3.4006574808876486, "learning_rate": 9.944848740847834e-06, "loss": 1.2298, "step": 252 }, { "epoch": 0.22389380530973452, "grad_norm": 1.6985988931943383, "learning_rate": 9.944328188793971e-06, "loss": 1.0111, "step": 253 }, { "epoch": 0.2247787610619469, "grad_norm": 3.450695059927729, "learning_rate": 9.943805205370467e-06, "loss": 1.1177, "step": 254 }, { "epoch": 0.22566371681415928, "grad_norm": 2.0875575355770857, "learning_rate": 9.9432797908345e-06, "loss": 1.2268, "step": 255 }, { "epoch": 0.22654867256637168, "grad_norm": 2.0160533525554927, "learning_rate": 9.942751945444437e-06, "loss": 1.4031, "step": 256 }, { "epoch": 0.22743362831858407, "grad_norm": 15.441313413936841, "learning_rate": 9.94222166945985e-06, "loss": 1.2051, "step": 257 }, { "epoch": 0.22831858407079647, "grad_norm": 3.5728625865941606, "learning_rate": 9.941688963141502e-06, "loss": 1.0774, "step": 258 }, { "epoch": 0.22920353982300884, "grad_norm": 2.460208271089753, "learning_rate": 9.941153826751349e-06, "loss": 1.3251, "step": 259 }, { "epoch": 0.23008849557522124, "grad_norm": 2.0348631571922975, "learning_rate": 9.940616260552545e-06, "loss": 1.374, "step": 260 }, { "epoch": 0.23097345132743363, "grad_norm": 2.410642867826453, "learning_rate": 9.940076264809438e-06, "loss": 1.3821, "step": 261 }, { "epoch": 0.23185840707964603, "grad_norm": 2.4319625845440433, "learning_rate": 9.939533839787567e-06, "loss": 0.9794, "step": 262 }, { "epoch": 0.2327433628318584, "grad_norm": 6.524509374661863, "learning_rate": 9.938988985753674e-06, "loss": 1.3375, "step": 263 }, { "epoch": 0.2336283185840708, "grad_norm": 7.757595171536015, "learning_rate": 9.938441702975689e-06, "loss": 1.2178, "step": 264 }, { "epoch": 0.2345132743362832, "grad_norm": 1.8030623984955558, "learning_rate": 9.937891991722736e-06, "loss": 1.3963, "step": 265 }, { "epoch": 0.23539823008849559, "grad_norm": 1.8753741097901682, "learning_rate": 9.937339852265138e-06, "loss": 1.2458, "step": 266 }, { "epoch": 0.23628318584070795, "grad_norm": 6.065218270195743, "learning_rate": 9.936785284874409e-06, "loss": 1.6086, "step": 267 }, { "epoch": 0.23716814159292035, "grad_norm": 3.914152997316887, "learning_rate": 9.936228289823253e-06, "loss": 1.1895, "step": 268 }, { "epoch": 0.23805309734513275, "grad_norm": 2.6590979405694797, "learning_rate": 9.935668867385575e-06, "loss": 1.0646, "step": 269 }, { "epoch": 0.23893805309734514, "grad_norm": 1.9464358625193448, "learning_rate": 9.935107017836472e-06, "loss": 1.0745, "step": 270 }, { "epoch": 0.2398230088495575, "grad_norm": 2.2942537305485966, "learning_rate": 9.93454274145223e-06, "loss": 1.2814, "step": 271 }, { "epoch": 0.2407079646017699, "grad_norm": 2.8980633738892565, "learning_rate": 9.933976038510334e-06, "loss": 1.4433, "step": 272 }, { "epoch": 0.2415929203539823, "grad_norm": 1.7793209491804163, "learning_rate": 9.933406909289456e-06, "loss": 1.406, "step": 273 }, { "epoch": 0.2424778761061947, "grad_norm": 1.7236883187662706, "learning_rate": 9.93283535406947e-06, "loss": 1.2825, "step": 274 }, { "epoch": 0.24336283185840707, "grad_norm": 1.9626957791310344, "learning_rate": 9.932261373131433e-06, "loss": 1.4977, "step": 275 }, { "epoch": 0.24424778761061947, "grad_norm": 1.8901398456491711, "learning_rate": 9.931684966757605e-06, "loss": 1.1388, "step": 276 }, { "epoch": 0.24513274336283186, "grad_norm": 1.5926720959963308, "learning_rate": 9.931106135231428e-06, "loss": 1.5617, "step": 277 }, { "epoch": 0.24601769911504426, "grad_norm": 1.8570168362831228, "learning_rate": 9.930524878837544e-06, "loss": 1.2685, "step": 278 }, { "epoch": 0.24690265486725663, "grad_norm": 2.8191823467860035, "learning_rate": 9.929941197861787e-06, "loss": 1.5764, "step": 279 }, { "epoch": 0.24778761061946902, "grad_norm": 1.349306512461668, "learning_rate": 9.92935509259118e-06, "loss": 1.2508, "step": 280 }, { "epoch": 0.24867256637168142, "grad_norm": 2.7453810651094455, "learning_rate": 9.928766563313941e-06, "loss": 0.9388, "step": 281 }, { "epoch": 0.24955752212389382, "grad_norm": 1.4548970845652462, "learning_rate": 9.928175610319477e-06, "loss": 1.206, "step": 282 }, { "epoch": 0.2504424778761062, "grad_norm": 2.0863646952849004, "learning_rate": 9.92758223389839e-06, "loss": 1.1304, "step": 283 }, { "epoch": 0.2513274336283186, "grad_norm": 2.1558439774406066, "learning_rate": 9.926986434342471e-06, "loss": 1.3247, "step": 284 }, { "epoch": 0.252212389380531, "grad_norm": 1.991012422392566, "learning_rate": 9.926388211944707e-06, "loss": 1.2824, "step": 285 }, { "epoch": 0.25309734513274335, "grad_norm": 3.2783590387663333, "learning_rate": 9.925787566999271e-06, "loss": 1.5618, "step": 286 }, { "epoch": 0.25398230088495577, "grad_norm": 1.6873983476563035, "learning_rate": 9.92518449980153e-06, "loss": 1.3836, "step": 287 }, { "epoch": 0.25486725663716814, "grad_norm": 1.8460655333219362, "learning_rate": 9.924579010648042e-06, "loss": 1.5339, "step": 288 }, { "epoch": 0.2557522123893805, "grad_norm": 1.680800675923085, "learning_rate": 9.923971099836556e-06, "loss": 1.5387, "step": 289 }, { "epoch": 0.25663716814159293, "grad_norm": 1.2959706530424298, "learning_rate": 9.92336076766601e-06, "loss": 1.1368, "step": 290 }, { "epoch": 0.2575221238938053, "grad_norm": 2.21581417768174, "learning_rate": 9.922748014436539e-06, "loss": 1.2503, "step": 291 }, { "epoch": 0.2584070796460177, "grad_norm": 1.6670943117863473, "learning_rate": 9.922132840449459e-06, "loss": 1.1634, "step": 292 }, { "epoch": 0.2592920353982301, "grad_norm": 1.4272887723376069, "learning_rate": 9.921515246007283e-06, "loss": 1.206, "step": 293 }, { "epoch": 0.26017699115044246, "grad_norm": 2.210309124481113, "learning_rate": 9.920895231413714e-06, "loss": 1.4347, "step": 294 }, { "epoch": 0.2610619469026549, "grad_norm": 2.318399931212704, "learning_rate": 9.920272796973643e-06, "loss": 1.1986, "step": 295 }, { "epoch": 0.26194690265486725, "grad_norm": 3.307468046108552, "learning_rate": 9.91964794299315e-06, "loss": 1.3045, "step": 296 }, { "epoch": 0.2628318584070796, "grad_norm": 2.3707563997446908, "learning_rate": 9.919020669779506e-06, "loss": 1.4075, "step": 297 }, { "epoch": 0.26371681415929205, "grad_norm": 3.4116132837843796, "learning_rate": 9.918390977641176e-06, "loss": 1.0433, "step": 298 }, { "epoch": 0.2646017699115044, "grad_norm": 8.157407277551416, "learning_rate": 9.917758866887808e-06, "loss": 1.1361, "step": 299 }, { "epoch": 0.26548672566371684, "grad_norm": 1.7844780892910876, "learning_rate": 9.917124337830242e-06, "loss": 1.1856, "step": 300 }, { "epoch": 0.2663716814159292, "grad_norm": 1.6512215697936987, "learning_rate": 9.916487390780509e-06, "loss": 1.1512, "step": 301 }, { "epoch": 0.2672566371681416, "grad_norm": 2.3475885299094457, "learning_rate": 9.915848026051825e-06, "loss": 1.1966, "step": 302 }, { "epoch": 0.268141592920354, "grad_norm": 1.8467464430357163, "learning_rate": 9.915206243958597e-06, "loss": 1.1392, "step": 303 }, { "epoch": 0.26902654867256637, "grad_norm": 2.8517265698631076, "learning_rate": 9.914562044816424e-06, "loss": 1.5706, "step": 304 }, { "epoch": 0.26991150442477874, "grad_norm": 3.5381848936888565, "learning_rate": 9.913915428942087e-06, "loss": 1.3408, "step": 305 }, { "epoch": 0.27079646017699116, "grad_norm": 2.6117926071381263, "learning_rate": 9.91326639665356e-06, "loss": 1.145, "step": 306 }, { "epoch": 0.27168141592920353, "grad_norm": 5.168208315341732, "learning_rate": 9.912614948270004e-06, "loss": 1.327, "step": 307 }, { "epoch": 0.27256637168141595, "grad_norm": 3.487486094555104, "learning_rate": 9.911961084111769e-06, "loss": 1.3452, "step": 308 }, { "epoch": 0.2734513274336283, "grad_norm": 2.1226612287447018, "learning_rate": 9.911304804500391e-06, "loss": 1.1734, "step": 309 }, { "epoch": 0.2743362831858407, "grad_norm": 3.2111814572507935, "learning_rate": 9.910646109758596e-06, "loss": 1.2672, "step": 310 }, { "epoch": 0.2752212389380531, "grad_norm": 1.434912408955727, "learning_rate": 9.909985000210295e-06, "loss": 0.8444, "step": 311 }, { "epoch": 0.2761061946902655, "grad_norm": 2.595918160859852, "learning_rate": 9.909321476180594e-06, "loss": 1.2922, "step": 312 }, { "epoch": 0.27699115044247785, "grad_norm": 3.009276077574387, "learning_rate": 9.908655537995772e-06, "loss": 1.5296, "step": 313 }, { "epoch": 0.2778761061946903, "grad_norm": 2.2063377986489785, "learning_rate": 9.907987185983307e-06, "loss": 1.3572, "step": 314 }, { "epoch": 0.27876106194690264, "grad_norm": 1.69367510061472, "learning_rate": 9.907316420471864e-06, "loss": 1.3549, "step": 315 }, { "epoch": 0.27964601769911507, "grad_norm": 1.5931442640222426, "learning_rate": 9.906643241791288e-06, "loss": 1.1295, "step": 316 }, { "epoch": 0.28053097345132744, "grad_norm": 4.791567127829935, "learning_rate": 9.905967650272612e-06, "loss": 1.1837, "step": 317 }, { "epoch": 0.2814159292035398, "grad_norm": 1.9230648932840269, "learning_rate": 9.905289646248063e-06, "loss": 1.2447, "step": 318 }, { "epoch": 0.28230088495575223, "grad_norm": 1.385901347921339, "learning_rate": 9.904609230051045e-06, "loss": 1.1674, "step": 319 }, { "epoch": 0.2831858407079646, "grad_norm": 2.0307640039513424, "learning_rate": 9.903926402016153e-06, "loss": 1.476, "step": 320 }, { "epoch": 0.28407079646017697, "grad_norm": 1.7240880668402425, "learning_rate": 9.903241162479169e-06, "loss": 1.2923, "step": 321 }, { "epoch": 0.2849557522123894, "grad_norm": 2.1736846986980605, "learning_rate": 9.902553511777055e-06, "loss": 1.3646, "step": 322 }, { "epoch": 0.28584070796460176, "grad_norm": 2.2609089044213007, "learning_rate": 9.901863450247967e-06, "loss": 1.1147, "step": 323 }, { "epoch": 0.2867256637168142, "grad_norm": 3.472645200415937, "learning_rate": 9.90117097823124e-06, "loss": 1.1591, "step": 324 }, { "epoch": 0.28761061946902655, "grad_norm": 7.915549500778585, "learning_rate": 9.9004760960674e-06, "loss": 1.2587, "step": 325 }, { "epoch": 0.2884955752212389, "grad_norm": 2.0545636839733246, "learning_rate": 9.899778804098148e-06, "loss": 1.0962, "step": 326 }, { "epoch": 0.28938053097345134, "grad_norm": 2.9147749210801535, "learning_rate": 9.899079102666382e-06, "loss": 1.2141, "step": 327 }, { "epoch": 0.2902654867256637, "grad_norm": 1.6616734785683749, "learning_rate": 9.898376992116179e-06, "loss": 1.311, "step": 328 }, { "epoch": 0.2911504424778761, "grad_norm": 2.628778890000797, "learning_rate": 9.8976724727928e-06, "loss": 1.297, "step": 329 }, { "epoch": 0.2920353982300885, "grad_norm": 4.170820819410935, "learning_rate": 9.896965545042692e-06, "loss": 1.3517, "step": 330 }, { "epoch": 0.2929203539823009, "grad_norm": 3.8297103708211617, "learning_rate": 9.896256209213487e-06, "loss": 1.4381, "step": 331 }, { "epoch": 0.2938053097345133, "grad_norm": 1.6387404970299553, "learning_rate": 9.895544465654002e-06, "loss": 1.001, "step": 332 }, { "epoch": 0.29469026548672567, "grad_norm": 2.3135726196952016, "learning_rate": 9.894830314714232e-06, "loss": 1.299, "step": 333 }, { "epoch": 0.29557522123893804, "grad_norm": 2.0699403559179537, "learning_rate": 9.894113756745362e-06, "loss": 1.3018, "step": 334 }, { "epoch": 0.29646017699115046, "grad_norm": 2.1150398594321755, "learning_rate": 9.893394792099762e-06, "loss": 1.3508, "step": 335 }, { "epoch": 0.2973451327433628, "grad_norm": 1.6440509832134937, "learning_rate": 9.892673421130979e-06, "loss": 0.9764, "step": 336 }, { "epoch": 0.2982300884955752, "grad_norm": 2.8715811949157417, "learning_rate": 9.891949644193745e-06, "loss": 1.2024, "step": 337 }, { "epoch": 0.2991150442477876, "grad_norm": 1.5671213501130892, "learning_rate": 9.891223461643982e-06, "loss": 1.3605, "step": 338 }, { "epoch": 0.3, "grad_norm": 1.679482140472365, "learning_rate": 9.890494873838785e-06, "loss": 1.0768, "step": 339 }, { "epoch": 0.3008849557522124, "grad_norm": 2.0017717812334004, "learning_rate": 9.889763881136439e-06, "loss": 1.5419, "step": 340 }, { "epoch": 0.3017699115044248, "grad_norm": 2.0489804171072836, "learning_rate": 9.88903048389641e-06, "loss": 1.0015, "step": 341 }, { "epoch": 0.30265486725663715, "grad_norm": 3.940163769758684, "learning_rate": 9.888294682479342e-06, "loss": 1.1337, "step": 342 }, { "epoch": 0.3035398230088496, "grad_norm": 2.879672552460691, "learning_rate": 9.887556477247066e-06, "loss": 1.3482, "step": 343 }, { "epoch": 0.30442477876106194, "grad_norm": 2.2004433874274865, "learning_rate": 9.886815868562596e-06, "loss": 1.4071, "step": 344 }, { "epoch": 0.3053097345132743, "grad_norm": 1.8534688684532001, "learning_rate": 9.886072856790125e-06, "loss": 1.0397, "step": 345 }, { "epoch": 0.30619469026548674, "grad_norm": 25.73188408233745, "learning_rate": 9.885327442295027e-06, "loss": 1.7802, "step": 346 }, { "epoch": 0.3070796460176991, "grad_norm": 1.7470686658079, "learning_rate": 9.884579625443859e-06, "loss": 1.1909, "step": 347 }, { "epoch": 0.30796460176991153, "grad_norm": 2.9789946517240407, "learning_rate": 9.883829406604363e-06, "loss": 1.3949, "step": 348 }, { "epoch": 0.3088495575221239, "grad_norm": 1.640797998571061, "learning_rate": 9.883076786145454e-06, "loss": 1.3292, "step": 349 }, { "epoch": 0.30973451327433627, "grad_norm": 1.7518947598401653, "learning_rate": 9.882321764437234e-06, "loss": 1.1608, "step": 350 }, { "epoch": 0.3106194690265487, "grad_norm": 7.389440167394963, "learning_rate": 9.881564341850987e-06, "loss": 1.1317, "step": 351 }, { "epoch": 0.31150442477876106, "grad_norm": 2.3677058168178027, "learning_rate": 9.88080451875917e-06, "loss": 1.2073, "step": 352 }, { "epoch": 0.3123893805309734, "grad_norm": 3.430162079173438, "learning_rate": 9.88004229553543e-06, "loss": 1.1144, "step": 353 }, { "epoch": 0.31327433628318585, "grad_norm": 1.7073698016663987, "learning_rate": 9.879277672554586e-06, "loss": 1.1101, "step": 354 }, { "epoch": 0.3141592920353982, "grad_norm": 2.3848020100443583, "learning_rate": 9.878510650192644e-06, "loss": 1.3401, "step": 355 }, { "epoch": 0.31504424778761064, "grad_norm": 1.671466354158794, "learning_rate": 9.877741228826785e-06, "loss": 1.1691, "step": 356 }, { "epoch": 0.315929203539823, "grad_norm": 2.3678222392113284, "learning_rate": 9.87696940883537e-06, "loss": 1.2223, "step": 357 }, { "epoch": 0.3168141592920354, "grad_norm": 1.9125757907542629, "learning_rate": 9.876195190597944e-06, "loss": 1.1534, "step": 358 }, { "epoch": 0.3176991150442478, "grad_norm": 2.09097344660005, "learning_rate": 9.875418574495227e-06, "loss": 1.226, "step": 359 }, { "epoch": 0.3185840707964602, "grad_norm": 2.0998751893768755, "learning_rate": 9.874639560909118e-06, "loss": 1.1596, "step": 360 }, { "epoch": 0.31946902654867254, "grad_norm": 2.0602323688811337, "learning_rate": 9.8738581502227e-06, "loss": 1.4566, "step": 361 }, { "epoch": 0.32035398230088497, "grad_norm": 1.7010332222978504, "learning_rate": 9.873074342820225e-06, "loss": 1.2281, "step": 362 }, { "epoch": 0.32123893805309733, "grad_norm": 2.726500543083113, "learning_rate": 9.872288139087136e-06, "loss": 1.2704, "step": 363 }, { "epoch": 0.32212389380530976, "grad_norm": 2.424159326232836, "learning_rate": 9.871499539410045e-06, "loss": 1.4332, "step": 364 }, { "epoch": 0.3230088495575221, "grad_norm": 2.439531551635802, "learning_rate": 9.870708544176744e-06, "loss": 1.3416, "step": 365 }, { "epoch": 0.3238938053097345, "grad_norm": 4.066828071464188, "learning_rate": 9.869915153776209e-06, "loss": 1.4259, "step": 366 }, { "epoch": 0.3247787610619469, "grad_norm": 2.4250481648259425, "learning_rate": 9.869119368598583e-06, "loss": 1.3634, "step": 367 }, { "epoch": 0.3256637168141593, "grad_norm": 1.667402722233621, "learning_rate": 9.868321189035196e-06, "loss": 1.0942, "step": 368 }, { "epoch": 0.32654867256637166, "grad_norm": 1.8648621399420688, "learning_rate": 9.867520615478554e-06, "loss": 1.1037, "step": 369 }, { "epoch": 0.3274336283185841, "grad_norm": 2.1307895850283556, "learning_rate": 9.866717648322335e-06, "loss": 1.1144, "step": 370 }, { "epoch": 0.32831858407079645, "grad_norm": 1.9909053093808458, "learning_rate": 9.8659122879614e-06, "loss": 1.0339, "step": 371 }, { "epoch": 0.3292035398230089, "grad_norm": 1.5968635857137303, "learning_rate": 9.865104534791782e-06, "loss": 1.1678, "step": 372 }, { "epoch": 0.33008849557522124, "grad_norm": 4.027983840247675, "learning_rate": 9.864294389210696e-06, "loss": 1.3516, "step": 373 }, { "epoch": 0.3309734513274336, "grad_norm": 2.1409074492569404, "learning_rate": 9.863481851616528e-06, "loss": 1.4794, "step": 374 }, { "epoch": 0.33185840707964603, "grad_norm": 8.554803315424873, "learning_rate": 9.862666922408844e-06, "loss": 1.0924, "step": 375 }, { "epoch": 0.3327433628318584, "grad_norm": 1.326731045364508, "learning_rate": 9.861849601988384e-06, "loss": 1.1416, "step": 376 }, { "epoch": 0.33362831858407077, "grad_norm": 1.8511768492552865, "learning_rate": 9.861029890757066e-06, "loss": 1.3584, "step": 377 }, { "epoch": 0.3345132743362832, "grad_norm": 4.225701949587194, "learning_rate": 9.86020778911798e-06, "loss": 1.0669, "step": 378 }, { "epoch": 0.33539823008849556, "grad_norm": 1.9316430798998132, "learning_rate": 9.859383297475395e-06, "loss": 1.4016, "step": 379 }, { "epoch": 0.336283185840708, "grad_norm": 1.7842138797459588, "learning_rate": 9.858556416234755e-06, "loss": 0.9912, "step": 380 }, { "epoch": 0.33716814159292036, "grad_norm": 1.4967044080713934, "learning_rate": 9.857727145802677e-06, "loss": 1.2918, "step": 381 }, { "epoch": 0.3380530973451327, "grad_norm": 5.118311146823387, "learning_rate": 9.856895486586957e-06, "loss": 0.9864, "step": 382 }, { "epoch": 0.33893805309734515, "grad_norm": 1.7074639445780142, "learning_rate": 9.85606143899656e-06, "loss": 1.0211, "step": 383 }, { "epoch": 0.3398230088495575, "grad_norm": 1.6944828130794298, "learning_rate": 9.855225003441629e-06, "loss": 0.9233, "step": 384 }, { "epoch": 0.3407079646017699, "grad_norm": 1.6666911369368507, "learning_rate": 9.854386180333479e-06, "loss": 0.9975, "step": 385 }, { "epoch": 0.3415929203539823, "grad_norm": 1.6774024532833465, "learning_rate": 9.853544970084602e-06, "loss": 1.2068, "step": 386 }, { "epoch": 0.3424778761061947, "grad_norm": 2.0284480091332675, "learning_rate": 9.852701373108665e-06, "loss": 1.1566, "step": 387 }, { "epoch": 0.3433628318584071, "grad_norm": 1.5153633003793472, "learning_rate": 9.8518553898205e-06, "loss": 1.3245, "step": 388 }, { "epoch": 0.34424778761061947, "grad_norm": 1.9282354724400224, "learning_rate": 9.851007020636125e-06, "loss": 1.1654, "step": 389 }, { "epoch": 0.34513274336283184, "grad_norm": 5.176765993722311, "learning_rate": 9.850156265972722e-06, "loss": 1.2249, "step": 390 }, { "epoch": 0.34601769911504426, "grad_norm": 1.8128804253362745, "learning_rate": 9.849303126248649e-06, "loss": 1.1577, "step": 391 }, { "epoch": 0.34690265486725663, "grad_norm": 4.687490743488281, "learning_rate": 9.848447601883436e-06, "loss": 1.0666, "step": 392 }, { "epoch": 0.347787610619469, "grad_norm": 2.9711791528725366, "learning_rate": 9.847589693297787e-06, "loss": 1.1983, "step": 393 }, { "epoch": 0.3486725663716814, "grad_norm": 52.435392713689225, "learning_rate": 9.84672940091358e-06, "loss": 1.4239, "step": 394 }, { "epoch": 0.3495575221238938, "grad_norm": 2.6409971818970726, "learning_rate": 9.845866725153861e-06, "loss": 1.2919, "step": 395 }, { "epoch": 0.3504424778761062, "grad_norm": 10.589168725957451, "learning_rate": 9.84500166644285e-06, "loss": 1.2198, "step": 396 }, { "epoch": 0.3513274336283186, "grad_norm": 2.1101774435350547, "learning_rate": 9.844134225205941e-06, "loss": 1.1605, "step": 397 }, { "epoch": 0.35221238938053095, "grad_norm": 4.426262830604028, "learning_rate": 9.843264401869696e-06, "loss": 1.1602, "step": 398 }, { "epoch": 0.3530973451327434, "grad_norm": 5.35704058954812, "learning_rate": 9.84239219686185e-06, "loss": 1.2567, "step": 399 }, { "epoch": 0.35398230088495575, "grad_norm": 2.4934419596231954, "learning_rate": 9.841517610611309e-06, "loss": 1.2381, "step": 400 }, { "epoch": 0.3548672566371681, "grad_norm": 1.4915831474135794, "learning_rate": 9.84064064354815e-06, "loss": 1.2088, "step": 401 }, { "epoch": 0.35575221238938054, "grad_norm": 2.442549383458422, "learning_rate": 9.83976129610362e-06, "loss": 1.2165, "step": 402 }, { "epoch": 0.3566371681415929, "grad_norm": 2.984551327090413, "learning_rate": 9.838879568710143e-06, "loss": 1.2583, "step": 403 }, { "epoch": 0.35752212389380533, "grad_norm": 2.0326012820487054, "learning_rate": 9.8379954618013e-06, "loss": 1.3112, "step": 404 }, { "epoch": 0.3584070796460177, "grad_norm": 10.823342823663682, "learning_rate": 9.837108975811855e-06, "loss": 1.2051, "step": 405 }, { "epoch": 0.35929203539823007, "grad_norm": 4.866553963320614, "learning_rate": 9.836220111177735e-06, "loss": 1.2621, "step": 406 }, { "epoch": 0.3601769911504425, "grad_norm": 4.196834887459422, "learning_rate": 9.835328868336038e-06, "loss": 1.3114, "step": 407 }, { "epoch": 0.36106194690265486, "grad_norm": 1.6640896161845604, "learning_rate": 9.834435247725032e-06, "loss": 1.0851, "step": 408 }, { "epoch": 0.36194690265486723, "grad_norm": 3.882299799217267, "learning_rate": 9.833539249784158e-06, "loss": 1.2266, "step": 409 }, { "epoch": 0.36283185840707965, "grad_norm": 1.9024842231981267, "learning_rate": 9.832640874954017e-06, "loss": 1.2125, "step": 410 }, { "epoch": 0.363716814159292, "grad_norm": 1.9280369148718064, "learning_rate": 9.831740123676387e-06, "loss": 1.1495, "step": 411 }, { "epoch": 0.36460176991150445, "grad_norm": 1.6788946234323854, "learning_rate": 9.830836996394212e-06, "loss": 1.3263, "step": 412 }, { "epoch": 0.3654867256637168, "grad_norm": 3.995930040308025, "learning_rate": 9.829931493551603e-06, "loss": 1.3851, "step": 413 }, { "epoch": 0.3663716814159292, "grad_norm": 1.699850013302917, "learning_rate": 9.82902361559384e-06, "loss": 1.195, "step": 414 }, { "epoch": 0.3672566371681416, "grad_norm": 2.621131766128849, "learning_rate": 9.828113362967373e-06, "loss": 1.2736, "step": 415 }, { "epoch": 0.368141592920354, "grad_norm": 1.8778616099927359, "learning_rate": 9.827200736119815e-06, "loss": 1.3555, "step": 416 }, { "epoch": 0.36902654867256635, "grad_norm": 2.3153028165249347, "learning_rate": 9.826285735499953e-06, "loss": 1.1436, "step": 417 }, { "epoch": 0.36991150442477877, "grad_norm": 1.4438167080881963, "learning_rate": 9.825368361557738e-06, "loss": 1.2763, "step": 418 }, { "epoch": 0.37079646017699114, "grad_norm": 3.046038850238262, "learning_rate": 9.824448614744283e-06, "loss": 1.2948, "step": 419 }, { "epoch": 0.37168141592920356, "grad_norm": 7.256510896853475, "learning_rate": 9.82352649551188e-06, "loss": 1.155, "step": 420 }, { "epoch": 0.37256637168141593, "grad_norm": 1.9018408857644271, "learning_rate": 9.822602004313975e-06, "loss": 1.3207, "step": 421 }, { "epoch": 0.3734513274336283, "grad_norm": 1.9533139236658885, "learning_rate": 9.821675141605187e-06, "loss": 1.0812, "step": 422 }, { "epoch": 0.3743362831858407, "grad_norm": 2.199063895464222, "learning_rate": 9.8207459078413e-06, "loss": 0.8404, "step": 423 }, { "epoch": 0.3752212389380531, "grad_norm": 1.9865383872667037, "learning_rate": 9.819814303479268e-06, "loss": 1.1833, "step": 424 }, { "epoch": 0.37610619469026546, "grad_norm": 1.733548818057734, "learning_rate": 9.8188803289772e-06, "loss": 1.1241, "step": 425 }, { "epoch": 0.3769911504424779, "grad_norm": 2.801361828871665, "learning_rate": 9.817943984794381e-06, "loss": 0.9316, "step": 426 }, { "epoch": 0.37787610619469025, "grad_norm": 1.9155559984432424, "learning_rate": 9.817005271391258e-06, "loss": 1.2802, "step": 427 }, { "epoch": 0.3787610619469027, "grad_norm": 2.0267147164927346, "learning_rate": 9.816064189229441e-06, "loss": 1.4897, "step": 428 }, { "epoch": 0.37964601769911505, "grad_norm": 3.226881440096096, "learning_rate": 9.815120738771708e-06, "loss": 1.1947, "step": 429 }, { "epoch": 0.3805309734513274, "grad_norm": 2.4387965616279805, "learning_rate": 9.814174920481999e-06, "loss": 1.2108, "step": 430 }, { "epoch": 0.38141592920353984, "grad_norm": 1.484389938454375, "learning_rate": 9.813226734825418e-06, "loss": 1.1765, "step": 431 }, { "epoch": 0.3823008849557522, "grad_norm": 1.9360696169645588, "learning_rate": 9.812276182268236e-06, "loss": 1.5223, "step": 432 }, { "epoch": 0.3831858407079646, "grad_norm": 1.7148036868640055, "learning_rate": 9.81132326327789e-06, "loss": 1.1804, "step": 433 }, { "epoch": 0.384070796460177, "grad_norm": 2.304054150979792, "learning_rate": 9.810367978322971e-06, "loss": 1.2345, "step": 434 }, { "epoch": 0.38495575221238937, "grad_norm": 1.8768440250516738, "learning_rate": 9.809410327873244e-06, "loss": 1.1941, "step": 435 }, { "epoch": 0.3858407079646018, "grad_norm": 1.4672740961590018, "learning_rate": 9.80845031239963e-06, "loss": 1.3473, "step": 436 }, { "epoch": 0.38672566371681416, "grad_norm": 2.4185536013643287, "learning_rate": 9.807487932374217e-06, "loss": 1.2663, "step": 437 }, { "epoch": 0.38761061946902653, "grad_norm": 1.6125466238846025, "learning_rate": 9.806523188270257e-06, "loss": 1.2167, "step": 438 }, { "epoch": 0.38849557522123895, "grad_norm": 1.6230227949658158, "learning_rate": 9.80555608056216e-06, "loss": 1.4001, "step": 439 }, { "epoch": 0.3893805309734513, "grad_norm": 1.9671909963960186, "learning_rate": 9.804586609725499e-06, "loss": 1.3876, "step": 440 }, { "epoch": 0.3902654867256637, "grad_norm": 1.5137675040088623, "learning_rate": 9.80361477623701e-06, "loss": 1.3842, "step": 441 }, { "epoch": 0.3911504424778761, "grad_norm": 4.244861423353005, "learning_rate": 9.802640580574598e-06, "loss": 1.2241, "step": 442 }, { "epoch": 0.3920353982300885, "grad_norm": 1.7943132563171298, "learning_rate": 9.801664023217318e-06, "loss": 1.0862, "step": 443 }, { "epoch": 0.3929203539823009, "grad_norm": 1.519783710859336, "learning_rate": 9.80068510464539e-06, "loss": 1.1839, "step": 444 }, { "epoch": 0.3938053097345133, "grad_norm": 2.040353665645219, "learning_rate": 9.799703825340198e-06, "loss": 1.5683, "step": 445 }, { "epoch": 0.39469026548672564, "grad_norm": 2.233719648322153, "learning_rate": 9.798720185784288e-06, "loss": 1.4581, "step": 446 }, { "epoch": 0.39557522123893807, "grad_norm": 3.0134840506790534, "learning_rate": 9.797734186461361e-06, "loss": 1.4419, "step": 447 }, { "epoch": 0.39646017699115044, "grad_norm": 1.3889225213543095, "learning_rate": 9.79674582785628e-06, "loss": 1.1219, "step": 448 }, { "epoch": 0.3973451327433628, "grad_norm": 1.419817489088537, "learning_rate": 9.795755110455074e-06, "loss": 1.1971, "step": 449 }, { "epoch": 0.39823008849557523, "grad_norm": 1.3825402452771676, "learning_rate": 9.794762034744925e-06, "loss": 0.8208, "step": 450 }, { "epoch": 0.3991150442477876, "grad_norm": 3.5260245188894515, "learning_rate": 9.793766601214177e-06, "loss": 1.4206, "step": 451 }, { "epoch": 0.4, "grad_norm": 1.5511266356595954, "learning_rate": 9.792768810352332e-06, "loss": 1.175, "step": 452 }, { "epoch": 0.4008849557522124, "grad_norm": 1.9070507149609472, "learning_rate": 9.791768662650059e-06, "loss": 1.46, "step": 453 }, { "epoch": 0.40176991150442476, "grad_norm": 1.9332110567223362, "learning_rate": 9.790766158599172e-06, "loss": 1.1513, "step": 454 }, { "epoch": 0.4026548672566372, "grad_norm": 1.5716871106747061, "learning_rate": 9.789761298692658e-06, "loss": 1.4165, "step": 455 }, { "epoch": 0.40353982300884955, "grad_norm": 1.3653926040732343, "learning_rate": 9.788754083424654e-06, "loss": 1.003, "step": 456 }, { "epoch": 0.4044247787610619, "grad_norm": 1.5000970372152227, "learning_rate": 9.787744513290456e-06, "loss": 1.0246, "step": 457 }, { "epoch": 0.40530973451327434, "grad_norm": 1.5779400757135935, "learning_rate": 9.78673258878652e-06, "loss": 1.4582, "step": 458 }, { "epoch": 0.4061946902654867, "grad_norm": 2.9328660523661587, "learning_rate": 9.78571831041046e-06, "loss": 1.197, "step": 459 }, { "epoch": 0.40707964601769914, "grad_norm": 2.074414855806992, "learning_rate": 9.784701678661045e-06, "loss": 1.3456, "step": 460 }, { "epoch": 0.4079646017699115, "grad_norm": 1.703471677841335, "learning_rate": 9.783682694038206e-06, "loss": 1.4204, "step": 461 }, { "epoch": 0.4088495575221239, "grad_norm": 1.6426638349189722, "learning_rate": 9.782661357043024e-06, "loss": 1.2618, "step": 462 }, { "epoch": 0.4097345132743363, "grad_norm": 2.2089451790101893, "learning_rate": 9.781637668177742e-06, "loss": 1.2238, "step": 463 }, { "epoch": 0.41061946902654867, "grad_norm": 1.4381617097840318, "learning_rate": 9.78061162794576e-06, "loss": 1.3689, "step": 464 }, { "epoch": 0.41150442477876104, "grad_norm": 2.719808852660407, "learning_rate": 9.779583236851632e-06, "loss": 1.197, "step": 465 }, { "epoch": 0.41238938053097346, "grad_norm": 1.4400117233921343, "learning_rate": 9.778552495401067e-06, "loss": 1.2086, "step": 466 }, { "epoch": 0.41327433628318583, "grad_norm": 1.7178577687005714, "learning_rate": 9.777519404100933e-06, "loss": 1.4589, "step": 467 }, { "epoch": 0.41415929203539825, "grad_norm": 1.635897128242605, "learning_rate": 9.77648396345925e-06, "loss": 1.3677, "step": 468 }, { "epoch": 0.4150442477876106, "grad_norm": 1.5671807794596186, "learning_rate": 9.775446173985194e-06, "loss": 1.2871, "step": 469 }, { "epoch": 0.415929203539823, "grad_norm": 1.504230363526691, "learning_rate": 9.774406036189104e-06, "loss": 1.1727, "step": 470 }, { "epoch": 0.4168141592920354, "grad_norm": 1.4548412780218067, "learning_rate": 9.773363550582458e-06, "loss": 1.3512, "step": 471 }, { "epoch": 0.4176991150442478, "grad_norm": 1.4474951663124793, "learning_rate": 9.772318717677905e-06, "loss": 1.1615, "step": 472 }, { "epoch": 0.41858407079646015, "grad_norm": 1.5235224643917837, "learning_rate": 9.771271537989236e-06, "loss": 1.187, "step": 473 }, { "epoch": 0.4194690265486726, "grad_norm": 1.4504636206349457, "learning_rate": 9.770222012031404e-06, "loss": 1.1144, "step": 474 }, { "epoch": 0.42035398230088494, "grad_norm": 1.6588672213589837, "learning_rate": 9.769170140320509e-06, "loss": 1.3504, "step": 475 }, { "epoch": 0.42123893805309737, "grad_norm": 1.8690309183787777, "learning_rate": 9.768115923373811e-06, "loss": 1.56, "step": 476 }, { "epoch": 0.42212389380530974, "grad_norm": 1.4072250383155183, "learning_rate": 9.767059361709719e-06, "loss": 0.856, "step": 477 }, { "epoch": 0.4230088495575221, "grad_norm": 1.6068310545848317, "learning_rate": 9.766000455847798e-06, "loss": 1.348, "step": 478 }, { "epoch": 0.42389380530973453, "grad_norm": 1.3068122880524942, "learning_rate": 9.764939206308763e-06, "loss": 1.0024, "step": 479 }, { "epoch": 0.4247787610619469, "grad_norm": 1.5789424927457516, "learning_rate": 9.763875613614482e-06, "loss": 1.2393, "step": 480 }, { "epoch": 0.42566371681415927, "grad_norm": 1.6878608773461858, "learning_rate": 9.762809678287977e-06, "loss": 1.2279, "step": 481 }, { "epoch": 0.4265486725663717, "grad_norm": 1.4852608696549723, "learning_rate": 9.76174140085342e-06, "loss": 1.2043, "step": 482 }, { "epoch": 0.42743362831858406, "grad_norm": 1.6420439736925427, "learning_rate": 9.760670781836136e-06, "loss": 1.4016, "step": 483 }, { "epoch": 0.4283185840707965, "grad_norm": 1.6002498679550097, "learning_rate": 9.759597821762603e-06, "loss": 1.042, "step": 484 }, { "epoch": 0.42920353982300885, "grad_norm": 1.3848558672126017, "learning_rate": 9.758522521160446e-06, "loss": 1.1964, "step": 485 }, { "epoch": 0.4300884955752212, "grad_norm": 3.3349681936177067, "learning_rate": 9.757444880558443e-06, "loss": 1.2711, "step": 486 }, { "epoch": 0.43097345132743364, "grad_norm": 1.3940389992163469, "learning_rate": 9.756364900486525e-06, "loss": 1.2614, "step": 487 }, { "epoch": 0.431858407079646, "grad_norm": 1.4185996877451994, "learning_rate": 9.755282581475769e-06, "loss": 1.2928, "step": 488 }, { "epoch": 0.4327433628318584, "grad_norm": 2.164743977421491, "learning_rate": 9.754197924058406e-06, "loss": 1.5701, "step": 489 }, { "epoch": 0.4336283185840708, "grad_norm": 1.4434133769013313, "learning_rate": 9.753110928767816e-06, "loss": 0.9033, "step": 490 }, { "epoch": 0.4345132743362832, "grad_norm": 1.5477587947590654, "learning_rate": 9.752021596138525e-06, "loss": 1.0394, "step": 491 }, { "epoch": 0.4353982300884956, "grad_norm": 1.4159518098048476, "learning_rate": 9.750929926706216e-06, "loss": 1.148, "step": 492 }, { "epoch": 0.43628318584070797, "grad_norm": 1.4814034666005997, "learning_rate": 9.749835921007713e-06, "loss": 1.0929, "step": 493 }, { "epoch": 0.43716814159292033, "grad_norm": 1.4820476787516597, "learning_rate": 9.748739579580995e-06, "loss": 1.1875, "step": 494 }, { "epoch": 0.43805309734513276, "grad_norm": 1.8223677680956318, "learning_rate": 9.747640902965185e-06, "loss": 1.1185, "step": 495 }, { "epoch": 0.4389380530973451, "grad_norm": 1.9073412172401691, "learning_rate": 9.746539891700558e-06, "loss": 1.2914, "step": 496 }, { "epoch": 0.4398230088495575, "grad_norm": 1.6637642492060363, "learning_rate": 9.745436546328533e-06, "loss": 1.0545, "step": 497 }, { "epoch": 0.4407079646017699, "grad_norm": 1.4628985297810517, "learning_rate": 9.744330867391682e-06, "loss": 1.0342, "step": 498 }, { "epoch": 0.4415929203539823, "grad_norm": 1.578150447819209, "learning_rate": 9.743222855433722e-06, "loss": 1.1831, "step": 499 }, { "epoch": 0.4424778761061947, "grad_norm": 1.722989643517729, "learning_rate": 9.742112510999516e-06, "loss": 1.1879, "step": 500 }, { "epoch": 0.4433628318584071, "grad_norm": 1.6461410604283393, "learning_rate": 9.740999834635073e-06, "loss": 1.1345, "step": 501 }, { "epoch": 0.44424778761061945, "grad_norm": 1.5204245759406783, "learning_rate": 9.739884826887554e-06, "loss": 1.1564, "step": 502 }, { "epoch": 0.4451327433628319, "grad_norm": 1.539112500627524, "learning_rate": 9.73876748830526e-06, "loss": 1.132, "step": 503 }, { "epoch": 0.44601769911504424, "grad_norm": 2.40496523196518, "learning_rate": 9.737647819437645e-06, "loss": 1.2372, "step": 504 }, { "epoch": 0.4469026548672566, "grad_norm": 1.547836789151419, "learning_rate": 9.736525820835303e-06, "loss": 1.1617, "step": 505 }, { "epoch": 0.44778761061946903, "grad_norm": 1.5714460354223738, "learning_rate": 9.735401493049977e-06, "loss": 1.2605, "step": 506 }, { "epoch": 0.4486725663716814, "grad_norm": 1.5870886579291794, "learning_rate": 9.734274836634554e-06, "loss": 1.4622, "step": 507 }, { "epoch": 0.4495575221238938, "grad_norm": 1.9134027889333405, "learning_rate": 9.733145852143065e-06, "loss": 1.2314, "step": 508 }, { "epoch": 0.4504424778761062, "grad_norm": 1.490105625196465, "learning_rate": 9.73201454013069e-06, "loss": 1.2066, "step": 509 }, { "epoch": 0.45132743362831856, "grad_norm": 1.3637715742449827, "learning_rate": 9.730880901153747e-06, "loss": 1.1342, "step": 510 }, { "epoch": 0.452212389380531, "grad_norm": 2.0649986329008247, "learning_rate": 9.729744935769708e-06, "loss": 1.0183, "step": 511 }, { "epoch": 0.45309734513274336, "grad_norm": 1.5905275685114284, "learning_rate": 9.728606644537177e-06, "loss": 1.0805, "step": 512 }, { "epoch": 0.4539823008849557, "grad_norm": 1.8064833165246643, "learning_rate": 9.727466028015913e-06, "loss": 1.4992, "step": 513 }, { "epoch": 0.45486725663716815, "grad_norm": 2.179740389366877, "learning_rate": 9.72632308676681e-06, "loss": 1.1714, "step": 514 }, { "epoch": 0.4557522123893805, "grad_norm": 2.070399863804374, "learning_rate": 9.725177821351906e-06, "loss": 1.1259, "step": 515 }, { "epoch": 0.45663716814159294, "grad_norm": 1.7267316617883541, "learning_rate": 9.72403023233439e-06, "loss": 1.2703, "step": 516 }, { "epoch": 0.4575221238938053, "grad_norm": 1.5686620083055716, "learning_rate": 9.722880320278587e-06, "loss": 1.0877, "step": 517 }, { "epoch": 0.4584070796460177, "grad_norm": 1.6042890249898052, "learning_rate": 9.721728085749964e-06, "loss": 1.2774, "step": 518 }, { "epoch": 0.4592920353982301, "grad_norm": 1.5352263796108883, "learning_rate": 9.72057352931513e-06, "loss": 1.2519, "step": 519 }, { "epoch": 0.46017699115044247, "grad_norm": 2.6398728151309827, "learning_rate": 9.719416651541839e-06, "loss": 1.3491, "step": 520 }, { "epoch": 0.46106194690265484, "grad_norm": 2.231864951028027, "learning_rate": 9.718257452998986e-06, "loss": 1.2827, "step": 521 }, { "epoch": 0.46194690265486726, "grad_norm": 1.4714771585167983, "learning_rate": 9.717095934256602e-06, "loss": 1.1241, "step": 522 }, { "epoch": 0.46283185840707963, "grad_norm": 1.5819754001983841, "learning_rate": 9.715932095885867e-06, "loss": 1.3789, "step": 523 }, { "epoch": 0.46371681415929206, "grad_norm": 1.835544496303231, "learning_rate": 9.714765938459097e-06, "loss": 1.3913, "step": 524 }, { "epoch": 0.4646017699115044, "grad_norm": 1.7955530690912733, "learning_rate": 9.713597462549747e-06, "loss": 1.1401, "step": 525 }, { "epoch": 0.4654867256637168, "grad_norm": 1.4305181365835489, "learning_rate": 9.712426668732415e-06, "loss": 1.2528, "step": 526 }, { "epoch": 0.4663716814159292, "grad_norm": 1.8989255549196868, "learning_rate": 9.711253557582839e-06, "loss": 1.0936, "step": 527 }, { "epoch": 0.4672566371681416, "grad_norm": 1.6652022091469214, "learning_rate": 9.710078129677895e-06, "loss": 1.4432, "step": 528 }, { "epoch": 0.46814159292035395, "grad_norm": 1.6030892246516457, "learning_rate": 9.7089003855956e-06, "loss": 1.208, "step": 529 }, { "epoch": 0.4690265486725664, "grad_norm": 1.6203452578970017, "learning_rate": 9.707720325915105e-06, "loss": 1.2281, "step": 530 }, { "epoch": 0.46991150442477875, "grad_norm": 1.7674867125388563, "learning_rate": 9.706537951216706e-06, "loss": 1.4079, "step": 531 }, { "epoch": 0.47079646017699117, "grad_norm": 1.434805970469598, "learning_rate": 9.705353262081836e-06, "loss": 1.3329, "step": 532 }, { "epoch": 0.47168141592920354, "grad_norm": 1.3662213706042279, "learning_rate": 9.704166259093063e-06, "loss": 0.8948, "step": 533 }, { "epoch": 0.4725663716814159, "grad_norm": 1.5910350364974106, "learning_rate": 9.702976942834096e-06, "loss": 1.2377, "step": 534 }, { "epoch": 0.47345132743362833, "grad_norm": 1.9632630361052983, "learning_rate": 9.70178531388978e-06, "loss": 1.2209, "step": 535 }, { "epoch": 0.4743362831858407, "grad_norm": 1.8454123916268528, "learning_rate": 9.700591372846096e-06, "loss": 1.3173, "step": 536 }, { "epoch": 0.47522123893805307, "grad_norm": 2.2766550350862755, "learning_rate": 9.699395120290166e-06, "loss": 1.387, "step": 537 }, { "epoch": 0.4761061946902655, "grad_norm": 1.8509715761672045, "learning_rate": 9.698196556810246e-06, "loss": 1.3439, "step": 538 }, { "epoch": 0.47699115044247786, "grad_norm": 1.6506644638523822, "learning_rate": 9.696995682995727e-06, "loss": 1.1501, "step": 539 }, { "epoch": 0.4778761061946903, "grad_norm": 1.5723794866336638, "learning_rate": 9.69579249943714e-06, "loss": 0.9931, "step": 540 }, { "epoch": 0.47876106194690266, "grad_norm": 1.4756888232060097, "learning_rate": 9.694587006726148e-06, "loss": 1.1331, "step": 541 }, { "epoch": 0.479646017699115, "grad_norm": 1.394881801937423, "learning_rate": 9.693379205455551e-06, "loss": 1.0724, "step": 542 }, { "epoch": 0.48053097345132745, "grad_norm": 3.210816381669762, "learning_rate": 9.692169096219286e-06, "loss": 1.1183, "step": 543 }, { "epoch": 0.4814159292035398, "grad_norm": 1.6802576608139148, "learning_rate": 9.690956679612422e-06, "loss": 1.3377, "step": 544 }, { "epoch": 0.4823008849557522, "grad_norm": 1.488717854714179, "learning_rate": 9.689741956231163e-06, "loss": 1.077, "step": 545 }, { "epoch": 0.4831858407079646, "grad_norm": 1.7264096793651948, "learning_rate": 9.688524926672851e-06, "loss": 1.3034, "step": 546 }, { "epoch": 0.484070796460177, "grad_norm": 1.7123622843124875, "learning_rate": 9.687305591535957e-06, "loss": 1.2055, "step": 547 }, { "epoch": 0.4849557522123894, "grad_norm": 1.4398559501661605, "learning_rate": 9.686083951420089e-06, "loss": 1.1869, "step": 548 }, { "epoch": 0.48584070796460177, "grad_norm": 1.3836520278478912, "learning_rate": 9.684860006925988e-06, "loss": 1.2816, "step": 549 }, { "epoch": 0.48672566371681414, "grad_norm": 3.301559729516578, "learning_rate": 9.683633758655529e-06, "loss": 1.4143, "step": 550 }, { "epoch": 0.48761061946902656, "grad_norm": 1.4511505473564503, "learning_rate": 9.682405207211714e-06, "loss": 1.1787, "step": 551 }, { "epoch": 0.48849557522123893, "grad_norm": 2.6544396339455685, "learning_rate": 9.681174353198687e-06, "loss": 1.4952, "step": 552 }, { "epoch": 0.4893805309734513, "grad_norm": 1.435189362880871, "learning_rate": 9.679941197221717e-06, "loss": 1.1668, "step": 553 }, { "epoch": 0.4902654867256637, "grad_norm": 1.974876088067606, "learning_rate": 9.67870573988721e-06, "loss": 1.1143, "step": 554 }, { "epoch": 0.4911504424778761, "grad_norm": 1.9711587308346394, "learning_rate": 9.677467981802697e-06, "loss": 1.4538, "step": 555 }, { "epoch": 0.4920353982300885, "grad_norm": 1.9184263627188964, "learning_rate": 9.67622792357685e-06, "loss": 1.35, "step": 556 }, { "epoch": 0.4929203539823009, "grad_norm": 1.5305342457407995, "learning_rate": 9.674985565819461e-06, "loss": 1.0808, "step": 557 }, { "epoch": 0.49380530973451325, "grad_norm": 1.5735849985261035, "learning_rate": 9.673740909141463e-06, "loss": 1.1556, "step": 558 }, { "epoch": 0.4946902654867257, "grad_norm": 2.1503557098549475, "learning_rate": 9.672493954154914e-06, "loss": 1.2578, "step": 559 }, { "epoch": 0.49557522123893805, "grad_norm": 5.158866901466978, "learning_rate": 9.671244701472999e-06, "loss": 1.2881, "step": 560 }, { "epoch": 0.4964601769911504, "grad_norm": 1.7697901182459659, "learning_rate": 9.669993151710045e-06, "loss": 1.0998, "step": 561 }, { "epoch": 0.49734513274336284, "grad_norm": 1.1955442892796118, "learning_rate": 9.668739305481493e-06, "loss": 1.0254, "step": 562 }, { "epoch": 0.4982300884955752, "grad_norm": 1.459877689348858, "learning_rate": 9.667483163403927e-06, "loss": 1.103, "step": 563 }, { "epoch": 0.49911504424778763, "grad_norm": 1.8825064147118742, "learning_rate": 9.666224726095048e-06, "loss": 1.1829, "step": 564 }, { "epoch": 0.5, "grad_norm": 1.5734639533002948, "learning_rate": 9.664963994173695e-06, "loss": 1.2689, "step": 565 }, { "epoch": 0.5008849557522124, "grad_norm": 1.61842588636153, "learning_rate": 9.663700968259832e-06, "loss": 1.3633, "step": 566 }, { "epoch": 0.5017699115044247, "grad_norm": 1.8594292352877815, "learning_rate": 9.662435648974552e-06, "loss": 1.4553, "step": 567 }, { "epoch": 0.5026548672566372, "grad_norm": 1.46765345729372, "learning_rate": 9.661168036940071e-06, "loss": 1.3119, "step": 568 }, { "epoch": 0.5035398230088496, "grad_norm": 1.4807524730711643, "learning_rate": 9.659898132779741e-06, "loss": 1.319, "step": 569 }, { "epoch": 0.504424778761062, "grad_norm": 1.903070449855407, "learning_rate": 9.658625937118033e-06, "loss": 1.4073, "step": 570 }, { "epoch": 0.5053097345132743, "grad_norm": 1.4818922748099912, "learning_rate": 9.65735145058055e-06, "loss": 1.0699, "step": 571 }, { "epoch": 0.5061946902654867, "grad_norm": 3.0376789939840996, "learning_rate": 9.656074673794018e-06, "loss": 1.1792, "step": 572 }, { "epoch": 0.5070796460176992, "grad_norm": 1.5302119212492507, "learning_rate": 9.654795607386294e-06, "loss": 1.2973, "step": 573 }, { "epoch": 0.5079646017699115, "grad_norm": 3.27797674743426, "learning_rate": 9.653514251986354e-06, "loss": 1.2862, "step": 574 }, { "epoch": 0.5088495575221239, "grad_norm": 1.8259577010543617, "learning_rate": 9.652230608224306e-06, "loss": 1.3203, "step": 575 }, { "epoch": 0.5097345132743363, "grad_norm": 1.3818266504063175, "learning_rate": 9.650944676731383e-06, "loss": 0.8823, "step": 576 }, { "epoch": 0.5106194690265486, "grad_norm": 2.4611005824288794, "learning_rate": 9.649656458139936e-06, "loss": 1.1872, "step": 577 }, { "epoch": 0.511504424778761, "grad_norm": 1.6662903368444395, "learning_rate": 9.648365953083447e-06, "loss": 1.3129, "step": 578 }, { "epoch": 0.5123893805309735, "grad_norm": 1.4333088155427736, "learning_rate": 9.647073162196524e-06, "loss": 1.1996, "step": 579 }, { "epoch": 0.5132743362831859, "grad_norm": 1.5247625687957762, "learning_rate": 9.645778086114892e-06, "loss": 1.1131, "step": 580 }, { "epoch": 0.5141592920353982, "grad_norm": 1.484725214522865, "learning_rate": 9.644480725475404e-06, "loss": 1.2473, "step": 581 }, { "epoch": 0.5150442477876106, "grad_norm": 1.2752214312696393, "learning_rate": 9.64318108091604e-06, "loss": 1.1017, "step": 582 }, { "epoch": 0.515929203539823, "grad_norm": 2.161243295820818, "learning_rate": 9.641879153075897e-06, "loss": 1.0328, "step": 583 }, { "epoch": 0.5168141592920354, "grad_norm": 1.3230353391542127, "learning_rate": 9.640574942595195e-06, "loss": 1.1098, "step": 584 }, { "epoch": 0.5176991150442478, "grad_norm": 1.3778148085623574, "learning_rate": 9.639268450115282e-06, "loss": 1.1806, "step": 585 }, { "epoch": 0.5185840707964602, "grad_norm": 1.586598492824982, "learning_rate": 9.637959676278621e-06, "loss": 1.4825, "step": 586 }, { "epoch": 0.5194690265486726, "grad_norm": 1.7567641341485882, "learning_rate": 9.636648621728804e-06, "loss": 1.1021, "step": 587 }, { "epoch": 0.5203539823008849, "grad_norm": 1.8837678605248689, "learning_rate": 9.635335287110538e-06, "loss": 1.2953, "step": 588 }, { "epoch": 0.5212389380530974, "grad_norm": 1.7750046458914532, "learning_rate": 9.634019673069656e-06, "loss": 1.2891, "step": 589 }, { "epoch": 0.5221238938053098, "grad_norm": 6.510307083239151, "learning_rate": 9.632701780253111e-06, "loss": 1.2508, "step": 590 }, { "epoch": 0.5230088495575221, "grad_norm": 1.4341225482612099, "learning_rate": 9.631381609308975e-06, "loss": 1.2781, "step": 591 }, { "epoch": 0.5238938053097345, "grad_norm": 1.418361129196937, "learning_rate": 9.63005916088644e-06, "loss": 1.1675, "step": 592 }, { "epoch": 0.5247787610619469, "grad_norm": 1.5154105477481472, "learning_rate": 9.62873443563582e-06, "loss": 1.034, "step": 593 }, { "epoch": 0.5256637168141592, "grad_norm": 1.78103558475923, "learning_rate": 9.62740743420855e-06, "loss": 1.1047, "step": 594 }, { "epoch": 0.5265486725663717, "grad_norm": 1.55281517565287, "learning_rate": 9.626078157257176e-06, "loss": 1.1273, "step": 595 }, { "epoch": 0.5274336283185841, "grad_norm": 1.8535459000435746, "learning_rate": 9.624746605435373e-06, "loss": 1.3608, "step": 596 }, { "epoch": 0.5283185840707965, "grad_norm": 1.4945134309278085, "learning_rate": 9.623412779397932e-06, "loss": 1.166, "step": 597 }, { "epoch": 0.5292035398230088, "grad_norm": 3.3469062162917798, "learning_rate": 9.622076679800757e-06, "loss": 1.2868, "step": 598 }, { "epoch": 0.5300884955752212, "grad_norm": 2.5898427540039983, "learning_rate": 9.620738307300876e-06, "loss": 1.1894, "step": 599 }, { "epoch": 0.5309734513274337, "grad_norm": 1.478918782296963, "learning_rate": 9.619397662556434e-06, "loss": 0.9233, "step": 600 }, { "epoch": 0.531858407079646, "grad_norm": 1.5034953096115884, "learning_rate": 9.61805474622669e-06, "loss": 1.132, "step": 601 }, { "epoch": 0.5327433628318584, "grad_norm": 1.4936264230687224, "learning_rate": 9.616709558972024e-06, "loss": 1.399, "step": 602 }, { "epoch": 0.5336283185840708, "grad_norm": 1.6327133764144968, "learning_rate": 9.615362101453927e-06, "loss": 1.444, "step": 603 }, { "epoch": 0.5345132743362832, "grad_norm": 2.7327085427667, "learning_rate": 9.614012374335014e-06, "loss": 1.3593, "step": 604 }, { "epoch": 0.5353982300884956, "grad_norm": 1.5461982294188659, "learning_rate": 9.612660378279011e-06, "loss": 1.0793, "step": 605 }, { "epoch": 0.536283185840708, "grad_norm": 1.7085270668951433, "learning_rate": 9.611306113950762e-06, "loss": 1.2023, "step": 606 }, { "epoch": 0.5371681415929204, "grad_norm": 1.7449738964114168, "learning_rate": 9.609949582016223e-06, "loss": 1.3443, "step": 607 }, { "epoch": 0.5380530973451327, "grad_norm": 2.31148448672424, "learning_rate": 9.608590783142471e-06, "loss": 1.5428, "step": 608 }, { "epoch": 0.5389380530973451, "grad_norm": 1.4656201686579047, "learning_rate": 9.60722971799769e-06, "loss": 1.1978, "step": 609 }, { "epoch": 0.5398230088495575, "grad_norm": 1.4633652880646313, "learning_rate": 9.605866387251186e-06, "loss": 1.1457, "step": 610 }, { "epoch": 0.54070796460177, "grad_norm": 1.4532348945802318, "learning_rate": 9.604500791573376e-06, "loss": 1.0844, "step": 611 }, { "epoch": 0.5415929203539823, "grad_norm": 1.6119074654485004, "learning_rate": 9.60313293163579e-06, "loss": 1.153, "step": 612 }, { "epoch": 0.5424778761061947, "grad_norm": 1.474007172002865, "learning_rate": 9.60176280811107e-06, "loss": 1.1756, "step": 613 }, { "epoch": 0.5433628318584071, "grad_norm": 1.6875952942611925, "learning_rate": 9.600390421672976e-06, "loss": 1.3539, "step": 614 }, { "epoch": 0.5442477876106194, "grad_norm": 2.065836936291748, "learning_rate": 9.599015772996376e-06, "loss": 0.9715, "step": 615 }, { "epoch": 0.5451327433628319, "grad_norm": 2.877104459799078, "learning_rate": 9.597638862757255e-06, "loss": 1.2976, "step": 616 }, { "epoch": 0.5460176991150443, "grad_norm": 4.14866979393378, "learning_rate": 9.596259691632703e-06, "loss": 1.2679, "step": 617 }, { "epoch": 0.5469026548672566, "grad_norm": 1.3709060106781963, "learning_rate": 9.594878260300933e-06, "loss": 1.176, "step": 618 }, { "epoch": 0.547787610619469, "grad_norm": 2.2126612777110894, "learning_rate": 9.593494569441257e-06, "loss": 1.0508, "step": 619 }, { "epoch": 0.5486725663716814, "grad_norm": 2.136163183963028, "learning_rate": 9.592108619734107e-06, "loss": 1.3673, "step": 620 }, { "epoch": 0.5495575221238939, "grad_norm": 2.0721561218178715, "learning_rate": 9.590720411861022e-06, "loss": 1.3424, "step": 621 }, { "epoch": 0.5504424778761062, "grad_norm": 1.5947382090653288, "learning_rate": 9.58932994650465e-06, "loss": 1.268, "step": 622 }, { "epoch": 0.5513274336283186, "grad_norm": 1.5824219117772933, "learning_rate": 9.587937224348753e-06, "loss": 1.348, "step": 623 }, { "epoch": 0.552212389380531, "grad_norm": 1.408366698565368, "learning_rate": 9.586542246078203e-06, "loss": 1.0918, "step": 624 }, { "epoch": 0.5530973451327433, "grad_norm": 1.7323123133912537, "learning_rate": 9.585145012378979e-06, "loss": 1.1328, "step": 625 }, { "epoch": 0.5539823008849557, "grad_norm": 2.47932129802536, "learning_rate": 9.583745523938166e-06, "loss": 1.115, "step": 626 }, { "epoch": 0.5548672566371682, "grad_norm": 2.406917632058555, "learning_rate": 9.582343781443966e-06, "loss": 1.2814, "step": 627 }, { "epoch": 0.5557522123893806, "grad_norm": 1.5704043069206366, "learning_rate": 9.58093978558568e-06, "loss": 1.3309, "step": 628 }, { "epoch": 0.5566371681415929, "grad_norm": 1.3978002740909756, "learning_rate": 9.579533537053726e-06, "loss": 1.187, "step": 629 }, { "epoch": 0.5575221238938053, "grad_norm": 1.5914166023361571, "learning_rate": 9.578125036539625e-06, "loss": 1.2214, "step": 630 }, { "epoch": 0.5584070796460177, "grad_norm": 1.9743482520068731, "learning_rate": 9.576714284736005e-06, "loss": 1.3828, "step": 631 }, { "epoch": 0.5592920353982301, "grad_norm": 1.4586734967017183, "learning_rate": 9.5753012823366e-06, "loss": 1.0158, "step": 632 }, { "epoch": 0.5601769911504425, "grad_norm": 1.4726486582576088, "learning_rate": 9.573886030036258e-06, "loss": 1.1726, "step": 633 }, { "epoch": 0.5610619469026549, "grad_norm": 1.7971621831221274, "learning_rate": 9.572468528530923e-06, "loss": 1.2459, "step": 634 }, { "epoch": 0.5619469026548672, "grad_norm": 1.4027906283215694, "learning_rate": 9.571048778517655e-06, "loss": 1.3407, "step": 635 }, { "epoch": 0.5628318584070796, "grad_norm": 1.2946871296658973, "learning_rate": 9.569626780694611e-06, "loss": 1.0482, "step": 636 }, { "epoch": 0.5637168141592921, "grad_norm": 2.3548182546298118, "learning_rate": 9.56820253576106e-06, "loss": 0.9876, "step": 637 }, { "epoch": 0.5646017699115045, "grad_norm": 3.1192293155388455, "learning_rate": 9.566776044417372e-06, "loss": 1.1364, "step": 638 }, { "epoch": 0.5654867256637168, "grad_norm": 1.5806033858325859, "learning_rate": 9.565347307365024e-06, "loss": 1.2028, "step": 639 }, { "epoch": 0.5663716814159292, "grad_norm": 1.5998307694053044, "learning_rate": 9.563916325306595e-06, "loss": 1.0228, "step": 640 }, { "epoch": 0.5672566371681416, "grad_norm": 1.6111728547988104, "learning_rate": 9.562483098945771e-06, "loss": 1.2616, "step": 641 }, { "epoch": 0.5681415929203539, "grad_norm": 1.7337958207208664, "learning_rate": 9.561047628987338e-06, "loss": 1.0777, "step": 642 }, { "epoch": 0.5690265486725664, "grad_norm": 1.2625204912763301, "learning_rate": 9.559609916137192e-06, "loss": 1.1074, "step": 643 }, { "epoch": 0.5699115044247788, "grad_norm": 1.3784467479476137, "learning_rate": 9.55816996110232e-06, "loss": 1.2271, "step": 644 }, { "epoch": 0.5707964601769911, "grad_norm": 2.089565303340047, "learning_rate": 9.556727764590822e-06, "loss": 1.2684, "step": 645 }, { "epoch": 0.5716814159292035, "grad_norm": 1.5317953205453307, "learning_rate": 9.555283327311901e-06, "loss": 1.1815, "step": 646 }, { "epoch": 0.5725663716814159, "grad_norm": 1.9385929268214193, "learning_rate": 9.553836649975852e-06, "loss": 1.323, "step": 647 }, { "epoch": 0.5734513274336284, "grad_norm": 1.7621453224045556, "learning_rate": 9.552387733294081e-06, "loss": 1.0507, "step": 648 }, { "epoch": 0.5743362831858407, "grad_norm": 1.4407356799660136, "learning_rate": 9.55093657797909e-06, "loss": 1.1608, "step": 649 }, { "epoch": 0.5752212389380531, "grad_norm": 1.3735108669757656, "learning_rate": 9.549483184744483e-06, "loss": 1.0417, "step": 650 }, { "epoch": 0.5761061946902655, "grad_norm": 2.110024126952182, "learning_rate": 9.548027554304969e-06, "loss": 1.1797, "step": 651 }, { "epoch": 0.5769911504424778, "grad_norm": 1.2950983031718106, "learning_rate": 9.546569687376349e-06, "loss": 1.4204, "step": 652 }, { "epoch": 0.5778761061946903, "grad_norm": 3.7338362698435668, "learning_rate": 9.545109584675528e-06, "loss": 1.552, "step": 653 }, { "epoch": 0.5787610619469027, "grad_norm": 2.4195061100061435, "learning_rate": 9.54364724692051e-06, "loss": 1.1393, "step": 654 }, { "epoch": 0.5796460176991151, "grad_norm": 2.9358652473671456, "learning_rate": 9.542182674830404e-06, "loss": 1.288, "step": 655 }, { "epoch": 0.5805309734513274, "grad_norm": 1.3414610560883071, "learning_rate": 9.540715869125407e-06, "loss": 1.3074, "step": 656 }, { "epoch": 0.5814159292035398, "grad_norm": 1.5163634264314163, "learning_rate": 9.53924683052682e-06, "loss": 1.3292, "step": 657 }, { "epoch": 0.5823008849557522, "grad_norm": 1.514630272459369, "learning_rate": 9.537775559757046e-06, "loss": 1.2431, "step": 658 }, { "epoch": 0.5831858407079646, "grad_norm": 1.8593571447306676, "learning_rate": 9.536302057539574e-06, "loss": 0.9031, "step": 659 }, { "epoch": 0.584070796460177, "grad_norm": 1.3294328020487711, "learning_rate": 9.534826324599002e-06, "loss": 1.077, "step": 660 }, { "epoch": 0.5849557522123894, "grad_norm": 2.0890418362631658, "learning_rate": 9.53334836166102e-06, "loss": 1.3013, "step": 661 }, { "epoch": 0.5858407079646017, "grad_norm": 1.2992252537315336, "learning_rate": 9.531868169452419e-06, "loss": 1.1555, "step": 662 }, { "epoch": 0.5867256637168141, "grad_norm": 1.4952241099524357, "learning_rate": 9.530385748701074e-06, "loss": 1.0681, "step": 663 }, { "epoch": 0.5876106194690266, "grad_norm": 1.5514530133740487, "learning_rate": 9.528901100135971e-06, "loss": 1.373, "step": 664 }, { "epoch": 0.588495575221239, "grad_norm": 1.4871569935625184, "learning_rate": 9.527414224487182e-06, "loss": 1.0498, "step": 665 }, { "epoch": 0.5893805309734513, "grad_norm": 1.2102150339151596, "learning_rate": 9.525925122485877e-06, "loss": 0.9927, "step": 666 }, { "epoch": 0.5902654867256637, "grad_norm": 2.3440312461075234, "learning_rate": 9.524433794864324e-06, "loss": 0.9617, "step": 667 }, { "epoch": 0.5911504424778761, "grad_norm": 1.3916637949014543, "learning_rate": 9.522940242355877e-06, "loss": 1.1852, "step": 668 }, { "epoch": 0.5920353982300885, "grad_norm": 1.6844491557811254, "learning_rate": 9.521444465694993e-06, "loss": 1.0903, "step": 669 }, { "epoch": 0.5929203539823009, "grad_norm": 1.6966821599588242, "learning_rate": 9.519946465617217e-06, "loss": 1.0068, "step": 670 }, { "epoch": 0.5938053097345133, "grad_norm": 1.512147563934271, "learning_rate": 9.518446242859192e-06, "loss": 1.2468, "step": 671 }, { "epoch": 0.5946902654867257, "grad_norm": 2.7549118733145597, "learning_rate": 9.51694379815865e-06, "loss": 1.3744, "step": 672 }, { "epoch": 0.595575221238938, "grad_norm": 1.4060470273423071, "learning_rate": 9.515439132254414e-06, "loss": 1.2134, "step": 673 }, { "epoch": 0.5964601769911504, "grad_norm": 1.3054489334386665, "learning_rate": 9.513932245886409e-06, "loss": 1.079, "step": 674 }, { "epoch": 0.5973451327433629, "grad_norm": 2.0876038828516306, "learning_rate": 9.512423139795637e-06, "loss": 1.4922, "step": 675 }, { "epoch": 0.5982300884955752, "grad_norm": 1.7432356593591254, "learning_rate": 9.510911814724208e-06, "loss": 1.13, "step": 676 }, { "epoch": 0.5991150442477876, "grad_norm": 1.396620424430167, "learning_rate": 9.509398271415308e-06, "loss": 1.1374, "step": 677 }, { "epoch": 0.6, "grad_norm": 1.7525404084550666, "learning_rate": 9.507882510613226e-06, "loss": 1.4146, "step": 678 }, { "epoch": 0.6008849557522123, "grad_norm": 1.344778382379383, "learning_rate": 9.506364533063335e-06, "loss": 1.1232, "step": 679 }, { "epoch": 0.6017699115044248, "grad_norm": 1.7080873447011158, "learning_rate": 9.504844339512096e-06, "loss": 1.3256, "step": 680 }, { "epoch": 0.6026548672566372, "grad_norm": 2.5560434845204445, "learning_rate": 9.503321930707068e-06, "loss": 0.9658, "step": 681 }, { "epoch": 0.6035398230088496, "grad_norm": 1.4783596563023342, "learning_rate": 9.501797307396889e-06, "loss": 1.1833, "step": 682 }, { "epoch": 0.6044247787610619, "grad_norm": 2.7176762506438674, "learning_rate": 9.500270470331296e-06, "loss": 1.311, "step": 683 }, { "epoch": 0.6053097345132743, "grad_norm": 1.604293088692499, "learning_rate": 9.498741420261109e-06, "loss": 1.1371, "step": 684 }, { "epoch": 0.6061946902654868, "grad_norm": 1.5144490521831242, "learning_rate": 9.497210157938234e-06, "loss": 1.081, "step": 685 }, { "epoch": 0.6070796460176991, "grad_norm": 1.6329620333718777, "learning_rate": 9.495676684115673e-06, "loss": 1.3834, "step": 686 }, { "epoch": 0.6079646017699115, "grad_norm": 1.4516404529716624, "learning_rate": 9.494140999547505e-06, "loss": 1.0686, "step": 687 }, { "epoch": 0.6088495575221239, "grad_norm": 1.7718289599346813, "learning_rate": 9.492603104988907e-06, "loss": 1.0963, "step": 688 }, { "epoch": 0.6097345132743363, "grad_norm": 1.4061668390200797, "learning_rate": 9.491063001196134e-06, "loss": 0.9362, "step": 689 }, { "epoch": 0.6106194690265486, "grad_norm": 1.414398207568135, "learning_rate": 9.489520688926534e-06, "loss": 1.0778, "step": 690 }, { "epoch": 0.6115044247787611, "grad_norm": 1.5791684897319684, "learning_rate": 9.487976168938535e-06, "loss": 1.3769, "step": 691 }, { "epoch": 0.6123893805309735, "grad_norm": 1.5536612144927673, "learning_rate": 9.486429441991655e-06, "loss": 1.0744, "step": 692 }, { "epoch": 0.6132743362831858, "grad_norm": 1.6604800507459978, "learning_rate": 9.484880508846495e-06, "loss": 1.1914, "step": 693 }, { "epoch": 0.6141592920353982, "grad_norm": 1.434116324081411, "learning_rate": 9.483329370264743e-06, "loss": 1.1573, "step": 694 }, { "epoch": 0.6150442477876106, "grad_norm": 2.1604751910632953, "learning_rate": 9.481776027009167e-06, "loss": 1.0689, "step": 695 }, { "epoch": 0.6159292035398231, "grad_norm": 3.158594429100956, "learning_rate": 9.480220479843627e-06, "loss": 1.2891, "step": 696 }, { "epoch": 0.6168141592920354, "grad_norm": 1.6437581097773024, "learning_rate": 9.47866272953306e-06, "loss": 1.369, "step": 697 }, { "epoch": 0.6176991150442478, "grad_norm": 1.401028578974123, "learning_rate": 9.477102776843486e-06, "loss": 1.376, "step": 698 }, { "epoch": 0.6185840707964602, "grad_norm": 1.6494522389359576, "learning_rate": 9.475540622542015e-06, "loss": 1.5793, "step": 699 }, { "epoch": 0.6194690265486725, "grad_norm": 1.5284387293411268, "learning_rate": 9.473976267396831e-06, "loss": 1.0306, "step": 700 }, { "epoch": 0.620353982300885, "grad_norm": 1.5199238438101301, "learning_rate": 9.472409712177207e-06, "loss": 1.2698, "step": 701 }, { "epoch": 0.6212389380530974, "grad_norm": 1.6990712625916833, "learning_rate": 9.470840957653497e-06, "loss": 1.0943, "step": 702 }, { "epoch": 0.6221238938053097, "grad_norm": 1.4834383353563945, "learning_rate": 9.469270004597131e-06, "loss": 1.0455, "step": 703 }, { "epoch": 0.6230088495575221, "grad_norm": 1.3285622366882868, "learning_rate": 9.467696853780625e-06, "loss": 1.3422, "step": 704 }, { "epoch": 0.6238938053097345, "grad_norm": 1.4973069503477934, "learning_rate": 9.466121505977577e-06, "loss": 1.3187, "step": 705 }, { "epoch": 0.6247787610619469, "grad_norm": 1.3050766786121655, "learning_rate": 9.464543961962662e-06, "loss": 1.0025, "step": 706 }, { "epoch": 0.6256637168141593, "grad_norm": 1.5378713302895155, "learning_rate": 9.462964222511634e-06, "loss": 1.2062, "step": 707 }, { "epoch": 0.6265486725663717, "grad_norm": 1.6023606716824783, "learning_rate": 9.46138228840133e-06, "loss": 1.2214, "step": 708 }, { "epoch": 0.6274336283185841, "grad_norm": 1.59965729635927, "learning_rate": 9.459798160409668e-06, "loss": 1.445, "step": 709 }, { "epoch": 0.6283185840707964, "grad_norm": 1.611838412289609, "learning_rate": 9.458211839315636e-06, "loss": 1.2788, "step": 710 }, { "epoch": 0.6292035398230088, "grad_norm": 1.6331178816761864, "learning_rate": 9.45662332589931e-06, "loss": 1.2105, "step": 711 }, { "epoch": 0.6300884955752213, "grad_norm": 1.4067896700496654, "learning_rate": 9.45503262094184e-06, "loss": 0.9173, "step": 712 }, { "epoch": 0.6309734513274337, "grad_norm": 1.8438416815780378, "learning_rate": 9.453439725225452e-06, "loss": 1.1723, "step": 713 }, { "epoch": 0.631858407079646, "grad_norm": 1.4249180760487976, "learning_rate": 9.451844639533453e-06, "loss": 0.96, "step": 714 }, { "epoch": 0.6327433628318584, "grad_norm": 1.6196535926341111, "learning_rate": 9.450247364650227e-06, "loss": 1.2459, "step": 715 }, { "epoch": 0.6336283185840708, "grad_norm": 1.4816135180673369, "learning_rate": 9.448647901361229e-06, "loss": 1.453, "step": 716 }, { "epoch": 0.6345132743362832, "grad_norm": 3.0835724901195283, "learning_rate": 9.447046250452994e-06, "loss": 1.3484, "step": 717 }, { "epoch": 0.6353982300884956, "grad_norm": 1.4318269374949162, "learning_rate": 9.445442412713137e-06, "loss": 1.0104, "step": 718 }, { "epoch": 0.636283185840708, "grad_norm": 1.7567731680978886, "learning_rate": 9.443836388930339e-06, "loss": 1.2513, "step": 719 }, { "epoch": 0.6371681415929203, "grad_norm": 1.8122835429073254, "learning_rate": 9.442228179894362e-06, "loss": 1.3442, "step": 720 }, { "epoch": 0.6380530973451327, "grad_norm": 1.5525640859616912, "learning_rate": 9.440617786396044e-06, "loss": 1.3524, "step": 721 }, { "epoch": 0.6389380530973451, "grad_norm": 1.567893542193636, "learning_rate": 9.439005209227293e-06, "loss": 1.2086, "step": 722 }, { "epoch": 0.6398230088495576, "grad_norm": 1.930931977390317, "learning_rate": 9.437390449181092e-06, "loss": 1.3835, "step": 723 }, { "epoch": 0.6407079646017699, "grad_norm": 1.352443443387227, "learning_rate": 9.4357735070515e-06, "loss": 0.9895, "step": 724 }, { "epoch": 0.6415929203539823, "grad_norm": 1.4215774914195312, "learning_rate": 9.434154383633647e-06, "loss": 1.2769, "step": 725 }, { "epoch": 0.6424778761061947, "grad_norm": 1.549622949743659, "learning_rate": 9.432533079723734e-06, "loss": 1.2691, "step": 726 }, { "epoch": 0.643362831858407, "grad_norm": 1.59983549512889, "learning_rate": 9.430909596119036e-06, "loss": 1.3917, "step": 727 }, { "epoch": 0.6442477876106195, "grad_norm": 7.979068738798665, "learning_rate": 9.4292839336179e-06, "loss": 1.2391, "step": 728 }, { "epoch": 0.6451327433628319, "grad_norm": 1.6359428342408429, "learning_rate": 9.427656093019746e-06, "loss": 1.3946, "step": 729 }, { "epoch": 0.6460176991150443, "grad_norm": 1.885597928880566, "learning_rate": 9.426026075125062e-06, "loss": 0.915, "step": 730 }, { "epoch": 0.6469026548672566, "grad_norm": 1.587775296215366, "learning_rate": 9.424393880735407e-06, "loss": 0.9446, "step": 731 }, { "epoch": 0.647787610619469, "grad_norm": 1.3026420793467897, "learning_rate": 9.422759510653414e-06, "loss": 1.0348, "step": 732 }, { "epoch": 0.6486725663716815, "grad_norm": 1.5690785319116276, "learning_rate": 9.421122965682782e-06, "loss": 1.1899, "step": 733 }, { "epoch": 0.6495575221238938, "grad_norm": 1.8616831360533352, "learning_rate": 9.419484246628279e-06, "loss": 1.1506, "step": 734 }, { "epoch": 0.6504424778761062, "grad_norm": 1.352916411520574, "learning_rate": 9.417843354295748e-06, "loss": 0.8856, "step": 735 }, { "epoch": 0.6513274336283186, "grad_norm": 1.4591341480829714, "learning_rate": 9.416200289492092e-06, "loss": 1.0924, "step": 736 }, { "epoch": 0.6522123893805309, "grad_norm": 1.2261121661024863, "learning_rate": 9.41455505302529e-06, "loss": 1.2203, "step": 737 }, { "epoch": 0.6530973451327433, "grad_norm": 1.256085383265856, "learning_rate": 9.412907645704385e-06, "loss": 1.1221, "step": 738 }, { "epoch": 0.6539823008849558, "grad_norm": 1.560836386136113, "learning_rate": 9.411258068339487e-06, "loss": 1.4943, "step": 739 }, { "epoch": 0.6548672566371682, "grad_norm": 1.2586174806706696, "learning_rate": 9.409606321741776e-06, "loss": 1.1493, "step": 740 }, { "epoch": 0.6557522123893805, "grad_norm": 1.5642597937866771, "learning_rate": 9.407952406723496e-06, "loss": 1.0145, "step": 741 }, { "epoch": 0.6566371681415929, "grad_norm": 2.655913668492653, "learning_rate": 9.40629632409796e-06, "loss": 1.2449, "step": 742 }, { "epoch": 0.6575221238938053, "grad_norm": 1.4058917588496282, "learning_rate": 9.404638074679544e-06, "loss": 1.1117, "step": 743 }, { "epoch": 0.6584070796460177, "grad_norm": 1.4999951615723472, "learning_rate": 9.40297765928369e-06, "loss": 1.1846, "step": 744 }, { "epoch": 0.6592920353982301, "grad_norm": 1.4784180504428006, "learning_rate": 9.401315078726908e-06, "loss": 1.205, "step": 745 }, { "epoch": 0.6601769911504425, "grad_norm": 1.708704193114729, "learning_rate": 9.399650333826771e-06, "loss": 1.3059, "step": 746 }, { "epoch": 0.6610619469026549, "grad_norm": 3.0270420212218196, "learning_rate": 9.397983425401915e-06, "loss": 1.1515, "step": 747 }, { "epoch": 0.6619469026548672, "grad_norm": 1.4835023172941324, "learning_rate": 9.39631435427204e-06, "loss": 1.3944, "step": 748 }, { "epoch": 0.6628318584070797, "grad_norm": 2.6121313115475235, "learning_rate": 9.394643121257914e-06, "loss": 1.0471, "step": 749 }, { "epoch": 0.6637168141592921, "grad_norm": 3.5678988602541803, "learning_rate": 9.39296972718136e-06, "loss": 1.1822, "step": 750 }, { "epoch": 0.6646017699115044, "grad_norm": 1.5435696019662108, "learning_rate": 9.39129417286527e-06, "loss": 1.3309, "step": 751 }, { "epoch": 0.6654867256637168, "grad_norm": 1.49381977791648, "learning_rate": 9.389616459133597e-06, "loss": 1.1969, "step": 752 }, { "epoch": 0.6663716814159292, "grad_norm": 1.570367800116359, "learning_rate": 9.387936586811357e-06, "loss": 1.3803, "step": 753 }, { "epoch": 0.6672566371681415, "grad_norm": 1.5608906181188729, "learning_rate": 9.386254556724622e-06, "loss": 1.3458, "step": 754 }, { "epoch": 0.668141592920354, "grad_norm": 1.70803153881905, "learning_rate": 9.384570369700531e-06, "loss": 1.1023, "step": 755 }, { "epoch": 0.6690265486725664, "grad_norm": 1.4559227845184208, "learning_rate": 9.382884026567286e-06, "loss": 1.1887, "step": 756 }, { "epoch": 0.6699115044247788, "grad_norm": 1.6447275476699392, "learning_rate": 9.381195528154137e-06, "loss": 1.1387, "step": 757 }, { "epoch": 0.6707964601769911, "grad_norm": 1.6715131338330742, "learning_rate": 9.379504875291408e-06, "loss": 1.3524, "step": 758 }, { "epoch": 0.6716814159292035, "grad_norm": 1.910308570172055, "learning_rate": 9.377812068810473e-06, "loss": 1.37, "step": 759 }, { "epoch": 0.672566371681416, "grad_norm": 2.6408221370278886, "learning_rate": 9.376117109543769e-06, "loss": 1.1715, "step": 760 }, { "epoch": 0.6734513274336283, "grad_norm": 1.5763298061872941, "learning_rate": 9.374419998324792e-06, "loss": 1.0592, "step": 761 }, { "epoch": 0.6743362831858407, "grad_norm": 1.5377479977716626, "learning_rate": 9.372720735988094e-06, "loss": 0.9428, "step": 762 }, { "epoch": 0.6752212389380531, "grad_norm": 2.315598416095765, "learning_rate": 9.371019323369287e-06, "loss": 1.4122, "step": 763 }, { "epoch": 0.6761061946902654, "grad_norm": 1.5655443611161677, "learning_rate": 9.369315761305039e-06, "loss": 1.2978, "step": 764 }, { "epoch": 0.6769911504424779, "grad_norm": 1.5915763720732574, "learning_rate": 9.367610050633076e-06, "loss": 1.1858, "step": 765 }, { "epoch": 0.6778761061946903, "grad_norm": 1.3853135810105934, "learning_rate": 9.365902192192176e-06, "loss": 1.202, "step": 766 }, { "epoch": 0.6787610619469027, "grad_norm": 1.6969644551559924, "learning_rate": 9.364192186822184e-06, "loss": 1.609, "step": 767 }, { "epoch": 0.679646017699115, "grad_norm": 1.4187736719545674, "learning_rate": 9.362480035363987e-06, "loss": 1.2774, "step": 768 }, { "epoch": 0.6805309734513274, "grad_norm": 4.802110083583092, "learning_rate": 9.360765738659538e-06, "loss": 1.0981, "step": 769 }, { "epoch": 0.6814159292035398, "grad_norm": 1.8705414901058257, "learning_rate": 9.35904929755184e-06, "loss": 1.077, "step": 770 }, { "epoch": 0.6823008849557523, "grad_norm": 1.6655458116452535, "learning_rate": 9.357330712884953e-06, "loss": 1.2563, "step": 771 }, { "epoch": 0.6831858407079646, "grad_norm": 3.994344088822336, "learning_rate": 9.355609985503988e-06, "loss": 1.1848, "step": 772 }, { "epoch": 0.684070796460177, "grad_norm": 1.4361271563505886, "learning_rate": 9.353887116255111e-06, "loss": 1.1094, "step": 773 }, { "epoch": 0.6849557522123894, "grad_norm": 1.4089452159897262, "learning_rate": 9.352162105985544e-06, "loss": 1.252, "step": 774 }, { "epoch": 0.6858407079646017, "grad_norm": 1.5402406847598171, "learning_rate": 9.350434955543557e-06, "loss": 1.2185, "step": 775 }, { "epoch": 0.6867256637168142, "grad_norm": 1.5649641474996097, "learning_rate": 9.348705665778479e-06, "loss": 1.2395, "step": 776 }, { "epoch": 0.6876106194690266, "grad_norm": 2.9602028535125307, "learning_rate": 9.34697423754068e-06, "loss": 1.2633, "step": 777 }, { "epoch": 0.6884955752212389, "grad_norm": 1.6503930013516226, "learning_rate": 9.345240671681595e-06, "loss": 1.0902, "step": 778 }, { "epoch": 0.6893805309734513, "grad_norm": 1.5490695705857473, "learning_rate": 9.3435049690537e-06, "loss": 1.0347, "step": 779 }, { "epoch": 0.6902654867256637, "grad_norm": 1.8527137003300929, "learning_rate": 9.341767130510529e-06, "loss": 1.1525, "step": 780 }, { "epoch": 0.6911504424778762, "grad_norm": 1.6978147163659227, "learning_rate": 9.340027156906657e-06, "loss": 1.2705, "step": 781 }, { "epoch": 0.6920353982300885, "grad_norm": 1.5160531908082027, "learning_rate": 9.338285049097722e-06, "loss": 1.3335, "step": 782 }, { "epoch": 0.6929203539823009, "grad_norm": 1.4697315860223692, "learning_rate": 9.336540807940397e-06, "loss": 1.1474, "step": 783 }, { "epoch": 0.6938053097345133, "grad_norm": 1.4291562761065668, "learning_rate": 9.334794434292416e-06, "loss": 1.2008, "step": 784 }, { "epoch": 0.6946902654867256, "grad_norm": 1.4931290401605681, "learning_rate": 9.333045929012557e-06, "loss": 1.2476, "step": 785 }, { "epoch": 0.695575221238938, "grad_norm": 1.5056424670598383, "learning_rate": 9.331295292960642e-06, "loss": 1.1572, "step": 786 }, { "epoch": 0.6964601769911505, "grad_norm": 2.151228406135392, "learning_rate": 9.32954252699755e-06, "loss": 1.1679, "step": 787 }, { "epoch": 0.6973451327433628, "grad_norm": 1.5113865459396911, "learning_rate": 9.327787631985197e-06, "loss": 1.2057, "step": 788 }, { "epoch": 0.6982300884955752, "grad_norm": 1.5427402426939427, "learning_rate": 9.326030608786558e-06, "loss": 1.2553, "step": 789 }, { "epoch": 0.6991150442477876, "grad_norm": 1.417416015414266, "learning_rate": 9.324271458265642e-06, "loss": 1.1016, "step": 790 }, { "epoch": 0.7, "grad_norm": 1.3182311115544108, "learning_rate": 9.322510181287512e-06, "loss": 1.067, "step": 791 }, { "epoch": 0.7008849557522124, "grad_norm": 1.530908494920914, "learning_rate": 9.320746778718274e-06, "loss": 1.1646, "step": 792 }, { "epoch": 0.7017699115044248, "grad_norm": 1.9696562589248594, "learning_rate": 9.318981251425081e-06, "loss": 0.9653, "step": 793 }, { "epoch": 0.7026548672566372, "grad_norm": 2.0541714274382246, "learning_rate": 9.31721360027613e-06, "loss": 1.1145, "step": 794 }, { "epoch": 0.7035398230088495, "grad_norm": 1.7074050850645999, "learning_rate": 9.315443826140663e-06, "loss": 1.0572, "step": 795 }, { "epoch": 0.7044247787610619, "grad_norm": 2.203109850297397, "learning_rate": 9.31367192988896e-06, "loss": 1.4655, "step": 796 }, { "epoch": 0.7053097345132744, "grad_norm": 1.3084207912675079, "learning_rate": 9.311897912392354e-06, "loss": 1.149, "step": 797 }, { "epoch": 0.7061946902654868, "grad_norm": 1.733261106422532, "learning_rate": 9.310121774523217e-06, "loss": 1.2766, "step": 798 }, { "epoch": 0.7070796460176991, "grad_norm": 1.568240600376525, "learning_rate": 9.308343517154963e-06, "loss": 1.1243, "step": 799 }, { "epoch": 0.7079646017699115, "grad_norm": 1.7723530593475336, "learning_rate": 9.306563141162046e-06, "loss": 1.286, "step": 800 }, { "epoch": 0.7088495575221239, "grad_norm": 4.256546813221697, "learning_rate": 9.304780647419967e-06, "loss": 1.1218, "step": 801 }, { "epoch": 0.7097345132743362, "grad_norm": 1.421848489930022, "learning_rate": 9.302996036805265e-06, "loss": 1.2784, "step": 802 }, { "epoch": 0.7106194690265487, "grad_norm": 1.6931327442359136, "learning_rate": 9.301209310195523e-06, "loss": 1.2036, "step": 803 }, { "epoch": 0.7115044247787611, "grad_norm": 2.301449729045146, "learning_rate": 9.29942046846936e-06, "loss": 1.3208, "step": 804 }, { "epoch": 0.7123893805309734, "grad_norm": 1.4956510101045706, "learning_rate": 9.29762951250644e-06, "loss": 1.0728, "step": 805 }, { "epoch": 0.7132743362831858, "grad_norm": 1.5093446987915768, "learning_rate": 9.29583644318746e-06, "loss": 0.8886, "step": 806 }, { "epoch": 0.7141592920353982, "grad_norm": 1.321289695250669, "learning_rate": 9.294041261394164e-06, "loss": 1.0864, "step": 807 }, { "epoch": 0.7150442477876107, "grad_norm": 1.5985417301813996, "learning_rate": 9.292243968009332e-06, "loss": 1.0865, "step": 808 }, { "epoch": 0.715929203539823, "grad_norm": 1.4661005614812148, "learning_rate": 9.290444563916778e-06, "loss": 0.9862, "step": 809 }, { "epoch": 0.7168141592920354, "grad_norm": 1.2619219797419254, "learning_rate": 9.288643050001362e-06, "loss": 1.1337, "step": 810 }, { "epoch": 0.7176991150442478, "grad_norm": 11.089605808370584, "learning_rate": 9.286839427148973e-06, "loss": 1.2517, "step": 811 }, { "epoch": 0.7185840707964601, "grad_norm": 1.4172287534184147, "learning_rate": 9.285033696246544e-06, "loss": 1.0962, "step": 812 }, { "epoch": 0.7194690265486726, "grad_norm": 1.6174359511464105, "learning_rate": 9.283225858182038e-06, "loss": 1.1367, "step": 813 }, { "epoch": 0.720353982300885, "grad_norm": 2.6065146135943853, "learning_rate": 9.281415913844465e-06, "loss": 1.4294, "step": 814 }, { "epoch": 0.7212389380530974, "grad_norm": 1.6817053423715727, "learning_rate": 9.279603864123858e-06, "loss": 1.2573, "step": 815 }, { "epoch": 0.7221238938053097, "grad_norm": 1.4556310601508886, "learning_rate": 9.27778970991129e-06, "loss": 1.4441, "step": 816 }, { "epoch": 0.7230088495575221, "grad_norm": 1.3329757903120532, "learning_rate": 9.275973452098877e-06, "loss": 1.1127, "step": 817 }, { "epoch": 0.7238938053097345, "grad_norm": 1.7414500222105247, "learning_rate": 9.274155091579756e-06, "loss": 1.2144, "step": 818 }, { "epoch": 0.7247787610619469, "grad_norm": 2.508355223781366, "learning_rate": 9.272334629248106e-06, "loss": 1.5371, "step": 819 }, { "epoch": 0.7256637168141593, "grad_norm": 1.4594641502218602, "learning_rate": 9.270512065999139e-06, "loss": 1.1566, "step": 820 }, { "epoch": 0.7265486725663717, "grad_norm": 1.3025777378573073, "learning_rate": 9.268687402729097e-06, "loss": 1.1443, "step": 821 }, { "epoch": 0.727433628318584, "grad_norm": 1.789811329472013, "learning_rate": 9.26686064033526e-06, "loss": 1.2866, "step": 822 }, { "epoch": 0.7283185840707964, "grad_norm": 1.2314343531312364, "learning_rate": 9.265031779715934e-06, "loss": 0.9899, "step": 823 }, { "epoch": 0.7292035398230089, "grad_norm": 1.345915980692999, "learning_rate": 9.263200821770462e-06, "loss": 1.1982, "step": 824 }, { "epoch": 0.7300884955752213, "grad_norm": 1.4760967855546148, "learning_rate": 9.261367767399215e-06, "loss": 0.9243, "step": 825 }, { "epoch": 0.7309734513274336, "grad_norm": 1.3458971340277457, "learning_rate": 9.259532617503598e-06, "loss": 1.0724, "step": 826 }, { "epoch": 0.731858407079646, "grad_norm": 1.8010176212271585, "learning_rate": 9.25769537298604e-06, "loss": 1.2243, "step": 827 }, { "epoch": 0.7327433628318584, "grad_norm": 1.6112967578485091, "learning_rate": 9.255856034750008e-06, "loss": 1.177, "step": 828 }, { "epoch": 0.7336283185840708, "grad_norm": 1.516882055103393, "learning_rate": 9.254014603699995e-06, "loss": 1.2123, "step": 829 }, { "epoch": 0.7345132743362832, "grad_norm": 1.5158584187990787, "learning_rate": 9.252171080741525e-06, "loss": 1.1373, "step": 830 }, { "epoch": 0.7353982300884956, "grad_norm": 1.0836331202035208, "learning_rate": 9.250325466781145e-06, "loss": 1.0878, "step": 831 }, { "epoch": 0.736283185840708, "grad_norm": 1.2586376250744484, "learning_rate": 9.248477762726438e-06, "loss": 0.9318, "step": 832 }, { "epoch": 0.7371681415929203, "grad_norm": 2.3472961911196504, "learning_rate": 9.246627969486008e-06, "loss": 1.2482, "step": 833 }, { "epoch": 0.7380530973451327, "grad_norm": 1.5242979049648984, "learning_rate": 9.244776087969492e-06, "loss": 1.2734, "step": 834 }, { "epoch": 0.7389380530973452, "grad_norm": 1.3957156737573084, "learning_rate": 9.242922119087548e-06, "loss": 1.0331, "step": 835 }, { "epoch": 0.7398230088495575, "grad_norm": 1.3510063967929664, "learning_rate": 9.241066063751868e-06, "loss": 1.0057, "step": 836 }, { "epoch": 0.7407079646017699, "grad_norm": 1.9142217278683133, "learning_rate": 9.239207922875163e-06, "loss": 1.0716, "step": 837 }, { "epoch": 0.7415929203539823, "grad_norm": 1.6712420811531044, "learning_rate": 9.237347697371173e-06, "loss": 1.0632, "step": 838 }, { "epoch": 0.7424778761061946, "grad_norm": 1.8685018805020221, "learning_rate": 9.235485388154663e-06, "loss": 1.2341, "step": 839 }, { "epoch": 0.7433628318584071, "grad_norm": 1.4021190676630013, "learning_rate": 9.233620996141421e-06, "loss": 1.2609, "step": 840 }, { "epoch": 0.7442477876106195, "grad_norm": 2.513512993736198, "learning_rate": 9.231754522248261e-06, "loss": 1.2552, "step": 841 }, { "epoch": 0.7451327433628319, "grad_norm": 5.03267992061944, "learning_rate": 9.229885967393022e-06, "loss": 0.9405, "step": 842 }, { "epoch": 0.7460176991150442, "grad_norm": 1.5909716488489256, "learning_rate": 9.228015332494562e-06, "loss": 1.1695, "step": 843 }, { "epoch": 0.7469026548672566, "grad_norm": 1.460198611053523, "learning_rate": 9.226142618472765e-06, "loss": 1.1953, "step": 844 }, { "epoch": 0.7477876106194691, "grad_norm": 1.7239116232265732, "learning_rate": 9.224267826248536e-06, "loss": 1.2915, "step": 845 }, { "epoch": 0.7486725663716814, "grad_norm": 1.892641459421171, "learning_rate": 9.222390956743805e-06, "loss": 1.2448, "step": 846 }, { "epoch": 0.7495575221238938, "grad_norm": 1.6684618851421928, "learning_rate": 9.220512010881518e-06, "loss": 1.4129, "step": 847 }, { "epoch": 0.7504424778761062, "grad_norm": 1.3815987501399751, "learning_rate": 9.218630989585647e-06, "loss": 1.3485, "step": 848 }, { "epoch": 0.7513274336283186, "grad_norm": 1.5259882740285071, "learning_rate": 9.216747893781181e-06, "loss": 1.07, "step": 849 }, { "epoch": 0.7522123893805309, "grad_norm": 2.094015160336484, "learning_rate": 9.214862724394133e-06, "loss": 1.176, "step": 850 }, { "epoch": 0.7530973451327434, "grad_norm": 1.5312417947086114, "learning_rate": 9.212975482351534e-06, "loss": 0.9628, "step": 851 }, { "epoch": 0.7539823008849558, "grad_norm": 5.4952157273384294, "learning_rate": 9.211086168581433e-06, "loss": 1.399, "step": 852 }, { "epoch": 0.7548672566371681, "grad_norm": 1.469482262298714, "learning_rate": 9.209194784012898e-06, "loss": 1.1842, "step": 853 }, { "epoch": 0.7557522123893805, "grad_norm": 1.44738975772603, "learning_rate": 9.207301329576017e-06, "loss": 1.2505, "step": 854 }, { "epoch": 0.7566371681415929, "grad_norm": 1.6017779344838323, "learning_rate": 9.205405806201895e-06, "loss": 1.1818, "step": 855 }, { "epoch": 0.7575221238938054, "grad_norm": 2.2958174671961995, "learning_rate": 9.203508214822652e-06, "loss": 1.1796, "step": 856 }, { "epoch": 0.7584070796460177, "grad_norm": 1.4130087220445802, "learning_rate": 9.201608556371428e-06, "loss": 1.2053, "step": 857 }, { "epoch": 0.7592920353982301, "grad_norm": 1.7825420542128307, "learning_rate": 9.19970683178238e-06, "loss": 1.1391, "step": 858 }, { "epoch": 0.7601769911504425, "grad_norm": 1.9039664427052005, "learning_rate": 9.19780304199068e-06, "loss": 1.2523, "step": 859 }, { "epoch": 0.7610619469026548, "grad_norm": 5.1452914105062995, "learning_rate": 9.195897187932513e-06, "loss": 1.3228, "step": 860 }, { "epoch": 0.7619469026548673, "grad_norm": 1.2245369636660859, "learning_rate": 9.19398927054508e-06, "loss": 1.1165, "step": 861 }, { "epoch": 0.7628318584070797, "grad_norm": 2.0533400768477232, "learning_rate": 9.192079290766603e-06, "loss": 1.1693, "step": 862 }, { "epoch": 0.763716814159292, "grad_norm": 1.6042675059548432, "learning_rate": 9.190167249536308e-06, "loss": 1.1413, "step": 863 }, { "epoch": 0.7646017699115044, "grad_norm": 1.5247279330520185, "learning_rate": 9.188253147794443e-06, "loss": 1.3235, "step": 864 }, { "epoch": 0.7654867256637168, "grad_norm": 1.6240264713693442, "learning_rate": 9.186336986482267e-06, "loss": 1.4257, "step": 865 }, { "epoch": 0.7663716814159292, "grad_norm": 1.846120891896264, "learning_rate": 9.184418766542046e-06, "loss": 1.2747, "step": 866 }, { "epoch": 0.7672566371681416, "grad_norm": 2.344787913694964, "learning_rate": 9.182498488917068e-06, "loss": 1.1145, "step": 867 }, { "epoch": 0.768141592920354, "grad_norm": 1.5208417196197703, "learning_rate": 9.180576154551628e-06, "loss": 1.2255, "step": 868 }, { "epoch": 0.7690265486725664, "grad_norm": 2.3708252992890366, "learning_rate": 9.17865176439103e-06, "loss": 1.1824, "step": 869 }, { "epoch": 0.7699115044247787, "grad_norm": 2.7773636339390584, "learning_rate": 9.176725319381589e-06, "loss": 1.3038, "step": 870 }, { "epoch": 0.7707964601769911, "grad_norm": 1.3852801259606948, "learning_rate": 9.17479682047064e-06, "loss": 1.1056, "step": 871 }, { "epoch": 0.7716814159292036, "grad_norm": 1.2565631088693634, "learning_rate": 9.172866268606514e-06, "loss": 1.0407, "step": 872 }, { "epoch": 0.772566371681416, "grad_norm": 2.1633467896407836, "learning_rate": 9.170933664738563e-06, "loss": 1.3905, "step": 873 }, { "epoch": 0.7734513274336283, "grad_norm": 1.525679397569262, "learning_rate": 9.168999009817141e-06, "loss": 1.2595, "step": 874 }, { "epoch": 0.7743362831858407, "grad_norm": 3.6147267927458113, "learning_rate": 9.167062304793615e-06, "loss": 1.3339, "step": 875 }, { "epoch": 0.7752212389380531, "grad_norm": 1.3625093939532498, "learning_rate": 9.165123550620357e-06, "loss": 1.0961, "step": 876 }, { "epoch": 0.7761061946902655, "grad_norm": 1.7942706924508764, "learning_rate": 9.163182748250747e-06, "loss": 1.1952, "step": 877 }, { "epoch": 0.7769911504424779, "grad_norm": 1.5169482783916817, "learning_rate": 9.161239898639173e-06, "loss": 1.314, "step": 878 }, { "epoch": 0.7778761061946903, "grad_norm": 1.4315156230712662, "learning_rate": 9.159295002741034e-06, "loss": 1.0511, "step": 879 }, { "epoch": 0.7787610619469026, "grad_norm": 1.8012869557654054, "learning_rate": 9.157348061512728e-06, "loss": 1.1572, "step": 880 }, { "epoch": 0.779646017699115, "grad_norm": 1.7197485682432394, "learning_rate": 9.15539907591166e-06, "loss": 1.1458, "step": 881 }, { "epoch": 0.7805309734513274, "grad_norm": 1.6384347196471878, "learning_rate": 9.153448046896244e-06, "loss": 1.0464, "step": 882 }, { "epoch": 0.7814159292035399, "grad_norm": 1.492911495844966, "learning_rate": 9.151494975425899e-06, "loss": 1.1353, "step": 883 }, { "epoch": 0.7823008849557522, "grad_norm": 1.5848070033370065, "learning_rate": 9.149539862461044e-06, "loss": 1.3736, "step": 884 }, { "epoch": 0.7831858407079646, "grad_norm": 1.6189053673602405, "learning_rate": 9.147582708963103e-06, "loss": 1.1529, "step": 885 }, { "epoch": 0.784070796460177, "grad_norm": 2.2922540010670915, "learning_rate": 9.14562351589451e-06, "loss": 1.1805, "step": 886 }, { "epoch": 0.7849557522123893, "grad_norm": 1.9068968440457927, "learning_rate": 9.143662284218691e-06, "loss": 1.1507, "step": 887 }, { "epoch": 0.7858407079646018, "grad_norm": 1.52934458841108, "learning_rate": 9.141699014900084e-06, "loss": 0.9682, "step": 888 }, { "epoch": 0.7867256637168142, "grad_norm": 2.472586872483963, "learning_rate": 9.139733708904122e-06, "loss": 1.2332, "step": 889 }, { "epoch": 0.7876106194690266, "grad_norm": 1.2615959069116933, "learning_rate": 9.137766367197246e-06, "loss": 1.1562, "step": 890 }, { "epoch": 0.7884955752212389, "grad_norm": 2.699900074111602, "learning_rate": 9.135796990746892e-06, "loss": 1.1782, "step": 891 }, { "epoch": 0.7893805309734513, "grad_norm": 2.6340287163695546, "learning_rate": 9.133825580521502e-06, "loss": 1.0122, "step": 892 }, { "epoch": 0.7902654867256638, "grad_norm": 1.406134374473168, "learning_rate": 9.131852137490513e-06, "loss": 0.8808, "step": 893 }, { "epoch": 0.7911504424778761, "grad_norm": 1.9974401612434358, "learning_rate": 9.129876662624366e-06, "loss": 1.3113, "step": 894 }, { "epoch": 0.7920353982300885, "grad_norm": 3.4205379471652795, "learning_rate": 9.1278991568945e-06, "loss": 1.2276, "step": 895 }, { "epoch": 0.7929203539823009, "grad_norm": 1.2803824763885254, "learning_rate": 9.125919621273348e-06, "loss": 1.0374, "step": 896 }, { "epoch": 0.7938053097345132, "grad_norm": 1.532553410137506, "learning_rate": 9.12393805673435e-06, "loss": 1.277, "step": 897 }, { "epoch": 0.7946902654867256, "grad_norm": 1.581547852975556, "learning_rate": 9.121954464251934e-06, "loss": 1.3123, "step": 898 }, { "epoch": 0.7955752212389381, "grad_norm": 1.4143999152532505, "learning_rate": 9.119968844801534e-06, "loss": 1.2231, "step": 899 }, { "epoch": 0.7964601769911505, "grad_norm": 2.509910282900688, "learning_rate": 9.117981199359575e-06, "loss": 1.1891, "step": 900 }, { "epoch": 0.7973451327433628, "grad_norm": 1.459029747180383, "learning_rate": 9.11599152890348e-06, "loss": 1.1475, "step": 901 }, { "epoch": 0.7982300884955752, "grad_norm": 1.8056969414014934, "learning_rate": 9.113999834411669e-06, "loss": 1.1904, "step": 902 }, { "epoch": 0.7991150442477876, "grad_norm": 1.5091418867859692, "learning_rate": 9.112006116863557e-06, "loss": 1.1268, "step": 903 }, { "epoch": 0.8, "grad_norm": 1.445205339178034, "learning_rate": 9.110010377239552e-06, "loss": 1.1882, "step": 904 }, { "epoch": 0.8008849557522124, "grad_norm": 1.1657427102643088, "learning_rate": 9.108012616521055e-06, "loss": 1.1068, "step": 905 }, { "epoch": 0.8017699115044248, "grad_norm": 1.4612419910862502, "learning_rate": 9.106012835690466e-06, "loss": 1.2622, "step": 906 }, { "epoch": 0.8026548672566372, "grad_norm": 1.5440094692409403, "learning_rate": 9.104011035731177e-06, "loss": 1.2564, "step": 907 }, { "epoch": 0.8035398230088495, "grad_norm": 1.4804575296762015, "learning_rate": 9.102007217627568e-06, "loss": 1.1553, "step": 908 }, { "epoch": 0.804424778761062, "grad_norm": 1.4849271649407088, "learning_rate": 9.10000138236502e-06, "loss": 1.3068, "step": 909 }, { "epoch": 0.8053097345132744, "grad_norm": 6.410864692465082, "learning_rate": 9.097993530929895e-06, "loss": 1.5501, "step": 910 }, { "epoch": 0.8061946902654867, "grad_norm": 1.6239112666022033, "learning_rate": 9.095983664309557e-06, "loss": 1.0691, "step": 911 }, { "epoch": 0.8070796460176991, "grad_norm": 1.4623561529390097, "learning_rate": 9.093971783492354e-06, "loss": 1.0215, "step": 912 }, { "epoch": 0.8079646017699115, "grad_norm": 1.8750113652566538, "learning_rate": 9.091957889467629e-06, "loss": 1.3564, "step": 913 }, { "epoch": 0.8088495575221238, "grad_norm": 1.8661350749867451, "learning_rate": 9.08994198322571e-06, "loss": 1.3507, "step": 914 }, { "epoch": 0.8097345132743363, "grad_norm": 2.0169082333051773, "learning_rate": 9.08792406575792e-06, "loss": 1.3631, "step": 915 }, { "epoch": 0.8106194690265487, "grad_norm": 1.4761704393093904, "learning_rate": 9.085904138056567e-06, "loss": 1.0108, "step": 916 }, { "epoch": 0.8115044247787611, "grad_norm": 2.861526294327625, "learning_rate": 9.083882201114948e-06, "loss": 1.3153, "step": 917 }, { "epoch": 0.8123893805309734, "grad_norm": 1.4715738389424755, "learning_rate": 9.081858255927351e-06, "loss": 1.1481, "step": 918 }, { "epoch": 0.8132743362831858, "grad_norm": 1.6299179255288574, "learning_rate": 9.079832303489049e-06, "loss": 1.2191, "step": 919 }, { "epoch": 0.8141592920353983, "grad_norm": 1.9843584607508302, "learning_rate": 9.077804344796302e-06, "loss": 1.0036, "step": 920 }, { "epoch": 0.8150442477876106, "grad_norm": 2.555558794315002, "learning_rate": 9.075774380846356e-06, "loss": 1.1703, "step": 921 }, { "epoch": 0.815929203539823, "grad_norm": 1.2597939019487112, "learning_rate": 9.073742412637448e-06, "loss": 1.2254, "step": 922 }, { "epoch": 0.8168141592920354, "grad_norm": 2.0862310736638148, "learning_rate": 9.071708441168795e-06, "loss": 1.0981, "step": 923 }, { "epoch": 0.8176991150442477, "grad_norm": 1.4518582240768805, "learning_rate": 9.069672467440598e-06, "loss": 1.2719, "step": 924 }, { "epoch": 0.8185840707964602, "grad_norm": 2.7512674648143554, "learning_rate": 9.067634492454048e-06, "loss": 1.0525, "step": 925 }, { "epoch": 0.8194690265486726, "grad_norm": 1.623869349177925, "learning_rate": 9.065594517211318e-06, "loss": 1.2113, "step": 926 }, { "epoch": 0.820353982300885, "grad_norm": 1.6183541616797188, "learning_rate": 9.063552542715564e-06, "loss": 1.3194, "step": 927 }, { "epoch": 0.8212389380530973, "grad_norm": 1.3168905217171358, "learning_rate": 9.061508569970926e-06, "loss": 1.0428, "step": 928 }, { "epoch": 0.8221238938053097, "grad_norm": 2.0314195935810275, "learning_rate": 9.059462599982525e-06, "loss": 1.4444, "step": 929 }, { "epoch": 0.8230088495575221, "grad_norm": 2.266399625433218, "learning_rate": 9.057414633756466e-06, "loss": 1.4294, "step": 930 }, { "epoch": 0.8238938053097346, "grad_norm": 1.6485616689179774, "learning_rate": 9.055364672299833e-06, "loss": 1.1517, "step": 931 }, { "epoch": 0.8247787610619469, "grad_norm": 1.455816595790672, "learning_rate": 9.053312716620695e-06, "loss": 1.0609, "step": 932 }, { "epoch": 0.8256637168141593, "grad_norm": 1.489369514590979, "learning_rate": 9.051258767728098e-06, "loss": 0.9725, "step": 933 }, { "epoch": 0.8265486725663717, "grad_norm": 2.4299033632440574, "learning_rate": 9.049202826632072e-06, "loss": 1.1344, "step": 934 }, { "epoch": 0.827433628318584, "grad_norm": 2.136380897064594, "learning_rate": 9.047144894343622e-06, "loss": 1.4485, "step": 935 }, { "epoch": 0.8283185840707965, "grad_norm": 2.2674812280067833, "learning_rate": 9.045084971874738e-06, "loss": 1.273, "step": 936 }, { "epoch": 0.8292035398230089, "grad_norm": 2.1373545243076078, "learning_rate": 9.043023060238382e-06, "loss": 1.3428, "step": 937 }, { "epoch": 0.8300884955752212, "grad_norm": 2.981027728253393, "learning_rate": 9.0409591604485e-06, "loss": 1.2014, "step": 938 }, { "epoch": 0.8309734513274336, "grad_norm": 1.5169443433585454, "learning_rate": 9.03889327352001e-06, "loss": 1.1222, "step": 939 }, { "epoch": 0.831858407079646, "grad_norm": 13.55940838204484, "learning_rate": 9.036825400468814e-06, "loss": 1.3124, "step": 940 }, { "epoch": 0.8327433628318585, "grad_norm": 1.6210309313431348, "learning_rate": 9.034755542311785e-06, "loss": 1.2695, "step": 941 }, { "epoch": 0.8336283185840708, "grad_norm": 1.5531181328340684, "learning_rate": 9.03268370006677e-06, "loss": 1.2743, "step": 942 }, { "epoch": 0.8345132743362832, "grad_norm": 1.5056701071440963, "learning_rate": 9.030609874752604e-06, "loss": 1.1691, "step": 943 }, { "epoch": 0.8353982300884956, "grad_norm": 1.406963141219612, "learning_rate": 9.028534067389087e-06, "loss": 0.8358, "step": 944 }, { "epoch": 0.8362831858407079, "grad_norm": 2.240636276031629, "learning_rate": 9.026456278996992e-06, "loss": 1.2094, "step": 945 }, { "epoch": 0.8371681415929203, "grad_norm": 3.2422982834688834, "learning_rate": 9.024376510598071e-06, "loss": 1.0597, "step": 946 }, { "epoch": 0.8380530973451328, "grad_norm": 1.30729892063389, "learning_rate": 9.022294763215051e-06, "loss": 1.1302, "step": 947 }, { "epoch": 0.8389380530973451, "grad_norm": 1.6059815488796243, "learning_rate": 9.020211037871626e-06, "loss": 1.1252, "step": 948 }, { "epoch": 0.8398230088495575, "grad_norm": 2.1606706127513773, "learning_rate": 9.018125335592471e-06, "loss": 1.0062, "step": 949 }, { "epoch": 0.8407079646017699, "grad_norm": 2.8236738093282656, "learning_rate": 9.016037657403225e-06, "loss": 1.2372, "step": 950 }, { "epoch": 0.8415929203539823, "grad_norm": 1.6684586423830494, "learning_rate": 9.013948004330503e-06, "loss": 1.1112, "step": 951 }, { "epoch": 0.8424778761061947, "grad_norm": 1.1549509362906405, "learning_rate": 9.011856377401891e-06, "loss": 0.896, "step": 952 }, { "epoch": 0.8433628318584071, "grad_norm": 1.6678243496413447, "learning_rate": 9.009762777645944e-06, "loss": 1.3057, "step": 953 }, { "epoch": 0.8442477876106195, "grad_norm": 1.8083683428586004, "learning_rate": 9.007667206092188e-06, "loss": 1.0283, "step": 954 }, { "epoch": 0.8451327433628318, "grad_norm": 1.6327977401355718, "learning_rate": 9.005569663771119e-06, "loss": 1.1268, "step": 955 }, { "epoch": 0.8460176991150442, "grad_norm": 4.1177572944879675, "learning_rate": 9.003470151714203e-06, "loss": 1.1624, "step": 956 }, { "epoch": 0.8469026548672567, "grad_norm": 1.6666958225323387, "learning_rate": 9.001368670953872e-06, "loss": 1.0371, "step": 957 }, { "epoch": 0.8477876106194691, "grad_norm": 1.8369787888350884, "learning_rate": 8.999265222523529e-06, "loss": 1.3295, "step": 958 }, { "epoch": 0.8486725663716814, "grad_norm": 2.0796663032334504, "learning_rate": 8.99715980745754e-06, "loss": 1.2084, "step": 959 }, { "epoch": 0.8495575221238938, "grad_norm": 1.7019825773649113, "learning_rate": 8.995052426791247e-06, "loss": 1.2404, "step": 960 }, { "epoch": 0.8504424778761062, "grad_norm": 2.222168024130456, "learning_rate": 8.992943081560946e-06, "loss": 1.4744, "step": 961 }, { "epoch": 0.8513274336283185, "grad_norm": 2.41585171478144, "learning_rate": 8.990831772803911e-06, "loss": 1.3132, "step": 962 }, { "epoch": 0.852212389380531, "grad_norm": 2.066164027787731, "learning_rate": 8.988718501558376e-06, "loss": 1.2067, "step": 963 }, { "epoch": 0.8530973451327434, "grad_norm": 1.3782116697557147, "learning_rate": 8.986603268863536e-06, "loss": 1.2155, "step": 964 }, { "epoch": 0.8539823008849557, "grad_norm": 1.9894947593589445, "learning_rate": 8.98448607575956e-06, "loss": 1.1086, "step": 965 }, { "epoch": 0.8548672566371681, "grad_norm": 1.4028618533024788, "learning_rate": 8.982366923287575e-06, "loss": 1.0612, "step": 966 }, { "epoch": 0.8557522123893805, "grad_norm": 2.4854892055443503, "learning_rate": 8.980245812489672e-06, "loss": 1.0475, "step": 967 }, { "epoch": 0.856637168141593, "grad_norm": 2.3087947974572125, "learning_rate": 8.978122744408905e-06, "loss": 1.2431, "step": 968 }, { "epoch": 0.8575221238938053, "grad_norm": 1.4848755198033312, "learning_rate": 8.975997720089294e-06, "loss": 1.0911, "step": 969 }, { "epoch": 0.8584070796460177, "grad_norm": 1.6241587603293803, "learning_rate": 8.973870740575814e-06, "loss": 1.3212, "step": 970 }, { "epoch": 0.8592920353982301, "grad_norm": 2.4759435441289, "learning_rate": 8.971741806914409e-06, "loss": 1.2896, "step": 971 }, { "epoch": 0.8601769911504424, "grad_norm": 1.161414735874842, "learning_rate": 8.969610920151976e-06, "loss": 1.0734, "step": 972 }, { "epoch": 0.8610619469026549, "grad_norm": 1.2768090349925367, "learning_rate": 8.967478081336383e-06, "loss": 1.0083, "step": 973 }, { "epoch": 0.8619469026548673, "grad_norm": 1.3511908701048951, "learning_rate": 8.965343291516448e-06, "loss": 0.7622, "step": 974 }, { "epoch": 0.8628318584070797, "grad_norm": 1.7394515158151405, "learning_rate": 8.963206551741951e-06, "loss": 1.4014, "step": 975 }, { "epoch": 0.863716814159292, "grad_norm": 1.628686256669386, "learning_rate": 8.961067863063638e-06, "loss": 1.1863, "step": 976 }, { "epoch": 0.8646017699115044, "grad_norm": 2.3633842620074645, "learning_rate": 8.958927226533202e-06, "loss": 1.0199, "step": 977 }, { "epoch": 0.8654867256637168, "grad_norm": 1.3276137563641035, "learning_rate": 8.956784643203303e-06, "loss": 1.2344, "step": 978 }, { "epoch": 0.8663716814159292, "grad_norm": 3.839480655182702, "learning_rate": 8.95464011412755e-06, "loss": 1.0669, "step": 979 }, { "epoch": 0.8672566371681416, "grad_norm": 1.3324406174979937, "learning_rate": 8.952493640360518e-06, "loss": 1.046, "step": 980 }, { "epoch": 0.868141592920354, "grad_norm": 1.8525544510586403, "learning_rate": 8.950345222957733e-06, "loss": 1.2725, "step": 981 }, { "epoch": 0.8690265486725663, "grad_norm": 1.4397140368605077, "learning_rate": 8.948194862975676e-06, "loss": 1.2706, "step": 982 }, { "epoch": 0.8699115044247787, "grad_norm": 1.8454677321288475, "learning_rate": 8.946042561471786e-06, "loss": 1.4876, "step": 983 }, { "epoch": 0.8707964601769912, "grad_norm": 1.5421135040911387, "learning_rate": 8.943888319504456e-06, "loss": 1.2948, "step": 984 }, { "epoch": 0.8716814159292036, "grad_norm": 1.478257644642202, "learning_rate": 8.941732138133032e-06, "loss": 1.3101, "step": 985 }, { "epoch": 0.8725663716814159, "grad_norm": 1.6296482435774589, "learning_rate": 8.939574018417815e-06, "loss": 1.2991, "step": 986 }, { "epoch": 0.8734513274336283, "grad_norm": 1.4982496648038752, "learning_rate": 8.937413961420058e-06, "loss": 1.0709, "step": 987 }, { "epoch": 0.8743362831858407, "grad_norm": 1.162831207939779, "learning_rate": 8.93525196820197e-06, "loss": 0.9042, "step": 988 }, { "epoch": 0.8752212389380531, "grad_norm": 1.3385708493737594, "learning_rate": 8.933088039826706e-06, "loss": 0.9257, "step": 989 }, { "epoch": 0.8761061946902655, "grad_norm": 1.9178088120273382, "learning_rate": 8.930922177358379e-06, "loss": 1.0796, "step": 990 }, { "epoch": 0.8769911504424779, "grad_norm": 1.5547028182165654, "learning_rate": 8.928754381862049e-06, "loss": 1.6229, "step": 991 }, { "epoch": 0.8778761061946903, "grad_norm": 2.015243468233858, "learning_rate": 8.926584654403725e-06, "loss": 1.2306, "step": 992 }, { "epoch": 0.8787610619469026, "grad_norm": 1.6315494374541348, "learning_rate": 8.924412996050373e-06, "loss": 0.956, "step": 993 }, { "epoch": 0.879646017699115, "grad_norm": 1.8014470998261225, "learning_rate": 8.922239407869904e-06, "loss": 1.2626, "step": 994 }, { "epoch": 0.8805309734513275, "grad_norm": 1.3722147895941181, "learning_rate": 8.920063890931177e-06, "loss": 1.1419, "step": 995 }, { "epoch": 0.8814159292035398, "grad_norm": 1.8434632495060437, "learning_rate": 8.917886446304001e-06, "loss": 1.1613, "step": 996 }, { "epoch": 0.8823008849557522, "grad_norm": 1.5175686293644757, "learning_rate": 8.915707075059133e-06, "loss": 1.3214, "step": 997 }, { "epoch": 0.8831858407079646, "grad_norm": 2.722058093628202, "learning_rate": 8.913525778268275e-06, "loss": 1.1919, "step": 998 }, { "epoch": 0.8840707964601769, "grad_norm": 1.920983262596426, "learning_rate": 8.911342557004084e-06, "loss": 1.1263, "step": 999 }, { "epoch": 0.8849557522123894, "grad_norm": 1.8801463306478368, "learning_rate": 8.90915741234015e-06, "loss": 1.2293, "step": 1000 }, { "epoch": 0.8858407079646018, "grad_norm": 1.3415900207685116, "learning_rate": 8.90697034535102e-06, "loss": 0.8686, "step": 1001 }, { "epoch": 0.8867256637168142, "grad_norm": 1.3437096734255491, "learning_rate": 8.904781357112185e-06, "loss": 1.3125, "step": 1002 }, { "epoch": 0.8876106194690265, "grad_norm": 5.728030946259185, "learning_rate": 8.902590448700072e-06, "loss": 1.5421, "step": 1003 }, { "epoch": 0.8884955752212389, "grad_norm": 1.3246754398528369, "learning_rate": 8.900397621192063e-06, "loss": 1.1895, "step": 1004 }, { "epoch": 0.8893805309734514, "grad_norm": 1.4281154700561167, "learning_rate": 8.89820287566648e-06, "loss": 1.1128, "step": 1005 }, { "epoch": 0.8902654867256637, "grad_norm": 3.152473848158449, "learning_rate": 8.896006213202584e-06, "loss": 1.4283, "step": 1006 }, { "epoch": 0.8911504424778761, "grad_norm": 2.0536488713796874, "learning_rate": 8.893807634880585e-06, "loss": 1.4984, "step": 1007 }, { "epoch": 0.8920353982300885, "grad_norm": 1.3190122408928944, "learning_rate": 8.89160714178163e-06, "loss": 1.0814, "step": 1008 }, { "epoch": 0.8929203539823009, "grad_norm": 1.4912454232270387, "learning_rate": 8.889404734987813e-06, "loss": 1.3232, "step": 1009 }, { "epoch": 0.8938053097345132, "grad_norm": 1.3371310057661168, "learning_rate": 8.88720041558216e-06, "loss": 1.2347, "step": 1010 }, { "epoch": 0.8946902654867257, "grad_norm": 1.3898081699266878, "learning_rate": 8.884994184648652e-06, "loss": 1.1924, "step": 1011 }, { "epoch": 0.8955752212389381, "grad_norm": 1.5030849971344185, "learning_rate": 8.882786043272193e-06, "loss": 1.206, "step": 1012 }, { "epoch": 0.8964601769911504, "grad_norm": 2.038280627433564, "learning_rate": 8.88057599253864e-06, "loss": 1.1875, "step": 1013 }, { "epoch": 0.8973451327433628, "grad_norm": 2.1223796545625446, "learning_rate": 8.878364033534781e-06, "loss": 1.533, "step": 1014 }, { "epoch": 0.8982300884955752, "grad_norm": 1.724653793297445, "learning_rate": 8.876150167348348e-06, "loss": 1.1908, "step": 1015 }, { "epoch": 0.8991150442477877, "grad_norm": 1.6581245030292244, "learning_rate": 8.873934395068006e-06, "loss": 1.1913, "step": 1016 }, { "epoch": 0.9, "grad_norm": 1.987866999491713, "learning_rate": 8.871716717783359e-06, "loss": 1.2546, "step": 1017 }, { "epoch": 0.9008849557522124, "grad_norm": 1.4350368562854503, "learning_rate": 8.86949713658495e-06, "loss": 1.1287, "step": 1018 }, { "epoch": 0.9017699115044248, "grad_norm": 1.3956708875549961, "learning_rate": 8.867275652564253e-06, "loss": 1.4844, "step": 1019 }, { "epoch": 0.9026548672566371, "grad_norm": 1.1937139531907923, "learning_rate": 8.865052266813686e-06, "loss": 0.7858, "step": 1020 }, { "epoch": 0.9035398230088496, "grad_norm": 2.3580664490221683, "learning_rate": 8.862826980426593e-06, "loss": 1.207, "step": 1021 }, { "epoch": 0.904424778761062, "grad_norm": 1.4542232073848733, "learning_rate": 8.860599794497259e-06, "loss": 1.2566, "step": 1022 }, { "epoch": 0.9053097345132743, "grad_norm": 1.843291393044054, "learning_rate": 8.8583707101209e-06, "loss": 1.1419, "step": 1023 }, { "epoch": 0.9061946902654867, "grad_norm": 7.20722337156487, "learning_rate": 8.856139728393667e-06, "loss": 1.5111, "step": 1024 }, { "epoch": 0.9070796460176991, "grad_norm": 1.609690847504069, "learning_rate": 8.853906850412644e-06, "loss": 1.3438, "step": 1025 }, { "epoch": 0.9079646017699115, "grad_norm": 1.5116163848083575, "learning_rate": 8.851672077275846e-06, "loss": 1.0131, "step": 1026 }, { "epoch": 0.9088495575221239, "grad_norm": 1.724582153968045, "learning_rate": 8.849435410082224e-06, "loss": 1.206, "step": 1027 }, { "epoch": 0.9097345132743363, "grad_norm": 1.5091396532151522, "learning_rate": 8.847196849931651e-06, "loss": 1.3132, "step": 1028 }, { "epoch": 0.9106194690265487, "grad_norm": 1.5256984699847116, "learning_rate": 8.844956397924944e-06, "loss": 1.0408, "step": 1029 }, { "epoch": 0.911504424778761, "grad_norm": 1.3655499301466634, "learning_rate": 8.842714055163841e-06, "loss": 1.0386, "step": 1030 }, { "epoch": 0.9123893805309734, "grad_norm": 1.3680564853588861, "learning_rate": 8.840469822751013e-06, "loss": 1.2581, "step": 1031 }, { "epoch": 0.9132743362831859, "grad_norm": 1.5574324432042193, "learning_rate": 8.838223701790057e-06, "loss": 1.2362, "step": 1032 }, { "epoch": 0.9141592920353983, "grad_norm": 1.1463336096413124, "learning_rate": 8.835975693385504e-06, "loss": 0.9712, "step": 1033 }, { "epoch": 0.9150442477876106, "grad_norm": 1.3524933781161343, "learning_rate": 8.833725798642809e-06, "loss": 1.1108, "step": 1034 }, { "epoch": 0.915929203539823, "grad_norm": 1.3800720470485248, "learning_rate": 8.831474018668356e-06, "loss": 1.1363, "step": 1035 }, { "epoch": 0.9168141592920354, "grad_norm": 1.2936672531993798, "learning_rate": 8.829220354569457e-06, "loss": 1.0924, "step": 1036 }, { "epoch": 0.9176991150442478, "grad_norm": 1.5418114522461537, "learning_rate": 8.82696480745435e-06, "loss": 1.4407, "step": 1037 }, { "epoch": 0.9185840707964602, "grad_norm": 1.5221987846648202, "learning_rate": 8.824707378432198e-06, "loss": 1.2106, "step": 1038 }, { "epoch": 0.9194690265486726, "grad_norm": 1.4986048112264931, "learning_rate": 8.822448068613089e-06, "loss": 1.1874, "step": 1039 }, { "epoch": 0.9203539823008849, "grad_norm": 1.9307515632087642, "learning_rate": 8.820186879108038e-06, "loss": 1.1005, "step": 1040 }, { "epoch": 0.9212389380530973, "grad_norm": 2.1495964576916844, "learning_rate": 8.817923811028984e-06, "loss": 1.2897, "step": 1041 }, { "epoch": 0.9221238938053097, "grad_norm": 1.5933587407560095, "learning_rate": 8.815658865488785e-06, "loss": 1.1933, "step": 1042 }, { "epoch": 0.9230088495575222, "grad_norm": 1.7363848557388506, "learning_rate": 8.813392043601232e-06, "loss": 1.2666, "step": 1043 }, { "epoch": 0.9238938053097345, "grad_norm": 1.3913012779437697, "learning_rate": 8.81112334648103e-06, "loss": 0.9741, "step": 1044 }, { "epoch": 0.9247787610619469, "grad_norm": 1.2482095314613872, "learning_rate": 8.80885277524381e-06, "loss": 1.1506, "step": 1045 }, { "epoch": 0.9256637168141593, "grad_norm": 2.7930042666192123, "learning_rate": 8.80658033100612e-06, "loss": 1.2685, "step": 1046 }, { "epoch": 0.9265486725663716, "grad_norm": 1.3957629712180297, "learning_rate": 8.804306014885438e-06, "loss": 1.255, "step": 1047 }, { "epoch": 0.9274336283185841, "grad_norm": 1.3239575629413027, "learning_rate": 8.802029828000157e-06, "loss": 1.0306, "step": 1048 }, { "epoch": 0.9283185840707965, "grad_norm": 1.4218717681416733, "learning_rate": 8.799751771469585e-06, "loss": 1.1494, "step": 1049 }, { "epoch": 0.9292035398230089, "grad_norm": 1.4659538691327683, "learning_rate": 8.797471846413957e-06, "loss": 1.2139, "step": 1050 }, { "epoch": 0.9300884955752212, "grad_norm": 1.81177970668574, "learning_rate": 8.795190053954428e-06, "loss": 1.3269, "step": 1051 }, { "epoch": 0.9309734513274336, "grad_norm": 3.161238110132213, "learning_rate": 8.792906395213064e-06, "loss": 0.942, "step": 1052 }, { "epoch": 0.9318584070796461, "grad_norm": 1.720440735643883, "learning_rate": 8.790620871312852e-06, "loss": 1.1973, "step": 1053 }, { "epoch": 0.9327433628318584, "grad_norm": 1.3224786156934385, "learning_rate": 8.788333483377699e-06, "loss": 1.1911, "step": 1054 }, { "epoch": 0.9336283185840708, "grad_norm": 1.4344817428372367, "learning_rate": 8.786044232532423e-06, "loss": 1.2697, "step": 1055 }, { "epoch": 0.9345132743362832, "grad_norm": 1.4549799414890576, "learning_rate": 8.783753119902766e-06, "loss": 1.1167, "step": 1056 }, { "epoch": 0.9353982300884955, "grad_norm": 1.6007290382353991, "learning_rate": 8.781460146615379e-06, "loss": 1.3697, "step": 1057 }, { "epoch": 0.9362831858407079, "grad_norm": 1.4448666372197925, "learning_rate": 8.779165313797828e-06, "loss": 1.0894, "step": 1058 }, { "epoch": 0.9371681415929204, "grad_norm": 1.4636888897730165, "learning_rate": 8.776868622578596e-06, "loss": 1.2531, "step": 1059 }, { "epoch": 0.9380530973451328, "grad_norm": 1.7641336032234436, "learning_rate": 8.77457007408708e-06, "loss": 1.3287, "step": 1060 }, { "epoch": 0.9389380530973451, "grad_norm": 1.4898871810937198, "learning_rate": 8.77226966945359e-06, "loss": 0.973, "step": 1061 }, { "epoch": 0.9398230088495575, "grad_norm": 1.5968589223368643, "learning_rate": 8.769967409809348e-06, "loss": 0.9749, "step": 1062 }, { "epoch": 0.9407079646017699, "grad_norm": 3.6132441105502355, "learning_rate": 8.767663296286488e-06, "loss": 1.2904, "step": 1063 }, { "epoch": 0.9415929203539823, "grad_norm": 1.784251204590787, "learning_rate": 8.765357330018056e-06, "loss": 1.4109, "step": 1064 }, { "epoch": 0.9424778761061947, "grad_norm": 1.2908981338939876, "learning_rate": 8.763049512138009e-06, "loss": 1.2534, "step": 1065 }, { "epoch": 0.9433628318584071, "grad_norm": 1.5446374397075704, "learning_rate": 8.760739843781214e-06, "loss": 1.2071, "step": 1066 }, { "epoch": 0.9442477876106194, "grad_norm": 1.7274720771611705, "learning_rate": 8.75842832608345e-06, "loss": 1.0873, "step": 1067 }, { "epoch": 0.9451327433628318, "grad_norm": 4.9452693886725525, "learning_rate": 8.756114960181405e-06, "loss": 1.0516, "step": 1068 }, { "epoch": 0.9460176991150443, "grad_norm": 1.7159824144989764, "learning_rate": 8.753799747212672e-06, "loss": 1.251, "step": 1069 }, { "epoch": 0.9469026548672567, "grad_norm": 1.6703569515993126, "learning_rate": 8.751482688315758e-06, "loss": 1.2445, "step": 1070 }, { "epoch": 0.947787610619469, "grad_norm": 1.4542056281556044, "learning_rate": 8.749163784630073e-06, "loss": 1.1648, "step": 1071 }, { "epoch": 0.9486725663716814, "grad_norm": 3.5118959356623884, "learning_rate": 8.746843037295936e-06, "loss": 1.4449, "step": 1072 }, { "epoch": 0.9495575221238938, "grad_norm": 2.469543460888822, "learning_rate": 8.744520447454576e-06, "loss": 1.115, "step": 1073 }, { "epoch": 0.9504424778761061, "grad_norm": 1.6048423661436357, "learning_rate": 8.742196016248121e-06, "loss": 1.389, "step": 1074 }, { "epoch": 0.9513274336283186, "grad_norm": 1.443558583648265, "learning_rate": 8.739869744819609e-06, "loss": 1.2885, "step": 1075 }, { "epoch": 0.952212389380531, "grad_norm": 1.648243782051673, "learning_rate": 8.737541634312985e-06, "loss": 1.2955, "step": 1076 }, { "epoch": 0.9530973451327434, "grad_norm": 1.4720416153739864, "learning_rate": 8.735211685873092e-06, "loss": 0.9577, "step": 1077 }, { "epoch": 0.9539823008849557, "grad_norm": 1.451072015039017, "learning_rate": 8.732879900645682e-06, "loss": 1.1824, "step": 1078 }, { "epoch": 0.9548672566371681, "grad_norm": 1.4601659175504782, "learning_rate": 8.73054627977741e-06, "loss": 1.2014, "step": 1079 }, { "epoch": 0.9557522123893806, "grad_norm": 1.5420597145676846, "learning_rate": 8.728210824415829e-06, "loss": 1.1698, "step": 1080 }, { "epoch": 0.9566371681415929, "grad_norm": 1.6638006446016826, "learning_rate": 8.7258735357094e-06, "loss": 1.1003, "step": 1081 }, { "epoch": 0.9575221238938053, "grad_norm": 1.5184062769366171, "learning_rate": 8.723534414807483e-06, "loss": 1.1823, "step": 1082 }, { "epoch": 0.9584070796460177, "grad_norm": 1.2699844387998185, "learning_rate": 8.721193462860335e-06, "loss": 1.0028, "step": 1083 }, { "epoch": 0.95929203539823, "grad_norm": 1.7769901930230618, "learning_rate": 8.718850681019125e-06, "loss": 1.1588, "step": 1084 }, { "epoch": 0.9601769911504425, "grad_norm": 1.8998096626953906, "learning_rate": 8.716506070435906e-06, "loss": 0.907, "step": 1085 }, { "epoch": 0.9610619469026549, "grad_norm": 1.7682019996799954, "learning_rate": 8.714159632263644e-06, "loss": 1.0622, "step": 1086 }, { "epoch": 0.9619469026548673, "grad_norm": 1.427426201718087, "learning_rate": 8.711811367656194e-06, "loss": 1.3259, "step": 1087 }, { "epoch": 0.9628318584070796, "grad_norm": 2.4858510821311963, "learning_rate": 8.70946127776832e-06, "loss": 0.9874, "step": 1088 }, { "epoch": 0.963716814159292, "grad_norm": 1.5343511432516848, "learning_rate": 8.707109363755668e-06, "loss": 1.2266, "step": 1089 }, { "epoch": 0.9646017699115044, "grad_norm": 1.7385085647878038, "learning_rate": 8.704755626774796e-06, "loss": 1.2225, "step": 1090 }, { "epoch": 0.9654867256637168, "grad_norm": 1.399503193615026, "learning_rate": 8.702400067983152e-06, "loss": 1.0974, "step": 1091 }, { "epoch": 0.9663716814159292, "grad_norm": 1.6578279913826957, "learning_rate": 8.70004268853908e-06, "loss": 1.0394, "step": 1092 }, { "epoch": 0.9672566371681416, "grad_norm": 1.4674164571447519, "learning_rate": 8.697683489601816e-06, "loss": 1.0078, "step": 1093 }, { "epoch": 0.968141592920354, "grad_norm": 1.836592043884921, "learning_rate": 8.695322472331497e-06, "loss": 1.162, "step": 1094 }, { "epoch": 0.9690265486725663, "grad_norm": 1.7835380880576663, "learning_rate": 8.692959637889153e-06, "loss": 1.0626, "step": 1095 }, { "epoch": 0.9699115044247788, "grad_norm": 1.8239297798096323, "learning_rate": 8.690594987436705e-06, "loss": 1.0232, "step": 1096 }, { "epoch": 0.9707964601769912, "grad_norm": 1.4534643185313652, "learning_rate": 8.688228522136966e-06, "loss": 0.9988, "step": 1097 }, { "epoch": 0.9716814159292035, "grad_norm": 2.1289785534901564, "learning_rate": 8.685860243153645e-06, "loss": 1.2154, "step": 1098 }, { "epoch": 0.9725663716814159, "grad_norm": 2.06611517185568, "learning_rate": 8.683490151651342e-06, "loss": 1.115, "step": 1099 }, { "epoch": 0.9734513274336283, "grad_norm": 1.3873282660998183, "learning_rate": 8.681118248795548e-06, "loss": 1.062, "step": 1100 }, { "epoch": 0.9743362831858408, "grad_norm": 1.4355090744076857, "learning_rate": 8.678744535752643e-06, "loss": 0.9892, "step": 1101 }, { "epoch": 0.9752212389380531, "grad_norm": 2.0111816051539484, "learning_rate": 8.6763690136899e-06, "loss": 1.4361, "step": 1102 }, { "epoch": 0.9761061946902655, "grad_norm": 1.5574022250552269, "learning_rate": 8.673991683775477e-06, "loss": 1.1819, "step": 1103 }, { "epoch": 0.9769911504424779, "grad_norm": 2.583154146758275, "learning_rate": 8.671612547178428e-06, "loss": 0.9439, "step": 1104 }, { "epoch": 0.9778761061946902, "grad_norm": 1.693845734468008, "learning_rate": 8.66923160506869e-06, "loss": 1.0945, "step": 1105 }, { "epoch": 0.9787610619469026, "grad_norm": 1.757075823093965, "learning_rate": 8.666848858617091e-06, "loss": 1.3323, "step": 1106 }, { "epoch": 0.9796460176991151, "grad_norm": 2.865682595325627, "learning_rate": 8.664464308995342e-06, "loss": 1.1506, "step": 1107 }, { "epoch": 0.9805309734513274, "grad_norm": 2.030047826178392, "learning_rate": 8.662077957376045e-06, "loss": 1.2964, "step": 1108 }, { "epoch": 0.9814159292035398, "grad_norm": 1.5478440539984728, "learning_rate": 8.659689804932687e-06, "loss": 1.1147, "step": 1109 }, { "epoch": 0.9823008849557522, "grad_norm": 3.717751491099766, "learning_rate": 8.65729985283964e-06, "loss": 1.1081, "step": 1110 }, { "epoch": 0.9831858407079646, "grad_norm": 1.8588776024637987, "learning_rate": 8.65490810227216e-06, "loss": 1.2188, "step": 1111 }, { "epoch": 0.984070796460177, "grad_norm": 1.4591545869050324, "learning_rate": 8.652514554406388e-06, "loss": 1.1107, "step": 1112 }, { "epoch": 0.9849557522123894, "grad_norm": 1.3152966078856292, "learning_rate": 8.650119210419353e-06, "loss": 1.1714, "step": 1113 }, { "epoch": 0.9858407079646018, "grad_norm": 1.563488459106201, "learning_rate": 8.64772207148896e-06, "loss": 1.4679, "step": 1114 }, { "epoch": 0.9867256637168141, "grad_norm": 1.6150703908562196, "learning_rate": 8.645323138794002e-06, "loss": 1.0015, "step": 1115 }, { "epoch": 0.9876106194690265, "grad_norm": 1.8647883245529595, "learning_rate": 8.642922413514151e-06, "loss": 1.2099, "step": 1116 }, { "epoch": 0.988495575221239, "grad_norm": 1.354525412454188, "learning_rate": 8.640519896829963e-06, "loss": 1.2443, "step": 1117 }, { "epoch": 0.9893805309734514, "grad_norm": 1.5746305216114274, "learning_rate": 8.638115589922875e-06, "loss": 1.131, "step": 1118 }, { "epoch": 0.9902654867256637, "grad_norm": 1.4941522995106122, "learning_rate": 8.635709493975199e-06, "loss": 0.9781, "step": 1119 }, { "epoch": 0.9911504424778761, "grad_norm": 1.5969838346492242, "learning_rate": 8.633301610170136e-06, "loss": 1.2457, "step": 1120 }, { "epoch": 0.9920353982300885, "grad_norm": 6.25910664934728, "learning_rate": 8.630891939691756e-06, "loss": 1.2569, "step": 1121 }, { "epoch": 0.9929203539823008, "grad_norm": 1.438701824083638, "learning_rate": 8.628480483725016e-06, "loss": 1.2654, "step": 1122 }, { "epoch": 0.9938053097345133, "grad_norm": 1.7141710482316699, "learning_rate": 8.626067243455748e-06, "loss": 1.2187, "step": 1123 }, { "epoch": 0.9946902654867257, "grad_norm": 1.6525381075871957, "learning_rate": 8.62365222007066e-06, "loss": 1.2862, "step": 1124 }, { "epoch": 0.995575221238938, "grad_norm": 1.3677677123697178, "learning_rate": 8.621235414757337e-06, "loss": 1.1415, "step": 1125 }, { "epoch": 0.9964601769911504, "grad_norm": 1.5510774178742215, "learning_rate": 8.61881682870424e-06, "loss": 1.1281, "step": 1126 }, { "epoch": 0.9973451327433628, "grad_norm": 1.4104993804577821, "learning_rate": 8.616396463100709e-06, "loss": 1.1725, "step": 1127 }, { "epoch": 0.9982300884955753, "grad_norm": 1.7633191488890578, "learning_rate": 8.613974319136959e-06, "loss": 1.7731, "step": 1128 }, { "epoch": 0.9991150442477876, "grad_norm": 2.167715199834539, "learning_rate": 8.611550398004074e-06, "loss": 1.1655, "step": 1129 }, { "epoch": 1.0, "grad_norm": 1.2942679037044045, "learning_rate": 8.609124700894017e-06, "loss": 1.1078, "step": 1130 }, { "epoch": 1.0008849557522124, "grad_norm": 1.9315757115293954, "learning_rate": 8.606697228999621e-06, "loss": 1.3741, "step": 1131 }, { "epoch": 1.0008849557522124, "grad_norm": 1.8735705423973807, "learning_rate": 8.604267983514595e-06, "loss": 1.0446, "step": 1132 }, { "epoch": 1.0017699115044247, "grad_norm": 1.9481811123156147, "learning_rate": 8.60183696563352e-06, "loss": 1.1506, "step": 1133 }, { "epoch": 1.002654867256637, "grad_norm": 1.2186333743343443, "learning_rate": 8.599404176551843e-06, "loss": 1.0794, "step": 1134 }, { "epoch": 1.0035398230088495, "grad_norm": 1.2281114385444434, "learning_rate": 8.59696961746589e-06, "loss": 0.9693, "step": 1135 }, { "epoch": 1.0044247787610618, "grad_norm": 1.4735420026601707, "learning_rate": 8.594533289572852e-06, "loss": 1.1214, "step": 1136 }, { "epoch": 1.0053097345132744, "grad_norm": 1.7409612160912231, "learning_rate": 8.592095194070793e-06, "loss": 1.2016, "step": 1137 }, { "epoch": 1.0061946902654868, "grad_norm": 1.42668125945203, "learning_rate": 8.589655332158641e-06, "loss": 1.0528, "step": 1138 }, { "epoch": 1.0070796460176992, "grad_norm": 4.58386297814834, "learning_rate": 8.587213705036202e-06, "loss": 1.0809, "step": 1139 }, { "epoch": 1.0079646017699115, "grad_norm": 1.529876703170467, "learning_rate": 8.584770313904138e-06, "loss": 1.2797, "step": 1140 }, { "epoch": 1.008849557522124, "grad_norm": 1.4178377989920692, "learning_rate": 8.582325159963987e-06, "loss": 1.028, "step": 1141 }, { "epoch": 1.0097345132743363, "grad_norm": 1.2917507446010335, "learning_rate": 8.579878244418154e-06, "loss": 1.0796, "step": 1142 }, { "epoch": 1.0106194690265486, "grad_norm": 1.4171318885466975, "learning_rate": 8.577429568469906e-06, "loss": 1.0413, "step": 1143 }, { "epoch": 1.011504424778761, "grad_norm": 1.4598902878192066, "learning_rate": 8.574979133323378e-06, "loss": 1.1844, "step": 1144 }, { "epoch": 1.0123893805309734, "grad_norm": 1.2564419148209869, "learning_rate": 8.572526940183567e-06, "loss": 0.8819, "step": 1145 }, { "epoch": 1.0132743362831858, "grad_norm": 1.694183984843691, "learning_rate": 8.570072990256342e-06, "loss": 1.2813, "step": 1146 }, { "epoch": 1.0141592920353983, "grad_norm": 1.774848177371664, "learning_rate": 8.567617284748427e-06, "loss": 1.3895, "step": 1147 }, { "epoch": 1.0150442477876107, "grad_norm": 1.5307869782045258, "learning_rate": 8.565159824867415e-06, "loss": 0.8971, "step": 1148 }, { "epoch": 1.015929203539823, "grad_norm": 1.3820308271931172, "learning_rate": 8.562700611821761e-06, "loss": 1.1494, "step": 1149 }, { "epoch": 1.0168141592920354, "grad_norm": 1.4165109057370309, "learning_rate": 8.560239646820779e-06, "loss": 1.0644, "step": 1150 }, { "epoch": 1.0176991150442478, "grad_norm": 1.2213162232364927, "learning_rate": 8.557776931074646e-06, "loss": 0.9402, "step": 1151 }, { "epoch": 1.0185840707964602, "grad_norm": 1.5137016536926549, "learning_rate": 8.555312465794402e-06, "loss": 1.2138, "step": 1152 }, { "epoch": 1.0194690265486726, "grad_norm": 1.4894089073084267, "learning_rate": 8.552846252191949e-06, "loss": 1.0216, "step": 1153 }, { "epoch": 1.020353982300885, "grad_norm": 1.4955448694594908, "learning_rate": 8.550378291480041e-06, "loss": 1.0148, "step": 1154 }, { "epoch": 1.0212389380530973, "grad_norm": 1.5809240840305634, "learning_rate": 8.547908584872298e-06, "loss": 1.1678, "step": 1155 }, { "epoch": 1.0221238938053097, "grad_norm": 1.3094633445535968, "learning_rate": 8.545437133583195e-06, "loss": 1.1123, "step": 1156 }, { "epoch": 1.023008849557522, "grad_norm": 1.562241244307305, "learning_rate": 8.542963938828067e-06, "loss": 1.0316, "step": 1157 }, { "epoch": 1.0238938053097346, "grad_norm": 1.5544343257134727, "learning_rate": 8.540489001823109e-06, "loss": 0.8886, "step": 1158 }, { "epoch": 1.024778761061947, "grad_norm": 1.492430250152058, "learning_rate": 8.538012323785362e-06, "loss": 1.1243, "step": 1159 }, { "epoch": 1.0256637168141594, "grad_norm": 1.3578264893963468, "learning_rate": 8.535533905932739e-06, "loss": 1.3502, "step": 1160 }, { "epoch": 1.0265486725663717, "grad_norm": 1.2639741821032824, "learning_rate": 8.533053749483992e-06, "loss": 0.9283, "step": 1161 }, { "epoch": 1.027433628318584, "grad_norm": 1.2433871718430485, "learning_rate": 8.530571855658743e-06, "loss": 1.0816, "step": 1162 }, { "epoch": 1.0283185840707965, "grad_norm": 1.3283769809615835, "learning_rate": 8.528088225677457e-06, "loss": 1.0673, "step": 1163 }, { "epoch": 1.0292035398230088, "grad_norm": 1.3895733857639754, "learning_rate": 8.52560286076146e-06, "loss": 1.1293, "step": 1164 }, { "epoch": 1.0300884955752212, "grad_norm": 1.4642259468318144, "learning_rate": 8.523115762132925e-06, "loss": 1.2409, "step": 1165 }, { "epoch": 1.0309734513274336, "grad_norm": 1.3354897978159035, "learning_rate": 8.520626931014884e-06, "loss": 1.1753, "step": 1166 }, { "epoch": 1.031858407079646, "grad_norm": 1.4751083604960589, "learning_rate": 8.518136368631216e-06, "loss": 1.0054, "step": 1167 }, { "epoch": 1.0327433628318583, "grad_norm": 1.4215954877212185, "learning_rate": 8.515644076206652e-06, "loss": 1.1429, "step": 1168 }, { "epoch": 1.033628318584071, "grad_norm": 1.4156045287336048, "learning_rate": 8.513150054966778e-06, "loss": 0.991, "step": 1169 }, { "epoch": 1.0345132743362833, "grad_norm": 1.4703439444655786, "learning_rate": 8.510654306138028e-06, "loss": 1.1371, "step": 1170 }, { "epoch": 1.0353982300884956, "grad_norm": 1.678180634011109, "learning_rate": 8.50815683094768e-06, "loss": 1.1323, "step": 1171 }, { "epoch": 1.036283185840708, "grad_norm": 5.181517844841743, "learning_rate": 8.505657630623867e-06, "loss": 1.1037, "step": 1172 }, { "epoch": 1.0371681415929204, "grad_norm": 1.5340341893229916, "learning_rate": 8.503156706395572e-06, "loss": 1.0908, "step": 1173 }, { "epoch": 1.0380530973451327, "grad_norm": 1.4946978454271744, "learning_rate": 8.500654059492618e-06, "loss": 1.0349, "step": 1174 }, { "epoch": 1.038938053097345, "grad_norm": 1.516200480547998, "learning_rate": 8.498149691145685e-06, "loss": 1.4722, "step": 1175 }, { "epoch": 1.0398230088495575, "grad_norm": 1.1932466986209578, "learning_rate": 8.495643602586287e-06, "loss": 1.1527, "step": 1176 }, { "epoch": 1.0407079646017698, "grad_norm": 1.447778427969622, "learning_rate": 8.493135795046799e-06, "loss": 0.9693, "step": 1177 }, { "epoch": 1.0415929203539822, "grad_norm": 1.2853549643961202, "learning_rate": 8.49062626976043e-06, "loss": 1.0434, "step": 1178 }, { "epoch": 1.0424778761061946, "grad_norm": 1.7750662047353607, "learning_rate": 8.488115027961234e-06, "loss": 1.1121, "step": 1179 }, { "epoch": 1.0433628318584072, "grad_norm": 1.8479813744844802, "learning_rate": 8.485602070884118e-06, "loss": 1.399, "step": 1180 }, { "epoch": 1.0442477876106195, "grad_norm": 1.4048759530095989, "learning_rate": 8.48308739976482e-06, "loss": 1.1888, "step": 1181 }, { "epoch": 1.045132743362832, "grad_norm": 2.3481353492135293, "learning_rate": 8.480571015839935e-06, "loss": 1.0596, "step": 1182 }, { "epoch": 1.0460176991150443, "grad_norm": 1.7097432557248837, "learning_rate": 8.47805292034689e-06, "loss": 1.3475, "step": 1183 }, { "epoch": 1.0469026548672566, "grad_norm": 1.400355440860478, "learning_rate": 8.475533114523954e-06, "loss": 1.1691, "step": 1184 }, { "epoch": 1.047787610619469, "grad_norm": 1.5252678160315847, "learning_rate": 8.473011599610244e-06, "loss": 1.2697, "step": 1185 }, { "epoch": 1.0486725663716814, "grad_norm": 3.1557186107620656, "learning_rate": 8.470488376845709e-06, "loss": 1.3429, "step": 1186 }, { "epoch": 1.0495575221238937, "grad_norm": 2.052683884435358, "learning_rate": 8.467963447471144e-06, "loss": 1.3192, "step": 1187 }, { "epoch": 1.0504424778761061, "grad_norm": 1.4064265507873024, "learning_rate": 8.465436812728181e-06, "loss": 0.9541, "step": 1188 }, { "epoch": 1.0513274336283185, "grad_norm": 1.4915444229333392, "learning_rate": 8.462908473859289e-06, "loss": 1.0567, "step": 1189 }, { "epoch": 1.052212389380531, "grad_norm": 10.058957310685757, "learning_rate": 8.460378432107779e-06, "loss": 1.2617, "step": 1190 }, { "epoch": 1.0530973451327434, "grad_norm": 1.5473927607469151, "learning_rate": 8.457846688717797e-06, "loss": 1.0627, "step": 1191 }, { "epoch": 1.0539823008849558, "grad_norm": 2.3192217257573224, "learning_rate": 8.455313244934324e-06, "loss": 1.2097, "step": 1192 }, { "epoch": 1.0548672566371682, "grad_norm": 1.3434187767800752, "learning_rate": 8.45277810200318e-06, "loss": 1.1704, "step": 1193 }, { "epoch": 1.0557522123893806, "grad_norm": 1.2432410611449516, "learning_rate": 8.450241261171022e-06, "loss": 0.8951, "step": 1194 }, { "epoch": 1.056637168141593, "grad_norm": 1.4892116087002907, "learning_rate": 8.447702723685335e-06, "loss": 0.9337, "step": 1195 }, { "epoch": 1.0575221238938053, "grad_norm": 3.466196399335579, "learning_rate": 8.445162490794447e-06, "loss": 1.1455, "step": 1196 }, { "epoch": 1.0584070796460177, "grad_norm": 1.4138469297590115, "learning_rate": 8.442620563747512e-06, "loss": 1.1702, "step": 1197 }, { "epoch": 1.05929203539823, "grad_norm": 1.7924655682503081, "learning_rate": 8.440076943794523e-06, "loss": 1.1926, "step": 1198 }, { "epoch": 1.0601769911504424, "grad_norm": 1.3619382849272776, "learning_rate": 8.437531632186305e-06, "loss": 1.0752, "step": 1199 }, { "epoch": 1.0610619469026548, "grad_norm": 1.4139496532723836, "learning_rate": 8.43498463017451e-06, "loss": 1.1797, "step": 1200 }, { "epoch": 1.0619469026548674, "grad_norm": 1.1329959800003802, "learning_rate": 8.432435939011623e-06, "loss": 0.8463, "step": 1201 }, { "epoch": 1.0628318584070797, "grad_norm": 1.3887353440171488, "learning_rate": 8.429885559950965e-06, "loss": 1.0395, "step": 1202 }, { "epoch": 1.063716814159292, "grad_norm": 1.9402847773146525, "learning_rate": 8.42733349424668e-06, "loss": 0.9163, "step": 1203 }, { "epoch": 1.0646017699115045, "grad_norm": 1.3201467550482813, "learning_rate": 8.424779743153747e-06, "loss": 0.8354, "step": 1204 }, { "epoch": 1.0654867256637168, "grad_norm": 1.5421732191244408, "learning_rate": 8.422224307927967e-06, "loss": 1.4379, "step": 1205 }, { "epoch": 1.0663716814159292, "grad_norm": 1.4489091279540696, "learning_rate": 8.419667189825977e-06, "loss": 1.2621, "step": 1206 }, { "epoch": 1.0672566371681416, "grad_norm": 1.3137483051293515, "learning_rate": 8.417108390105238e-06, "loss": 1.1285, "step": 1207 }, { "epoch": 1.068141592920354, "grad_norm": 1.3873559796354729, "learning_rate": 8.414547910024035e-06, "loss": 1.1574, "step": 1208 }, { "epoch": 1.0690265486725663, "grad_norm": 1.5292097991962161, "learning_rate": 8.411985750841484e-06, "loss": 0.9494, "step": 1209 }, { "epoch": 1.0699115044247787, "grad_norm": 2.3128079020620578, "learning_rate": 8.409421913817526e-06, "loss": 0.9108, "step": 1210 }, { "epoch": 1.0707964601769913, "grad_norm": 3.6811682216979658, "learning_rate": 8.406856400212924e-06, "loss": 1.1306, "step": 1211 }, { "epoch": 1.0716814159292036, "grad_norm": 2.6990956602997667, "learning_rate": 8.404289211289267e-06, "loss": 1.1911, "step": 1212 }, { "epoch": 1.072566371681416, "grad_norm": 1.1831468333763722, "learning_rate": 8.40172034830897e-06, "loss": 1.0962, "step": 1213 }, { "epoch": 1.0734513274336284, "grad_norm": 1.822304373767993, "learning_rate": 8.399149812535269e-06, "loss": 1.2997, "step": 1214 }, { "epoch": 1.0743362831858407, "grad_norm": 1.5070799077916055, "learning_rate": 8.396577605232221e-06, "loss": 1.2389, "step": 1215 }, { "epoch": 1.075221238938053, "grad_norm": 1.640115095771238, "learning_rate": 8.39400372766471e-06, "loss": 0.9315, "step": 1216 }, { "epoch": 1.0761061946902655, "grad_norm": 3.466821411652683, "learning_rate": 8.391428181098435e-06, "loss": 1.2462, "step": 1217 }, { "epoch": 1.0769911504424778, "grad_norm": 1.1671906761031687, "learning_rate": 8.388850966799921e-06, "loss": 0.9571, "step": 1218 }, { "epoch": 1.0778761061946902, "grad_norm": 1.2480812129597987, "learning_rate": 8.386272086036514e-06, "loss": 0.8617, "step": 1219 }, { "epoch": 1.0787610619469026, "grad_norm": 1.3544244129088803, "learning_rate": 8.383691540076372e-06, "loss": 1.1409, "step": 1220 }, { "epoch": 1.079646017699115, "grad_norm": 1.3207701815304922, "learning_rate": 8.381109330188479e-06, "loss": 1.1921, "step": 1221 }, { "epoch": 1.0805309734513275, "grad_norm": 1.7350177480680098, "learning_rate": 8.378525457642633e-06, "loss": 0.9841, "step": 1222 }, { "epoch": 1.08141592920354, "grad_norm": 1.3878519450439268, "learning_rate": 8.375939923709453e-06, "loss": 1.0014, "step": 1223 }, { "epoch": 1.0823008849557523, "grad_norm": 1.416777429785744, "learning_rate": 8.373352729660373e-06, "loss": 1.1045, "step": 1224 }, { "epoch": 1.0831858407079646, "grad_norm": 2.784440007471356, "learning_rate": 8.370763876767642e-06, "loss": 0.9897, "step": 1225 }, { "epoch": 1.084070796460177, "grad_norm": 1.2955845398652888, "learning_rate": 8.36817336630433e-06, "loss": 0.9658, "step": 1226 }, { "epoch": 1.0849557522123894, "grad_norm": 1.6266812590349786, "learning_rate": 8.365581199544316e-06, "loss": 1.061, "step": 1227 }, { "epoch": 1.0858407079646017, "grad_norm": 6.240675139213699, "learning_rate": 8.362987377762295e-06, "loss": 1.1781, "step": 1228 }, { "epoch": 1.0867256637168141, "grad_norm": 2.1530372719029662, "learning_rate": 8.36039190223378e-06, "loss": 1.3681, "step": 1229 }, { "epoch": 1.0876106194690265, "grad_norm": 1.2366914951808703, "learning_rate": 8.357794774235094e-06, "loss": 1.1912, "step": 1230 }, { "epoch": 1.0884955752212389, "grad_norm": 1.4202611277135024, "learning_rate": 8.355195995043368e-06, "loss": 1.1054, "step": 1231 }, { "epoch": 1.0893805309734512, "grad_norm": 1.4459053326666613, "learning_rate": 8.352595565936554e-06, "loss": 1.0265, "step": 1232 }, { "epoch": 1.0902654867256638, "grad_norm": 1.8976365001298074, "learning_rate": 8.349993488193411e-06, "loss": 1.0343, "step": 1233 }, { "epoch": 1.0911504424778762, "grad_norm": 1.4969581139344024, "learning_rate": 8.347389763093507e-06, "loss": 1.101, "step": 1234 }, { "epoch": 1.0920353982300885, "grad_norm": 1.183514661040638, "learning_rate": 8.344784391917225e-06, "loss": 0.8602, "step": 1235 }, { "epoch": 1.092920353982301, "grad_norm": 1.2029259158346, "learning_rate": 8.34217737594575e-06, "loss": 1.1947, "step": 1236 }, { "epoch": 1.0938053097345133, "grad_norm": 1.3726256234854872, "learning_rate": 8.339568716461082e-06, "loss": 1.1941, "step": 1237 }, { "epoch": 1.0946902654867257, "grad_norm": 1.4716628187874725, "learning_rate": 8.336958414746027e-06, "loss": 1.0715, "step": 1238 }, { "epoch": 1.095575221238938, "grad_norm": 1.1164105869793306, "learning_rate": 8.3343464720842e-06, "loss": 1.0312, "step": 1239 }, { "epoch": 1.0964601769911504, "grad_norm": 1.1263250008263548, "learning_rate": 8.331732889760021e-06, "loss": 0.9111, "step": 1240 }, { "epoch": 1.0973451327433628, "grad_norm": 1.2761301358851804, "learning_rate": 8.329117669058717e-06, "loss": 1.0782, "step": 1241 }, { "epoch": 1.0982300884955751, "grad_norm": 1.8414607343157219, "learning_rate": 8.326500811266321e-06, "loss": 1.0608, "step": 1242 }, { "epoch": 1.0991150442477875, "grad_norm": 1.2427020409438583, "learning_rate": 8.323882317669672e-06, "loss": 1.1103, "step": 1243 }, { "epoch": 1.1, "grad_norm": 1.4408628776601284, "learning_rate": 8.32126218955641e-06, "loss": 1.1761, "step": 1244 }, { "epoch": 1.1008849557522125, "grad_norm": 1.452901811721267, "learning_rate": 8.318640428214982e-06, "loss": 0.9281, "step": 1245 }, { "epoch": 1.1017699115044248, "grad_norm": 1.2723277488774214, "learning_rate": 8.316017034934638e-06, "loss": 1.1891, "step": 1246 }, { "epoch": 1.1026548672566372, "grad_norm": 1.4010051970729813, "learning_rate": 8.313392011005428e-06, "loss": 1.1779, "step": 1247 }, { "epoch": 1.1035398230088496, "grad_norm": 1.521662230203516, "learning_rate": 8.310765357718207e-06, "loss": 0.9919, "step": 1248 }, { "epoch": 1.104424778761062, "grad_norm": 4.376888040877593, "learning_rate": 8.30813707636463e-06, "loss": 1.1065, "step": 1249 }, { "epoch": 1.1053097345132743, "grad_norm": 1.4474385911925391, "learning_rate": 8.305507168237152e-06, "loss": 1.1021, "step": 1250 }, { "epoch": 1.1061946902654867, "grad_norm": 1.2397555624130792, "learning_rate": 8.302875634629027e-06, "loss": 1.1087, "step": 1251 }, { "epoch": 1.107079646017699, "grad_norm": 1.748307285836397, "learning_rate": 8.30024247683431e-06, "loss": 1.2028, "step": 1252 }, { "epoch": 1.1079646017699114, "grad_norm": 1.5840160670005883, "learning_rate": 8.297607696147855e-06, "loss": 1.038, "step": 1253 }, { "epoch": 1.108849557522124, "grad_norm": 1.5482904428573159, "learning_rate": 8.294971293865315e-06, "loss": 1.0613, "step": 1254 }, { "epoch": 1.1097345132743364, "grad_norm": 1.4067094784470229, "learning_rate": 8.292333271283135e-06, "loss": 1.0274, "step": 1255 }, { "epoch": 1.1106194690265487, "grad_norm": 2.129408019269671, "learning_rate": 8.289693629698564e-06, "loss": 1.0657, "step": 1256 }, { "epoch": 1.111504424778761, "grad_norm": 1.6213597760137162, "learning_rate": 8.287052370409643e-06, "loss": 1.0193, "step": 1257 }, { "epoch": 1.1123893805309735, "grad_norm": 1.6449303676986609, "learning_rate": 8.284409494715208e-06, "loss": 1.1346, "step": 1258 }, { "epoch": 1.1132743362831858, "grad_norm": 2.17643143254864, "learning_rate": 8.281765003914892e-06, "loss": 1.422, "step": 1259 }, { "epoch": 1.1141592920353982, "grad_norm": 1.303453299019362, "learning_rate": 8.279118899309121e-06, "loss": 1.227, "step": 1260 }, { "epoch": 1.1150442477876106, "grad_norm": 1.2856806833209642, "learning_rate": 8.276471182199115e-06, "loss": 1.1456, "step": 1261 }, { "epoch": 1.115929203539823, "grad_norm": 1.7731619084277643, "learning_rate": 8.273821853886888e-06, "loss": 0.9737, "step": 1262 }, { "epoch": 1.1168141592920353, "grad_norm": 1.6368315588173794, "learning_rate": 8.271170915675245e-06, "loss": 1.1753, "step": 1263 }, { "epoch": 1.117699115044248, "grad_norm": 1.4369808391790835, "learning_rate": 8.268518368867781e-06, "loss": 0.9967, "step": 1264 }, { "epoch": 1.1185840707964603, "grad_norm": 1.3232478665343992, "learning_rate": 8.265864214768883e-06, "loss": 0.8891, "step": 1265 }, { "epoch": 1.1194690265486726, "grad_norm": 1.286953291732408, "learning_rate": 8.263208454683735e-06, "loss": 1.1399, "step": 1266 }, { "epoch": 1.120353982300885, "grad_norm": 2.6695847669279775, "learning_rate": 8.2605510899183e-06, "loss": 0.9601, "step": 1267 }, { "epoch": 1.1212389380530974, "grad_norm": 1.3072175781952347, "learning_rate": 8.257892121779335e-06, "loss": 0.9519, "step": 1268 }, { "epoch": 1.1221238938053097, "grad_norm": 1.5312078642568938, "learning_rate": 8.255231551574389e-06, "loss": 1.1336, "step": 1269 }, { "epoch": 1.1230088495575221, "grad_norm": 1.607077371725657, "learning_rate": 8.252569380611793e-06, "loss": 1.1381, "step": 1270 }, { "epoch": 1.1238938053097345, "grad_norm": 1.3157451863398923, "learning_rate": 8.249905610200666e-06, "loss": 1.0913, "step": 1271 }, { "epoch": 1.1247787610619469, "grad_norm": 1.5686504635087053, "learning_rate": 8.247240241650918e-06, "loss": 1.0421, "step": 1272 }, { "epoch": 1.1256637168141592, "grad_norm": 2.063005490076947, "learning_rate": 8.244573276273242e-06, "loss": 1.1511, "step": 1273 }, { "epoch": 1.1265486725663716, "grad_norm": 1.3399252894061215, "learning_rate": 8.241904715379115e-06, "loss": 1.1826, "step": 1274 }, { "epoch": 1.1274336283185842, "grad_norm": 1.4290120673864657, "learning_rate": 8.2392345602808e-06, "loss": 1.4211, "step": 1275 }, { "epoch": 1.1283185840707965, "grad_norm": 1.4927336170344483, "learning_rate": 8.236562812291341e-06, "loss": 1.4905, "step": 1276 }, { "epoch": 1.129203539823009, "grad_norm": 4.883001399876412, "learning_rate": 8.233889472724574e-06, "loss": 1.1722, "step": 1277 }, { "epoch": 1.1300884955752213, "grad_norm": 1.2995956706964644, "learning_rate": 8.231214542895107e-06, "loss": 0.9784, "step": 1278 }, { "epoch": 1.1309734513274337, "grad_norm": 1.3395099553653786, "learning_rate": 8.228538024118338e-06, "loss": 1.0036, "step": 1279 }, { "epoch": 1.131858407079646, "grad_norm": 1.3858503265737778, "learning_rate": 8.22585991771044e-06, "loss": 1.2133, "step": 1280 }, { "epoch": 1.1327433628318584, "grad_norm": 1.3645493943830829, "learning_rate": 8.22318022498837e-06, "loss": 1.2018, "step": 1281 }, { "epoch": 1.1336283185840708, "grad_norm": 1.2385947602043845, "learning_rate": 8.220498947269866e-06, "loss": 0.9934, "step": 1282 }, { "epoch": 1.1345132743362831, "grad_norm": 1.3000745581084525, "learning_rate": 8.217816085873445e-06, "loss": 0.8514, "step": 1283 }, { "epoch": 1.1353982300884955, "grad_norm": 1.4484341090283783, "learning_rate": 8.215131642118401e-06, "loss": 1.137, "step": 1284 }, { "epoch": 1.1362831858407079, "grad_norm": 1.23380887971531, "learning_rate": 8.21244561732481e-06, "loss": 1.0806, "step": 1285 }, { "epoch": 1.1371681415929205, "grad_norm": 6.953686653826788, "learning_rate": 8.209758012813515e-06, "loss": 1.3819, "step": 1286 }, { "epoch": 1.1380530973451328, "grad_norm": 1.5044728061030737, "learning_rate": 8.207068829906153e-06, "loss": 0.8469, "step": 1287 }, { "epoch": 1.1389380530973452, "grad_norm": 1.30080517573963, "learning_rate": 8.204378069925121e-06, "loss": 1.325, "step": 1288 }, { "epoch": 1.1398230088495576, "grad_norm": 1.5875474985469469, "learning_rate": 8.201685734193602e-06, "loss": 1.3103, "step": 1289 }, { "epoch": 1.14070796460177, "grad_norm": 1.7005153923120897, "learning_rate": 8.198991824035546e-06, "loss": 0.9821, "step": 1290 }, { "epoch": 1.1415929203539823, "grad_norm": 2.03721533667952, "learning_rate": 8.196296340775687e-06, "loss": 1.3259, "step": 1291 }, { "epoch": 1.1424778761061947, "grad_norm": 1.5336693684436054, "learning_rate": 8.193599285739522e-06, "loss": 1.1492, "step": 1292 }, { "epoch": 1.143362831858407, "grad_norm": 1.4465616159426342, "learning_rate": 8.190900660253327e-06, "loss": 1.187, "step": 1293 }, { "epoch": 1.1442477876106194, "grad_norm": 1.4458879587822457, "learning_rate": 8.18820046564415e-06, "loss": 1.2523, "step": 1294 }, { "epoch": 1.1451327433628318, "grad_norm": 2.4241765475559047, "learning_rate": 8.185498703239808e-06, "loss": 1.2037, "step": 1295 }, { "epoch": 1.1460176991150441, "grad_norm": 1.3808439996018236, "learning_rate": 8.182795374368893e-06, "loss": 1.0265, "step": 1296 }, { "epoch": 1.1469026548672567, "grad_norm": 1.14894743491785, "learning_rate": 8.180090480360764e-06, "loss": 1.0778, "step": 1297 }, { "epoch": 1.147787610619469, "grad_norm": 1.7284862162355228, "learning_rate": 8.177384022545549e-06, "loss": 1.242, "step": 1298 }, { "epoch": 1.1486725663716815, "grad_norm": 2.068911356712327, "learning_rate": 8.174676002254149e-06, "loss": 1.0094, "step": 1299 }, { "epoch": 1.1495575221238938, "grad_norm": 1.3682121797738154, "learning_rate": 8.171966420818227e-06, "loss": 0.7969, "step": 1300 }, { "epoch": 1.1504424778761062, "grad_norm": 1.3886565297288254, "learning_rate": 8.169255279570223e-06, "loss": 1.1013, "step": 1301 }, { "epoch": 1.1513274336283186, "grad_norm": 1.789214882428885, "learning_rate": 8.166542579843337e-06, "loss": 1.0734, "step": 1302 }, { "epoch": 1.152212389380531, "grad_norm": 1.5176778069924544, "learning_rate": 8.163828322971533e-06, "loss": 1.2711, "step": 1303 }, { "epoch": 1.1530973451327433, "grad_norm": 1.3470829397626622, "learning_rate": 8.16111251028955e-06, "loss": 1.2986, "step": 1304 }, { "epoch": 1.1539823008849557, "grad_norm": 1.581361714745579, "learning_rate": 8.158395143132882e-06, "loss": 1.0943, "step": 1305 }, { "epoch": 1.154867256637168, "grad_norm": 4.252397739138023, "learning_rate": 8.155676222837799e-06, "loss": 0.9309, "step": 1306 }, { "epoch": 1.1557522123893804, "grad_norm": 1.5283470798556804, "learning_rate": 8.15295575074132e-06, "loss": 1.0757, "step": 1307 }, { "epoch": 1.156637168141593, "grad_norm": 1.40293119753076, "learning_rate": 8.15023372818124e-06, "loss": 1.1722, "step": 1308 }, { "epoch": 1.1575221238938054, "grad_norm": 1.5044114007482372, "learning_rate": 8.14751015649611e-06, "loss": 1.2169, "step": 1309 }, { "epoch": 1.1584070796460177, "grad_norm": 1.4374489807448494, "learning_rate": 8.144785037025246e-06, "loss": 1.1481, "step": 1310 }, { "epoch": 1.1592920353982301, "grad_norm": 1.3552833892317093, "learning_rate": 8.142058371108724e-06, "loss": 1.1417, "step": 1311 }, { "epoch": 1.1601769911504425, "grad_norm": 1.4415955160861216, "learning_rate": 8.139330160087374e-06, "loss": 1.1899, "step": 1312 }, { "epoch": 1.1610619469026549, "grad_norm": 2.201609019857072, "learning_rate": 8.136600405302796e-06, "loss": 1.123, "step": 1313 }, { "epoch": 1.1619469026548672, "grad_norm": 1.1785615780303735, "learning_rate": 8.133869108097349e-06, "loss": 1.0643, "step": 1314 }, { "epoch": 1.1628318584070796, "grad_norm": 1.5243807999294388, "learning_rate": 8.131136269814139e-06, "loss": 1.2695, "step": 1315 }, { "epoch": 1.163716814159292, "grad_norm": 1.4054673547365695, "learning_rate": 8.12840189179704e-06, "loss": 0.9965, "step": 1316 }, { "epoch": 1.1646017699115045, "grad_norm": 1.2248758813708776, "learning_rate": 8.125665975390682e-06, "loss": 0.9835, "step": 1317 }, { "epoch": 1.1654867256637167, "grad_norm": 2.6551346808087257, "learning_rate": 8.122928521940448e-06, "loss": 1.0752, "step": 1318 }, { "epoch": 1.1663716814159293, "grad_norm": 1.3353601765497374, "learning_rate": 8.120189532792478e-06, "loss": 1.0975, "step": 1319 }, { "epoch": 1.1672566371681417, "grad_norm": 1.4572555543910926, "learning_rate": 8.117449009293668e-06, "loss": 1.1907, "step": 1320 }, { "epoch": 1.168141592920354, "grad_norm": 1.4162220125351708, "learning_rate": 8.11470695279167e-06, "loss": 0.9835, "step": 1321 }, { "epoch": 1.1690265486725664, "grad_norm": 1.8466722467172658, "learning_rate": 8.111963364634889e-06, "loss": 0.9579, "step": 1322 }, { "epoch": 1.1699115044247788, "grad_norm": 1.3298520126500197, "learning_rate": 8.109218246172478e-06, "loss": 0.9559, "step": 1323 }, { "epoch": 1.1707964601769911, "grad_norm": 1.512114499086275, "learning_rate": 8.106471598754353e-06, "loss": 1.1153, "step": 1324 }, { "epoch": 1.1716814159292035, "grad_norm": 1.205456460857661, "learning_rate": 8.103723423731169e-06, "loss": 0.8799, "step": 1325 }, { "epoch": 1.1725663716814159, "grad_norm": 1.4549796384778826, "learning_rate": 8.100973722454344e-06, "loss": 1.0358, "step": 1326 }, { "epoch": 1.1734513274336282, "grad_norm": 1.2592223195146188, "learning_rate": 8.09822249627604e-06, "loss": 1.0229, "step": 1327 }, { "epoch": 1.1743362831858408, "grad_norm": 1.2932977040734883, "learning_rate": 8.095469746549172e-06, "loss": 1.2278, "step": 1328 }, { "epoch": 1.1752212389380532, "grad_norm": 2.370710318046772, "learning_rate": 8.092715474627398e-06, "loss": 1.5416, "step": 1329 }, { "epoch": 1.1761061946902656, "grad_norm": 1.4764517203693046, "learning_rate": 8.089959681865134e-06, "loss": 1.2038, "step": 1330 }, { "epoch": 1.176991150442478, "grad_norm": 1.5755483907585954, "learning_rate": 8.087202369617534e-06, "loss": 1.1598, "step": 1331 }, { "epoch": 1.1778761061946903, "grad_norm": 1.7103795723713184, "learning_rate": 8.084443539240511e-06, "loss": 1.3691, "step": 1332 }, { "epoch": 1.1787610619469027, "grad_norm": 1.5440085228726423, "learning_rate": 8.081683192090714e-06, "loss": 1.3772, "step": 1333 }, { "epoch": 1.179646017699115, "grad_norm": 2.4034737212438326, "learning_rate": 8.07892132952554e-06, "loss": 1.4134, "step": 1334 }, { "epoch": 1.1805309734513274, "grad_norm": 1.3681240316273817, "learning_rate": 8.076157952903134e-06, "loss": 1.087, "step": 1335 }, { "epoch": 1.1814159292035398, "grad_norm": 1.4522769749726032, "learning_rate": 8.073393063582386e-06, "loss": 1.4476, "step": 1336 }, { "epoch": 1.1823008849557521, "grad_norm": 1.2229664750295213, "learning_rate": 8.070626662922928e-06, "loss": 1.0737, "step": 1337 }, { "epoch": 1.1831858407079645, "grad_norm": 1.4893151151282742, "learning_rate": 8.067858752285134e-06, "loss": 1.2674, "step": 1338 }, { "epoch": 1.184070796460177, "grad_norm": 1.3607544919888095, "learning_rate": 8.06508933303012e-06, "loss": 1.2356, "step": 1339 }, { "epoch": 1.1849557522123895, "grad_norm": 1.5891302622262882, "learning_rate": 8.062318406519751e-06, "loss": 1.1787, "step": 1340 }, { "epoch": 1.1858407079646018, "grad_norm": 1.6224077597698057, "learning_rate": 8.059545974116625e-06, "loss": 0.8735, "step": 1341 }, { "epoch": 1.1867256637168142, "grad_norm": 2.310094487498882, "learning_rate": 8.056772037184083e-06, "loss": 1.1513, "step": 1342 }, { "epoch": 1.1876106194690266, "grad_norm": 1.5554585125120806, "learning_rate": 8.053996597086208e-06, "loss": 0.9165, "step": 1343 }, { "epoch": 1.188495575221239, "grad_norm": 1.3059885109139053, "learning_rate": 8.051219655187818e-06, "loss": 1.0873, "step": 1344 }, { "epoch": 1.1893805309734513, "grad_norm": 1.5242859215899003, "learning_rate": 8.048441212854477e-06, "loss": 1.2407, "step": 1345 }, { "epoch": 1.1902654867256637, "grad_norm": 1.5294230836893858, "learning_rate": 8.045661271452475e-06, "loss": 1.0441, "step": 1346 }, { "epoch": 1.191150442477876, "grad_norm": 1.390831172624902, "learning_rate": 8.042879832348853e-06, "loss": 1.332, "step": 1347 }, { "epoch": 1.1920353982300884, "grad_norm": 1.1625643210218315, "learning_rate": 8.040096896911379e-06, "loss": 0.9526, "step": 1348 }, { "epoch": 1.1929203539823008, "grad_norm": 1.3942692536207837, "learning_rate": 8.037312466508555e-06, "loss": 1.4095, "step": 1349 }, { "epoch": 1.1938053097345134, "grad_norm": 1.7796669809861494, "learning_rate": 8.034526542509629e-06, "loss": 1.2139, "step": 1350 }, { "epoch": 1.1946902654867257, "grad_norm": 1.728669185365813, "learning_rate": 8.031739126284574e-06, "loss": 1.1251, "step": 1351 }, { "epoch": 1.1955752212389381, "grad_norm": 1.272634246791433, "learning_rate": 8.0289502192041e-06, "loss": 0.9637, "step": 1352 }, { "epoch": 1.1964601769911505, "grad_norm": 1.2882346618392473, "learning_rate": 8.02615982263965e-06, "loss": 0.8035, "step": 1353 }, { "epoch": 1.1973451327433628, "grad_norm": 1.78271092819043, "learning_rate": 8.0233679379634e-06, "loss": 1.5801, "step": 1354 }, { "epoch": 1.1982300884955752, "grad_norm": 1.580585588111612, "learning_rate": 8.020574566548258e-06, "loss": 1.0809, "step": 1355 }, { "epoch": 1.1991150442477876, "grad_norm": 1.3328442000258705, "learning_rate": 8.017779709767857e-06, "loss": 1.1868, "step": 1356 }, { "epoch": 1.2, "grad_norm": 1.3768515831284562, "learning_rate": 8.014983368996573e-06, "loss": 1.0087, "step": 1357 }, { "epoch": 1.2008849557522123, "grad_norm": 1.946445019059083, "learning_rate": 8.0121855456095e-06, "loss": 1.3371, "step": 1358 }, { "epoch": 1.2017699115044247, "grad_norm": 1.439368928740964, "learning_rate": 8.009386240982468e-06, "loss": 1.2597, "step": 1359 }, { "epoch": 1.202654867256637, "grad_norm": 1.5102292408169598, "learning_rate": 8.00658545649203e-06, "loss": 1.1381, "step": 1360 }, { "epoch": 1.2035398230088497, "grad_norm": 1.4579664219451312, "learning_rate": 8.003783193515473e-06, "loss": 1.1067, "step": 1361 }, { "epoch": 1.204424778761062, "grad_norm": 1.649689953711718, "learning_rate": 8.000979453430804e-06, "loss": 1.128, "step": 1362 }, { "epoch": 1.2053097345132744, "grad_norm": 1.2684448247306286, "learning_rate": 7.998174237616763e-06, "loss": 1.1242, "step": 1363 }, { "epoch": 1.2061946902654868, "grad_norm": 1.3901877512760807, "learning_rate": 7.995367547452811e-06, "loss": 0.9737, "step": 1364 }, { "epoch": 1.2070796460176991, "grad_norm": 1.5608056936734203, "learning_rate": 7.992559384319138e-06, "loss": 1.2694, "step": 1365 }, { "epoch": 1.2079646017699115, "grad_norm": 1.9533541506793466, "learning_rate": 7.989749749596653e-06, "loss": 0.9398, "step": 1366 }, { "epoch": 1.2088495575221239, "grad_norm": 1.5854350444922967, "learning_rate": 7.986938644666995e-06, "loss": 1.1261, "step": 1367 }, { "epoch": 1.2097345132743362, "grad_norm": 1.3817595774323013, "learning_rate": 7.984126070912519e-06, "loss": 0.9984, "step": 1368 }, { "epoch": 1.2106194690265486, "grad_norm": 2.1425460136621197, "learning_rate": 7.98131202971631e-06, "loss": 1.1374, "step": 1369 }, { "epoch": 1.211504424778761, "grad_norm": 1.464318332110233, "learning_rate": 7.978496522462167e-06, "loss": 1.2819, "step": 1370 }, { "epoch": 1.2123893805309733, "grad_norm": 1.4553212368856787, "learning_rate": 7.975679550534617e-06, "loss": 1.1348, "step": 1371 }, { "epoch": 1.213274336283186, "grad_norm": 1.7206629394680624, "learning_rate": 7.972861115318902e-06, "loss": 1.0113, "step": 1372 }, { "epoch": 1.2141592920353983, "grad_norm": 1.4810156930409042, "learning_rate": 7.970041218200983e-06, "loss": 1.0915, "step": 1373 }, { "epoch": 1.2150442477876107, "grad_norm": 1.2805354882059816, "learning_rate": 7.967219860567547e-06, "loss": 0.9533, "step": 1374 }, { "epoch": 1.215929203539823, "grad_norm": 1.2162780730562361, "learning_rate": 7.964397043805994e-06, "loss": 1.1239, "step": 1375 }, { "epoch": 1.2168141592920354, "grad_norm": 1.28238995410991, "learning_rate": 7.961572769304437e-06, "loss": 1.2045, "step": 1376 }, { "epoch": 1.2176991150442478, "grad_norm": 1.4929472268504422, "learning_rate": 7.958747038451715e-06, "loss": 1.2669, "step": 1377 }, { "epoch": 1.2185840707964601, "grad_norm": 1.3844598449508043, "learning_rate": 7.95591985263738e-06, "loss": 1.0178, "step": 1378 }, { "epoch": 1.2194690265486725, "grad_norm": 1.4350272526619456, "learning_rate": 7.953091213251694e-06, "loss": 1.2206, "step": 1379 }, { "epoch": 1.2203539823008849, "grad_norm": 1.4633907790598224, "learning_rate": 7.950261121685642e-06, "loss": 1.1236, "step": 1380 }, { "epoch": 1.2212389380530975, "grad_norm": 1.3320711031442702, "learning_rate": 7.947429579330917e-06, "loss": 1.0588, "step": 1381 }, { "epoch": 1.2221238938053096, "grad_norm": 1.3908083416018187, "learning_rate": 7.944596587579931e-06, "loss": 1.2438, "step": 1382 }, { "epoch": 1.2230088495575222, "grad_norm": 1.482277615365509, "learning_rate": 7.941762147825803e-06, "loss": 1.088, "step": 1383 }, { "epoch": 1.2238938053097346, "grad_norm": 1.6246009618514095, "learning_rate": 7.938926261462366e-06, "loss": 1.1984, "step": 1384 }, { "epoch": 1.224778761061947, "grad_norm": 1.2227978650062565, "learning_rate": 7.936088929884168e-06, "loss": 1.1862, "step": 1385 }, { "epoch": 1.2256637168141593, "grad_norm": 2.291639592354343, "learning_rate": 7.933250154486462e-06, "loss": 0.9939, "step": 1386 }, { "epoch": 1.2265486725663717, "grad_norm": 1.8938530393978268, "learning_rate": 7.930409936665214e-06, "loss": 1.0161, "step": 1387 }, { "epoch": 1.227433628318584, "grad_norm": 1.291590291542015, "learning_rate": 7.927568277817099e-06, "loss": 0.7592, "step": 1388 }, { "epoch": 1.2283185840707964, "grad_norm": 1.2179883404608471, "learning_rate": 7.924725179339501e-06, "loss": 1.0151, "step": 1389 }, { "epoch": 1.2292035398230088, "grad_norm": 1.307211662914542, "learning_rate": 7.921880642630514e-06, "loss": 1.196, "step": 1390 }, { "epoch": 1.2300884955752212, "grad_norm": 2.0939389456727318, "learning_rate": 7.919034669088933e-06, "loss": 1.1765, "step": 1391 }, { "epoch": 1.2309734513274337, "grad_norm": 1.4755646167390906, "learning_rate": 7.916187260114264e-06, "loss": 1.1107, "step": 1392 }, { "epoch": 1.2318584070796461, "grad_norm": 2.0497143881119424, "learning_rate": 7.913338417106718e-06, "loss": 1.1566, "step": 1393 }, { "epoch": 1.2327433628318585, "grad_norm": 1.6179876336026373, "learning_rate": 7.910488141467215e-06, "loss": 1.2042, "step": 1394 }, { "epoch": 1.2336283185840708, "grad_norm": 1.1989379284458086, "learning_rate": 7.90763643459737e-06, "loss": 1.0192, "step": 1395 }, { "epoch": 1.2345132743362832, "grad_norm": 1.6133593124602392, "learning_rate": 7.904783297899515e-06, "loss": 1.0269, "step": 1396 }, { "epoch": 1.2353982300884956, "grad_norm": 1.4732164133647578, "learning_rate": 7.90192873277667e-06, "loss": 1.1692, "step": 1397 }, { "epoch": 1.236283185840708, "grad_norm": 1.9182813231982363, "learning_rate": 7.89907274063257e-06, "loss": 1.1794, "step": 1398 }, { "epoch": 1.2371681415929203, "grad_norm": 1.728573560673791, "learning_rate": 7.896215322871646e-06, "loss": 1.1609, "step": 1399 }, { "epoch": 1.2380530973451327, "grad_norm": 1.425547905825057, "learning_rate": 7.89335648089903e-06, "loss": 1.2639, "step": 1400 }, { "epoch": 1.238938053097345, "grad_norm": 1.4446290414056948, "learning_rate": 7.890496216120557e-06, "loss": 1.1991, "step": 1401 }, { "epoch": 1.2398230088495574, "grad_norm": 1.4183438362481193, "learning_rate": 7.887634529942759e-06, "loss": 1.1727, "step": 1402 }, { "epoch": 1.24070796460177, "grad_norm": 1.3605521569728962, "learning_rate": 7.884771423772867e-06, "loss": 1.0269, "step": 1403 }, { "epoch": 1.2415929203539824, "grad_norm": 1.879576472276567, "learning_rate": 7.881906899018815e-06, "loss": 1.3916, "step": 1404 }, { "epoch": 1.2424778761061948, "grad_norm": 1.5231689410192661, "learning_rate": 7.879040957089229e-06, "loss": 1.1741, "step": 1405 }, { "epoch": 1.2433628318584071, "grad_norm": 1.5098788814985589, "learning_rate": 7.87617359939343e-06, "loss": 1.2839, "step": 1406 }, { "epoch": 1.2442477876106195, "grad_norm": 1.7159412518648294, "learning_rate": 7.873304827341444e-06, "loss": 1.2737, "step": 1407 }, { "epoch": 1.2451327433628319, "grad_norm": 1.7480298272716037, "learning_rate": 7.870434642343984e-06, "loss": 1.2774, "step": 1408 }, { "epoch": 1.2460176991150442, "grad_norm": 2.138255879379555, "learning_rate": 7.867563045812466e-06, "loss": 1.4427, "step": 1409 }, { "epoch": 1.2469026548672566, "grad_norm": 3.04859803112163, "learning_rate": 7.864690039158991e-06, "loss": 1.2225, "step": 1410 }, { "epoch": 1.247787610619469, "grad_norm": 1.5129888004737513, "learning_rate": 7.861815623796358e-06, "loss": 1.3061, "step": 1411 }, { "epoch": 1.2486725663716813, "grad_norm": 1.4944585440974436, "learning_rate": 7.858939801138061e-06, "loss": 0.9955, "step": 1412 }, { "epoch": 1.2495575221238937, "grad_norm": 1.9132394196877, "learning_rate": 7.856062572598284e-06, "loss": 1.109, "step": 1413 }, { "epoch": 1.2504424778761063, "grad_norm": 1.5154955881986902, "learning_rate": 7.853183939591898e-06, "loss": 1.3483, "step": 1414 }, { "epoch": 1.2513274336283187, "grad_norm": 1.0973197840832616, "learning_rate": 7.850303903534473e-06, "loss": 0.8399, "step": 1415 }, { "epoch": 1.252212389380531, "grad_norm": 1.3857162565164554, "learning_rate": 7.84742246584226e-06, "loss": 1.0202, "step": 1416 }, { "epoch": 1.2530973451327434, "grad_norm": 1.317108755153368, "learning_rate": 7.844539627932208e-06, "loss": 0.9919, "step": 1417 }, { "epoch": 1.2539823008849558, "grad_norm": 1.38291583104571, "learning_rate": 7.84165539122195e-06, "loss": 1.1136, "step": 1418 }, { "epoch": 1.2548672566371681, "grad_norm": 1.412216656775217, "learning_rate": 7.838769757129804e-06, "loss": 1.1145, "step": 1419 }, { "epoch": 1.2557522123893805, "grad_norm": 1.3895048676245283, "learning_rate": 7.835882727074779e-06, "loss": 1.1493, "step": 1420 }, { "epoch": 1.2566371681415929, "grad_norm": 1.4278418879401145, "learning_rate": 7.832994302476575e-06, "loss": 1.0079, "step": 1421 }, { "epoch": 1.2575221238938052, "grad_norm": 1.5061555103598339, "learning_rate": 7.830104484755566e-06, "loss": 0.8295, "step": 1422 }, { "epoch": 1.2584070796460178, "grad_norm": 2.019069063930834, "learning_rate": 7.82721327533282e-06, "loss": 1.1216, "step": 1423 }, { "epoch": 1.25929203539823, "grad_norm": 1.322209352228468, "learning_rate": 7.82432067563009e-06, "loss": 1.1209, "step": 1424 }, { "epoch": 1.2601769911504426, "grad_norm": 1.3297882260712584, "learning_rate": 7.821426687069805e-06, "loss": 0.9982, "step": 1425 }, { "epoch": 1.261061946902655, "grad_norm": 1.2425909329768166, "learning_rate": 7.818531311075084e-06, "loss": 0.9527, "step": 1426 }, { "epoch": 1.2619469026548673, "grad_norm": 1.3857432416471949, "learning_rate": 7.815634549069728e-06, "loss": 0.8598, "step": 1427 }, { "epoch": 1.2628318584070797, "grad_norm": 1.5634703702011066, "learning_rate": 7.812736402478212e-06, "loss": 1.0313, "step": 1428 }, { "epoch": 1.263716814159292, "grad_norm": 1.4625341195100463, "learning_rate": 7.809836872725702e-06, "loss": 1.1399, "step": 1429 }, { "epoch": 1.2646017699115044, "grad_norm": 1.5213387504101352, "learning_rate": 7.806935961238041e-06, "loss": 0.9257, "step": 1430 }, { "epoch": 1.2654867256637168, "grad_norm": 1.28747173591191, "learning_rate": 7.804033669441745e-06, "loss": 1.1004, "step": 1431 }, { "epoch": 1.2663716814159292, "grad_norm": 1.3207247581875299, "learning_rate": 7.801129998764014e-06, "loss": 1.0585, "step": 1432 }, { "epoch": 1.2672566371681415, "grad_norm": 1.4667668161145202, "learning_rate": 7.79822495063273e-06, "loss": 1.1815, "step": 1433 }, { "epoch": 1.268141592920354, "grad_norm": 1.3514698805058023, "learning_rate": 7.795318526476448e-06, "loss": 1.0026, "step": 1434 }, { "epoch": 1.2690265486725663, "grad_norm": 2.508302711886825, "learning_rate": 7.792410727724395e-06, "loss": 1.1, "step": 1435 }, { "epoch": 1.2699115044247788, "grad_norm": 2.3985296398791216, "learning_rate": 7.789501555806484e-06, "loss": 1.2304, "step": 1436 }, { "epoch": 1.2707964601769912, "grad_norm": 1.5254252641887396, "learning_rate": 7.786591012153294e-06, "loss": 1.2025, "step": 1437 }, { "epoch": 1.2716814159292036, "grad_norm": 1.7027707038132942, "learning_rate": 7.783679098196086e-06, "loss": 1.1773, "step": 1438 }, { "epoch": 1.272566371681416, "grad_norm": 1.4479097736242899, "learning_rate": 7.780765815366791e-06, "loss": 0.9321, "step": 1439 }, { "epoch": 1.2734513274336283, "grad_norm": 1.8734635524605534, "learning_rate": 7.777851165098012e-06, "loss": 1.1079, "step": 1440 }, { "epoch": 1.2743362831858407, "grad_norm": 1.5995442297397555, "learning_rate": 7.774935148823026e-06, "loss": 1.0817, "step": 1441 }, { "epoch": 1.275221238938053, "grad_norm": 1.6360107236679855, "learning_rate": 7.772017767975784e-06, "loss": 0.9831, "step": 1442 }, { "epoch": 1.2761061946902654, "grad_norm": 1.3764963878215755, "learning_rate": 7.769099023990903e-06, "loss": 1.1455, "step": 1443 }, { "epoch": 1.2769911504424778, "grad_norm": 1.503685959006512, "learning_rate": 7.766178918303675e-06, "loss": 1.177, "step": 1444 }, { "epoch": 1.2778761061946904, "grad_norm": 1.8571956096505573, "learning_rate": 7.76325745235006e-06, "loss": 1.3894, "step": 1445 }, { "epoch": 1.2787610619469025, "grad_norm": 1.6761545175752315, "learning_rate": 7.760334627566686e-06, "loss": 1.2476, "step": 1446 }, { "epoch": 1.2796460176991151, "grad_norm": 1.5792709152521491, "learning_rate": 7.757410445390847e-06, "loss": 1.3375, "step": 1447 }, { "epoch": 1.2805309734513275, "grad_norm": 1.7534137688696347, "learning_rate": 7.754484907260513e-06, "loss": 1.177, "step": 1448 }, { "epoch": 1.2814159292035399, "grad_norm": 1.585608931953075, "learning_rate": 7.751558014614312e-06, "loss": 1.5007, "step": 1449 }, { "epoch": 1.2823008849557522, "grad_norm": 2.645386860087754, "learning_rate": 7.748629768891542e-06, "loss": 1.166, "step": 1450 }, { "epoch": 1.2831858407079646, "grad_norm": 3.031195615126375, "learning_rate": 7.745700171532164e-06, "loss": 1.1451, "step": 1451 }, { "epoch": 1.284070796460177, "grad_norm": 1.0032173861586935, "learning_rate": 7.742769223976807e-06, "loss": 0.8281, "step": 1452 }, { "epoch": 1.2849557522123893, "grad_norm": 2.3453244912505795, "learning_rate": 7.739836927666762e-06, "loss": 1.6111, "step": 1453 }, { "epoch": 1.2858407079646017, "grad_norm": 1.3435797853160092, "learning_rate": 7.736903284043985e-06, "loss": 0.9284, "step": 1454 }, { "epoch": 1.286725663716814, "grad_norm": 1.5319936817780835, "learning_rate": 7.73396829455109e-06, "loss": 1.0592, "step": 1455 }, { "epoch": 1.2876106194690267, "grad_norm": 1.5422601396421274, "learning_rate": 7.731031960631354e-06, "loss": 1.1564, "step": 1456 }, { "epoch": 1.2884955752212388, "grad_norm": 1.275235507892593, "learning_rate": 7.728094283728724e-06, "loss": 0.9463, "step": 1457 }, { "epoch": 1.2893805309734514, "grad_norm": 1.3236797716288295, "learning_rate": 7.725155265287796e-06, "loss": 1.0141, "step": 1458 }, { "epoch": 1.2902654867256638, "grad_norm": 1.400148626894062, "learning_rate": 7.722214906753829e-06, "loss": 1.0297, "step": 1459 }, { "epoch": 1.2911504424778761, "grad_norm": 1.3411799352101705, "learning_rate": 7.719273209572745e-06, "loss": 1.0595, "step": 1460 }, { "epoch": 1.2920353982300885, "grad_norm": 1.2917063645870828, "learning_rate": 7.716330175191118e-06, "loss": 1.1014, "step": 1461 }, { "epoch": 1.2929203539823009, "grad_norm": 1.4787688108229127, "learning_rate": 7.713385805056187e-06, "loss": 1.0853, "step": 1462 }, { "epoch": 1.2938053097345132, "grad_norm": 1.4611455380564458, "learning_rate": 7.710440100615841e-06, "loss": 1.2358, "step": 1463 }, { "epoch": 1.2946902654867256, "grad_norm": 1.8956001778090044, "learning_rate": 7.70749306331863e-06, "loss": 1.2743, "step": 1464 }, { "epoch": 1.295575221238938, "grad_norm": 1.4005603720053568, "learning_rate": 7.704544694613755e-06, "loss": 0.8991, "step": 1465 }, { "epoch": 1.2964601769911503, "grad_norm": 1.954123426042762, "learning_rate": 7.701594995951075e-06, "loss": 1.1538, "step": 1466 }, { "epoch": 1.297345132743363, "grad_norm": 1.8623454155371928, "learning_rate": 7.698643968781101e-06, "loss": 1.2133, "step": 1467 }, { "epoch": 1.298230088495575, "grad_norm": 1.6869720824410726, "learning_rate": 7.695691614555002e-06, "loss": 1.1113, "step": 1468 }, { "epoch": 1.2991150442477877, "grad_norm": 1.3248519574129705, "learning_rate": 7.692737934724592e-06, "loss": 1.0563, "step": 1469 }, { "epoch": 1.3, "grad_norm": 1.729840644543766, "learning_rate": 7.689782930742345e-06, "loss": 1.1278, "step": 1470 }, { "epoch": 1.3008849557522124, "grad_norm": 1.4207689906184156, "learning_rate": 7.686826604061375e-06, "loss": 1.0186, "step": 1471 }, { "epoch": 1.3017699115044248, "grad_norm": 1.316508676223997, "learning_rate": 7.68386895613546e-06, "loss": 1.051, "step": 1472 }, { "epoch": 1.3026548672566372, "grad_norm": 1.4096188620469448, "learning_rate": 7.680909988419019e-06, "loss": 0.9118, "step": 1473 }, { "epoch": 1.3035398230088495, "grad_norm": 1.7183667257853827, "learning_rate": 7.677949702367122e-06, "loss": 1.0842, "step": 1474 }, { "epoch": 1.3044247787610619, "grad_norm": 1.2989599528523477, "learning_rate": 7.674988099435487e-06, "loss": 1.0706, "step": 1475 }, { "epoch": 1.3053097345132743, "grad_norm": 1.2171210422893677, "learning_rate": 7.672025181080481e-06, "loss": 0.8571, "step": 1476 }, { "epoch": 1.3061946902654866, "grad_norm": 1.7381521267647366, "learning_rate": 7.669060948759115e-06, "loss": 1.3961, "step": 1477 }, { "epoch": 1.3070796460176992, "grad_norm": 1.3843749947641697, "learning_rate": 7.66609540392905e-06, "loss": 0.9979, "step": 1478 }, { "epoch": 1.3079646017699116, "grad_norm": 1.3690616951522563, "learning_rate": 7.66312854804859e-06, "loss": 1.1477, "step": 1479 }, { "epoch": 1.308849557522124, "grad_norm": 1.4526837919491509, "learning_rate": 7.660160382576683e-06, "loss": 0.8788, "step": 1480 }, { "epoch": 1.3097345132743363, "grad_norm": 1.3711198579046298, "learning_rate": 7.657190908972924e-06, "loss": 1.3075, "step": 1481 }, { "epoch": 1.3106194690265487, "grad_norm": 1.5648548659301291, "learning_rate": 7.654220128697547e-06, "loss": 1.1982, "step": 1482 }, { "epoch": 1.311504424778761, "grad_norm": 1.2450284408056753, "learning_rate": 7.651248043211435e-06, "loss": 0.7875, "step": 1483 }, { "epoch": 1.3123893805309734, "grad_norm": 1.2905550667718286, "learning_rate": 7.648274653976102e-06, "loss": 0.9149, "step": 1484 }, { "epoch": 1.3132743362831858, "grad_norm": 3.239591329385311, "learning_rate": 7.645299962453717e-06, "loss": 1.0919, "step": 1485 }, { "epoch": 1.3141592920353982, "grad_norm": 1.3020421135099518, "learning_rate": 7.64232397010708e-06, "loss": 1.0648, "step": 1486 }, { "epoch": 1.3150442477876108, "grad_norm": 1.0859433716874995, "learning_rate": 7.63934667839963e-06, "loss": 0.9669, "step": 1487 }, { "epoch": 1.315929203539823, "grad_norm": 1.3681987640853308, "learning_rate": 7.636368088795451e-06, "loss": 1.0117, "step": 1488 }, { "epoch": 1.3168141592920355, "grad_norm": 1.3520653174303745, "learning_rate": 7.633388202759262e-06, "loss": 1.1108, "step": 1489 }, { "epoch": 1.3176991150442479, "grad_norm": 2.107754263276839, "learning_rate": 7.630407021756419e-06, "loss": 1.2528, "step": 1490 }, { "epoch": 1.3185840707964602, "grad_norm": 3.4320964275103036, "learning_rate": 7.627424547252914e-06, "loss": 1.1837, "step": 1491 }, { "epoch": 1.3194690265486726, "grad_norm": 1.439358786250422, "learning_rate": 7.624440780715379e-06, "loss": 1.1389, "step": 1492 }, { "epoch": 1.320353982300885, "grad_norm": 1.5882176360893023, "learning_rate": 7.621455723611079e-06, "loss": 1.1679, "step": 1493 }, { "epoch": 1.3212389380530973, "grad_norm": 1.246386965238514, "learning_rate": 7.618469377407911e-06, "loss": 1.074, "step": 1494 }, { "epoch": 1.3221238938053097, "grad_norm": 1.2731285956770086, "learning_rate": 7.615481743574411e-06, "loss": 1.0685, "step": 1495 }, { "epoch": 1.323008849557522, "grad_norm": 1.484499323665891, "learning_rate": 7.612492823579744e-06, "loss": 1.0595, "step": 1496 }, { "epoch": 1.3238938053097344, "grad_norm": 1.283214135867496, "learning_rate": 7.609502618893712e-06, "loss": 0.8347, "step": 1497 }, { "epoch": 1.324778761061947, "grad_norm": 2.0754186854694705, "learning_rate": 7.606511130986742e-06, "loss": 1.0654, "step": 1498 }, { "epoch": 1.3256637168141592, "grad_norm": 1.4955409907943866, "learning_rate": 7.6035183613299e-06, "loss": 1.1648, "step": 1499 }, { "epoch": 1.3265486725663718, "grad_norm": 1.7862536517628176, "learning_rate": 7.600524311394873e-06, "loss": 0.9548, "step": 1500 }, { "epoch": 1.3274336283185841, "grad_norm": 1.385580880291687, "learning_rate": 7.597528982653989e-06, "loss": 0.9946, "step": 1501 }, { "epoch": 1.3283185840707965, "grad_norm": 1.1937056318572852, "learning_rate": 7.594532376580193e-06, "loss": 1.0162, "step": 1502 }, { "epoch": 1.3292035398230089, "grad_norm": 2.331696824011255, "learning_rate": 7.591534494647066e-06, "loss": 1.0581, "step": 1503 }, { "epoch": 1.3300884955752212, "grad_norm": 1.5359763211257886, "learning_rate": 7.588535338328816e-06, "loss": 1.1485, "step": 1504 }, { "epoch": 1.3309734513274336, "grad_norm": 1.9511346338027755, "learning_rate": 7.585534909100274e-06, "loss": 1.0507, "step": 1505 }, { "epoch": 1.331858407079646, "grad_norm": 1.8207397420764135, "learning_rate": 7.582533208436897e-06, "loss": 1.3452, "step": 1506 }, { "epoch": 1.3327433628318583, "grad_norm": 1.4218413401559469, "learning_rate": 7.579530237814774e-06, "loss": 1.1932, "step": 1507 }, { "epoch": 1.3336283185840707, "grad_norm": 1.4847956007707035, "learning_rate": 7.576525998710609e-06, "loss": 0.8941, "step": 1508 }, { "epoch": 1.3345132743362833, "grad_norm": 1.3835113626344795, "learning_rate": 7.573520492601736e-06, "loss": 1.1733, "step": 1509 }, { "epoch": 1.3353982300884955, "grad_norm": 1.2853744747250582, "learning_rate": 7.570513720966108e-06, "loss": 0.9479, "step": 1510 }, { "epoch": 1.336283185840708, "grad_norm": 1.3306190431792344, "learning_rate": 7.567505685282308e-06, "loss": 1.0615, "step": 1511 }, { "epoch": 1.3371681415929204, "grad_norm": 1.4706419800627548, "learning_rate": 7.564496387029532e-06, "loss": 1.1505, "step": 1512 }, { "epoch": 1.3380530973451328, "grad_norm": 2.0839225188283814, "learning_rate": 7.561485827687599e-06, "loss": 1.1264, "step": 1513 }, { "epoch": 1.3389380530973451, "grad_norm": 1.5125868895046333, "learning_rate": 7.558474008736951e-06, "loss": 1.2631, "step": 1514 }, { "epoch": 1.3398230088495575, "grad_norm": 1.5599095104724305, "learning_rate": 7.555460931658647e-06, "loss": 1.1139, "step": 1515 }, { "epoch": 1.3407079646017699, "grad_norm": 1.4758926643490122, "learning_rate": 7.5524465979343665e-06, "loss": 1.2172, "step": 1516 }, { "epoch": 1.3415929203539823, "grad_norm": 2.257640334735848, "learning_rate": 7.549431009046404e-06, "loss": 1.1901, "step": 1517 }, { "epoch": 1.3424778761061946, "grad_norm": 1.4045140806955472, "learning_rate": 7.5464141664776734e-06, "loss": 1.1305, "step": 1518 }, { "epoch": 1.343362831858407, "grad_norm": 1.418952452099267, "learning_rate": 7.543396071711706e-06, "loss": 1.0904, "step": 1519 }, { "epoch": 1.3442477876106196, "grad_norm": 1.3927647704926338, "learning_rate": 7.540376726232648e-06, "loss": 1.1123, "step": 1520 }, { "epoch": 1.3451327433628317, "grad_norm": 1.358406518782085, "learning_rate": 7.537356131525259e-06, "loss": 0.9619, "step": 1521 }, { "epoch": 1.3460176991150443, "grad_norm": 1.44501854428644, "learning_rate": 7.5343342890749135e-06, "loss": 1.3323, "step": 1522 }, { "epoch": 1.3469026548672567, "grad_norm": 1.3168206371397608, "learning_rate": 7.531311200367601e-06, "loss": 1.0804, "step": 1523 }, { "epoch": 1.347787610619469, "grad_norm": 1.4650090005599408, "learning_rate": 7.528286866889924e-06, "loss": 0.9701, "step": 1524 }, { "epoch": 1.3486725663716814, "grad_norm": 1.3543642003764405, "learning_rate": 7.525261290129094e-06, "loss": 1.1785, "step": 1525 }, { "epoch": 1.3495575221238938, "grad_norm": 1.3368618919965305, "learning_rate": 7.522234471572939e-06, "loss": 1.0642, "step": 1526 }, { "epoch": 1.3504424778761062, "grad_norm": 1.3940514929382937, "learning_rate": 7.519206412709893e-06, "loss": 1.0529, "step": 1527 }, { "epoch": 1.3513274336283185, "grad_norm": 1.6313709459272598, "learning_rate": 7.516177115029002e-06, "loss": 1.1957, "step": 1528 }, { "epoch": 1.352212389380531, "grad_norm": 1.338092070123681, "learning_rate": 7.51314658001992e-06, "loss": 1.2647, "step": 1529 }, { "epoch": 1.3530973451327433, "grad_norm": 1.3919484771580786, "learning_rate": 7.51011480917291e-06, "loss": 1.1683, "step": 1530 }, { "epoch": 1.3539823008849559, "grad_norm": 1.742680224798067, "learning_rate": 7.5070818039788455e-06, "loss": 0.9975, "step": 1531 }, { "epoch": 1.354867256637168, "grad_norm": 1.2887482503792247, "learning_rate": 7.504047565929204e-06, "loss": 0.9886, "step": 1532 }, { "epoch": 1.3557522123893806, "grad_norm": 1.324632816599706, "learning_rate": 7.501012096516066e-06, "loss": 1.1043, "step": 1533 }, { "epoch": 1.356637168141593, "grad_norm": 1.5650482916538395, "learning_rate": 7.4979753972321265e-06, "loss": 1.2783, "step": 1534 }, { "epoch": 1.3575221238938053, "grad_norm": 1.4466260993179711, "learning_rate": 7.494937469570675e-06, "loss": 1.1276, "step": 1535 }, { "epoch": 1.3584070796460177, "grad_norm": 1.578588212922502, "learning_rate": 7.491898315025615e-06, "loss": 1.1975, "step": 1536 }, { "epoch": 1.35929203539823, "grad_norm": 1.2810574787173303, "learning_rate": 7.488857935091447e-06, "loss": 1.1074, "step": 1537 }, { "epoch": 1.3601769911504424, "grad_norm": 1.578667943653661, "learning_rate": 7.485816331263273e-06, "loss": 1.0663, "step": 1538 }, { "epoch": 1.3610619469026548, "grad_norm": 2.0245384144808005, "learning_rate": 7.482773505036801e-06, "loss": 1.0155, "step": 1539 }, { "epoch": 1.3619469026548672, "grad_norm": 1.2385793075367142, "learning_rate": 7.4797294579083405e-06, "loss": 0.7425, "step": 1540 }, { "epoch": 1.3628318584070795, "grad_norm": 1.3629781103596041, "learning_rate": 7.476684191374794e-06, "loss": 1.1525, "step": 1541 }, { "epoch": 1.3637168141592921, "grad_norm": 1.7436746146352178, "learning_rate": 7.473637706933676e-06, "loss": 0.9763, "step": 1542 }, { "epoch": 1.3646017699115045, "grad_norm": 2.7235307308704604, "learning_rate": 7.470590006083087e-06, "loss": 1.0304, "step": 1543 }, { "epoch": 1.3654867256637169, "grad_norm": 1.2976723426946806, "learning_rate": 7.467541090321735e-06, "loss": 1.1342, "step": 1544 }, { "epoch": 1.3663716814159292, "grad_norm": 1.3176738541817938, "learning_rate": 7.464490961148921e-06, "loss": 0.9851, "step": 1545 }, { "epoch": 1.3672566371681416, "grad_norm": 1.361976901944264, "learning_rate": 7.4614396200645435e-06, "loss": 1.1358, "step": 1546 }, { "epoch": 1.368141592920354, "grad_norm": 1.4629125302514645, "learning_rate": 7.4583870685690974e-06, "loss": 0.9909, "step": 1547 }, { "epoch": 1.3690265486725663, "grad_norm": 1.7251256815681588, "learning_rate": 7.455333308163673e-06, "loss": 1.3023, "step": 1548 }, { "epoch": 1.3699115044247787, "grad_norm": 1.3593873537179437, "learning_rate": 7.452278340349953e-06, "loss": 1.1823, "step": 1549 }, { "epoch": 1.370796460176991, "grad_norm": 1.3495139200623307, "learning_rate": 7.449222166630218e-06, "loss": 1.15, "step": 1550 }, { "epoch": 1.3716814159292037, "grad_norm": 1.3549877635732204, "learning_rate": 7.446164788507338e-06, "loss": 1.1181, "step": 1551 }, { "epoch": 1.3725663716814158, "grad_norm": 1.297329503705372, "learning_rate": 7.443106207484776e-06, "loss": 1.1832, "step": 1552 }, { "epoch": 1.3734513274336284, "grad_norm": 1.5622898965467458, "learning_rate": 7.440046425066587e-06, "loss": 1.1109, "step": 1553 }, { "epoch": 1.3743362831858408, "grad_norm": 1.250168263156101, "learning_rate": 7.436985442757415e-06, "loss": 1.1495, "step": 1554 }, { "epoch": 1.3752212389380531, "grad_norm": 1.7251201612624487, "learning_rate": 7.4339232620625e-06, "loss": 1.3045, "step": 1555 }, { "epoch": 1.3761061946902655, "grad_norm": 1.2579248747817668, "learning_rate": 7.430859884487663e-06, "loss": 1.0881, "step": 1556 }, { "epoch": 1.3769911504424779, "grad_norm": 3.2827143457607306, "learning_rate": 7.427795311539321e-06, "loss": 1.2036, "step": 1557 }, { "epoch": 1.3778761061946903, "grad_norm": 1.5256394166811473, "learning_rate": 7.424729544724471e-06, "loss": 1.1823, "step": 1558 }, { "epoch": 1.3787610619469026, "grad_norm": 2.230749161338179, "learning_rate": 7.421662585550707e-06, "loss": 1.2715, "step": 1559 }, { "epoch": 1.379646017699115, "grad_norm": 1.486693039436941, "learning_rate": 7.4185944355261996e-06, "loss": 0.9488, "step": 1560 }, { "epoch": 1.3805309734513274, "grad_norm": 1.3435322403894159, "learning_rate": 7.4155250961597106e-06, "loss": 1.3642, "step": 1561 }, { "epoch": 1.38141592920354, "grad_norm": 1.3393658228382161, "learning_rate": 7.4124545689605855e-06, "loss": 0.8765, "step": 1562 }, { "epoch": 1.382300884955752, "grad_norm": 1.119434853197126, "learning_rate": 7.409382855438754e-06, "loss": 1.0156, "step": 1563 }, { "epoch": 1.3831858407079647, "grad_norm": 1.621561405464853, "learning_rate": 7.406309957104727e-06, "loss": 0.957, "step": 1564 }, { "epoch": 1.384070796460177, "grad_norm": 1.7676508237040574, "learning_rate": 7.403235875469603e-06, "loss": 1.0733, "step": 1565 }, { "epoch": 1.3849557522123894, "grad_norm": 1.4200818280181853, "learning_rate": 7.400160612045057e-06, "loss": 1.2115, "step": 1566 }, { "epoch": 1.3858407079646018, "grad_norm": 1.2919325573270544, "learning_rate": 7.397084168343347e-06, "loss": 0.9784, "step": 1567 }, { "epoch": 1.3867256637168142, "grad_norm": 1.1858037768450436, "learning_rate": 7.3940065458773146e-06, "loss": 0.9694, "step": 1568 }, { "epoch": 1.3876106194690265, "grad_norm": 1.3265662336804107, "learning_rate": 7.390927746160377e-06, "loss": 0.7849, "step": 1569 }, { "epoch": 1.388495575221239, "grad_norm": 1.4634150241566013, "learning_rate": 7.3878477707065314e-06, "loss": 1.0441, "step": 1570 }, { "epoch": 1.3893805309734513, "grad_norm": 2.2459428251499696, "learning_rate": 7.384766621030352e-06, "loss": 0.9606, "step": 1571 }, { "epoch": 1.3902654867256636, "grad_norm": 2.140110437941646, "learning_rate": 7.381684298646993e-06, "loss": 1.1752, "step": 1572 }, { "epoch": 1.3911504424778762, "grad_norm": 1.3881014317602716, "learning_rate": 7.378600805072186e-06, "loss": 1.0554, "step": 1573 }, { "epoch": 1.3920353982300884, "grad_norm": 1.235956695517774, "learning_rate": 7.375516141822232e-06, "loss": 1.0478, "step": 1574 }, { "epoch": 1.392920353982301, "grad_norm": 2.0070793220358243, "learning_rate": 7.372430310414017e-06, "loss": 1.0741, "step": 1575 }, { "epoch": 1.3938053097345133, "grad_norm": 1.7800346056391942, "learning_rate": 7.369343312364994e-06, "loss": 1.2061, "step": 1576 }, { "epoch": 1.3946902654867257, "grad_norm": 1.539960833822958, "learning_rate": 7.366255149193192e-06, "loss": 1.1831, "step": 1577 }, { "epoch": 1.395575221238938, "grad_norm": 1.4504475187113512, "learning_rate": 7.363165822417212e-06, "loss": 1.0794, "step": 1578 }, { "epoch": 1.3964601769911504, "grad_norm": 1.2392576878866022, "learning_rate": 7.360075333556229e-06, "loss": 1.0858, "step": 1579 }, { "epoch": 1.3973451327433628, "grad_norm": 1.749502383582017, "learning_rate": 7.3569836841299905e-06, "loss": 1.0239, "step": 1580 }, { "epoch": 1.3982300884955752, "grad_norm": 1.395101827974938, "learning_rate": 7.353890875658807e-06, "loss": 1.0792, "step": 1581 }, { "epoch": 1.3991150442477875, "grad_norm": 1.487018034995209, "learning_rate": 7.350796909663571e-06, "loss": 1.3342, "step": 1582 }, { "epoch": 1.4, "grad_norm": 1.3009987161200667, "learning_rate": 7.3477017876657355e-06, "loss": 1.2094, "step": 1583 }, { "epoch": 1.4008849557522125, "grad_norm": 1.9346082851517579, "learning_rate": 7.344605511187322e-06, "loss": 1.0917, "step": 1584 }, { "epoch": 1.4017699115044246, "grad_norm": 1.2219398893428084, "learning_rate": 7.341508081750928e-06, "loss": 1.0499, "step": 1585 }, { "epoch": 1.4026548672566372, "grad_norm": 1.4568077664254988, "learning_rate": 7.3384095008797065e-06, "loss": 0.9341, "step": 1586 }, { "epoch": 1.4035398230088496, "grad_norm": 1.4767270571668436, "learning_rate": 7.335309770097383e-06, "loss": 1.217, "step": 1587 }, { "epoch": 1.404424778761062, "grad_norm": 1.9361339818069683, "learning_rate": 7.332208890928252e-06, "loss": 1.273, "step": 1588 }, { "epoch": 1.4053097345132743, "grad_norm": 1.4390172837831334, "learning_rate": 7.329106864897163e-06, "loss": 1.0824, "step": 1589 }, { "epoch": 1.4061946902654867, "grad_norm": 1.7768977632807224, "learning_rate": 7.326003693529538e-06, "loss": 1.1923, "step": 1590 }, { "epoch": 1.407079646017699, "grad_norm": 1.35048734702713, "learning_rate": 7.32289937835136e-06, "loss": 1.2166, "step": 1591 }, { "epoch": 1.4079646017699115, "grad_norm": 1.7268381376186626, "learning_rate": 7.319793920889171e-06, "loss": 1.221, "step": 1592 }, { "epoch": 1.4088495575221238, "grad_norm": 1.4149568855531809, "learning_rate": 7.3166873226700794e-06, "loss": 0.9765, "step": 1593 }, { "epoch": 1.4097345132743362, "grad_norm": 1.2060845822569088, "learning_rate": 7.313579585221752e-06, "loss": 0.9643, "step": 1594 }, { "epoch": 1.4106194690265488, "grad_norm": 1.4417921187373874, "learning_rate": 7.310470710072414e-06, "loss": 1.1885, "step": 1595 }, { "epoch": 1.411504424778761, "grad_norm": 1.381300529971979, "learning_rate": 7.3073606987508575e-06, "loss": 1.0586, "step": 1596 }, { "epoch": 1.4123893805309735, "grad_norm": 1.5507635255657088, "learning_rate": 7.304249552786422e-06, "loss": 1.0899, "step": 1597 }, { "epoch": 1.4132743362831859, "grad_norm": 1.3364803840130934, "learning_rate": 7.301137273709017e-06, "loss": 1.1024, "step": 1598 }, { "epoch": 1.4141592920353983, "grad_norm": 4.258300190847668, "learning_rate": 7.298023863049099e-06, "loss": 1.1321, "step": 1599 }, { "epoch": 1.4150442477876106, "grad_norm": 1.6818109981727556, "learning_rate": 7.294909322337689e-06, "loss": 1.0205, "step": 1600 }, { "epoch": 1.415929203539823, "grad_norm": 1.5984488507689834, "learning_rate": 7.291793653106357e-06, "loss": 1.2825, "step": 1601 }, { "epoch": 1.4168141592920354, "grad_norm": 2.2074091859537006, "learning_rate": 7.288676856887233e-06, "loss": 1.2546, "step": 1602 }, { "epoch": 1.4176991150442477, "grad_norm": 1.464878538117375, "learning_rate": 7.2855589352129966e-06, "loss": 1.2298, "step": 1603 }, { "epoch": 1.41858407079646, "grad_norm": 1.7623045141380604, "learning_rate": 7.282439889616887e-06, "loss": 0.9861, "step": 1604 }, { "epoch": 1.4194690265486725, "grad_norm": 1.52510777475721, "learning_rate": 7.279319721632689e-06, "loss": 1.2132, "step": 1605 }, { "epoch": 1.420353982300885, "grad_norm": 1.4022304603947977, "learning_rate": 7.276198432794747e-06, "loss": 1.1023, "step": 1606 }, { "epoch": 1.4212389380530974, "grad_norm": 1.3951203427748198, "learning_rate": 7.273076024637946e-06, "loss": 1.0701, "step": 1607 }, { "epoch": 1.4221238938053098, "grad_norm": 1.2130575063457074, "learning_rate": 7.269952498697734e-06, "loss": 1.0504, "step": 1608 }, { "epoch": 1.4230088495575222, "grad_norm": 4.396867745576411, "learning_rate": 7.266827856510102e-06, "loss": 1.3974, "step": 1609 }, { "epoch": 1.4238938053097345, "grad_norm": 1.1695954724386437, "learning_rate": 7.263702099611586e-06, "loss": 0.949, "step": 1610 }, { "epoch": 1.424778761061947, "grad_norm": 1.4291153478793326, "learning_rate": 7.260575229539278e-06, "loss": 1.1941, "step": 1611 }, { "epoch": 1.4256637168141593, "grad_norm": 2.075530431216336, "learning_rate": 7.257447247830813e-06, "loss": 1.1348, "step": 1612 }, { "epoch": 1.4265486725663716, "grad_norm": 1.41999363924689, "learning_rate": 7.254318156024374e-06, "loss": 1.114, "step": 1613 }, { "epoch": 1.427433628318584, "grad_norm": 1.3529729567493016, "learning_rate": 7.251187955658691e-06, "loss": 1.1536, "step": 1614 }, { "epoch": 1.4283185840707966, "grad_norm": 3.1714420114823274, "learning_rate": 7.248056648273034e-06, "loss": 1.1393, "step": 1615 }, { "epoch": 1.4292035398230087, "grad_norm": 1.6811365615225822, "learning_rate": 7.244924235407224e-06, "loss": 0.9963, "step": 1616 }, { "epoch": 1.4300884955752213, "grad_norm": 1.7324956847681388, "learning_rate": 7.2417907186016215e-06, "loss": 1.2078, "step": 1617 }, { "epoch": 1.4309734513274337, "grad_norm": 1.3025608737220244, "learning_rate": 7.23865609939713e-06, "loss": 1.1033, "step": 1618 }, { "epoch": 1.431858407079646, "grad_norm": 1.4981920674937452, "learning_rate": 7.2355203793352005e-06, "loss": 1.3709, "step": 1619 }, { "epoch": 1.4327433628318584, "grad_norm": 1.3452371587382357, "learning_rate": 7.232383559957815e-06, "loss": 0.9378, "step": 1620 }, { "epoch": 1.4336283185840708, "grad_norm": 1.466939387231396, "learning_rate": 7.2292456428075065e-06, "loss": 1.134, "step": 1621 }, { "epoch": 1.4345132743362832, "grad_norm": 1.230086376035075, "learning_rate": 7.226106629427342e-06, "loss": 1.0528, "step": 1622 }, { "epoch": 1.4353982300884955, "grad_norm": 1.3941011975954287, "learning_rate": 7.222966521360928e-06, "loss": 1.1666, "step": 1623 }, { "epoch": 1.436283185840708, "grad_norm": 1.660413932272789, "learning_rate": 7.219825320152411e-06, "loss": 1.2679, "step": 1624 }, { "epoch": 1.4371681415929203, "grad_norm": 1.2574277636848032, "learning_rate": 7.216683027346475e-06, "loss": 0.8831, "step": 1625 }, { "epoch": 1.4380530973451329, "grad_norm": 1.2858546556742738, "learning_rate": 7.213539644488339e-06, "loss": 1.2204, "step": 1626 }, { "epoch": 1.438938053097345, "grad_norm": 1.402633888094926, "learning_rate": 7.21039517312376e-06, "loss": 1.1667, "step": 1627 }, { "epoch": 1.4398230088495576, "grad_norm": 1.187068620015323, "learning_rate": 7.207249614799028e-06, "loss": 1.0079, "step": 1628 }, { "epoch": 1.44070796460177, "grad_norm": 1.382482065046584, "learning_rate": 7.204102971060971e-06, "loss": 1.0612, "step": 1629 }, { "epoch": 1.4415929203539823, "grad_norm": 1.3400290724052055, "learning_rate": 7.200955243456946e-06, "loss": 0.9539, "step": 1630 }, { "epoch": 1.4424778761061947, "grad_norm": 1.7969759241573038, "learning_rate": 7.197806433534849e-06, "loss": 1.0211, "step": 1631 }, { "epoch": 1.443362831858407, "grad_norm": 1.3919606429735338, "learning_rate": 7.194656542843103e-06, "loss": 1.0668, "step": 1632 }, { "epoch": 1.4442477876106194, "grad_norm": 2.0867310001336157, "learning_rate": 7.191505572930664e-06, "loss": 1.0941, "step": 1633 }, { "epoch": 1.4451327433628318, "grad_norm": 1.3845932350644476, "learning_rate": 7.18835352534702e-06, "loss": 0.9696, "step": 1634 }, { "epoch": 1.4460176991150442, "grad_norm": 2.335462196455774, "learning_rate": 7.185200401642187e-06, "loss": 1.4508, "step": 1635 }, { "epoch": 1.4469026548672566, "grad_norm": 1.4274234601263118, "learning_rate": 7.18204620336671e-06, "loss": 1.1958, "step": 1636 }, { "epoch": 1.4477876106194691, "grad_norm": 1.2661517200203058, "learning_rate": 7.17889093207167e-06, "loss": 1.1513, "step": 1637 }, { "epoch": 1.4486725663716813, "grad_norm": 1.2577031529710123, "learning_rate": 7.17573458930866e-06, "loss": 1.2249, "step": 1638 }, { "epoch": 1.4495575221238939, "grad_norm": 1.3817839576550692, "learning_rate": 7.1725771766298155e-06, "loss": 1.1966, "step": 1639 }, { "epoch": 1.4504424778761063, "grad_norm": 1.1778644963586986, "learning_rate": 7.169418695587791e-06, "loss": 1.0636, "step": 1640 }, { "epoch": 1.4513274336283186, "grad_norm": 1.322327229429169, "learning_rate": 7.1662591477357655e-06, "loss": 1.1685, "step": 1641 }, { "epoch": 1.452212389380531, "grad_norm": 1.2341398766301281, "learning_rate": 7.1630985346274465e-06, "loss": 1.067, "step": 1642 }, { "epoch": 1.4530973451327434, "grad_norm": 1.3526168739268398, "learning_rate": 7.15993685781706e-06, "loss": 1.1194, "step": 1643 }, { "epoch": 1.4539823008849557, "grad_norm": 1.3329439509927683, "learning_rate": 7.15677411885936e-06, "loss": 0.8753, "step": 1644 }, { "epoch": 1.454867256637168, "grad_norm": 2.953250339083495, "learning_rate": 7.153610319309622e-06, "loss": 1.0852, "step": 1645 }, { "epoch": 1.4557522123893805, "grad_norm": 1.4618169839894994, "learning_rate": 7.150445460723638e-06, "loss": 1.069, "step": 1646 }, { "epoch": 1.4566371681415928, "grad_norm": 1.1913205456229177, "learning_rate": 7.14727954465773e-06, "loss": 0.9918, "step": 1647 }, { "epoch": 1.4575221238938054, "grad_norm": 1.2775710142514083, "learning_rate": 7.1441125726687336e-06, "loss": 1.2509, "step": 1648 }, { "epoch": 1.4584070796460176, "grad_norm": 1.5774697922640422, "learning_rate": 7.140944546314001e-06, "loss": 0.9727, "step": 1649 }, { "epoch": 1.4592920353982302, "grad_norm": 1.2893276213696878, "learning_rate": 7.137775467151411e-06, "loss": 0.9925, "step": 1650 }, { "epoch": 1.4601769911504425, "grad_norm": 1.4769929112538955, "learning_rate": 7.134605336739354e-06, "loss": 1.0928, "step": 1651 }, { "epoch": 1.461061946902655, "grad_norm": 1.5274374456989317, "learning_rate": 7.131434156636741e-06, "loss": 1.4085, "step": 1652 }, { "epoch": 1.4619469026548673, "grad_norm": 1.6886702678157197, "learning_rate": 7.128261928402993e-06, "loss": 0.9755, "step": 1653 }, { "epoch": 1.4628318584070796, "grad_norm": 2.203787696933805, "learning_rate": 7.125088653598057e-06, "loss": 1.314, "step": 1654 }, { "epoch": 1.463716814159292, "grad_norm": 1.4920970516600922, "learning_rate": 7.121914333782384e-06, "loss": 1.3083, "step": 1655 }, { "epoch": 1.4646017699115044, "grad_norm": 1.4542289712407437, "learning_rate": 7.118738970516944e-06, "loss": 1.2061, "step": 1656 }, { "epoch": 1.4654867256637167, "grad_norm": 1.3470696948802898, "learning_rate": 7.115562565363221e-06, "loss": 1.1433, "step": 1657 }, { "epoch": 1.466371681415929, "grad_norm": 1.7336907046601957, "learning_rate": 7.1123851198832095e-06, "loss": 1.2896, "step": 1658 }, { "epoch": 1.4672566371681417, "grad_norm": 1.3187278566852607, "learning_rate": 7.109206635639414e-06, "loss": 1.24, "step": 1659 }, { "epoch": 1.4681415929203538, "grad_norm": 1.5853340332453807, "learning_rate": 7.106027114194856e-06, "loss": 1.1339, "step": 1660 }, { "epoch": 1.4690265486725664, "grad_norm": 2.821189479632709, "learning_rate": 7.102846557113057e-06, "loss": 1.0623, "step": 1661 }, { "epoch": 1.4699115044247788, "grad_norm": 1.3920188808701353, "learning_rate": 7.099664965958058e-06, "loss": 0.9581, "step": 1662 }, { "epoch": 1.4707964601769912, "grad_norm": 1.2194749289616802, "learning_rate": 7.096482342294401e-06, "loss": 1.1152, "step": 1663 }, { "epoch": 1.4716814159292035, "grad_norm": 1.2635025611331958, "learning_rate": 7.093298687687141e-06, "loss": 0.9956, "step": 1664 }, { "epoch": 1.472566371681416, "grad_norm": 1.2797734584804603, "learning_rate": 7.090114003701838e-06, "loss": 1.136, "step": 1665 }, { "epoch": 1.4734513274336283, "grad_norm": 1.428300930082789, "learning_rate": 7.086928291904556e-06, "loss": 1.0783, "step": 1666 }, { "epoch": 1.4743362831858406, "grad_norm": 1.4222135984126314, "learning_rate": 7.083741553861866e-06, "loss": 1.1354, "step": 1667 }, { "epoch": 1.475221238938053, "grad_norm": 2.007703981291038, "learning_rate": 7.080553791140848e-06, "loss": 1.1332, "step": 1668 }, { "epoch": 1.4761061946902654, "grad_norm": 1.4875307420178785, "learning_rate": 7.077365005309077e-06, "loss": 1.2031, "step": 1669 }, { "epoch": 1.476991150442478, "grad_norm": 1.4489526574291147, "learning_rate": 7.0741751979346395e-06, "loss": 1.0764, "step": 1670 }, { "epoch": 1.4778761061946903, "grad_norm": 1.3360304252059203, "learning_rate": 7.070984370586119e-06, "loss": 1.1007, "step": 1671 }, { "epoch": 1.4787610619469027, "grad_norm": 1.3816885499401241, "learning_rate": 7.067792524832604e-06, "loss": 1.0154, "step": 1672 }, { "epoch": 1.479646017699115, "grad_norm": 1.4866540049179493, "learning_rate": 7.064599662243681e-06, "loss": 1.3985, "step": 1673 }, { "epoch": 1.4805309734513274, "grad_norm": 1.33465699521692, "learning_rate": 7.061405784389438e-06, "loss": 1.2273, "step": 1674 }, { "epoch": 1.4814159292035398, "grad_norm": 1.480100727139653, "learning_rate": 7.058210892840461e-06, "loss": 0.9899, "step": 1675 }, { "epoch": 1.4823008849557522, "grad_norm": 1.3124821548595607, "learning_rate": 7.055014989167837e-06, "loss": 1.1365, "step": 1676 }, { "epoch": 1.4831858407079646, "grad_norm": 1.491622427366188, "learning_rate": 7.051818074943148e-06, "loss": 1.3124, "step": 1677 }, { "epoch": 1.484070796460177, "grad_norm": 1.5568486374520083, "learning_rate": 7.048620151738478e-06, "loss": 1.1463, "step": 1678 }, { "epoch": 1.4849557522123895, "grad_norm": 1.5956576143205823, "learning_rate": 7.045421221126397e-06, "loss": 1.0413, "step": 1679 }, { "epoch": 1.4858407079646017, "grad_norm": 1.388531935640759, "learning_rate": 7.042221284679982e-06, "loss": 1.1245, "step": 1680 }, { "epoch": 1.4867256637168142, "grad_norm": 1.440004786560123, "learning_rate": 7.039020343972796e-06, "loss": 1.0605, "step": 1681 }, { "epoch": 1.4876106194690266, "grad_norm": 1.3840134211975312, "learning_rate": 7.035818400578901e-06, "loss": 0.9651, "step": 1682 }, { "epoch": 1.488495575221239, "grad_norm": 1.7793658003297208, "learning_rate": 7.032615456072849e-06, "loss": 0.8079, "step": 1683 }, { "epoch": 1.4893805309734514, "grad_norm": 1.3219422900698747, "learning_rate": 7.029411512029687e-06, "loss": 1.0874, "step": 1684 }, { "epoch": 1.4902654867256637, "grad_norm": 1.3418150048417414, "learning_rate": 7.026206570024949e-06, "loss": 1.1469, "step": 1685 }, { "epoch": 1.491150442477876, "grad_norm": 1.1638773595986631, "learning_rate": 7.023000631634668e-06, "loss": 1.0653, "step": 1686 }, { "epoch": 1.4920353982300885, "grad_norm": 1.9898828609537937, "learning_rate": 7.019793698435358e-06, "loss": 1.0883, "step": 1687 }, { "epoch": 1.4929203539823008, "grad_norm": 1.987708858943816, "learning_rate": 7.016585772004026e-06, "loss": 1.5141, "step": 1688 }, { "epoch": 1.4938053097345132, "grad_norm": 1.7521593614913835, "learning_rate": 7.013376853918169e-06, "loss": 1.4491, "step": 1689 }, { "epoch": 1.4946902654867258, "grad_norm": 4.030248434902002, "learning_rate": 7.010166945755768e-06, "loss": 1.0547, "step": 1690 }, { "epoch": 1.495575221238938, "grad_norm": 1.3997454869347454, "learning_rate": 7.0069560490952956e-06, "loss": 0.8927, "step": 1691 }, { "epoch": 1.4964601769911505, "grad_norm": 2.2317434164266605, "learning_rate": 7.0037441655157045e-06, "loss": 1.167, "step": 1692 }, { "epoch": 1.497345132743363, "grad_norm": 1.5950504233048084, "learning_rate": 7.00053129659644e-06, "loss": 1.1615, "step": 1693 }, { "epoch": 1.4982300884955753, "grad_norm": 1.4751948045929728, "learning_rate": 6.997317443917424e-06, "loss": 1.2247, "step": 1694 }, { "epoch": 1.4991150442477876, "grad_norm": 1.3805778510027018, "learning_rate": 6.9941026090590705e-06, "loss": 1.0064, "step": 1695 }, { "epoch": 1.5, "grad_norm": 1.1468844773071774, "learning_rate": 6.990886793602268e-06, "loss": 1.1737, "step": 1696 }, { "epoch": 1.5008849557522124, "grad_norm": 1.9779883411813555, "learning_rate": 6.9876699991283926e-06, "loss": 0.9245, "step": 1697 }, { "epoch": 1.5017699115044247, "grad_norm": 1.5891563594677827, "learning_rate": 6.9844522272193005e-06, "loss": 1.0003, "step": 1698 }, { "epoch": 1.5026548672566373, "grad_norm": 1.2851066667163331, "learning_rate": 6.9812334794573285e-06, "loss": 1.096, "step": 1699 }, { "epoch": 1.5035398230088495, "grad_norm": 1.3743619643183806, "learning_rate": 6.978013757425295e-06, "loss": 1.1238, "step": 1700 }, { "epoch": 1.504424778761062, "grad_norm": 1.5455660456292355, "learning_rate": 6.974793062706494e-06, "loss": 1.2267, "step": 1701 }, { "epoch": 1.5053097345132742, "grad_norm": 1.4841400062621997, "learning_rate": 6.9715713968847e-06, "loss": 1.092, "step": 1702 }, { "epoch": 1.5061946902654868, "grad_norm": 1.6843561699230911, "learning_rate": 6.968348761544166e-06, "loss": 1.0013, "step": 1703 }, { "epoch": 1.5070796460176992, "grad_norm": 2.6904545547796697, "learning_rate": 6.965125158269619e-06, "loss": 1.348, "step": 1704 }, { "epoch": 1.5079646017699115, "grad_norm": 1.2592825030775325, "learning_rate": 6.961900588646264e-06, "loss": 1.1058, "step": 1705 }, { "epoch": 1.508849557522124, "grad_norm": 1.6962353680287388, "learning_rate": 6.95867505425978e-06, "loss": 0.908, "step": 1706 }, { "epoch": 1.5097345132743363, "grad_norm": 1.3118048078296733, "learning_rate": 6.955448556696324e-06, "loss": 1.2239, "step": 1707 }, { "epoch": 1.5106194690265486, "grad_norm": 1.155088389172363, "learning_rate": 6.9522210975425186e-06, "loss": 0.8731, "step": 1708 }, { "epoch": 1.511504424778761, "grad_norm": 1.397986373614464, "learning_rate": 6.94899267838547e-06, "loss": 1.1616, "step": 1709 }, { "epoch": 1.5123893805309736, "grad_norm": 1.6717405426058363, "learning_rate": 6.945763300812746e-06, "loss": 1.2254, "step": 1710 }, { "epoch": 1.5132743362831858, "grad_norm": 1.775139910898796, "learning_rate": 6.9425329664123945e-06, "loss": 1.0278, "step": 1711 }, { "epoch": 1.5141592920353983, "grad_norm": 1.4703876078017115, "learning_rate": 6.939301676772927e-06, "loss": 1.2802, "step": 1712 }, { "epoch": 1.5150442477876105, "grad_norm": 1.4566363735293697, "learning_rate": 6.936069433483329e-06, "loss": 1.4485, "step": 1713 }, { "epoch": 1.515929203539823, "grad_norm": 1.254067986201956, "learning_rate": 6.932836238133054e-06, "loss": 0.9822, "step": 1714 }, { "epoch": 1.5168141592920354, "grad_norm": 1.465579458753173, "learning_rate": 6.929602092312023e-06, "loss": 1.1605, "step": 1715 }, { "epoch": 1.5176991150442478, "grad_norm": 1.215884923517938, "learning_rate": 6.926366997610624e-06, "loss": 1.0195, "step": 1716 }, { "epoch": 1.5185840707964602, "grad_norm": 1.3713156539757536, "learning_rate": 6.923130955619714e-06, "loss": 1.0861, "step": 1717 }, { "epoch": 1.5194690265486726, "grad_norm": 2.071029868722044, "learning_rate": 6.919893967930613e-06, "loss": 1.011, "step": 1718 }, { "epoch": 1.520353982300885, "grad_norm": 1.322003072026693, "learning_rate": 6.91665603613511e-06, "loss": 0.8646, "step": 1719 }, { "epoch": 1.5212389380530973, "grad_norm": 1.666122649284126, "learning_rate": 6.913417161825449e-06, "loss": 0.954, "step": 1720 }, { "epoch": 1.5221238938053099, "grad_norm": 1.8991932338527182, "learning_rate": 6.9101773465943504e-06, "loss": 0.9814, "step": 1721 }, { "epoch": 1.523008849557522, "grad_norm": 1.4075909966907736, "learning_rate": 6.906936592034988e-06, "loss": 0.9755, "step": 1722 }, { "epoch": 1.5238938053097346, "grad_norm": 1.4678638392386592, "learning_rate": 6.903694899741001e-06, "loss": 1.177, "step": 1723 }, { "epoch": 1.5247787610619468, "grad_norm": 1.1894427021220964, "learning_rate": 6.90045227130649e-06, "loss": 1.0597, "step": 1724 }, { "epoch": 1.5256637168141594, "grad_norm": 1.390380747612413, "learning_rate": 6.897208708326013e-06, "loss": 1.1948, "step": 1725 }, { "epoch": 1.5265486725663717, "grad_norm": 1.5736196957463655, "learning_rate": 6.893964212394592e-06, "loss": 0.9969, "step": 1726 }, { "epoch": 1.527433628318584, "grad_norm": 1.5083190928645689, "learning_rate": 6.8907187851077026e-06, "loss": 1.3537, "step": 1727 }, { "epoch": 1.5283185840707965, "grad_norm": 2.0014475850260194, "learning_rate": 6.887472428061285e-06, "loss": 1.3608, "step": 1728 }, { "epoch": 1.5292035398230088, "grad_norm": 1.382690238227299, "learning_rate": 6.884225142851729e-06, "loss": 1.072, "step": 1729 }, { "epoch": 1.5300884955752212, "grad_norm": 1.3963098777577863, "learning_rate": 6.880976931075887e-06, "loss": 0.7325, "step": 1730 }, { "epoch": 1.5309734513274336, "grad_norm": 1.7248049129594738, "learning_rate": 6.877727794331063e-06, "loss": 1.2521, "step": 1731 }, { "epoch": 1.5318584070796462, "grad_norm": 1.2738580580236996, "learning_rate": 6.87447773421502e-06, "loss": 1.0975, "step": 1732 }, { "epoch": 1.5327433628318583, "grad_norm": 1.2488391972665274, "learning_rate": 6.8712267523259705e-06, "loss": 1.0389, "step": 1733 }, { "epoch": 1.533628318584071, "grad_norm": 1.2494947880794058, "learning_rate": 6.867974850262582e-06, "loss": 0.8737, "step": 1734 }, { "epoch": 1.534513274336283, "grad_norm": 1.4022893381506154, "learning_rate": 6.864722029623977e-06, "loss": 1.169, "step": 1735 }, { "epoch": 1.5353982300884956, "grad_norm": 1.1977968992508157, "learning_rate": 6.8614682920097265e-06, "loss": 1.0132, "step": 1736 }, { "epoch": 1.536283185840708, "grad_norm": 1.4572715965915193, "learning_rate": 6.858213639019853e-06, "loss": 1.3557, "step": 1737 }, { "epoch": 1.5371681415929204, "grad_norm": 1.5074465664990782, "learning_rate": 6.8549580722548315e-06, "loss": 1.2965, "step": 1738 }, { "epoch": 1.5380530973451327, "grad_norm": 1.5336826378364723, "learning_rate": 6.851701593315581e-06, "loss": 1.3998, "step": 1739 }, { "epoch": 1.538938053097345, "grad_norm": 1.379248647458368, "learning_rate": 6.848444203803476e-06, "loss": 1.0257, "step": 1740 }, { "epoch": 1.5398230088495575, "grad_norm": 2.0348940227323014, "learning_rate": 6.845185905320333e-06, "loss": 1.8584, "step": 1741 }, { "epoch": 1.5407079646017698, "grad_norm": 1.3468316833562146, "learning_rate": 6.8419266994684194e-06, "loss": 1.0442, "step": 1742 }, { "epoch": 1.5415929203539824, "grad_norm": 1.5974622812684782, "learning_rate": 6.838666587850447e-06, "loss": 1.4398, "step": 1743 }, { "epoch": 1.5424778761061946, "grad_norm": 1.3694336719632414, "learning_rate": 6.835405572069572e-06, "loss": 1.088, "step": 1744 }, { "epoch": 1.5433628318584072, "grad_norm": 1.2976284155745827, "learning_rate": 6.832143653729397e-06, "loss": 1.1141, "step": 1745 }, { "epoch": 1.5442477876106193, "grad_norm": 1.2779692182621043, "learning_rate": 6.828880834433969e-06, "loss": 0.9887, "step": 1746 }, { "epoch": 1.545132743362832, "grad_norm": 1.5224402731010447, "learning_rate": 6.825617115787777e-06, "loss": 1.1437, "step": 1747 }, { "epoch": 1.5460176991150443, "grad_norm": 1.5791648663310127, "learning_rate": 6.822352499395751e-06, "loss": 1.2633, "step": 1748 }, { "epoch": 1.5469026548672566, "grad_norm": 1.3874441972730338, "learning_rate": 6.81908698686326e-06, "loss": 0.9843, "step": 1749 }, { "epoch": 1.547787610619469, "grad_norm": 1.190426114843867, "learning_rate": 6.8158205797961265e-06, "loss": 1.0389, "step": 1750 }, { "epoch": 1.5486725663716814, "grad_norm": 1.5098095063470784, "learning_rate": 6.812553279800595e-06, "loss": 1.3026, "step": 1751 }, { "epoch": 1.549557522123894, "grad_norm": 1.520931684942825, "learning_rate": 6.809285088483361e-06, "loss": 1.0424, "step": 1752 }, { "epoch": 1.5504424778761061, "grad_norm": 1.3858371220192116, "learning_rate": 6.8060160074515565e-06, "loss": 1.1679, "step": 1753 }, { "epoch": 1.5513274336283187, "grad_norm": 1.1639726583456287, "learning_rate": 6.802746038312749e-06, "loss": 0.9122, "step": 1754 }, { "epoch": 1.5522123893805309, "grad_norm": 1.4988189238737717, "learning_rate": 6.799475182674942e-06, "loss": 0.8462, "step": 1755 }, { "epoch": 1.5530973451327434, "grad_norm": 1.5852442685535666, "learning_rate": 6.796203442146576e-06, "loss": 0.9857, "step": 1756 }, { "epoch": 1.5539823008849556, "grad_norm": 1.4980594724217684, "learning_rate": 6.792930818336529e-06, "loss": 1.02, "step": 1757 }, { "epoch": 1.5548672566371682, "grad_norm": 1.4109685837788946, "learning_rate": 6.78965731285411e-06, "loss": 1.072, "step": 1758 }, { "epoch": 1.5557522123893806, "grad_norm": 1.2163064916952713, "learning_rate": 6.786382927309064e-06, "loss": 1.0803, "step": 1759 }, { "epoch": 1.556637168141593, "grad_norm": 1.4075713862303119, "learning_rate": 6.783107663311566e-06, "loss": 0.9902, "step": 1760 }, { "epoch": 1.5575221238938053, "grad_norm": 1.5602318394628658, "learning_rate": 6.779831522472226e-06, "loss": 1.0225, "step": 1761 }, { "epoch": 1.5584070796460177, "grad_norm": 1.9713498013180557, "learning_rate": 6.776554506402081e-06, "loss": 1.141, "step": 1762 }, { "epoch": 1.5592920353982302, "grad_norm": 1.8898299129691374, "learning_rate": 6.773276616712605e-06, "loss": 1.2706, "step": 1763 }, { "epoch": 1.5601769911504424, "grad_norm": 1.5258752627251475, "learning_rate": 6.7699978550156954e-06, "loss": 1.3903, "step": 1764 }, { "epoch": 1.561061946902655, "grad_norm": 1.1926210748066364, "learning_rate": 6.76671822292368e-06, "loss": 1.0642, "step": 1765 }, { "epoch": 1.5619469026548671, "grad_norm": 1.252702214850997, "learning_rate": 6.76343772204932e-06, "loss": 1.1811, "step": 1766 }, { "epoch": 1.5628318584070797, "grad_norm": 2.493394926308201, "learning_rate": 6.760156354005794e-06, "loss": 1.0612, "step": 1767 }, { "epoch": 1.563716814159292, "grad_norm": 1.4895962890403447, "learning_rate": 6.7568741204067145e-06, "loss": 1.071, "step": 1768 }, { "epoch": 1.5646017699115045, "grad_norm": 1.5526393612284135, "learning_rate": 6.753591022866117e-06, "loss": 1.4348, "step": 1769 }, { "epoch": 1.5654867256637168, "grad_norm": 1.1816809511866864, "learning_rate": 6.750307062998462e-06, "loss": 0.827, "step": 1770 }, { "epoch": 1.5663716814159292, "grad_norm": 1.2722643749701006, "learning_rate": 6.747022242418636e-06, "loss": 1.0983, "step": 1771 }, { "epoch": 1.5672566371681416, "grad_norm": 3.3738691370038434, "learning_rate": 6.743736562741944e-06, "loss": 1.2208, "step": 1772 }, { "epoch": 1.568141592920354, "grad_norm": 1.2824991729056807, "learning_rate": 6.74045002558412e-06, "loss": 1.1306, "step": 1773 }, { "epoch": 1.5690265486725665, "grad_norm": 1.3059130587594978, "learning_rate": 6.737162632561311e-06, "loss": 1.1084, "step": 1774 }, { "epoch": 1.5699115044247787, "grad_norm": 1.1283184892514588, "learning_rate": 6.733874385290097e-06, "loss": 0.9672, "step": 1775 }, { "epoch": 1.5707964601769913, "grad_norm": 1.427658212433811, "learning_rate": 6.730585285387465e-06, "loss": 1.416, "step": 1776 }, { "epoch": 1.5716814159292034, "grad_norm": 1.5583547389469175, "learning_rate": 6.727295334470831e-06, "loss": 1.0764, "step": 1777 }, { "epoch": 1.572566371681416, "grad_norm": 1.5717339778631823, "learning_rate": 6.724004534158025e-06, "loss": 1.0914, "step": 1778 }, { "epoch": 1.5734513274336284, "grad_norm": 1.5127867473856897, "learning_rate": 6.720712886067295e-06, "loss": 1.2501, "step": 1779 }, { "epoch": 1.5743362831858407, "grad_norm": 1.4030650225197043, "learning_rate": 6.717420391817306e-06, "loss": 1.2275, "step": 1780 }, { "epoch": 1.575221238938053, "grad_norm": 1.2310919519026537, "learning_rate": 6.714127053027142e-06, "loss": 1.01, "step": 1781 }, { "epoch": 1.5761061946902655, "grad_norm": 1.2538046992039225, "learning_rate": 6.710832871316295e-06, "loss": 1.1068, "step": 1782 }, { "epoch": 1.5769911504424778, "grad_norm": 1.3690682148570656, "learning_rate": 6.707537848304682e-06, "loss": 1.0801, "step": 1783 }, { "epoch": 1.5778761061946902, "grad_norm": 1.7812277865458357, "learning_rate": 6.704241985612625e-06, "loss": 1.2772, "step": 1784 }, { "epoch": 1.5787610619469028, "grad_norm": 1.6747541744845102, "learning_rate": 6.7009452848608625e-06, "loss": 1.0625, "step": 1785 }, { "epoch": 1.579646017699115, "grad_norm": 1.4405770737507324, "learning_rate": 6.697647747670545e-06, "loss": 1.2766, "step": 1786 }, { "epoch": 1.5805309734513275, "grad_norm": 1.3504184255286809, "learning_rate": 6.694349375663234e-06, "loss": 1.1038, "step": 1787 }, { "epoch": 1.5814159292035397, "grad_norm": 1.2608187764008019, "learning_rate": 6.691050170460899e-06, "loss": 1.1362, "step": 1788 }, { "epoch": 1.5823008849557523, "grad_norm": 2.2501380465186163, "learning_rate": 6.6877501336859264e-06, "loss": 1.0812, "step": 1789 }, { "epoch": 1.5831858407079646, "grad_norm": 1.3991827299652442, "learning_rate": 6.684449266961101e-06, "loss": 0.9801, "step": 1790 }, { "epoch": 1.584070796460177, "grad_norm": 1.6200730683242919, "learning_rate": 6.6811475719096255e-06, "loss": 1.3921, "step": 1791 }, { "epoch": 1.5849557522123894, "grad_norm": 1.4743757411457485, "learning_rate": 6.6778450501551065e-06, "loss": 1.017, "step": 1792 }, { "epoch": 1.5858407079646017, "grad_norm": 1.569226190784024, "learning_rate": 6.674541703321553e-06, "loss": 1.1321, "step": 1793 }, { "epoch": 1.5867256637168141, "grad_norm": 1.559309826831299, "learning_rate": 6.671237533033388e-06, "loss": 1.0452, "step": 1794 }, { "epoch": 1.5876106194690265, "grad_norm": 5.913319211863944, "learning_rate": 6.667932540915429e-06, "loss": 1.3213, "step": 1795 }, { "epoch": 1.588495575221239, "grad_norm": 1.4467219045344322, "learning_rate": 6.664626728592909e-06, "loss": 1.2875, "step": 1796 }, { "epoch": 1.5893805309734512, "grad_norm": 1.5659419857433576, "learning_rate": 6.661320097691454e-06, "loss": 0.9731, "step": 1797 }, { "epoch": 1.5902654867256638, "grad_norm": 1.923798483268291, "learning_rate": 6.6580126498371016e-06, "loss": 1.0851, "step": 1798 }, { "epoch": 1.591150442477876, "grad_norm": 1.7966858919519586, "learning_rate": 6.654704386656283e-06, "loss": 1.1241, "step": 1799 }, { "epoch": 1.5920353982300885, "grad_norm": 1.3502254062320123, "learning_rate": 6.651395309775837e-06, "loss": 1.1904, "step": 1800 }, { "epoch": 1.592920353982301, "grad_norm": 1.4330089824051528, "learning_rate": 6.6480854208229975e-06, "loss": 1.2199, "step": 1801 }, { "epoch": 1.5938053097345133, "grad_norm": 4.52712692117055, "learning_rate": 6.6447747214254e-06, "loss": 1.008, "step": 1802 }, { "epoch": 1.5946902654867257, "grad_norm": 1.4700018620674002, "learning_rate": 6.641463213211079e-06, "loss": 1.1571, "step": 1803 }, { "epoch": 1.595575221238938, "grad_norm": 1.4234695659240832, "learning_rate": 6.638150897808469e-06, "loss": 1.2463, "step": 1804 }, { "epoch": 1.5964601769911504, "grad_norm": 1.6052475564497741, "learning_rate": 6.634837776846394e-06, "loss": 1.1914, "step": 1805 }, { "epoch": 1.5973451327433628, "grad_norm": 1.741414998563443, "learning_rate": 6.63152385195408e-06, "loss": 1.0745, "step": 1806 }, { "epoch": 1.5982300884955754, "grad_norm": 2.4250440369436275, "learning_rate": 6.628209124761149e-06, "loss": 1.3213, "step": 1807 }, { "epoch": 1.5991150442477875, "grad_norm": 2.2463503776374285, "learning_rate": 6.6248935968976135e-06, "loss": 1.2083, "step": 1808 }, { "epoch": 1.6, "grad_norm": 1.3889113576861785, "learning_rate": 6.621577269993883e-06, "loss": 1.0174, "step": 1809 }, { "epoch": 1.6008849557522122, "grad_norm": 1.4141223871649877, "learning_rate": 6.618260145680758e-06, "loss": 1.0706, "step": 1810 }, { "epoch": 1.6017699115044248, "grad_norm": 1.5845384583726303, "learning_rate": 6.614942225589432e-06, "loss": 1.1757, "step": 1811 }, { "epoch": 1.6026548672566372, "grad_norm": 2.4219987867381305, "learning_rate": 6.611623511351491e-06, "loss": 1.4157, "step": 1812 }, { "epoch": 1.6035398230088496, "grad_norm": 1.3562409066817174, "learning_rate": 6.608304004598908e-06, "loss": 1.0937, "step": 1813 }, { "epoch": 1.604424778761062, "grad_norm": 1.5951578411479876, "learning_rate": 6.60498370696405e-06, "loss": 1.3926, "step": 1814 }, { "epoch": 1.6053097345132743, "grad_norm": 1.3049110776956958, "learning_rate": 6.601662620079669e-06, "loss": 1.0079, "step": 1815 }, { "epoch": 1.606194690265487, "grad_norm": 2.093576652872968, "learning_rate": 6.598340745578908e-06, "loss": 1.0935, "step": 1816 }, { "epoch": 1.607079646017699, "grad_norm": 1.1911008748094758, "learning_rate": 6.595018085095296e-06, "loss": 1.0846, "step": 1817 }, { "epoch": 1.6079646017699116, "grad_norm": 1.4789342201190185, "learning_rate": 6.591694640262749e-06, "loss": 1.3815, "step": 1818 }, { "epoch": 1.6088495575221238, "grad_norm": 1.2507624242132278, "learning_rate": 6.588370412715569e-06, "loss": 0.9775, "step": 1819 }, { "epoch": 1.6097345132743364, "grad_norm": 1.7506716402154565, "learning_rate": 6.585045404088442e-06, "loss": 1.028, "step": 1820 }, { "epoch": 1.6106194690265485, "grad_norm": 1.5087154569500776, "learning_rate": 6.581719616016437e-06, "loss": 1.2152, "step": 1821 }, { "epoch": 1.611504424778761, "grad_norm": 1.2647084894954743, "learning_rate": 6.57839305013501e-06, "loss": 0.974, "step": 1822 }, { "epoch": 1.6123893805309735, "grad_norm": 1.1688436151089654, "learning_rate": 6.575065708079995e-06, "loss": 0.9126, "step": 1823 }, { "epoch": 1.6132743362831858, "grad_norm": 1.473519933499021, "learning_rate": 6.571737591487611e-06, "loss": 1.079, "step": 1824 }, { "epoch": 1.6141592920353982, "grad_norm": 1.4716997576522184, "learning_rate": 6.568408701994459e-06, "loss": 1.1505, "step": 1825 }, { "epoch": 1.6150442477876106, "grad_norm": 1.2724677311061274, "learning_rate": 6.565079041237513e-06, "loss": 0.8965, "step": 1826 }, { "epoch": 1.6159292035398232, "grad_norm": 1.6469106083668825, "learning_rate": 6.561748610854137e-06, "loss": 1.2691, "step": 1827 }, { "epoch": 1.6168141592920353, "grad_norm": 1.8835333204065337, "learning_rate": 6.558417412482062e-06, "loss": 1.2836, "step": 1828 }, { "epoch": 1.617699115044248, "grad_norm": 1.4776444942366334, "learning_rate": 6.555085447759406e-06, "loss": 1.1283, "step": 1829 }, { "epoch": 1.61858407079646, "grad_norm": 1.5904110980357644, "learning_rate": 6.55175271832466e-06, "loss": 1.2448, "step": 1830 }, { "epoch": 1.6194690265486726, "grad_norm": 1.7023564854246196, "learning_rate": 6.548419225816689e-06, "loss": 1.7171, "step": 1831 }, { "epoch": 1.620353982300885, "grad_norm": 1.5655146126092034, "learning_rate": 6.545084971874738e-06, "loss": 1.0751, "step": 1832 }, { "epoch": 1.6212389380530974, "grad_norm": 3.8561474040961636, "learning_rate": 6.541749958138421e-06, "loss": 1.0865, "step": 1833 }, { "epoch": 1.6221238938053097, "grad_norm": 1.343121857436066, "learning_rate": 6.53841418624773e-06, "loss": 0.9326, "step": 1834 }, { "epoch": 1.6230088495575221, "grad_norm": 1.3099798820012654, "learning_rate": 6.53507765784303e-06, "loss": 0.8518, "step": 1835 }, { "epoch": 1.6238938053097345, "grad_norm": 1.211709051582145, "learning_rate": 6.531740374565053e-06, "loss": 1.0201, "step": 1836 }, { "epoch": 1.6247787610619469, "grad_norm": 1.9142402206896911, "learning_rate": 6.528402338054909e-06, "loss": 1.177, "step": 1837 }, { "epoch": 1.6256637168141594, "grad_norm": 1.9179627894220406, "learning_rate": 6.52506354995407e-06, "loss": 1.2805, "step": 1838 }, { "epoch": 1.6265486725663716, "grad_norm": 1.4073485954353435, "learning_rate": 6.521724011904387e-06, "loss": 1.1833, "step": 1839 }, { "epoch": 1.6274336283185842, "grad_norm": 1.8854843519450655, "learning_rate": 6.518383725548074e-06, "loss": 1.228, "step": 1840 }, { "epoch": 1.6283185840707963, "grad_norm": 1.2711375698684586, "learning_rate": 6.515042692527714e-06, "loss": 1.0393, "step": 1841 }, { "epoch": 1.629203539823009, "grad_norm": 1.2943774384901534, "learning_rate": 6.511700914486258e-06, "loss": 1.2912, "step": 1842 }, { "epoch": 1.6300884955752213, "grad_norm": 2.9479790357561133, "learning_rate": 6.508358393067023e-06, "loss": 1.1726, "step": 1843 }, { "epoch": 1.6309734513274337, "grad_norm": 1.3442767420992237, "learning_rate": 6.505015129913689e-06, "loss": 1.0902, "step": 1844 }, { "epoch": 1.631858407079646, "grad_norm": 1.3313160379574194, "learning_rate": 6.501671126670307e-06, "loss": 1.1681, "step": 1845 }, { "epoch": 1.6327433628318584, "grad_norm": 1.3489036669308714, "learning_rate": 6.4983263849812835e-06, "loss": 1.074, "step": 1846 }, { "epoch": 1.6336283185840708, "grad_norm": 1.4394766298657153, "learning_rate": 6.494980906491397e-06, "loss": 1.1803, "step": 1847 }, { "epoch": 1.6345132743362831, "grad_norm": 1.7318139303246782, "learning_rate": 6.491634692845781e-06, "loss": 1.4344, "step": 1848 }, { "epoch": 1.6353982300884957, "grad_norm": 3.2054922093329146, "learning_rate": 6.488287745689936e-06, "loss": 1.2719, "step": 1849 }, { "epoch": 1.6362831858407079, "grad_norm": 2.1684446771475394, "learning_rate": 6.484940066669718e-06, "loss": 1.2247, "step": 1850 }, { "epoch": 1.6371681415929205, "grad_norm": 1.3102105766074434, "learning_rate": 6.481591657431349e-06, "loss": 1.072, "step": 1851 }, { "epoch": 1.6380530973451326, "grad_norm": 1.5194460562841186, "learning_rate": 6.478242519621403e-06, "loss": 1.0821, "step": 1852 }, { "epoch": 1.6389380530973452, "grad_norm": 1.3740178498701168, "learning_rate": 6.474892654886819e-06, "loss": 1.2535, "step": 1853 }, { "epoch": 1.6398230088495576, "grad_norm": 1.2520059994850548, "learning_rate": 6.4715420648748875e-06, "loss": 1.0707, "step": 1854 }, { "epoch": 1.64070796460177, "grad_norm": 1.443917638728632, "learning_rate": 6.468190751233262e-06, "loss": 1.1524, "step": 1855 }, { "epoch": 1.6415929203539823, "grad_norm": 1.2640297960598332, "learning_rate": 6.464838715609945e-06, "loss": 0.9478, "step": 1856 }, { "epoch": 1.6424778761061947, "grad_norm": 1.5809516229261724, "learning_rate": 6.4614859596533016e-06, "loss": 1.0488, "step": 1857 }, { "epoch": 1.643362831858407, "grad_norm": 1.5558596922977843, "learning_rate": 6.458132485012043e-06, "loss": 1.2005, "step": 1858 }, { "epoch": 1.6442477876106194, "grad_norm": 1.3078076540035928, "learning_rate": 6.45477829333524e-06, "loss": 1.1804, "step": 1859 }, { "epoch": 1.645132743362832, "grad_norm": 2.9137933310196025, "learning_rate": 6.451423386272312e-06, "loss": 1.0654, "step": 1860 }, { "epoch": 1.6460176991150441, "grad_norm": 1.6721165081627767, "learning_rate": 6.448067765473034e-06, "loss": 1.1315, "step": 1861 }, { "epoch": 1.6469026548672567, "grad_norm": 1.6970234798764796, "learning_rate": 6.44471143258753e-06, "loss": 1.2004, "step": 1862 }, { "epoch": 1.6477876106194689, "grad_norm": 1.3075794865693855, "learning_rate": 6.441354389266274e-06, "loss": 1.1217, "step": 1863 }, { "epoch": 1.6486725663716815, "grad_norm": 1.2407257781937164, "learning_rate": 6.437996637160086e-06, "loss": 0.948, "step": 1864 }, { "epoch": 1.6495575221238938, "grad_norm": 1.2058068312683896, "learning_rate": 6.434638177920144e-06, "loss": 0.7935, "step": 1865 }, { "epoch": 1.6504424778761062, "grad_norm": 1.3997481556075566, "learning_rate": 6.431279013197964e-06, "loss": 1.0496, "step": 1866 }, { "epoch": 1.6513274336283186, "grad_norm": 1.452159421369976, "learning_rate": 6.427919144645411e-06, "loss": 1.0119, "step": 1867 }, { "epoch": 1.652212389380531, "grad_norm": 1.3533471651150706, "learning_rate": 6.424558573914704e-06, "loss": 1.2938, "step": 1868 }, { "epoch": 1.6530973451327433, "grad_norm": 1.4826916189641086, "learning_rate": 6.4211973026583955e-06, "loss": 1.2362, "step": 1869 }, { "epoch": 1.6539823008849557, "grad_norm": 1.2699888499151613, "learning_rate": 6.417835332529389e-06, "loss": 0.9622, "step": 1870 }, { "epoch": 1.6548672566371683, "grad_norm": 1.4234610965358907, "learning_rate": 6.4144726651809334e-06, "loss": 1.2269, "step": 1871 }, { "epoch": 1.6557522123893804, "grad_norm": 1.4932823885498083, "learning_rate": 6.411109302266616e-06, "loss": 0.9654, "step": 1872 }, { "epoch": 1.656637168141593, "grad_norm": 1.2804708055160354, "learning_rate": 6.4077452454403675e-06, "loss": 0.8433, "step": 1873 }, { "epoch": 1.6575221238938052, "grad_norm": 1.3277863712159683, "learning_rate": 6.4043804963564616e-06, "loss": 1.0782, "step": 1874 }, { "epoch": 1.6584070796460177, "grad_norm": 1.1750181267266764, "learning_rate": 6.401015056669508e-06, "loss": 1.0484, "step": 1875 }, { "epoch": 1.6592920353982301, "grad_norm": 1.3254324333798198, "learning_rate": 6.397648928034466e-06, "loss": 1.0073, "step": 1876 }, { "epoch": 1.6601769911504425, "grad_norm": 1.1534460665787294, "learning_rate": 6.394282112106619e-06, "loss": 0.9876, "step": 1877 }, { "epoch": 1.6610619469026549, "grad_norm": 1.2478577534815047, "learning_rate": 6.3909146105416e-06, "loss": 1.1389, "step": 1878 }, { "epoch": 1.6619469026548672, "grad_norm": 1.3029923648217427, "learning_rate": 6.387546424995376e-06, "loss": 0.901, "step": 1879 }, { "epoch": 1.6628318584070798, "grad_norm": 1.4507010805703466, "learning_rate": 6.384177557124247e-06, "loss": 1.1828, "step": 1880 }, { "epoch": 1.663716814159292, "grad_norm": 1.2525058486776584, "learning_rate": 6.3808080085848544e-06, "loss": 1.13, "step": 1881 }, { "epoch": 1.6646017699115045, "grad_norm": 1.4037638512596984, "learning_rate": 6.377437781034169e-06, "loss": 0.9937, "step": 1882 }, { "epoch": 1.6654867256637167, "grad_norm": 1.4480123466775268, "learning_rate": 6.374066876129496e-06, "loss": 1.1145, "step": 1883 }, { "epoch": 1.6663716814159293, "grad_norm": 1.273087209191234, "learning_rate": 6.370695295528482e-06, "loss": 1.1187, "step": 1884 }, { "epoch": 1.6672566371681414, "grad_norm": 1.2284996223616844, "learning_rate": 6.36732304088909e-06, "loss": 1.0916, "step": 1885 }, { "epoch": 1.668141592920354, "grad_norm": 1.304926452783493, "learning_rate": 6.363950113869634e-06, "loss": 1.1446, "step": 1886 }, { "epoch": 1.6690265486725664, "grad_norm": 1.0707199596598358, "learning_rate": 6.360576516128738e-06, "loss": 0.8559, "step": 1887 }, { "epoch": 1.6699115044247788, "grad_norm": 1.8571006168339648, "learning_rate": 6.3572022493253715e-06, "loss": 0.9581, "step": 1888 }, { "epoch": 1.6707964601769911, "grad_norm": 1.3050448068139715, "learning_rate": 6.353827315118828e-06, "loss": 1.2642, "step": 1889 }, { "epoch": 1.6716814159292035, "grad_norm": 1.3961022415693363, "learning_rate": 6.350451715168728e-06, "loss": 1.2079, "step": 1890 }, { "epoch": 1.672566371681416, "grad_norm": 1.321499642020295, "learning_rate": 6.347075451135019e-06, "loss": 1.0368, "step": 1891 }, { "epoch": 1.6734513274336282, "grad_norm": 1.1478786698733205, "learning_rate": 6.343698524677979e-06, "loss": 1.0384, "step": 1892 }, { "epoch": 1.6743362831858408, "grad_norm": 1.1942298916532608, "learning_rate": 6.340320937458205e-06, "loss": 1.1402, "step": 1893 }, { "epoch": 1.675221238938053, "grad_norm": 1.415546333328266, "learning_rate": 6.3369426911366275e-06, "loss": 1.1226, "step": 1894 }, { "epoch": 1.6761061946902656, "grad_norm": 1.714160179401008, "learning_rate": 6.333563787374493e-06, "loss": 1.0526, "step": 1895 }, { "epoch": 1.676991150442478, "grad_norm": 1.2135594995938834, "learning_rate": 6.330184227833376e-06, "loss": 0.9296, "step": 1896 }, { "epoch": 1.6778761061946903, "grad_norm": 1.3526774888088589, "learning_rate": 6.326804014175174e-06, "loss": 1.1063, "step": 1897 }, { "epoch": 1.6787610619469027, "grad_norm": 1.4826211557971467, "learning_rate": 6.323423148062101e-06, "loss": 0.9883, "step": 1898 }, { "epoch": 1.679646017699115, "grad_norm": 1.6066589770416433, "learning_rate": 6.320041631156699e-06, "loss": 1.4459, "step": 1899 }, { "epoch": 1.6805309734513274, "grad_norm": 1.402793999145189, "learning_rate": 6.3166594651218235e-06, "loss": 0.907, "step": 1900 }, { "epoch": 1.6814159292035398, "grad_norm": 1.323900112920338, "learning_rate": 6.313276651620655e-06, "loss": 1.0786, "step": 1901 }, { "epoch": 1.6823008849557524, "grad_norm": 1.531459033333727, "learning_rate": 6.309893192316687e-06, "loss": 1.3009, "step": 1902 }, { "epoch": 1.6831858407079645, "grad_norm": 3.66863714797542, "learning_rate": 6.306509088873736e-06, "loss": 1.1575, "step": 1903 }, { "epoch": 1.684070796460177, "grad_norm": 1.6878842941144534, "learning_rate": 6.303124342955928e-06, "loss": 0.9069, "step": 1904 }, { "epoch": 1.6849557522123892, "grad_norm": 1.5343538854652559, "learning_rate": 6.299738956227712e-06, "loss": 1.2029, "step": 1905 }, { "epoch": 1.6858407079646018, "grad_norm": 1.6621155265589302, "learning_rate": 6.296352930353848e-06, "loss": 1.3828, "step": 1906 }, { "epoch": 1.6867256637168142, "grad_norm": 1.4089519837707096, "learning_rate": 6.292966266999414e-06, "loss": 1.0421, "step": 1907 }, { "epoch": 1.6876106194690266, "grad_norm": 1.3590176363975837, "learning_rate": 6.289578967829796e-06, "loss": 1.2296, "step": 1908 }, { "epoch": 1.688495575221239, "grad_norm": 1.5098629526879495, "learning_rate": 6.2861910345107e-06, "loss": 1.1373, "step": 1909 }, { "epoch": 1.6893805309734513, "grad_norm": 1.3665924526610589, "learning_rate": 6.282802468708133e-06, "loss": 1.0097, "step": 1910 }, { "epoch": 1.6902654867256637, "grad_norm": 2.7990750706818273, "learning_rate": 6.279413272088427e-06, "loss": 1.1768, "step": 1911 }, { "epoch": 1.691150442477876, "grad_norm": 3.151543498150245, "learning_rate": 6.276023446318214e-06, "loss": 1.1366, "step": 1912 }, { "epoch": 1.6920353982300886, "grad_norm": 2.750422081081138, "learning_rate": 6.272632993064436e-06, "loss": 1.1185, "step": 1913 }, { "epoch": 1.6929203539823008, "grad_norm": 1.3156819605181354, "learning_rate": 6.269241913994348e-06, "loss": 1.0623, "step": 1914 }, { "epoch": 1.6938053097345134, "grad_norm": 1.1382232873604632, "learning_rate": 6.26585021077551e-06, "loss": 0.9791, "step": 1915 }, { "epoch": 1.6946902654867255, "grad_norm": 1.3634768235827626, "learning_rate": 6.26245788507579e-06, "loss": 1.1544, "step": 1916 }, { "epoch": 1.6955752212389381, "grad_norm": 1.388223382408758, "learning_rate": 6.259064938563363e-06, "loss": 1.1677, "step": 1917 }, { "epoch": 1.6964601769911505, "grad_norm": 1.4997298664719934, "learning_rate": 6.255671372906704e-06, "loss": 1.3819, "step": 1918 }, { "epoch": 1.6973451327433628, "grad_norm": 1.3285738827291105, "learning_rate": 6.252277189774599e-06, "loss": 1.0083, "step": 1919 }, { "epoch": 1.6982300884955752, "grad_norm": 1.7468034694792802, "learning_rate": 6.248882390836135e-06, "loss": 1.1744, "step": 1920 }, { "epoch": 1.6991150442477876, "grad_norm": 1.1879312652750522, "learning_rate": 6.245486977760702e-06, "loss": 1.052, "step": 1921 }, { "epoch": 1.7, "grad_norm": 2.3070904695728824, "learning_rate": 6.242090952217989e-06, "loss": 1.097, "step": 1922 }, { "epoch": 1.7008849557522123, "grad_norm": 1.5597545160887962, "learning_rate": 6.238694315877994e-06, "loss": 1.4605, "step": 1923 }, { "epoch": 1.701769911504425, "grad_norm": 1.443904262867291, "learning_rate": 6.235297070411004e-06, "loss": 1.1464, "step": 1924 }, { "epoch": 1.702654867256637, "grad_norm": 2.166792977149365, "learning_rate": 6.2318992174876195e-06, "loss": 1.423, "step": 1925 }, { "epoch": 1.7035398230088497, "grad_norm": 1.8740093094404386, "learning_rate": 6.228500758778724e-06, "loss": 1.13, "step": 1926 }, { "epoch": 1.7044247787610618, "grad_norm": 1.9090853610168603, "learning_rate": 6.225101695955513e-06, "loss": 1.3589, "step": 1927 }, { "epoch": 1.7053097345132744, "grad_norm": 1.862428436069772, "learning_rate": 6.2217020306894705e-06, "loss": 1.0841, "step": 1928 }, { "epoch": 1.7061946902654868, "grad_norm": 1.351531508383671, "learning_rate": 6.21830176465238e-06, "loss": 1.0348, "step": 1929 }, { "epoch": 1.7070796460176991, "grad_norm": 1.5728885622289188, "learning_rate": 6.21490089951632e-06, "loss": 1.1623, "step": 1930 }, { "epoch": 1.7079646017699115, "grad_norm": 1.2346605018835892, "learning_rate": 6.2114994369536615e-06, "loss": 0.9699, "step": 1931 }, { "epoch": 1.7088495575221239, "grad_norm": 1.3118617904587833, "learning_rate": 6.208097378637076e-06, "loss": 1.1835, "step": 1932 }, { "epoch": 1.7097345132743362, "grad_norm": 1.2595858527109864, "learning_rate": 6.204694726239516e-06, "loss": 1.0363, "step": 1933 }, { "epoch": 1.7106194690265486, "grad_norm": 1.3387276596303066, "learning_rate": 6.201291481434238e-06, "loss": 1.1117, "step": 1934 }, { "epoch": 1.7115044247787612, "grad_norm": 1.302585574163565, "learning_rate": 6.197887645894785e-06, "loss": 0.9032, "step": 1935 }, { "epoch": 1.7123893805309733, "grad_norm": 1.3335558182896452, "learning_rate": 6.194483221294989e-06, "loss": 1.261, "step": 1936 }, { "epoch": 1.713274336283186, "grad_norm": 1.3547661134458633, "learning_rate": 6.191078209308974e-06, "loss": 1.0742, "step": 1937 }, { "epoch": 1.714159292035398, "grad_norm": 2.5406269634458925, "learning_rate": 6.187672611611152e-06, "loss": 1.2151, "step": 1938 }, { "epoch": 1.7150442477876107, "grad_norm": 1.3156432472583326, "learning_rate": 6.184266429876221e-06, "loss": 1.0789, "step": 1939 }, { "epoch": 1.715929203539823, "grad_norm": 1.7736213447905849, "learning_rate": 6.180859665779173e-06, "loss": 1.3524, "step": 1940 }, { "epoch": 1.7168141592920354, "grad_norm": 1.4867228802960677, "learning_rate": 6.177452320995276e-06, "loss": 1.4205, "step": 1941 }, { "epoch": 1.7176991150442478, "grad_norm": 1.7669487529824002, "learning_rate": 6.174044397200094e-06, "loss": 0.9505, "step": 1942 }, { "epoch": 1.7185840707964601, "grad_norm": 1.344281787257617, "learning_rate": 6.170635896069468e-06, "loss": 1.1387, "step": 1943 }, { "epoch": 1.7194690265486727, "grad_norm": 1.4068558339729154, "learning_rate": 6.1672268192795285e-06, "loss": 1.1209, "step": 1944 }, { "epoch": 1.7203539823008849, "grad_norm": 1.9934259650751533, "learning_rate": 6.163817168506683e-06, "loss": 1.2156, "step": 1945 }, { "epoch": 1.7212389380530975, "grad_norm": 2.0090246950791766, "learning_rate": 6.1604069454276276e-06, "loss": 1.2488, "step": 1946 }, { "epoch": 1.7221238938053096, "grad_norm": 1.3783382238682218, "learning_rate": 6.156996151719334e-06, "loss": 1.0385, "step": 1947 }, { "epoch": 1.7230088495575222, "grad_norm": 1.1186518374143497, "learning_rate": 6.1535847890590615e-06, "loss": 0.6944, "step": 1948 }, { "epoch": 1.7238938053097344, "grad_norm": 2.616361466393032, "learning_rate": 6.150172859124342e-06, "loss": 1.105, "step": 1949 }, { "epoch": 1.724778761061947, "grad_norm": 1.123606138439773, "learning_rate": 6.14676036359299e-06, "loss": 0.8579, "step": 1950 }, { "epoch": 1.7256637168141593, "grad_norm": 1.2566631435156734, "learning_rate": 6.143347304143098e-06, "loss": 1.164, "step": 1951 }, { "epoch": 1.7265486725663717, "grad_norm": 1.7021871484120528, "learning_rate": 6.139933682453035e-06, "loss": 1.1265, "step": 1952 }, { "epoch": 1.727433628318584, "grad_norm": 1.3250619920697342, "learning_rate": 6.136519500201448e-06, "loss": 0.8247, "step": 1953 }, { "epoch": 1.7283185840707964, "grad_norm": 1.3250222847958806, "learning_rate": 6.133104759067257e-06, "loss": 1.2056, "step": 1954 }, { "epoch": 1.729203539823009, "grad_norm": 1.2296885552622316, "learning_rate": 6.12968946072966e-06, "loss": 1.0303, "step": 1955 }, { "epoch": 1.7300884955752212, "grad_norm": 2.3468401378258634, "learning_rate": 6.126273606868125e-06, "loss": 0.9746, "step": 1956 }, { "epoch": 1.7309734513274337, "grad_norm": 1.4626378647562943, "learning_rate": 6.122857199162396e-06, "loss": 1.1039, "step": 1957 }, { "epoch": 1.731858407079646, "grad_norm": 1.3884256743460635, "learning_rate": 6.119440239292493e-06, "loss": 0.999, "step": 1958 }, { "epoch": 1.7327433628318585, "grad_norm": 1.2381645243711203, "learning_rate": 6.116022728938699e-06, "loss": 0.8944, "step": 1959 }, { "epoch": 1.7336283185840708, "grad_norm": 1.237422568618946, "learning_rate": 6.112604669781572e-06, "loss": 1.0222, "step": 1960 }, { "epoch": 1.7345132743362832, "grad_norm": 1.9412518090998376, "learning_rate": 6.109186063501944e-06, "loss": 1.1483, "step": 1961 }, { "epoch": 1.7353982300884956, "grad_norm": 1.5343914970061991, "learning_rate": 6.1057669117809085e-06, "loss": 1.0745, "step": 1962 }, { "epoch": 1.736283185840708, "grad_norm": 1.3770567833980074, "learning_rate": 6.102347216299831e-06, "loss": 1.1854, "step": 1963 }, { "epoch": 1.7371681415929203, "grad_norm": 1.3727435676202975, "learning_rate": 6.0989269787403445e-06, "loss": 1.1644, "step": 1964 }, { "epoch": 1.7380530973451327, "grad_norm": 1.4068009717030776, "learning_rate": 6.095506200784349e-06, "loss": 1.0571, "step": 1965 }, { "epoch": 1.7389380530973453, "grad_norm": 1.3274066679494794, "learning_rate": 6.09208488411401e-06, "loss": 1.103, "step": 1966 }, { "epoch": 1.7398230088495574, "grad_norm": 1.2077110057046003, "learning_rate": 6.088663030411757e-06, "loss": 1.0316, "step": 1967 }, { "epoch": 1.74070796460177, "grad_norm": 1.2704788906318427, "learning_rate": 6.085240641360281e-06, "loss": 1.1166, "step": 1968 }, { "epoch": 1.7415929203539822, "grad_norm": 1.1081960924594296, "learning_rate": 6.081817718642544e-06, "loss": 0.8862, "step": 1969 }, { "epoch": 1.7424778761061948, "grad_norm": 1.1366572889592819, "learning_rate": 6.078394263941762e-06, "loss": 0.8927, "step": 1970 }, { "epoch": 1.7433628318584071, "grad_norm": 1.3455056031685806, "learning_rate": 6.074970278941419e-06, "loss": 1.0737, "step": 1971 }, { "epoch": 1.7442477876106195, "grad_norm": 1.5606401651716553, "learning_rate": 6.071545765325254e-06, "loss": 1.2154, "step": 1972 }, { "epoch": 1.7451327433628319, "grad_norm": 1.802826332595244, "learning_rate": 6.068120724777271e-06, "loss": 0.9895, "step": 1973 }, { "epoch": 1.7460176991150442, "grad_norm": 2.5880397897010097, "learning_rate": 6.064695158981732e-06, "loss": 1.3371, "step": 1974 }, { "epoch": 1.7469026548672566, "grad_norm": 1.3353133631250746, "learning_rate": 6.061269069623154e-06, "loss": 1.1084, "step": 1975 }, { "epoch": 1.747787610619469, "grad_norm": 1.3319387856276685, "learning_rate": 6.057842458386315e-06, "loss": 1.1553, "step": 1976 }, { "epoch": 1.7486725663716816, "grad_norm": 1.197788786290059, "learning_rate": 6.054415326956249e-06, "loss": 0.9994, "step": 1977 }, { "epoch": 1.7495575221238937, "grad_norm": 1.295676580426587, "learning_rate": 6.0509876770182445e-06, "loss": 1.1565, "step": 1978 }, { "epoch": 1.7504424778761063, "grad_norm": 2.1299792525094023, "learning_rate": 6.0475595102578455e-06, "loss": 1.2302, "step": 1979 }, { "epoch": 1.7513274336283184, "grad_norm": 1.402962868686464, "learning_rate": 6.04413082836085e-06, "loss": 1.2245, "step": 1980 }, { "epoch": 1.752212389380531, "grad_norm": 1.2560407509179239, "learning_rate": 6.040701633013313e-06, "loss": 1.0118, "step": 1981 }, { "epoch": 1.7530973451327434, "grad_norm": 1.8265210186780048, "learning_rate": 6.037271925901533e-06, "loss": 1.1241, "step": 1982 }, { "epoch": 1.7539823008849558, "grad_norm": 1.271218638071204, "learning_rate": 6.033841708712072e-06, "loss": 1.0682, "step": 1983 }, { "epoch": 1.7548672566371681, "grad_norm": 1.35644604059254, "learning_rate": 6.030410983131733e-06, "loss": 1.0642, "step": 1984 }, { "epoch": 1.7557522123893805, "grad_norm": 1.247334815296304, "learning_rate": 6.026979750847572e-06, "loss": 1.073, "step": 1985 }, { "epoch": 1.7566371681415929, "grad_norm": 4.543122731607081, "learning_rate": 6.023548013546899e-06, "loss": 1.4003, "step": 1986 }, { "epoch": 1.7575221238938052, "grad_norm": 1.3397722279135045, "learning_rate": 6.020115772917266e-06, "loss": 1.0657, "step": 1987 }, { "epoch": 1.7584070796460178, "grad_norm": 1.3504273240023115, "learning_rate": 6.016683030646471e-06, "loss": 1.0735, "step": 1988 }, { "epoch": 1.75929203539823, "grad_norm": 1.2691971653414413, "learning_rate": 6.0132497884225696e-06, "loss": 1.0583, "step": 1989 }, { "epoch": 1.7601769911504426, "grad_norm": 1.5100597516961622, "learning_rate": 6.009816047933849e-06, "loss": 1.2756, "step": 1990 }, { "epoch": 1.7610619469026547, "grad_norm": 1.455956363781396, "learning_rate": 6.006381810868853e-06, "loss": 1.2111, "step": 1991 }, { "epoch": 1.7619469026548673, "grad_norm": 1.0762976511054356, "learning_rate": 6.002947078916365e-06, "loss": 0.856, "step": 1992 }, { "epoch": 1.7628318584070797, "grad_norm": 2.714543425713841, "learning_rate": 5.99951185376541e-06, "loss": 1.1639, "step": 1993 }, { "epoch": 1.763716814159292, "grad_norm": 1.2743123773281753, "learning_rate": 5.996076137105258e-06, "loss": 1.1488, "step": 1994 }, { "epoch": 1.7646017699115044, "grad_norm": 1.447330529756102, "learning_rate": 5.992639930625419e-06, "loss": 0.9665, "step": 1995 }, { "epoch": 1.7654867256637168, "grad_norm": 1.2846890670302769, "learning_rate": 5.989203236015647e-06, "loss": 0.9829, "step": 1996 }, { "epoch": 1.7663716814159292, "grad_norm": 1.3131528010945597, "learning_rate": 5.985766054965934e-06, "loss": 1.0192, "step": 1997 }, { "epoch": 1.7672566371681415, "grad_norm": 1.4675670218462338, "learning_rate": 5.982328389166509e-06, "loss": 1.1114, "step": 1998 }, { "epoch": 1.768141592920354, "grad_norm": 1.8012927196601123, "learning_rate": 5.978890240307843e-06, "loss": 1.3107, "step": 1999 }, { "epoch": 1.7690265486725663, "grad_norm": 7.4520777601368655, "learning_rate": 5.975451610080643e-06, "loss": 1.1966, "step": 2000 }, { "epoch": 1.7699115044247788, "grad_norm": 1.251928224271009, "learning_rate": 5.97201250017585e-06, "loss": 0.8161, "step": 2001 }, { "epoch": 1.770796460176991, "grad_norm": 1.714533741841932, "learning_rate": 5.96857291228465e-06, "loss": 1.2229, "step": 2002 }, { "epoch": 1.7716814159292036, "grad_norm": 1.3528483081330802, "learning_rate": 5.965132848098453e-06, "loss": 1.0487, "step": 2003 }, { "epoch": 1.772566371681416, "grad_norm": 1.8766356829228719, "learning_rate": 5.9616923093089095e-06, "loss": 1.2369, "step": 2004 }, { "epoch": 1.7734513274336283, "grad_norm": 1.292276091399248, "learning_rate": 5.9582512976078995e-06, "loss": 0.9788, "step": 2005 }, { "epoch": 1.7743362831858407, "grad_norm": 1.3451380958042802, "learning_rate": 5.954809814687541e-06, "loss": 1.1638, "step": 2006 }, { "epoch": 1.775221238938053, "grad_norm": 1.4232781592132715, "learning_rate": 5.95136786224018e-06, "loss": 1.1518, "step": 2007 }, { "epoch": 1.7761061946902656, "grad_norm": 1.3141255070409357, "learning_rate": 5.947925441958393e-06, "loss": 1.0559, "step": 2008 }, { "epoch": 1.7769911504424778, "grad_norm": 1.2142903646683132, "learning_rate": 5.9444825555349885e-06, "loss": 0.9955, "step": 2009 }, { "epoch": 1.7778761061946904, "grad_norm": 1.442258817065768, "learning_rate": 5.941039204663001e-06, "loss": 1.294, "step": 2010 }, { "epoch": 1.7787610619469025, "grad_norm": 1.1663973260867875, "learning_rate": 5.937595391035699e-06, "loss": 0.9539, "step": 2011 }, { "epoch": 1.7796460176991151, "grad_norm": 1.4223980739556839, "learning_rate": 5.934151116346574e-06, "loss": 1.1598, "step": 2012 }, { "epoch": 1.7805309734513273, "grad_norm": 1.3756658822230905, "learning_rate": 5.9307063822893425e-06, "loss": 1.1065, "step": 2013 }, { "epoch": 1.7814159292035399, "grad_norm": 1.4720961452054355, "learning_rate": 5.927261190557955e-06, "loss": 1.199, "step": 2014 }, { "epoch": 1.7823008849557522, "grad_norm": 1.4732895353880187, "learning_rate": 5.9238155428465796e-06, "loss": 1.073, "step": 2015 }, { "epoch": 1.7831858407079646, "grad_norm": 1.323160951413285, "learning_rate": 5.920369440849609e-06, "loss": 1.162, "step": 2016 }, { "epoch": 1.784070796460177, "grad_norm": 1.476306359254701, "learning_rate": 5.916922886261665e-06, "loss": 0.9746, "step": 2017 }, { "epoch": 1.7849557522123893, "grad_norm": 1.7310846875312504, "learning_rate": 5.913475880777585e-06, "loss": 1.1656, "step": 2018 }, { "epoch": 1.785840707964602, "grad_norm": 1.3174861724288793, "learning_rate": 5.910028426092432e-06, "loss": 1.128, "step": 2019 }, { "epoch": 1.786725663716814, "grad_norm": 1.5485445060703085, "learning_rate": 5.906580523901493e-06, "loss": 1.0982, "step": 2020 }, { "epoch": 1.7876106194690267, "grad_norm": 1.2800680752004143, "learning_rate": 5.903132175900264e-06, "loss": 1.0845, "step": 2021 }, { "epoch": 1.7884955752212388, "grad_norm": 1.5080753911169362, "learning_rate": 5.899683383784474e-06, "loss": 1.1171, "step": 2022 }, { "epoch": 1.7893805309734514, "grad_norm": 1.4596292356772023, "learning_rate": 5.896234149250061e-06, "loss": 1.0853, "step": 2023 }, { "epoch": 1.7902654867256638, "grad_norm": 1.5329667542413037, "learning_rate": 5.892784473993184e-06, "loss": 1.0914, "step": 2024 }, { "epoch": 1.7911504424778761, "grad_norm": 1.7751926872560178, "learning_rate": 5.889334359710218e-06, "loss": 1.2138, "step": 2025 }, { "epoch": 1.7920353982300885, "grad_norm": 1.3523600609454725, "learning_rate": 5.885883808097756e-06, "loss": 1.1341, "step": 2026 }, { "epoch": 1.7929203539823009, "grad_norm": 1.3159691760532117, "learning_rate": 5.8824328208525995e-06, "loss": 1.167, "step": 2027 }, { "epoch": 1.7938053097345132, "grad_norm": 1.5177700588078065, "learning_rate": 5.878981399671774e-06, "loss": 1.1228, "step": 2028 }, { "epoch": 1.7946902654867256, "grad_norm": 1.5573193437464115, "learning_rate": 5.87552954625251e-06, "loss": 1.1248, "step": 2029 }, { "epoch": 1.7955752212389382, "grad_norm": 1.7649590469972558, "learning_rate": 5.872077262292255e-06, "loss": 1.2519, "step": 2030 }, { "epoch": 1.7964601769911503, "grad_norm": 1.4124448574895805, "learning_rate": 5.868624549488666e-06, "loss": 0.7633, "step": 2031 }, { "epoch": 1.797345132743363, "grad_norm": 2.128498707477577, "learning_rate": 5.865171409539614e-06, "loss": 1.0327, "step": 2032 }, { "epoch": 1.798230088495575, "grad_norm": 1.281225182714856, "learning_rate": 5.861717844143177e-06, "loss": 1.0229, "step": 2033 }, { "epoch": 1.7991150442477877, "grad_norm": 1.2465720690236883, "learning_rate": 5.858263854997642e-06, "loss": 1.0338, "step": 2034 }, { "epoch": 1.8, "grad_norm": 1.2708527656294437, "learning_rate": 5.8548094438015065e-06, "loss": 1.2575, "step": 2035 }, { "epoch": 1.8008849557522124, "grad_norm": 1.761437795061028, "learning_rate": 5.851354612253475e-06, "loss": 1.2158, "step": 2036 }, { "epoch": 1.8017699115044248, "grad_norm": 1.7295891400443042, "learning_rate": 5.847899362052457e-06, "loss": 1.3864, "step": 2037 }, { "epoch": 1.8026548672566372, "grad_norm": 1.4104354507027206, "learning_rate": 5.844443694897571e-06, "loss": 0.9042, "step": 2038 }, { "epoch": 1.8035398230088495, "grad_norm": 1.268929051383329, "learning_rate": 5.840987612488138e-06, "loss": 1.2156, "step": 2039 }, { "epoch": 1.8044247787610619, "grad_norm": 1.3965612854548335, "learning_rate": 5.837531116523683e-06, "loss": 1.0037, "step": 2040 }, { "epoch": 1.8053097345132745, "grad_norm": 1.2908636209773559, "learning_rate": 5.834074208703936e-06, "loss": 0.9532, "step": 2041 }, { "epoch": 1.8061946902654866, "grad_norm": 1.312712191345331, "learning_rate": 5.830616890728828e-06, "loss": 1.2767, "step": 2042 }, { "epoch": 1.8070796460176992, "grad_norm": 1.7054897814219838, "learning_rate": 5.827159164298495e-06, "loss": 1.2516, "step": 2043 }, { "epoch": 1.8079646017699114, "grad_norm": 1.6214499674500389, "learning_rate": 5.823701031113267e-06, "loss": 1.2654, "step": 2044 }, { "epoch": 1.808849557522124, "grad_norm": 1.2216135901998593, "learning_rate": 5.8202424928736825e-06, "loss": 1.0671, "step": 2045 }, { "epoch": 1.8097345132743363, "grad_norm": 1.2158321926387636, "learning_rate": 5.816783551280473e-06, "loss": 1.0918, "step": 2046 }, { "epoch": 1.8106194690265487, "grad_norm": 2.3667529437618127, "learning_rate": 5.813324208034571e-06, "loss": 1.2146, "step": 2047 }, { "epoch": 1.811504424778761, "grad_norm": 1.4110448304466994, "learning_rate": 5.809864464837105e-06, "loss": 1.1755, "step": 2048 }, { "epoch": 1.8123893805309734, "grad_norm": 1.6530789362971015, "learning_rate": 5.806404323389403e-06, "loss": 1.2504, "step": 2049 }, { "epoch": 1.8132743362831858, "grad_norm": 1.4989159891697548, "learning_rate": 5.802943785392986e-06, "loss": 0.9392, "step": 2050 }, { "epoch": 1.8141592920353982, "grad_norm": 1.4918913308289068, "learning_rate": 5.79948285254957e-06, "loss": 1.0227, "step": 2051 }, { "epoch": 1.8150442477876108, "grad_norm": 1.3316981413885374, "learning_rate": 5.796021526561067e-06, "loss": 1.1611, "step": 2052 }, { "epoch": 1.815929203539823, "grad_norm": 1.5822604178154727, "learning_rate": 5.792559809129582e-06, "loss": 1.3481, "step": 2053 }, { "epoch": 1.8168141592920355, "grad_norm": 1.4642624999733977, "learning_rate": 5.78909770195741e-06, "loss": 1.2046, "step": 2054 }, { "epoch": 1.8176991150442476, "grad_norm": 1.6824220078385879, "learning_rate": 5.785635206747041e-06, "loss": 1.2914, "step": 2055 }, { "epoch": 1.8185840707964602, "grad_norm": 2.345726012652845, "learning_rate": 5.782172325201155e-06, "loss": 1.1288, "step": 2056 }, { "epoch": 1.8194690265486726, "grad_norm": 1.5867476722771647, "learning_rate": 5.77870905902262e-06, "loss": 0.8971, "step": 2057 }, { "epoch": 1.820353982300885, "grad_norm": 1.257363134874452, "learning_rate": 5.775245409914496e-06, "loss": 1.0813, "step": 2058 }, { "epoch": 1.8212389380530973, "grad_norm": 1.4608166285682478, "learning_rate": 5.7717813795800285e-06, "loss": 1.1725, "step": 2059 }, { "epoch": 1.8221238938053097, "grad_norm": 1.3287903753663202, "learning_rate": 5.768316969722651e-06, "loss": 0.9708, "step": 2060 }, { "epoch": 1.823008849557522, "grad_norm": 1.296361952873894, "learning_rate": 5.764852182045988e-06, "loss": 1.1037, "step": 2061 }, { "epoch": 1.8238938053097344, "grad_norm": 1.519406907519977, "learning_rate": 5.761387018253841e-06, "loss": 1.2023, "step": 2062 }, { "epoch": 1.824778761061947, "grad_norm": 1.6500429887859196, "learning_rate": 5.757921480050206e-06, "loss": 1.0133, "step": 2063 }, { "epoch": 1.8256637168141592, "grad_norm": 1.710199026536398, "learning_rate": 5.754455569139258e-06, "loss": 1.1415, "step": 2064 }, { "epoch": 1.8265486725663718, "grad_norm": 1.4384678850505979, "learning_rate": 5.750989287225355e-06, "loss": 1.1183, "step": 2065 }, { "epoch": 1.827433628318584, "grad_norm": 1.3765916323898943, "learning_rate": 5.747522636013038e-06, "loss": 1.1325, "step": 2066 }, { "epoch": 1.8283185840707965, "grad_norm": 1.2144478473782225, "learning_rate": 5.744055617207032e-06, "loss": 1.1012, "step": 2067 }, { "epoch": 1.8292035398230089, "grad_norm": 1.3925415382372566, "learning_rate": 5.740588232512238e-06, "loss": 1.023, "step": 2068 }, { "epoch": 1.8300884955752212, "grad_norm": 1.3826850442487626, "learning_rate": 5.7371204836337445e-06, "loss": 1.3447, "step": 2069 }, { "epoch": 1.8309734513274336, "grad_norm": 1.4381110786702513, "learning_rate": 5.733652372276809e-06, "loss": 1.1194, "step": 2070 }, { "epoch": 1.831858407079646, "grad_norm": 1.3196187911348438, "learning_rate": 5.730183900146877e-06, "loss": 0.8977, "step": 2071 }, { "epoch": 1.8327433628318586, "grad_norm": 1.3328326106856434, "learning_rate": 5.726715068949564e-06, "loss": 1.1984, "step": 2072 }, { "epoch": 1.8336283185840707, "grad_norm": 1.5065910367296687, "learning_rate": 5.723245880390668e-06, "loss": 1.132, "step": 2073 }, { "epoch": 1.8345132743362833, "grad_norm": 1.3801228126090408, "learning_rate": 5.719776336176156e-06, "loss": 1.1908, "step": 2074 }, { "epoch": 1.8353982300884955, "grad_norm": 1.4245415263056709, "learning_rate": 5.716306438012176e-06, "loss": 1.1005, "step": 2075 }, { "epoch": 1.836283185840708, "grad_norm": 1.9169568654159652, "learning_rate": 5.71283618760505e-06, "loss": 1.0692, "step": 2076 }, { "epoch": 1.8371681415929202, "grad_norm": 1.3257402503155022, "learning_rate": 5.709365586661266e-06, "loss": 1.0511, "step": 2077 }, { "epoch": 1.8380530973451328, "grad_norm": 2.326488547900033, "learning_rate": 5.705894636887494e-06, "loss": 1.161, "step": 2078 }, { "epoch": 1.8389380530973451, "grad_norm": 1.3447842286142753, "learning_rate": 5.702423339990569e-06, "loss": 1.1565, "step": 2079 }, { "epoch": 1.8398230088495575, "grad_norm": 1.1435331946121146, "learning_rate": 5.698951697677498e-06, "loss": 0.905, "step": 2080 }, { "epoch": 1.8407079646017699, "grad_norm": 1.5331194478558663, "learning_rate": 5.695479711655459e-06, "loss": 1.067, "step": 2081 }, { "epoch": 1.8415929203539823, "grad_norm": 1.3191728038779813, "learning_rate": 5.692007383631799e-06, "loss": 1.1635, "step": 2082 }, { "epoch": 1.8424778761061948, "grad_norm": 1.852395355587854, "learning_rate": 5.688534715314031e-06, "loss": 1.2738, "step": 2083 }, { "epoch": 1.843362831858407, "grad_norm": 1.3912003530522392, "learning_rate": 5.6850617084098416e-06, "loss": 1.0203, "step": 2084 }, { "epoch": 1.8442477876106196, "grad_norm": 1.7517958661420532, "learning_rate": 5.681588364627073e-06, "loss": 0.7404, "step": 2085 }, { "epoch": 1.8451327433628317, "grad_norm": 1.2555951810605814, "learning_rate": 5.678114685673743e-06, "loss": 1.0497, "step": 2086 }, { "epoch": 1.8460176991150443, "grad_norm": 1.2937823682332747, "learning_rate": 5.67464067325803e-06, "loss": 1.0855, "step": 2087 }, { "epoch": 1.8469026548672567, "grad_norm": 1.375779918752008, "learning_rate": 5.671166329088278e-06, "loss": 0.9559, "step": 2088 }, { "epoch": 1.847787610619469, "grad_norm": 1.398728479762893, "learning_rate": 5.667691654872991e-06, "loss": 1.119, "step": 2089 }, { "epoch": 1.8486725663716814, "grad_norm": 1.3717131297391338, "learning_rate": 5.66421665232084e-06, "loss": 0.9037, "step": 2090 }, { "epoch": 1.8495575221238938, "grad_norm": 1.5409237240587457, "learning_rate": 5.660741323140651e-06, "loss": 1.315, "step": 2091 }, { "epoch": 1.8504424778761062, "grad_norm": 1.325628110926165, "learning_rate": 5.65726566904142e-06, "loss": 1.0188, "step": 2092 }, { "epoch": 1.8513274336283185, "grad_norm": 1.128575319802486, "learning_rate": 5.65378969173229e-06, "loss": 0.9906, "step": 2093 }, { "epoch": 1.8522123893805311, "grad_norm": 1.7678899377692525, "learning_rate": 5.650313392922578e-06, "loss": 1.0216, "step": 2094 }, { "epoch": 1.8530973451327433, "grad_norm": 1.2499212671408935, "learning_rate": 5.646836774321746e-06, "loss": 1.2989, "step": 2095 }, { "epoch": 1.8539823008849559, "grad_norm": 1.3085511416655526, "learning_rate": 5.643359837639419e-06, "loss": 1.1802, "step": 2096 }, { "epoch": 1.854867256637168, "grad_norm": 1.235645144621095, "learning_rate": 5.6398825845853814e-06, "loss": 0.8819, "step": 2097 }, { "epoch": 1.8557522123893806, "grad_norm": 1.65708614208853, "learning_rate": 5.636405016869567e-06, "loss": 1.0764, "step": 2098 }, { "epoch": 1.856637168141593, "grad_norm": 1.0774956039691719, "learning_rate": 5.632927136202067e-06, "loss": 0.9649, "step": 2099 }, { "epoch": 1.8575221238938053, "grad_norm": 1.367972589697275, "learning_rate": 5.629448944293128e-06, "loss": 1.3915, "step": 2100 }, { "epoch": 1.8584070796460177, "grad_norm": 1.5016287582739973, "learning_rate": 5.625970442853146e-06, "loss": 1.0077, "step": 2101 }, { "epoch": 1.85929203539823, "grad_norm": 1.4216642533857544, "learning_rate": 5.622491633592675e-06, "loss": 0.9336, "step": 2102 }, { "epoch": 1.8601769911504424, "grad_norm": 1.2631394957117996, "learning_rate": 5.619012518222413e-06, "loss": 1.2, "step": 2103 }, { "epoch": 1.8610619469026548, "grad_norm": 1.1651489992403476, "learning_rate": 5.615533098453215e-06, "loss": 1.0753, "step": 2104 }, { "epoch": 1.8619469026548674, "grad_norm": 1.384329796897256, "learning_rate": 5.612053375996082e-06, "loss": 0.9526, "step": 2105 }, { "epoch": 1.8628318584070795, "grad_norm": 1.4379899857148997, "learning_rate": 5.608573352562163e-06, "loss": 1.1488, "step": 2106 }, { "epoch": 1.8637168141592921, "grad_norm": 1.2303577705497681, "learning_rate": 5.605093029862762e-06, "loss": 1.0738, "step": 2107 }, { "epoch": 1.8646017699115043, "grad_norm": 1.5133498925438107, "learning_rate": 5.60161240960932e-06, "loss": 1.1812, "step": 2108 }, { "epoch": 1.8654867256637169, "grad_norm": 1.2427742748834012, "learning_rate": 5.598131493513432e-06, "loss": 1.051, "step": 2109 }, { "epoch": 1.8663716814159292, "grad_norm": 1.4277891793943693, "learning_rate": 5.594650283286835e-06, "loss": 1.1615, "step": 2110 }, { "epoch": 1.8672566371681416, "grad_norm": 1.648691309701542, "learning_rate": 5.591168780641412e-06, "loss": 1.1227, "step": 2111 }, { "epoch": 1.868141592920354, "grad_norm": 1.4222491717734913, "learning_rate": 5.587686987289189e-06, "loss": 1.27, "step": 2112 }, { "epoch": 1.8690265486725663, "grad_norm": 1.489581312301154, "learning_rate": 5.584204904942335e-06, "loss": 1.1282, "step": 2113 }, { "epoch": 1.8699115044247787, "grad_norm": 1.4943354142509655, "learning_rate": 5.58072253531316e-06, "loss": 1.2974, "step": 2114 }, { "epoch": 1.870796460176991, "grad_norm": 1.238657830628297, "learning_rate": 5.577239880114121e-06, "loss": 1.0062, "step": 2115 }, { "epoch": 1.8716814159292037, "grad_norm": 1.692131816808058, "learning_rate": 5.573756941057805e-06, "loss": 1.3909, "step": 2116 }, { "epoch": 1.8725663716814158, "grad_norm": 1.3729769374403749, "learning_rate": 5.570273719856952e-06, "loss": 1.1221, "step": 2117 }, { "epoch": 1.8734513274336284, "grad_norm": 1.2103173488245498, "learning_rate": 5.566790218224425e-06, "loss": 1.118, "step": 2118 }, { "epoch": 1.8743362831858406, "grad_norm": 1.4401265154123515, "learning_rate": 5.563306437873239e-06, "loss": 1.0282, "step": 2119 }, { "epoch": 1.8752212389380531, "grad_norm": 1.2741244624720807, "learning_rate": 5.559822380516539e-06, "loss": 1.0819, "step": 2120 }, { "epoch": 1.8761061946902655, "grad_norm": 1.7536369598078672, "learning_rate": 5.556338047867609e-06, "loss": 1.1958, "step": 2121 }, { "epoch": 1.8769911504424779, "grad_norm": 1.4143296001696495, "learning_rate": 5.552853441639864e-06, "loss": 1.012, "step": 2122 }, { "epoch": 1.8778761061946903, "grad_norm": 1.3758356916170125, "learning_rate": 5.549368563546857e-06, "loss": 1.128, "step": 2123 }, { "epoch": 1.8787610619469026, "grad_norm": 1.9463986636735071, "learning_rate": 5.545883415302276e-06, "loss": 1.2882, "step": 2124 }, { "epoch": 1.879646017699115, "grad_norm": 1.2679523980199043, "learning_rate": 5.54239799861994e-06, "loss": 1.0682, "step": 2125 }, { "epoch": 1.8805309734513274, "grad_norm": 1.3312067944937251, "learning_rate": 5.5389123152137965e-06, "loss": 1.3816, "step": 2126 }, { "epoch": 1.88141592920354, "grad_norm": 1.413581452144499, "learning_rate": 5.535426366797933e-06, "loss": 1.0066, "step": 2127 }, { "epoch": 1.882300884955752, "grad_norm": 1.5510494255857619, "learning_rate": 5.531940155086557e-06, "loss": 1.1744, "step": 2128 }, { "epoch": 1.8831858407079647, "grad_norm": 1.1599439783546182, "learning_rate": 5.528453681794015e-06, "loss": 1.1041, "step": 2129 }, { "epoch": 1.8840707964601768, "grad_norm": 1.2623385526304258, "learning_rate": 5.524966948634774e-06, "loss": 1.1911, "step": 2130 }, { "epoch": 1.8849557522123894, "grad_norm": 1.2630596960905076, "learning_rate": 5.521479957323433e-06, "loss": 1.0473, "step": 2131 }, { "epoch": 1.8858407079646018, "grad_norm": 1.5073512893830292, "learning_rate": 5.517992709574718e-06, "loss": 1.0354, "step": 2132 }, { "epoch": 1.8867256637168142, "grad_norm": 1.3120611309213113, "learning_rate": 5.514505207103482e-06, "loss": 1.2422, "step": 2133 }, { "epoch": 1.8876106194690265, "grad_norm": 1.2121967641285976, "learning_rate": 5.511017451624698e-06, "loss": 1.0982, "step": 2134 }, { "epoch": 1.888495575221239, "grad_norm": 1.5936527486602317, "learning_rate": 5.507529444853471e-06, "loss": 1.1748, "step": 2135 }, { "epoch": 1.8893805309734515, "grad_norm": 1.3250230069625972, "learning_rate": 5.504041188505022e-06, "loss": 0.8724, "step": 2136 }, { "epoch": 1.8902654867256636, "grad_norm": 1.240416076157097, "learning_rate": 5.500552684294703e-06, "loss": 0.9247, "step": 2137 }, { "epoch": 1.8911504424778762, "grad_norm": 1.3689310485588821, "learning_rate": 5.4970639339379795e-06, "loss": 1.1358, "step": 2138 }, { "epoch": 1.8920353982300884, "grad_norm": 1.4799602744955884, "learning_rate": 5.493574939150443e-06, "loss": 0.906, "step": 2139 }, { "epoch": 1.892920353982301, "grad_norm": 1.5045715377848934, "learning_rate": 5.490085701647805e-06, "loss": 1.2466, "step": 2140 }, { "epoch": 1.893805309734513, "grad_norm": 1.2851751387941301, "learning_rate": 5.486596223145892e-06, "loss": 1.017, "step": 2141 }, { "epoch": 1.8946902654867257, "grad_norm": 1.2473342590200605, "learning_rate": 5.483106505360656e-06, "loss": 1.1457, "step": 2142 }, { "epoch": 1.895575221238938, "grad_norm": 1.5072400643775017, "learning_rate": 5.479616550008162e-06, "loss": 1.1953, "step": 2143 }, { "epoch": 1.8964601769911504, "grad_norm": 2.61504780212126, "learning_rate": 5.476126358804594e-06, "loss": 1.2165, "step": 2144 }, { "epoch": 1.8973451327433628, "grad_norm": 1.4770495935095034, "learning_rate": 5.472635933466248e-06, "loss": 1.2256, "step": 2145 }, { "epoch": 1.8982300884955752, "grad_norm": 2.5085635274656495, "learning_rate": 5.469145275709541e-06, "loss": 1.0737, "step": 2146 }, { "epoch": 1.8991150442477878, "grad_norm": 1.286453090166021, "learning_rate": 5.4656543872509994e-06, "loss": 1.0416, "step": 2147 }, { "epoch": 1.9, "grad_norm": 1.4060990636149595, "learning_rate": 5.462163269807267e-06, "loss": 1.039, "step": 2148 }, { "epoch": 1.9008849557522125, "grad_norm": 1.436090656671484, "learning_rate": 5.4586719250950935e-06, "loss": 0.8877, "step": 2149 }, { "epoch": 1.9017699115044246, "grad_norm": 1.298267445905039, "learning_rate": 5.4551803548313505e-06, "loss": 1.0029, "step": 2150 }, { "epoch": 1.9026548672566372, "grad_norm": 1.4995990504592511, "learning_rate": 5.451688560733014e-06, "loss": 0.9029, "step": 2151 }, { "epoch": 1.9035398230088496, "grad_norm": 1.229231683858257, "learning_rate": 5.448196544517168e-06, "loss": 1.1749, "step": 2152 }, { "epoch": 1.904424778761062, "grad_norm": 1.2994687815605153, "learning_rate": 5.444704307901012e-06, "loss": 1.2384, "step": 2153 }, { "epoch": 1.9053097345132743, "grad_norm": 1.3559996275201194, "learning_rate": 5.441211852601849e-06, "loss": 1.2257, "step": 2154 }, { "epoch": 1.9061946902654867, "grad_norm": 1.4741414511895554, "learning_rate": 5.43771918033709e-06, "loss": 1.2656, "step": 2155 }, { "epoch": 1.907079646017699, "grad_norm": 1.180426503654494, "learning_rate": 5.4342262928242584e-06, "loss": 1.175, "step": 2156 }, { "epoch": 1.9079646017699115, "grad_norm": 1.1896737240530795, "learning_rate": 5.430733191780974e-06, "loss": 0.8702, "step": 2157 }, { "epoch": 1.908849557522124, "grad_norm": 1.5913226228780109, "learning_rate": 5.4272398789249705e-06, "loss": 1.2621, "step": 2158 }, { "epoch": 1.9097345132743362, "grad_norm": 1.6328482982952555, "learning_rate": 5.4237463559740785e-06, "loss": 1.1103, "step": 2159 }, { "epoch": 1.9106194690265488, "grad_norm": 1.2782903876120422, "learning_rate": 5.420252624646238e-06, "loss": 1.1071, "step": 2160 }, { "epoch": 1.911504424778761, "grad_norm": 1.3589112524617974, "learning_rate": 5.416758686659488e-06, "loss": 1.0925, "step": 2161 }, { "epoch": 1.9123893805309735, "grad_norm": 1.2332566786205608, "learning_rate": 5.41326454373197e-06, "loss": 0.9693, "step": 2162 }, { "epoch": 1.9132743362831859, "grad_norm": 1.5187709973578958, "learning_rate": 5.409770197581923e-06, "loss": 1.3479, "step": 2163 }, { "epoch": 1.9141592920353983, "grad_norm": 1.176664022460812, "learning_rate": 5.4062756499276945e-06, "loss": 1.0361, "step": 2164 }, { "epoch": 1.9150442477876106, "grad_norm": 1.4259204180740666, "learning_rate": 5.402780902487721e-06, "loss": 0.9675, "step": 2165 }, { "epoch": 1.915929203539823, "grad_norm": 2.7285461487190887, "learning_rate": 5.399285956980547e-06, "loss": 1.0905, "step": 2166 }, { "epoch": 1.9168141592920354, "grad_norm": 1.4007538072174195, "learning_rate": 5.395790815124802e-06, "loss": 1.1859, "step": 2167 }, { "epoch": 1.9176991150442477, "grad_norm": 1.5081335300662917, "learning_rate": 5.392295478639226e-06, "loss": 1.0175, "step": 2168 }, { "epoch": 1.9185840707964603, "grad_norm": 1.1224292402644385, "learning_rate": 5.388799949242645e-06, "loss": 0.8891, "step": 2169 }, { "epoch": 1.9194690265486725, "grad_norm": 1.1153918007800838, "learning_rate": 5.385304228653983e-06, "loss": 1.0164, "step": 2170 }, { "epoch": 1.920353982300885, "grad_norm": 1.2787015818806091, "learning_rate": 5.381808318592259e-06, "loss": 0.9626, "step": 2171 }, { "epoch": 1.9212389380530972, "grad_norm": 1.299546312171085, "learning_rate": 5.378312220776584e-06, "loss": 1.0527, "step": 2172 }, { "epoch": 1.9221238938053098, "grad_norm": 1.2127416374080573, "learning_rate": 5.3748159369261585e-06, "loss": 1.0582, "step": 2173 }, { "epoch": 1.9230088495575222, "grad_norm": 1.2248072623381807, "learning_rate": 5.371319468760283e-06, "loss": 1.2349, "step": 2174 }, { "epoch": 1.9238938053097345, "grad_norm": 1.2503675324698922, "learning_rate": 5.367822817998338e-06, "loss": 0.833, "step": 2175 }, { "epoch": 1.924778761061947, "grad_norm": 1.3229184031213435, "learning_rate": 5.3643259863598015e-06, "loss": 1.2359, "step": 2176 }, { "epoch": 1.9256637168141593, "grad_norm": 1.3289819957045614, "learning_rate": 5.360828975564238e-06, "loss": 1.0717, "step": 2177 }, { "epoch": 1.9265486725663716, "grad_norm": 1.996037336552399, "learning_rate": 5.357331787331297e-06, "loss": 1.3036, "step": 2178 }, { "epoch": 1.927433628318584, "grad_norm": 1.4356461808983798, "learning_rate": 5.353834423380723e-06, "loss": 1.3166, "step": 2179 }, { "epoch": 1.9283185840707966, "grad_norm": 1.3633497614796528, "learning_rate": 5.350336885432337e-06, "loss": 1.3305, "step": 2180 }, { "epoch": 1.9292035398230087, "grad_norm": 1.5424678431107786, "learning_rate": 5.346839175206053e-06, "loss": 0.972, "step": 2181 }, { "epoch": 1.9300884955752213, "grad_norm": 1.2262129773561625, "learning_rate": 5.343341294421868e-06, "loss": 0.9855, "step": 2182 }, { "epoch": 1.9309734513274335, "grad_norm": 1.2933188485479619, "learning_rate": 5.339843244799862e-06, "loss": 1.0876, "step": 2183 }, { "epoch": 1.931858407079646, "grad_norm": 1.1829361809593648, "learning_rate": 5.336345028060199e-06, "loss": 0.987, "step": 2184 }, { "epoch": 1.9327433628318584, "grad_norm": 3.1593382905094254, "learning_rate": 5.3328466459231225e-06, "loss": 1.1206, "step": 2185 }, { "epoch": 1.9336283185840708, "grad_norm": 1.4445237441129912, "learning_rate": 5.329348100108958e-06, "loss": 1.1831, "step": 2186 }, { "epoch": 1.9345132743362832, "grad_norm": 1.5014775118698873, "learning_rate": 5.325849392338117e-06, "loss": 1.3119, "step": 2187 }, { "epoch": 1.9353982300884955, "grad_norm": 1.362428781750617, "learning_rate": 5.322350524331082e-06, "loss": 1.1816, "step": 2188 }, { "epoch": 1.936283185840708, "grad_norm": 1.7374546562190736, "learning_rate": 5.318851497808424e-06, "loss": 1.3063, "step": 2189 }, { "epoch": 1.9371681415929203, "grad_norm": 1.720173144426914, "learning_rate": 5.315352314490781e-06, "loss": 1.1134, "step": 2190 }, { "epoch": 1.9380530973451329, "grad_norm": 1.1892205235126962, "learning_rate": 5.311852976098877e-06, "loss": 0.8633, "step": 2191 }, { "epoch": 1.938938053097345, "grad_norm": 1.593538991655569, "learning_rate": 5.308353484353508e-06, "loss": 1.0771, "step": 2192 }, { "epoch": 1.9398230088495576, "grad_norm": 1.3510568829430696, "learning_rate": 5.3048538409755466e-06, "loss": 1.282, "step": 2193 }, { "epoch": 1.9407079646017698, "grad_norm": 1.3005672326823907, "learning_rate": 5.3013540476859404e-06, "loss": 1.0199, "step": 2194 }, { "epoch": 1.9415929203539823, "grad_norm": 1.2305919535424765, "learning_rate": 5.297854106205709e-06, "loss": 1.0669, "step": 2195 }, { "epoch": 1.9424778761061947, "grad_norm": 1.5074326140483425, "learning_rate": 5.294354018255945e-06, "loss": 1.0793, "step": 2196 }, { "epoch": 1.943362831858407, "grad_norm": 1.4042783993155286, "learning_rate": 5.290853785557819e-06, "loss": 1.155, "step": 2197 }, { "epoch": 1.9442477876106194, "grad_norm": 1.3887439165240254, "learning_rate": 5.287353409832561e-06, "loss": 1.2621, "step": 2198 }, { "epoch": 1.9451327433628318, "grad_norm": 1.3340411423582348, "learning_rate": 5.283852892801483e-06, "loss": 1.2077, "step": 2199 }, { "epoch": 1.9460176991150444, "grad_norm": 1.4271411187680334, "learning_rate": 5.2803522361859596e-06, "loss": 1.1056, "step": 2200 }, { "epoch": 1.9469026548672566, "grad_norm": 1.2354483103441225, "learning_rate": 5.276851441707437e-06, "loss": 1.02, "step": 2201 }, { "epoch": 1.9477876106194691, "grad_norm": 1.3890407281874302, "learning_rate": 5.273350511087427e-06, "loss": 1.1486, "step": 2202 }, { "epoch": 1.9486725663716813, "grad_norm": 1.7227902000757684, "learning_rate": 5.26984944604751e-06, "loss": 0.9898, "step": 2203 }, { "epoch": 1.9495575221238939, "grad_norm": 1.1194354548258425, "learning_rate": 5.266348248309332e-06, "loss": 1.1863, "step": 2204 }, { "epoch": 1.950442477876106, "grad_norm": 1.6498717181582447, "learning_rate": 5.262846919594607e-06, "loss": 1.0978, "step": 2205 }, { "epoch": 1.9513274336283186, "grad_norm": 1.421918707237554, "learning_rate": 5.259345461625106e-06, "loss": 1.1772, "step": 2206 }, { "epoch": 1.952212389380531, "grad_norm": 1.5651538841481896, "learning_rate": 5.255843876122672e-06, "loss": 1.122, "step": 2207 }, { "epoch": 1.9530973451327434, "grad_norm": 1.289692069085551, "learning_rate": 5.252342164809204e-06, "loss": 0.9267, "step": 2208 }, { "epoch": 1.9539823008849557, "grad_norm": 1.1063283947918756, "learning_rate": 5.2488403294066695e-06, "loss": 0.9123, "step": 2209 }, { "epoch": 1.954867256637168, "grad_norm": 1.5148273226402595, "learning_rate": 5.245338371637091e-06, "loss": 1.2431, "step": 2210 }, { "epoch": 1.9557522123893807, "grad_norm": 1.3922768822121132, "learning_rate": 5.2418362932225534e-06, "loss": 1.1327, "step": 2211 }, { "epoch": 1.9566371681415928, "grad_norm": 1.4189612744874238, "learning_rate": 5.238334095885204e-06, "loss": 1.187, "step": 2212 }, { "epoch": 1.9575221238938054, "grad_norm": 1.232081864535495, "learning_rate": 5.2348317813472406e-06, "loss": 1.2125, "step": 2213 }, { "epoch": 1.9584070796460176, "grad_norm": 1.5826072138156833, "learning_rate": 5.231329351330927e-06, "loss": 0.9515, "step": 2214 }, { "epoch": 1.9592920353982302, "grad_norm": 1.2813634525209876, "learning_rate": 5.2278268075585815e-06, "loss": 1.0854, "step": 2215 }, { "epoch": 1.9601769911504425, "grad_norm": 1.6204366325417103, "learning_rate": 5.224324151752575e-06, "loss": 1.3266, "step": 2216 }, { "epoch": 1.961061946902655, "grad_norm": 1.317973549235033, "learning_rate": 5.220821385635337e-06, "loss": 0.9633, "step": 2217 }, { "epoch": 1.9619469026548673, "grad_norm": 1.3268689589751568, "learning_rate": 5.2173185109293514e-06, "loss": 1.081, "step": 2218 }, { "epoch": 1.9628318584070796, "grad_norm": 1.2186601501273202, "learning_rate": 5.2138155293571504e-06, "loss": 1.1558, "step": 2219 }, { "epoch": 1.963716814159292, "grad_norm": 2.4482122942308306, "learning_rate": 5.210312442641327e-06, "loss": 1.3368, "step": 2220 }, { "epoch": 1.9646017699115044, "grad_norm": 1.4028477121761636, "learning_rate": 5.2068092525045165e-06, "loss": 1.1744, "step": 2221 }, { "epoch": 1.965486725663717, "grad_norm": 1.2102200169820598, "learning_rate": 5.203305960669415e-06, "loss": 1.1833, "step": 2222 }, { "epoch": 1.966371681415929, "grad_norm": 2.0297624491536532, "learning_rate": 5.19980256885876e-06, "loss": 0.6997, "step": 2223 }, { "epoch": 1.9672566371681417, "grad_norm": 3.209000151734565, "learning_rate": 5.1962990787953436e-06, "loss": 1.424, "step": 2224 }, { "epoch": 1.9681415929203538, "grad_norm": 1.3379667869789702, "learning_rate": 5.1927954922020045e-06, "loss": 1.1369, "step": 2225 }, { "epoch": 1.9690265486725664, "grad_norm": 1.3572926142980188, "learning_rate": 5.189291810801628e-06, "loss": 1.0093, "step": 2226 }, { "epoch": 1.9699115044247788, "grad_norm": 1.1938835991760617, "learning_rate": 5.185788036317145e-06, "loss": 1.1042, "step": 2227 }, { "epoch": 1.9707964601769912, "grad_norm": 1.3359043290576476, "learning_rate": 5.1822841704715385e-06, "loss": 1.1757, "step": 2228 }, { "epoch": 1.9716814159292035, "grad_norm": 1.1748649194502028, "learning_rate": 5.1787802149878275e-06, "loss": 1.0086, "step": 2229 }, { "epoch": 1.972566371681416, "grad_norm": 1.1915281774388604, "learning_rate": 5.175276171589082e-06, "loss": 0.8779, "step": 2230 }, { "epoch": 1.9734513274336283, "grad_norm": 1.4302217997975126, "learning_rate": 5.171772041998412e-06, "loss": 1.324, "step": 2231 }, { "epoch": 1.9743362831858406, "grad_norm": 1.3600161310046035, "learning_rate": 5.168267827938971e-06, "loss": 1.0223, "step": 2232 }, { "epoch": 1.9752212389380532, "grad_norm": 1.0599709871495875, "learning_rate": 5.1647635311339515e-06, "loss": 0.9601, "step": 2233 }, { "epoch": 1.9761061946902654, "grad_norm": 1.9957969598569822, "learning_rate": 5.161259153306592e-06, "loss": 0.9853, "step": 2234 }, { "epoch": 1.976991150442478, "grad_norm": 1.2883837732995849, "learning_rate": 5.157754696180165e-06, "loss": 1.1252, "step": 2235 }, { "epoch": 1.9778761061946901, "grad_norm": 1.2766368522246458, "learning_rate": 5.154250161477986e-06, "loss": 0.9147, "step": 2236 }, { "epoch": 1.9787610619469027, "grad_norm": 1.416218925492958, "learning_rate": 5.1507455509234065e-06, "loss": 1.0596, "step": 2237 }, { "epoch": 1.979646017699115, "grad_norm": 1.3562144517319472, "learning_rate": 5.147240866239817e-06, "loss": 1.1155, "step": 2238 }, { "epoch": 1.9805309734513274, "grad_norm": 1.363183162725099, "learning_rate": 5.143736109150642e-06, "loss": 0.9816, "step": 2239 }, { "epoch": 1.9814159292035398, "grad_norm": 1.3556007521545095, "learning_rate": 5.140231281379345e-06, "loss": 0.9806, "step": 2240 }, { "epoch": 1.9823008849557522, "grad_norm": 1.3881120301242489, "learning_rate": 5.136726384649422e-06, "loss": 1.2573, "step": 2241 }, { "epoch": 1.9831858407079646, "grad_norm": 1.3254890109914634, "learning_rate": 5.133221420684403e-06, "loss": 1.1079, "step": 2242 }, { "epoch": 1.984070796460177, "grad_norm": 1.341685395323354, "learning_rate": 5.129716391207851e-06, "loss": 1.0566, "step": 2243 }, { "epoch": 1.9849557522123895, "grad_norm": 1.4486406282920226, "learning_rate": 5.126211297943362e-06, "loss": 1.0627, "step": 2244 }, { "epoch": 1.9858407079646017, "grad_norm": 1.3751349926603111, "learning_rate": 5.122706142614562e-06, "loss": 0.9599, "step": 2245 }, { "epoch": 1.9867256637168142, "grad_norm": 1.3934617743261408, "learning_rate": 5.1192009269451094e-06, "loss": 1.0531, "step": 2246 }, { "epoch": 1.9876106194690264, "grad_norm": 1.3678376835581372, "learning_rate": 5.115695652658692e-06, "loss": 1.1957, "step": 2247 }, { "epoch": 1.988495575221239, "grad_norm": 1.300164254349502, "learning_rate": 5.112190321479026e-06, "loss": 1.1401, "step": 2248 }, { "epoch": 1.9893805309734514, "grad_norm": 1.3129924720762496, "learning_rate": 5.108684935129853e-06, "loss": 1.1165, "step": 2249 }, { "epoch": 1.9902654867256637, "grad_norm": 1.826829360890637, "learning_rate": 5.1051794953349445e-06, "loss": 1.1129, "step": 2250 }, { "epoch": 1.991150442477876, "grad_norm": 1.3247405066041273, "learning_rate": 5.101674003818101e-06, "loss": 1.173, "step": 2251 }, { "epoch": 1.9920353982300885, "grad_norm": 1.2631634032305223, "learning_rate": 5.098168462303141e-06, "loss": 0.9976, "step": 2252 }, { "epoch": 1.9929203539823008, "grad_norm": 1.4120587206665292, "learning_rate": 5.094662872513916e-06, "loss": 0.9927, "step": 2253 }, { "epoch": 1.9938053097345132, "grad_norm": 1.4147784243270225, "learning_rate": 5.091157236174295e-06, "loss": 1.1136, "step": 2254 }, { "epoch": 1.9946902654867258, "grad_norm": 1.357024048922821, "learning_rate": 5.0876515550081715e-06, "loss": 1.2615, "step": 2255 }, { "epoch": 1.995575221238938, "grad_norm": 1.8632461957866868, "learning_rate": 5.084145830739462e-06, "loss": 1.2338, "step": 2256 }, { "epoch": 1.9964601769911505, "grad_norm": 1.1623893440167299, "learning_rate": 5.080640065092105e-06, "loss": 1.0755, "step": 2257 }, { "epoch": 1.9973451327433627, "grad_norm": 1.335200527807935, "learning_rate": 5.077134259790056e-06, "loss": 1.1244, "step": 2258 }, { "epoch": 1.9982300884955753, "grad_norm": 1.2059491878961823, "learning_rate": 5.073628416557293e-06, "loss": 1.1187, "step": 2259 }, { "epoch": 1.9991150442477876, "grad_norm": 1.5262989051138232, "learning_rate": 5.070122537117812e-06, "loss": 1.1874, "step": 2260 } ], "logging_steps": 1, "max_steps": 4520, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1130, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 552822688972800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }