{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4434, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006765899864682003, "grad_norm": 12.109418480944685, "learning_rate": 2.2522522522522524e-08, "loss": 0.2806, "step": 1 }, { "epoch": 0.0013531799729364006, "grad_norm": 11.05123491573913, "learning_rate": 4.504504504504505e-08, "loss": 0.2493, "step": 2 }, { "epoch": 0.0020297699594046007, "grad_norm": 11.568626476471813, "learning_rate": 6.756756756756757e-08, "loss": 0.2796, "step": 3 }, { "epoch": 0.0027063599458728013, "grad_norm": 15.330906430777253, "learning_rate": 9.00900900900901e-08, "loss": 0.3237, "step": 4 }, { "epoch": 0.0033829499323410014, "grad_norm": 11.80893118713706, "learning_rate": 1.1261261261261262e-07, "loss": 0.2768, "step": 5 }, { "epoch": 0.0040595399188092015, "grad_norm": 12.288565128357796, "learning_rate": 1.3513513513513515e-07, "loss": 0.278, "step": 6 }, { "epoch": 0.004736129905277402, "grad_norm": 11.056614459254297, "learning_rate": 1.5765765765765766e-07, "loss": 0.2614, "step": 7 }, { "epoch": 0.005412719891745603, "grad_norm": 11.644727224551765, "learning_rate": 1.801801801801802e-07, "loss": 0.2835, "step": 8 }, { "epoch": 0.006089309878213802, "grad_norm": 10.897140250186142, "learning_rate": 2.0270270270270273e-07, "loss": 0.2622, "step": 9 }, { "epoch": 0.006765899864682003, "grad_norm": 12.757274251949218, "learning_rate": 2.2522522522522524e-07, "loss": 0.3088, "step": 10 }, { "epoch": 0.007442489851150203, "grad_norm": 11.771844271594402, "learning_rate": 2.477477477477478e-07, "loss": 0.2771, "step": 11 }, { "epoch": 0.008119079837618403, "grad_norm": 10.462073634910523, "learning_rate": 2.702702702702703e-07, "loss": 0.2459, "step": 12 }, { "epoch": 0.008795669824086604, "grad_norm": 11.330093340150974, "learning_rate": 2.927927927927928e-07, "loss": 0.2485, "step": 13 }, { "epoch": 0.009472259810554804, "grad_norm": 11.045487141403331, "learning_rate": 3.153153153153153e-07, "loss": 0.2496, "step": 14 }, { "epoch": 0.010148849797023005, "grad_norm": 9.006747510455144, "learning_rate": 3.378378378378379e-07, "loss": 0.2169, "step": 15 }, { "epoch": 0.010825439783491205, "grad_norm": 8.513474092378251, "learning_rate": 3.603603603603604e-07, "loss": 0.1941, "step": 16 }, { "epoch": 0.011502029769959404, "grad_norm": 8.985361626419866, "learning_rate": 3.828828828828829e-07, "loss": 0.2209, "step": 17 }, { "epoch": 0.012178619756427604, "grad_norm": 7.4644108769734885, "learning_rate": 4.0540540540540546e-07, "loss": 0.2173, "step": 18 }, { "epoch": 0.012855209742895805, "grad_norm": 8.138660456856027, "learning_rate": 4.27927927927928e-07, "loss": 0.1994, "step": 19 }, { "epoch": 0.013531799729364006, "grad_norm": 6.463475276121953, "learning_rate": 4.504504504504505e-07, "loss": 0.1617, "step": 20 }, { "epoch": 0.014208389715832206, "grad_norm": 4.67755763633634, "learning_rate": 4.7297297297297305e-07, "loss": 0.1528, "step": 21 }, { "epoch": 0.014884979702300407, "grad_norm": 5.102687813043612, "learning_rate": 4.954954954954956e-07, "loss": 0.1559, "step": 22 }, { "epoch": 0.015561569688768605, "grad_norm": 4.450782219252275, "learning_rate": 5.180180180180181e-07, "loss": 0.1432, "step": 23 }, { "epoch": 0.016238159675236806, "grad_norm": 3.8628913682582757, "learning_rate": 5.405405405405406e-07, "loss": 0.1566, "step": 24 }, { "epoch": 0.016914749661705007, "grad_norm": 3.765429189029888, "learning_rate": 5.630630630630631e-07, "loss": 0.1539, "step": 25 }, { "epoch": 0.017591339648173207, "grad_norm": 3.4747070187389206, "learning_rate": 5.855855855855856e-07, "loss": 0.156, "step": 26 }, { "epoch": 0.018267929634641408, "grad_norm": 2.7591405300352077, "learning_rate": 6.081081081081082e-07, "loss": 0.1504, "step": 27 }, { "epoch": 0.018944519621109608, "grad_norm": 2.5110411566324258, "learning_rate": 6.306306306306306e-07, "loss": 0.1226, "step": 28 }, { "epoch": 0.01962110960757781, "grad_norm": 2.147016347481153, "learning_rate": 6.531531531531532e-07, "loss": 0.0942, "step": 29 }, { "epoch": 0.02029769959404601, "grad_norm": 1.8191888485653491, "learning_rate": 6.756756756756758e-07, "loss": 0.1346, "step": 30 }, { "epoch": 0.02097428958051421, "grad_norm": 2.4427111665888233, "learning_rate": 6.981981981981982e-07, "loss": 0.1226, "step": 31 }, { "epoch": 0.02165087956698241, "grad_norm": 1.515634001454101, "learning_rate": 7.207207207207208e-07, "loss": 0.1013, "step": 32 }, { "epoch": 0.022327469553450607, "grad_norm": 1.4537092544960724, "learning_rate": 7.432432432432434e-07, "loss": 0.11, "step": 33 }, { "epoch": 0.023004059539918808, "grad_norm": 2.478946984359515, "learning_rate": 7.657657657657658e-07, "loss": 0.1188, "step": 34 }, { "epoch": 0.02368064952638701, "grad_norm": 1.6796456542643035, "learning_rate": 7.882882882882883e-07, "loss": 0.1119, "step": 35 }, { "epoch": 0.02435723951285521, "grad_norm": 1.60436577837702, "learning_rate": 8.108108108108109e-07, "loss": 0.1056, "step": 36 }, { "epoch": 0.02503382949932341, "grad_norm": 1.5649369548638523, "learning_rate": 8.333333333333333e-07, "loss": 0.1106, "step": 37 }, { "epoch": 0.02571041948579161, "grad_norm": 2.0561242436687373, "learning_rate": 8.55855855855856e-07, "loss": 0.0859, "step": 38 }, { "epoch": 0.02638700947225981, "grad_norm": 1.8755061475179144, "learning_rate": 8.783783783783785e-07, "loss": 0.1142, "step": 39 }, { "epoch": 0.02706359945872801, "grad_norm": 1.846342223086991, "learning_rate": 9.00900900900901e-07, "loss": 0.0968, "step": 40 }, { "epoch": 0.02774018944519621, "grad_norm": 1.6330756001827755, "learning_rate": 9.234234234234235e-07, "loss": 0.1053, "step": 41 }, { "epoch": 0.028416779431664412, "grad_norm": 1.566099881914172, "learning_rate": 9.459459459459461e-07, "loss": 0.1067, "step": 42 }, { "epoch": 0.029093369418132613, "grad_norm": 1.698567379119881, "learning_rate": 9.684684684684686e-07, "loss": 0.091, "step": 43 }, { "epoch": 0.029769959404600813, "grad_norm": 1.6125714900139525, "learning_rate": 9.909909909909911e-07, "loss": 0.076, "step": 44 }, { "epoch": 0.030446549391069014, "grad_norm": 2.115521381565484, "learning_rate": 1.0135135135135136e-06, "loss": 0.1054, "step": 45 }, { "epoch": 0.03112313937753721, "grad_norm": 2.650889695337764, "learning_rate": 1.0360360360360361e-06, "loss": 0.1047, "step": 46 }, { "epoch": 0.031799729364005415, "grad_norm": 1.389331225829285, "learning_rate": 1.0585585585585587e-06, "loss": 0.0897, "step": 47 }, { "epoch": 0.03247631935047361, "grad_norm": 1.4231867009481125, "learning_rate": 1.0810810810810812e-06, "loss": 0.0995, "step": 48 }, { "epoch": 0.033152909336941816, "grad_norm": 1.2285847771352059, "learning_rate": 1.1036036036036037e-06, "loss": 0.0717, "step": 49 }, { "epoch": 0.03382949932341001, "grad_norm": 1.3024523733871285, "learning_rate": 1.1261261261261262e-06, "loss": 0.0935, "step": 50 }, { "epoch": 0.03450608930987822, "grad_norm": 1.3670113610830439, "learning_rate": 1.148648648648649e-06, "loss": 0.0791, "step": 51 }, { "epoch": 0.035182679296346414, "grad_norm": 1.7510664364205482, "learning_rate": 1.1711711711711712e-06, "loss": 0.1211, "step": 52 }, { "epoch": 0.03585926928281461, "grad_norm": 2.58368446347696, "learning_rate": 1.1936936936936937e-06, "loss": 0.1098, "step": 53 }, { "epoch": 0.036535859269282815, "grad_norm": 1.1546683198894498, "learning_rate": 1.2162162162162164e-06, "loss": 0.0944, "step": 54 }, { "epoch": 0.03721244925575101, "grad_norm": 1.0262303684513803, "learning_rate": 1.2387387387387387e-06, "loss": 0.0697, "step": 55 }, { "epoch": 0.037889039242219216, "grad_norm": 2.0466850231277793, "learning_rate": 1.2612612612612613e-06, "loss": 0.0939, "step": 56 }, { "epoch": 0.03856562922868741, "grad_norm": 2.0547227253053095, "learning_rate": 1.2837837837837838e-06, "loss": 0.0935, "step": 57 }, { "epoch": 0.03924221921515562, "grad_norm": 1.2229081913467617, "learning_rate": 1.3063063063063065e-06, "loss": 0.0994, "step": 58 }, { "epoch": 0.039918809201623814, "grad_norm": 1.3020824069826142, "learning_rate": 1.328828828828829e-06, "loss": 0.0712, "step": 59 }, { "epoch": 0.04059539918809202, "grad_norm": 1.4084130139999327, "learning_rate": 1.3513513513513515e-06, "loss": 0.0828, "step": 60 }, { "epoch": 0.041271989174560215, "grad_norm": 1.3605355132053356, "learning_rate": 1.373873873873874e-06, "loss": 0.0767, "step": 61 }, { "epoch": 0.04194857916102842, "grad_norm": 1.190756668990836, "learning_rate": 1.3963963963963963e-06, "loss": 0.0862, "step": 62 }, { "epoch": 0.04262516914749662, "grad_norm": 1.609471879933888, "learning_rate": 1.418918918918919e-06, "loss": 0.1112, "step": 63 }, { "epoch": 0.04330175913396482, "grad_norm": 1.19760034159579, "learning_rate": 1.4414414414414416e-06, "loss": 0.0988, "step": 64 }, { "epoch": 0.04397834912043302, "grad_norm": 1.2167332141698486, "learning_rate": 1.463963963963964e-06, "loss": 0.0616, "step": 65 }, { "epoch": 0.044654939106901215, "grad_norm": 1.1643059346788907, "learning_rate": 1.4864864864864868e-06, "loss": 0.0749, "step": 66 }, { "epoch": 0.04533152909336942, "grad_norm": 1.756367232451425, "learning_rate": 1.5090090090090093e-06, "loss": 0.0883, "step": 67 }, { "epoch": 0.046008119079837616, "grad_norm": 1.1816077677665917, "learning_rate": 1.5315315315315316e-06, "loss": 0.0925, "step": 68 }, { "epoch": 0.04668470906630582, "grad_norm": 1.3497747965548728, "learning_rate": 1.5540540540540541e-06, "loss": 0.104, "step": 69 }, { "epoch": 0.04736129905277402, "grad_norm": 1.3274120607510551, "learning_rate": 1.5765765765765766e-06, "loss": 0.0849, "step": 70 }, { "epoch": 0.04803788903924222, "grad_norm": 1.549606581695767, "learning_rate": 1.5990990990990993e-06, "loss": 0.0889, "step": 71 }, { "epoch": 0.04871447902571042, "grad_norm": 1.4493395352159697, "learning_rate": 1.6216216216216219e-06, "loss": 0.0944, "step": 72 }, { "epoch": 0.04939106901217862, "grad_norm": 1.0994319556111298, "learning_rate": 1.6441441441441444e-06, "loss": 0.0759, "step": 73 }, { "epoch": 0.05006765899864682, "grad_norm": 1.2837499508543788, "learning_rate": 1.6666666666666667e-06, "loss": 0.0698, "step": 74 }, { "epoch": 0.05074424898511502, "grad_norm": 1.1341275770781738, "learning_rate": 1.6891891891891894e-06, "loss": 0.107, "step": 75 }, { "epoch": 0.05142083897158322, "grad_norm": 1.0646003246593085, "learning_rate": 1.711711711711712e-06, "loss": 0.0877, "step": 76 }, { "epoch": 0.052097428958051424, "grad_norm": 1.0705597995328733, "learning_rate": 1.7342342342342344e-06, "loss": 0.0938, "step": 77 }, { "epoch": 0.05277401894451962, "grad_norm": 1.8324753250262114, "learning_rate": 1.756756756756757e-06, "loss": 0.098, "step": 78 }, { "epoch": 0.05345060893098782, "grad_norm": 1.6052106663983605, "learning_rate": 1.7792792792792792e-06, "loss": 0.0724, "step": 79 }, { "epoch": 0.05412719891745602, "grad_norm": 1.0519773994845274, "learning_rate": 1.801801801801802e-06, "loss": 0.0643, "step": 80 }, { "epoch": 0.05480378890392422, "grad_norm": 1.2333488233321814, "learning_rate": 1.8243243243243245e-06, "loss": 0.0835, "step": 81 }, { "epoch": 0.05548037889039242, "grad_norm": 1.4998932144723107, "learning_rate": 1.846846846846847e-06, "loss": 0.0748, "step": 82 }, { "epoch": 0.05615696887686062, "grad_norm": 1.389620534415091, "learning_rate": 1.8693693693693697e-06, "loss": 0.0819, "step": 83 }, { "epoch": 0.056833558863328824, "grad_norm": 1.2388704868409917, "learning_rate": 1.8918918918918922e-06, "loss": 0.0648, "step": 84 }, { "epoch": 0.05751014884979702, "grad_norm": 0.8506120195949848, "learning_rate": 1.9144144144144145e-06, "loss": 0.0576, "step": 85 }, { "epoch": 0.058186738836265225, "grad_norm": 1.2780645850137915, "learning_rate": 1.9369369369369372e-06, "loss": 0.0796, "step": 86 }, { "epoch": 0.05886332882273342, "grad_norm": 1.132956937818961, "learning_rate": 1.9594594594594595e-06, "loss": 0.0786, "step": 87 }, { "epoch": 0.05953991880920163, "grad_norm": 1.4862959937967373, "learning_rate": 1.9819819819819822e-06, "loss": 0.0655, "step": 88 }, { "epoch": 0.060216508795669824, "grad_norm": 1.5069955438493359, "learning_rate": 2.0045045045045045e-06, "loss": 0.0789, "step": 89 }, { "epoch": 0.06089309878213803, "grad_norm": 1.1028032292446677, "learning_rate": 2.0270270270270273e-06, "loss": 0.0663, "step": 90 }, { "epoch": 0.061569688768606225, "grad_norm": 1.853101169509201, "learning_rate": 2.0495495495495496e-06, "loss": 0.1001, "step": 91 }, { "epoch": 0.06224627875507442, "grad_norm": 1.1764131955694261, "learning_rate": 2.0720720720720723e-06, "loss": 0.0677, "step": 92 }, { "epoch": 0.06292286874154263, "grad_norm": 1.270501762567475, "learning_rate": 2.0945945945945946e-06, "loss": 0.0809, "step": 93 }, { "epoch": 0.06359945872801083, "grad_norm": 1.08651500253901, "learning_rate": 2.1171171171171173e-06, "loss": 0.0701, "step": 94 }, { "epoch": 0.06427604871447902, "grad_norm": 1.171577214767396, "learning_rate": 2.13963963963964e-06, "loss": 0.0658, "step": 95 }, { "epoch": 0.06495263870094722, "grad_norm": 0.9993389865692777, "learning_rate": 2.1621621621621623e-06, "loss": 0.0697, "step": 96 }, { "epoch": 0.06562922868741543, "grad_norm": 1.1647667270803757, "learning_rate": 2.1846846846846846e-06, "loss": 0.068, "step": 97 }, { "epoch": 0.06630581867388363, "grad_norm": 1.0815909952207134, "learning_rate": 2.2072072072072073e-06, "loss": 0.0586, "step": 98 }, { "epoch": 0.06698240866035182, "grad_norm": 1.4432815661843537, "learning_rate": 2.22972972972973e-06, "loss": 0.0849, "step": 99 }, { "epoch": 0.06765899864682003, "grad_norm": 1.2850237284528643, "learning_rate": 2.2522522522522524e-06, "loss": 0.0803, "step": 100 }, { "epoch": 0.06833558863328823, "grad_norm": 1.3086998007860184, "learning_rate": 2.274774774774775e-06, "loss": 0.0893, "step": 101 }, { "epoch": 0.06901217861975643, "grad_norm": 1.3811854632746974, "learning_rate": 2.297297297297298e-06, "loss": 0.0787, "step": 102 }, { "epoch": 0.06968876860622462, "grad_norm": 0.8496140325256104, "learning_rate": 2.31981981981982e-06, "loss": 0.0552, "step": 103 }, { "epoch": 0.07036535859269283, "grad_norm": 1.1250516677643785, "learning_rate": 2.3423423423423424e-06, "loss": 0.0674, "step": 104 }, { "epoch": 0.07104194857916103, "grad_norm": 1.5188591494569172, "learning_rate": 2.364864864864865e-06, "loss": 0.0853, "step": 105 }, { "epoch": 0.07171853856562922, "grad_norm": 0.9166081492102163, "learning_rate": 2.3873873873873874e-06, "loss": 0.0691, "step": 106 }, { "epoch": 0.07239512855209743, "grad_norm": 1.4406911354239935, "learning_rate": 2.40990990990991e-06, "loss": 0.0764, "step": 107 }, { "epoch": 0.07307171853856563, "grad_norm": 1.2837903616374027, "learning_rate": 2.432432432432433e-06, "loss": 0.0686, "step": 108 }, { "epoch": 0.07374830852503383, "grad_norm": 1.1827381468628524, "learning_rate": 2.454954954954955e-06, "loss": 0.0835, "step": 109 }, { "epoch": 0.07442489851150202, "grad_norm": 1.0842278836488644, "learning_rate": 2.4774774774774775e-06, "loss": 0.0847, "step": 110 }, { "epoch": 0.07510148849797023, "grad_norm": 0.7826209232029521, "learning_rate": 2.5e-06, "loss": 0.0611, "step": 111 }, { "epoch": 0.07577807848443843, "grad_norm": 1.0514720604523498, "learning_rate": 2.5225225225225225e-06, "loss": 0.0662, "step": 112 }, { "epoch": 0.07645466847090664, "grad_norm": 1.1210491354297492, "learning_rate": 2.5450450450450452e-06, "loss": 0.0668, "step": 113 }, { "epoch": 0.07713125845737483, "grad_norm": 1.081267730408898, "learning_rate": 2.5675675675675675e-06, "loss": 0.0799, "step": 114 }, { "epoch": 0.07780784844384303, "grad_norm": 1.404599973743778, "learning_rate": 2.5900900900900907e-06, "loss": 0.0726, "step": 115 }, { "epoch": 0.07848443843031123, "grad_norm": 0.796717754084093, "learning_rate": 2.612612612612613e-06, "loss": 0.0514, "step": 116 }, { "epoch": 0.07916102841677942, "grad_norm": 1.2692775386997248, "learning_rate": 2.6351351351351353e-06, "loss": 0.0764, "step": 117 }, { "epoch": 0.07983761840324763, "grad_norm": 0.8178369672005279, "learning_rate": 2.657657657657658e-06, "loss": 0.052, "step": 118 }, { "epoch": 0.08051420838971583, "grad_norm": 1.1124558159045197, "learning_rate": 2.6801801801801803e-06, "loss": 0.0699, "step": 119 }, { "epoch": 0.08119079837618404, "grad_norm": 1.8258037622705916, "learning_rate": 2.702702702702703e-06, "loss": 0.086, "step": 120 }, { "epoch": 0.08186738836265223, "grad_norm": 0.9911638914541022, "learning_rate": 2.7252252252252253e-06, "loss": 0.0716, "step": 121 }, { "epoch": 0.08254397834912043, "grad_norm": 0.7981140741449456, "learning_rate": 2.747747747747748e-06, "loss": 0.0653, "step": 122 }, { "epoch": 0.08322056833558863, "grad_norm": 0.9891583556512792, "learning_rate": 2.7702702702702703e-06, "loss": 0.0626, "step": 123 }, { "epoch": 0.08389715832205684, "grad_norm": 0.9311177301850705, "learning_rate": 2.7927927927927926e-06, "loss": 0.0676, "step": 124 }, { "epoch": 0.08457374830852503, "grad_norm": 0.8213289961419631, "learning_rate": 2.8153153153153158e-06, "loss": 0.065, "step": 125 }, { "epoch": 0.08525033829499323, "grad_norm": 1.4753798779127683, "learning_rate": 2.837837837837838e-06, "loss": 0.0847, "step": 126 }, { "epoch": 0.08592692828146144, "grad_norm": 1.018664096046297, "learning_rate": 2.860360360360361e-06, "loss": 0.0689, "step": 127 }, { "epoch": 0.08660351826792964, "grad_norm": 0.885299703826496, "learning_rate": 2.882882882882883e-06, "loss": 0.0513, "step": 128 }, { "epoch": 0.08728010825439783, "grad_norm": 1.2654835363026304, "learning_rate": 2.9054054054054054e-06, "loss": 0.0762, "step": 129 }, { "epoch": 0.08795669824086604, "grad_norm": 1.3878112144001065, "learning_rate": 2.927927927927928e-06, "loss": 0.118, "step": 130 }, { "epoch": 0.08863328822733424, "grad_norm": 1.6327600361262127, "learning_rate": 2.9504504504504504e-06, "loss": 0.077, "step": 131 }, { "epoch": 0.08930987821380243, "grad_norm": 1.42128938356837, "learning_rate": 2.9729729729729736e-06, "loss": 0.0674, "step": 132 }, { "epoch": 0.08998646820027063, "grad_norm": 1.2993447107485385, "learning_rate": 2.995495495495496e-06, "loss": 0.0678, "step": 133 }, { "epoch": 0.09066305818673884, "grad_norm": 0.9176485862076669, "learning_rate": 3.0180180180180186e-06, "loss": 0.0528, "step": 134 }, { "epoch": 0.09133964817320704, "grad_norm": 1.2537634334042256, "learning_rate": 3.040540540540541e-06, "loss": 0.0772, "step": 135 }, { "epoch": 0.09201623815967523, "grad_norm": 0.8837089615432548, "learning_rate": 3.063063063063063e-06, "loss": 0.0565, "step": 136 }, { "epoch": 0.09269282814614344, "grad_norm": 0.949361425538162, "learning_rate": 3.085585585585586e-06, "loss": 0.0637, "step": 137 }, { "epoch": 0.09336941813261164, "grad_norm": 0.7767836874416418, "learning_rate": 3.1081081081081082e-06, "loss": 0.0537, "step": 138 }, { "epoch": 0.09404600811907984, "grad_norm": 1.0620072007006445, "learning_rate": 3.130630630630631e-06, "loss": 0.0821, "step": 139 }, { "epoch": 0.09472259810554803, "grad_norm": 0.8330908041619831, "learning_rate": 3.1531531531531532e-06, "loss": 0.0672, "step": 140 }, { "epoch": 0.09539918809201624, "grad_norm": 0.8709290571325103, "learning_rate": 3.1756756756756755e-06, "loss": 0.0582, "step": 141 }, { "epoch": 0.09607577807848444, "grad_norm": 0.9576748527480916, "learning_rate": 3.1981981981981987e-06, "loss": 0.0804, "step": 142 }, { "epoch": 0.09675236806495263, "grad_norm": 2.139408740627896, "learning_rate": 3.220720720720721e-06, "loss": 0.0788, "step": 143 }, { "epoch": 0.09742895805142084, "grad_norm": 0.92133608056852, "learning_rate": 3.2432432432432437e-06, "loss": 0.0595, "step": 144 }, { "epoch": 0.09810554803788904, "grad_norm": 0.9925385605152662, "learning_rate": 3.265765765765766e-06, "loss": 0.0603, "step": 145 }, { "epoch": 0.09878213802435724, "grad_norm": 1.0763586483912095, "learning_rate": 3.2882882882882887e-06, "loss": 0.0804, "step": 146 }, { "epoch": 0.09945872801082543, "grad_norm": 1.7871971536480755, "learning_rate": 3.310810810810811e-06, "loss": 0.0841, "step": 147 }, { "epoch": 0.10013531799729364, "grad_norm": 1.202388521210232, "learning_rate": 3.3333333333333333e-06, "loss": 0.0912, "step": 148 }, { "epoch": 0.10081190798376184, "grad_norm": 0.9232872486608801, "learning_rate": 3.3558558558558565e-06, "loss": 0.0874, "step": 149 }, { "epoch": 0.10148849797023005, "grad_norm": 1.562181686800407, "learning_rate": 3.3783783783783788e-06, "loss": 0.0796, "step": 150 }, { "epoch": 0.10216508795669824, "grad_norm": 1.0582165709303402, "learning_rate": 3.4009009009009015e-06, "loss": 0.0611, "step": 151 }, { "epoch": 0.10284167794316644, "grad_norm": 1.1869252064389257, "learning_rate": 3.423423423423424e-06, "loss": 0.0778, "step": 152 }, { "epoch": 0.10351826792963464, "grad_norm": 1.0882063117012855, "learning_rate": 3.445945945945946e-06, "loss": 0.0704, "step": 153 }, { "epoch": 0.10419485791610285, "grad_norm": 1.684218554074011, "learning_rate": 3.468468468468469e-06, "loss": 0.0739, "step": 154 }, { "epoch": 0.10487144790257104, "grad_norm": 1.2879493411494656, "learning_rate": 3.490990990990991e-06, "loss": 0.0706, "step": 155 }, { "epoch": 0.10554803788903924, "grad_norm": 1.180667462102325, "learning_rate": 3.513513513513514e-06, "loss": 0.0516, "step": 156 }, { "epoch": 0.10622462787550745, "grad_norm": 0.9173666672150224, "learning_rate": 3.536036036036036e-06, "loss": 0.0597, "step": 157 }, { "epoch": 0.10690121786197564, "grad_norm": 0.9824869826881204, "learning_rate": 3.5585585585585584e-06, "loss": 0.0618, "step": 158 }, { "epoch": 0.10757780784844384, "grad_norm": 0.8372326102844767, "learning_rate": 3.5810810810810816e-06, "loss": 0.055, "step": 159 }, { "epoch": 0.10825439783491204, "grad_norm": 0.6863999562308667, "learning_rate": 3.603603603603604e-06, "loss": 0.0453, "step": 160 }, { "epoch": 0.10893098782138025, "grad_norm": 1.7300411271145275, "learning_rate": 3.6261261261261266e-06, "loss": 0.078, "step": 161 }, { "epoch": 0.10960757780784844, "grad_norm": 0.8323956752494743, "learning_rate": 3.648648648648649e-06, "loss": 0.0671, "step": 162 }, { "epoch": 0.11028416779431664, "grad_norm": 0.8816106059707829, "learning_rate": 3.6711711711711716e-06, "loss": 0.0677, "step": 163 }, { "epoch": 0.11096075778078485, "grad_norm": 1.291090966278352, "learning_rate": 3.693693693693694e-06, "loss": 0.0703, "step": 164 }, { "epoch": 0.11163734776725305, "grad_norm": 1.4380603085453538, "learning_rate": 3.7162162162162162e-06, "loss": 0.081, "step": 165 }, { "epoch": 0.11231393775372124, "grad_norm": 1.1279173560250253, "learning_rate": 3.7387387387387394e-06, "loss": 0.0626, "step": 166 }, { "epoch": 0.11299052774018944, "grad_norm": 1.1190351240265553, "learning_rate": 3.7612612612612612e-06, "loss": 0.0803, "step": 167 }, { "epoch": 0.11366711772665765, "grad_norm": 1.0861817886530531, "learning_rate": 3.7837837837837844e-06, "loss": 0.0531, "step": 168 }, { "epoch": 0.11434370771312584, "grad_norm": 1.1450975829292371, "learning_rate": 3.8063063063063067e-06, "loss": 0.0657, "step": 169 }, { "epoch": 0.11502029769959404, "grad_norm": 1.1837341458804667, "learning_rate": 3.828828828828829e-06, "loss": 0.0779, "step": 170 }, { "epoch": 0.11569688768606225, "grad_norm": 0.9705664284135157, "learning_rate": 3.851351351351352e-06, "loss": 0.0717, "step": 171 }, { "epoch": 0.11637347767253045, "grad_norm": 0.9067677836299134, "learning_rate": 3.8738738738738744e-06, "loss": 0.0641, "step": 172 }, { "epoch": 0.11705006765899864, "grad_norm": 0.7795707970035339, "learning_rate": 3.896396396396397e-06, "loss": 0.0539, "step": 173 }, { "epoch": 0.11772665764546685, "grad_norm": 1.1543725315639926, "learning_rate": 3.918918918918919e-06, "loss": 0.07, "step": 174 }, { "epoch": 0.11840324763193505, "grad_norm": 0.8574092898862513, "learning_rate": 3.941441441441442e-06, "loss": 0.056, "step": 175 }, { "epoch": 0.11907983761840325, "grad_norm": 1.1116892070356739, "learning_rate": 3.9639639639639645e-06, "loss": 0.0667, "step": 176 }, { "epoch": 0.11975642760487144, "grad_norm": 1.0421056323865376, "learning_rate": 3.986486486486487e-06, "loss": 0.0699, "step": 177 }, { "epoch": 0.12043301759133965, "grad_norm": 1.0065865415244708, "learning_rate": 4.009009009009009e-06, "loss": 0.0973, "step": 178 }, { "epoch": 0.12110960757780785, "grad_norm": 0.8365657611328879, "learning_rate": 4.031531531531531e-06, "loss": 0.0617, "step": 179 }, { "epoch": 0.12178619756427606, "grad_norm": 0.9221990808220503, "learning_rate": 4.0540540540540545e-06, "loss": 0.0623, "step": 180 }, { "epoch": 0.12246278755074425, "grad_norm": 0.7072566756504902, "learning_rate": 4.076576576576577e-06, "loss": 0.0507, "step": 181 }, { "epoch": 0.12313937753721245, "grad_norm": 1.0923690184288475, "learning_rate": 4.099099099099099e-06, "loss": 0.0611, "step": 182 }, { "epoch": 0.12381596752368065, "grad_norm": 0.8634520196498353, "learning_rate": 4.121621621621622e-06, "loss": 0.0649, "step": 183 }, { "epoch": 0.12449255751014884, "grad_norm": 0.7815477364864121, "learning_rate": 4.1441441441441446e-06, "loss": 0.0686, "step": 184 }, { "epoch": 0.12516914749661706, "grad_norm": 1.034553826316958, "learning_rate": 4.166666666666667e-06, "loss": 0.0761, "step": 185 }, { "epoch": 0.12584573748308525, "grad_norm": 0.8790600434907246, "learning_rate": 4.189189189189189e-06, "loss": 0.0541, "step": 186 }, { "epoch": 0.12652232746955344, "grad_norm": 0.6312394412630871, "learning_rate": 4.2117117117117115e-06, "loss": 0.048, "step": 187 }, { "epoch": 0.12719891745602166, "grad_norm": 1.0236694297248756, "learning_rate": 4.234234234234235e-06, "loss": 0.0624, "step": 188 }, { "epoch": 0.12787550744248985, "grad_norm": 1.2415095048856977, "learning_rate": 4.256756756756757e-06, "loss": 0.076, "step": 189 }, { "epoch": 0.12855209742895804, "grad_norm": 1.0440729129155921, "learning_rate": 4.27927927927928e-06, "loss": 0.0721, "step": 190 }, { "epoch": 0.12922868741542626, "grad_norm": 0.9272072797636888, "learning_rate": 4.301801801801802e-06, "loss": 0.0636, "step": 191 }, { "epoch": 0.12990527740189445, "grad_norm": 0.8399664093238411, "learning_rate": 4.324324324324325e-06, "loss": 0.0605, "step": 192 }, { "epoch": 0.13058186738836267, "grad_norm": 0.8169119514173009, "learning_rate": 4.346846846846847e-06, "loss": 0.0464, "step": 193 }, { "epoch": 0.13125845737483086, "grad_norm": 1.7056399951235797, "learning_rate": 4.369369369369369e-06, "loss": 0.0842, "step": 194 }, { "epoch": 0.13193504736129905, "grad_norm": 0.8963540535989036, "learning_rate": 4.391891891891892e-06, "loss": 0.0631, "step": 195 }, { "epoch": 0.13261163734776726, "grad_norm": 1.1297665202656213, "learning_rate": 4.414414414414415e-06, "loss": 0.0666, "step": 196 }, { "epoch": 0.13328822733423545, "grad_norm": 1.2222656454260337, "learning_rate": 4.436936936936938e-06, "loss": 0.0742, "step": 197 }, { "epoch": 0.13396481732070364, "grad_norm": 0.7830411618814591, "learning_rate": 4.45945945945946e-06, "loss": 0.0643, "step": 198 }, { "epoch": 0.13464140730717186, "grad_norm": 0.9248966716591538, "learning_rate": 4.4819819819819824e-06, "loss": 0.056, "step": 199 }, { "epoch": 0.13531799729364005, "grad_norm": 1.0226482316768566, "learning_rate": 4.504504504504505e-06, "loss": 0.0513, "step": 200 }, { "epoch": 0.13599458728010824, "grad_norm": 2.796924711877252, "learning_rate": 4.527027027027027e-06, "loss": 0.0719, "step": 201 }, { "epoch": 0.13667117726657646, "grad_norm": 1.3548336859142396, "learning_rate": 4.54954954954955e-06, "loss": 0.0695, "step": 202 }, { "epoch": 0.13734776725304465, "grad_norm": 0.8975021051323848, "learning_rate": 4.5720720720720725e-06, "loss": 0.0503, "step": 203 }, { "epoch": 0.13802435723951287, "grad_norm": 1.1704431625814344, "learning_rate": 4.594594594594596e-06, "loss": 0.0693, "step": 204 }, { "epoch": 0.13870094722598106, "grad_norm": 1.1095813348157724, "learning_rate": 4.617117117117118e-06, "loss": 0.0624, "step": 205 }, { "epoch": 0.13937753721244925, "grad_norm": 1.0994100355897811, "learning_rate": 4.63963963963964e-06, "loss": 0.0814, "step": 206 }, { "epoch": 0.14005412719891747, "grad_norm": 0.9955265603973159, "learning_rate": 4.6621621621621625e-06, "loss": 0.0582, "step": 207 }, { "epoch": 0.14073071718538566, "grad_norm": 0.7673711112426891, "learning_rate": 4.684684684684685e-06, "loss": 0.079, "step": 208 }, { "epoch": 0.14140730717185385, "grad_norm": 1.1997536882951019, "learning_rate": 4.707207207207208e-06, "loss": 0.0758, "step": 209 }, { "epoch": 0.14208389715832206, "grad_norm": 1.066850922079127, "learning_rate": 4.72972972972973e-06, "loss": 0.0673, "step": 210 }, { "epoch": 0.14276048714479025, "grad_norm": 0.6981303735219595, "learning_rate": 4.7522522522522526e-06, "loss": 0.0515, "step": 211 }, { "epoch": 0.14343707713125844, "grad_norm": 0.7702057119766513, "learning_rate": 4.774774774774775e-06, "loss": 0.0558, "step": 212 }, { "epoch": 0.14411366711772666, "grad_norm": 0.8179829158571619, "learning_rate": 4.797297297297297e-06, "loss": 0.059, "step": 213 }, { "epoch": 0.14479025710419485, "grad_norm": 0.841601120489142, "learning_rate": 4.81981981981982e-06, "loss": 0.0667, "step": 214 }, { "epoch": 0.14546684709066307, "grad_norm": 0.7577151328092167, "learning_rate": 4.842342342342343e-06, "loss": 0.064, "step": 215 }, { "epoch": 0.14614343707713126, "grad_norm": 0.7229776371738366, "learning_rate": 4.864864864864866e-06, "loss": 0.0551, "step": 216 }, { "epoch": 0.14682002706359945, "grad_norm": 0.7548280052358195, "learning_rate": 4.887387387387388e-06, "loss": 0.0742, "step": 217 }, { "epoch": 0.14749661705006767, "grad_norm": 0.7832374227672186, "learning_rate": 4.90990990990991e-06, "loss": 0.0585, "step": 218 }, { "epoch": 0.14817320703653586, "grad_norm": 0.7468947916494266, "learning_rate": 4.932432432432433e-06, "loss": 0.0597, "step": 219 }, { "epoch": 0.14884979702300405, "grad_norm": 0.7515844395435481, "learning_rate": 4.954954954954955e-06, "loss": 0.0477, "step": 220 }, { "epoch": 0.14952638700947227, "grad_norm": 0.8072211331751639, "learning_rate": 4.977477477477478e-06, "loss": 0.0546, "step": 221 }, { "epoch": 0.15020297699594046, "grad_norm": 0.8185940219640517, "learning_rate": 5e-06, "loss": 0.0576, "step": 222 }, { "epoch": 0.15087956698240865, "grad_norm": 0.8997670516926375, "learning_rate": 5.022522522522523e-06, "loss": 0.0787, "step": 223 }, { "epoch": 0.15155615696887687, "grad_norm": 0.812997747689852, "learning_rate": 5.045045045045045e-06, "loss": 0.0771, "step": 224 }, { "epoch": 0.15223274695534506, "grad_norm": 1.6748254663318023, "learning_rate": 5.067567567567568e-06, "loss": 0.0783, "step": 225 }, { "epoch": 0.15290933694181327, "grad_norm": 1.2271152074751945, "learning_rate": 5.0900900900900905e-06, "loss": 0.0738, "step": 226 }, { "epoch": 0.15358592692828146, "grad_norm": 0.6945605892310552, "learning_rate": 5.112612612612613e-06, "loss": 0.0579, "step": 227 }, { "epoch": 0.15426251691474965, "grad_norm": 0.8426999080994071, "learning_rate": 5.135135135135135e-06, "loss": 0.0557, "step": 228 }, { "epoch": 0.15493910690121787, "grad_norm": 0.8652992733951973, "learning_rate": 5.157657657657657e-06, "loss": 0.0669, "step": 229 }, { "epoch": 0.15561569688768606, "grad_norm": 0.7837956357894128, "learning_rate": 5.180180180180181e-06, "loss": 0.0552, "step": 230 }, { "epoch": 0.15629228687415425, "grad_norm": 0.8690750422186453, "learning_rate": 5.202702702702704e-06, "loss": 0.058, "step": 231 }, { "epoch": 0.15696887686062247, "grad_norm": 1.6243365225413129, "learning_rate": 5.225225225225226e-06, "loss": 0.0702, "step": 232 }, { "epoch": 0.15764546684709066, "grad_norm": 0.9135053302800554, "learning_rate": 5.247747747747748e-06, "loss": 0.0557, "step": 233 }, { "epoch": 0.15832205683355885, "grad_norm": 1.1007878772849908, "learning_rate": 5.2702702702702705e-06, "loss": 0.079, "step": 234 }, { "epoch": 0.15899864682002707, "grad_norm": 0.6184680312641445, "learning_rate": 5.292792792792794e-06, "loss": 0.0545, "step": 235 }, { "epoch": 0.15967523680649526, "grad_norm": 1.0609934975650073, "learning_rate": 5.315315315315316e-06, "loss": 0.072, "step": 236 }, { "epoch": 0.16035182679296348, "grad_norm": 0.8713490461647541, "learning_rate": 5.337837837837838e-06, "loss": 0.067, "step": 237 }, { "epoch": 0.16102841677943167, "grad_norm": 0.5902219845508051, "learning_rate": 5.360360360360361e-06, "loss": 0.0446, "step": 238 }, { "epoch": 0.16170500676589986, "grad_norm": 1.649731021518868, "learning_rate": 5.382882882882884e-06, "loss": 0.0663, "step": 239 }, { "epoch": 0.16238159675236807, "grad_norm": 1.0906319565432787, "learning_rate": 5.405405405405406e-06, "loss": 0.0658, "step": 240 }, { "epoch": 0.16305818673883626, "grad_norm": 0.6666684555012528, "learning_rate": 5.427927927927928e-06, "loss": 0.0599, "step": 241 }, { "epoch": 0.16373477672530445, "grad_norm": 1.0927066523354083, "learning_rate": 5.450450450450451e-06, "loss": 0.0865, "step": 242 }, { "epoch": 0.16441136671177267, "grad_norm": 0.831763258236507, "learning_rate": 5.472972972972973e-06, "loss": 0.0586, "step": 243 }, { "epoch": 0.16508795669824086, "grad_norm": 0.8704003641819794, "learning_rate": 5.495495495495496e-06, "loss": 0.0531, "step": 244 }, { "epoch": 0.16576454668470908, "grad_norm": 1.0765777401013956, "learning_rate": 5.518018018018018e-06, "loss": 0.0623, "step": 245 }, { "epoch": 0.16644113667117727, "grad_norm": 0.8411725489707547, "learning_rate": 5.540540540540541e-06, "loss": 0.0662, "step": 246 }, { "epoch": 0.16711772665764546, "grad_norm": 0.8517356933328273, "learning_rate": 5.563063063063063e-06, "loss": 0.0545, "step": 247 }, { "epoch": 0.16779431664411368, "grad_norm": 0.7124648740025268, "learning_rate": 5.585585585585585e-06, "loss": 0.0581, "step": 248 }, { "epoch": 0.16847090663058187, "grad_norm": 0.6663886160451271, "learning_rate": 5.608108108108109e-06, "loss": 0.0595, "step": 249 }, { "epoch": 0.16914749661705006, "grad_norm": 1.0397462129786121, "learning_rate": 5.6306306306306316e-06, "loss": 0.0588, "step": 250 }, { "epoch": 0.16982408660351828, "grad_norm": 1.5997174616428276, "learning_rate": 5.653153153153154e-06, "loss": 0.0829, "step": 251 }, { "epoch": 0.17050067658998647, "grad_norm": 0.7506698509283649, "learning_rate": 5.675675675675676e-06, "loss": 0.065, "step": 252 }, { "epoch": 0.17117726657645466, "grad_norm": 9.512088246340134, "learning_rate": 5.6981981981981985e-06, "loss": 0.0999, "step": 253 }, { "epoch": 0.17185385656292287, "grad_norm": 0.8094898813339155, "learning_rate": 5.720720720720722e-06, "loss": 0.064, "step": 254 }, { "epoch": 0.17253044654939106, "grad_norm": 0.9073780252224255, "learning_rate": 5.743243243243244e-06, "loss": 0.0547, "step": 255 }, { "epoch": 0.17320703653585928, "grad_norm": 0.9363636629622993, "learning_rate": 5.765765765765766e-06, "loss": 0.0729, "step": 256 }, { "epoch": 0.17388362652232747, "grad_norm": 1.1763667241015792, "learning_rate": 5.7882882882882885e-06, "loss": 0.0623, "step": 257 }, { "epoch": 0.17456021650879566, "grad_norm": 0.7735422761562635, "learning_rate": 5.810810810810811e-06, "loss": 0.0549, "step": 258 }, { "epoch": 0.17523680649526388, "grad_norm": 1.1842795603184852, "learning_rate": 5.833333333333334e-06, "loss": 0.0678, "step": 259 }, { "epoch": 0.17591339648173207, "grad_norm": 0.9880273332974672, "learning_rate": 5.855855855855856e-06, "loss": 0.0616, "step": 260 }, { "epoch": 0.17658998646820026, "grad_norm": 0.9946326645978719, "learning_rate": 5.8783783783783786e-06, "loss": 0.067, "step": 261 }, { "epoch": 0.17726657645466848, "grad_norm": 0.8436445730122438, "learning_rate": 5.900900900900901e-06, "loss": 0.0507, "step": 262 }, { "epoch": 0.17794316644113667, "grad_norm": 1.128724554221957, "learning_rate": 5.923423423423423e-06, "loss": 0.0637, "step": 263 }, { "epoch": 0.17861975642760486, "grad_norm": 0.8228539499031123, "learning_rate": 5.945945945945947e-06, "loss": 0.0505, "step": 264 }, { "epoch": 0.17929634641407308, "grad_norm": 1.4221352867966524, "learning_rate": 5.9684684684684694e-06, "loss": 0.0669, "step": 265 }, { "epoch": 0.17997293640054127, "grad_norm": 0.6266405186580584, "learning_rate": 5.990990990990992e-06, "loss": 0.0557, "step": 266 }, { "epoch": 0.18064952638700948, "grad_norm": 1.016148575810402, "learning_rate": 6.013513513513514e-06, "loss": 0.0575, "step": 267 }, { "epoch": 0.18132611637347767, "grad_norm": 1.0689203740691198, "learning_rate": 6.036036036036037e-06, "loss": 0.0619, "step": 268 }, { "epoch": 0.18200270635994586, "grad_norm": 1.000862602449387, "learning_rate": 6.0585585585585595e-06, "loss": 0.067, "step": 269 }, { "epoch": 0.18267929634641408, "grad_norm": 1.110717610117604, "learning_rate": 6.081081081081082e-06, "loss": 0.0807, "step": 270 }, { "epoch": 0.18335588633288227, "grad_norm": 0.8781204597287579, "learning_rate": 6.103603603603604e-06, "loss": 0.0603, "step": 271 }, { "epoch": 0.18403247631935046, "grad_norm": 0.8582142107483495, "learning_rate": 6.126126126126126e-06, "loss": 0.0625, "step": 272 }, { "epoch": 0.18470906630581868, "grad_norm": 0.9650867483607015, "learning_rate": 6.1486486486486495e-06, "loss": 0.0617, "step": 273 }, { "epoch": 0.18538565629228687, "grad_norm": 0.848479702118272, "learning_rate": 6.171171171171172e-06, "loss": 0.0755, "step": 274 }, { "epoch": 0.18606224627875506, "grad_norm": 1.026330058910402, "learning_rate": 6.193693693693694e-06, "loss": 0.0764, "step": 275 }, { "epoch": 0.18673883626522328, "grad_norm": 0.7526362586186429, "learning_rate": 6.2162162162162164e-06, "loss": 0.0463, "step": 276 }, { "epoch": 0.18741542625169147, "grad_norm": 1.0045694906631195, "learning_rate": 6.238738738738739e-06, "loss": 0.0812, "step": 277 }, { "epoch": 0.1880920162381597, "grad_norm": 0.828489825167721, "learning_rate": 6.261261261261262e-06, "loss": 0.0615, "step": 278 }, { "epoch": 0.18876860622462788, "grad_norm": 0.8573465514600265, "learning_rate": 6.283783783783784e-06, "loss": 0.0721, "step": 279 }, { "epoch": 0.18944519621109607, "grad_norm": 0.6617401623803025, "learning_rate": 6.3063063063063065e-06, "loss": 0.0645, "step": 280 }, { "epoch": 0.19012178619756429, "grad_norm": 0.8634458607540112, "learning_rate": 6.328828828828829e-06, "loss": 0.0477, "step": 281 }, { "epoch": 0.19079837618403248, "grad_norm": 1.1310375061446956, "learning_rate": 6.351351351351351e-06, "loss": 0.0682, "step": 282 }, { "epoch": 0.19147496617050067, "grad_norm": 0.8308007684531133, "learning_rate": 6.373873873873875e-06, "loss": 0.0743, "step": 283 }, { "epoch": 0.19215155615696888, "grad_norm": 0.9582211460617213, "learning_rate": 6.396396396396397e-06, "loss": 0.0841, "step": 284 }, { "epoch": 0.19282814614343707, "grad_norm": 0.6703558818024842, "learning_rate": 6.41891891891892e-06, "loss": 0.0572, "step": 285 }, { "epoch": 0.19350473612990526, "grad_norm": 1.1896877966727823, "learning_rate": 6.441441441441442e-06, "loss": 0.0676, "step": 286 }, { "epoch": 0.19418132611637348, "grad_norm": 0.9324453447728346, "learning_rate": 6.463963963963964e-06, "loss": 0.0538, "step": 287 }, { "epoch": 0.19485791610284167, "grad_norm": 0.7451682540900739, "learning_rate": 6.486486486486487e-06, "loss": 0.061, "step": 288 }, { "epoch": 0.1955345060893099, "grad_norm": 1.00741044422082, "learning_rate": 6.50900900900901e-06, "loss": 0.0621, "step": 289 }, { "epoch": 0.19621109607577808, "grad_norm": 0.7827295409555299, "learning_rate": 6.531531531531532e-06, "loss": 0.0607, "step": 290 }, { "epoch": 0.19688768606224627, "grad_norm": 1.1092699412455538, "learning_rate": 6.554054054054054e-06, "loss": 0.066, "step": 291 }, { "epoch": 0.1975642760487145, "grad_norm": 0.7778480185787826, "learning_rate": 6.5765765765765775e-06, "loss": 0.0626, "step": 292 }, { "epoch": 0.19824086603518268, "grad_norm": 0.9415418209335837, "learning_rate": 6.5990990990991e-06, "loss": 0.0702, "step": 293 }, { "epoch": 0.19891745602165087, "grad_norm": 0.7712422958541395, "learning_rate": 6.621621621621622e-06, "loss": 0.077, "step": 294 }, { "epoch": 0.19959404600811909, "grad_norm": 0.7502680597526424, "learning_rate": 6.644144144144144e-06, "loss": 0.0464, "step": 295 }, { "epoch": 0.20027063599458728, "grad_norm": 1.5825005457319432, "learning_rate": 6.666666666666667e-06, "loss": 0.0696, "step": 296 }, { "epoch": 0.2009472259810555, "grad_norm": 0.6160769472482266, "learning_rate": 6.689189189189191e-06, "loss": 0.0489, "step": 297 }, { "epoch": 0.20162381596752368, "grad_norm": 1.0163485751161636, "learning_rate": 6.711711711711713e-06, "loss": 0.0503, "step": 298 }, { "epoch": 0.20230040595399187, "grad_norm": 0.8415280386144876, "learning_rate": 6.734234234234235e-06, "loss": 0.0555, "step": 299 }, { "epoch": 0.2029769959404601, "grad_norm": 0.7533756964827386, "learning_rate": 6.7567567567567575e-06, "loss": 0.0553, "step": 300 }, { "epoch": 0.20365358592692828, "grad_norm": 0.8567571847440155, "learning_rate": 6.77927927927928e-06, "loss": 0.0609, "step": 301 }, { "epoch": 0.20433017591339647, "grad_norm": 0.6840058445177892, "learning_rate": 6.801801801801803e-06, "loss": 0.0484, "step": 302 }, { "epoch": 0.2050067658998647, "grad_norm": 1.20849999437776, "learning_rate": 6.824324324324325e-06, "loss": 0.0755, "step": 303 }, { "epoch": 0.20568335588633288, "grad_norm": 0.7179879977264311, "learning_rate": 6.846846846846848e-06, "loss": 0.0645, "step": 304 }, { "epoch": 0.20635994587280107, "grad_norm": 0.7978287341375102, "learning_rate": 6.86936936936937e-06, "loss": 0.0606, "step": 305 }, { "epoch": 0.2070365358592693, "grad_norm": 0.6223542351284302, "learning_rate": 6.891891891891892e-06, "loss": 0.0534, "step": 306 }, { "epoch": 0.20771312584573748, "grad_norm": 0.708227113767345, "learning_rate": 6.914414414414415e-06, "loss": 0.0647, "step": 307 }, { "epoch": 0.2083897158322057, "grad_norm": 0.766161531560619, "learning_rate": 6.936936936936938e-06, "loss": 0.0536, "step": 308 }, { "epoch": 0.2090663058186739, "grad_norm": 0.6538609527519975, "learning_rate": 6.95945945945946e-06, "loss": 0.053, "step": 309 }, { "epoch": 0.20974289580514208, "grad_norm": 0.5192620602709809, "learning_rate": 6.981981981981982e-06, "loss": 0.0521, "step": 310 }, { "epoch": 0.2104194857916103, "grad_norm": 0.8855494776836502, "learning_rate": 7.0045045045045045e-06, "loss": 0.0803, "step": 311 }, { "epoch": 0.21109607577807848, "grad_norm": 0.7105341285767813, "learning_rate": 7.027027027027028e-06, "loss": 0.0552, "step": 312 }, { "epoch": 0.21177266576454667, "grad_norm": 0.760756554196171, "learning_rate": 7.04954954954955e-06, "loss": 0.0523, "step": 313 }, { "epoch": 0.2124492557510149, "grad_norm": 0.6833298158589686, "learning_rate": 7.072072072072072e-06, "loss": 0.0649, "step": 314 }, { "epoch": 0.21312584573748308, "grad_norm": 0.6605226531267566, "learning_rate": 7.0945945945945946e-06, "loss": 0.0562, "step": 315 }, { "epoch": 0.21380243572395127, "grad_norm": 0.7182781506178565, "learning_rate": 7.117117117117117e-06, "loss": 0.0619, "step": 316 }, { "epoch": 0.2144790257104195, "grad_norm": 0.9039606730344303, "learning_rate": 7.139639639639641e-06, "loss": 0.0678, "step": 317 }, { "epoch": 0.21515561569688768, "grad_norm": 1.0672408603590426, "learning_rate": 7.162162162162163e-06, "loss": 0.062, "step": 318 }, { "epoch": 0.2158322056833559, "grad_norm": 0.7825066978725418, "learning_rate": 7.1846846846846855e-06, "loss": 0.0631, "step": 319 }, { "epoch": 0.2165087956698241, "grad_norm": 0.6986235049014473, "learning_rate": 7.207207207207208e-06, "loss": 0.0522, "step": 320 }, { "epoch": 0.21718538565629228, "grad_norm": 0.7970237099290051, "learning_rate": 7.229729729729731e-06, "loss": 0.0568, "step": 321 }, { "epoch": 0.2178619756427605, "grad_norm": 0.713437897281833, "learning_rate": 7.252252252252253e-06, "loss": 0.0606, "step": 322 }, { "epoch": 0.2185385656292287, "grad_norm": 0.6538301705889707, "learning_rate": 7.2747747747747755e-06, "loss": 0.0592, "step": 323 }, { "epoch": 0.21921515561569688, "grad_norm": 0.7333839996698983, "learning_rate": 7.297297297297298e-06, "loss": 0.0506, "step": 324 }, { "epoch": 0.2198917456021651, "grad_norm": 0.7219854833615129, "learning_rate": 7.31981981981982e-06, "loss": 0.0575, "step": 325 }, { "epoch": 0.22056833558863329, "grad_norm": 0.5823539428141844, "learning_rate": 7.342342342342343e-06, "loss": 0.0452, "step": 326 }, { "epoch": 0.22124492557510148, "grad_norm": 0.7073237029424214, "learning_rate": 7.3648648648648655e-06, "loss": 0.0692, "step": 327 }, { "epoch": 0.2219215155615697, "grad_norm": 0.5163380360026765, "learning_rate": 7.387387387387388e-06, "loss": 0.0542, "step": 328 }, { "epoch": 0.22259810554803788, "grad_norm": 0.8531706542539876, "learning_rate": 7.40990990990991e-06, "loss": 0.0643, "step": 329 }, { "epoch": 0.2232746955345061, "grad_norm": 0.5916938857991, "learning_rate": 7.4324324324324324e-06, "loss": 0.056, "step": 330 }, { "epoch": 0.2239512855209743, "grad_norm": 0.7178043475648989, "learning_rate": 7.4549549549549564e-06, "loss": 0.0586, "step": 331 }, { "epoch": 0.22462787550744248, "grad_norm": 1.1038858368742637, "learning_rate": 7.477477477477479e-06, "loss": 0.087, "step": 332 }, { "epoch": 0.2253044654939107, "grad_norm": 1.0433511825553878, "learning_rate": 7.500000000000001e-06, "loss": 0.0709, "step": 333 }, { "epoch": 0.2259810554803789, "grad_norm": 0.7610852262597415, "learning_rate": 7.5225225225225225e-06, "loss": 0.0632, "step": 334 }, { "epoch": 0.22665764546684708, "grad_norm": 0.9203107705922047, "learning_rate": 7.545045045045045e-06, "loss": 0.0625, "step": 335 }, { "epoch": 0.2273342354533153, "grad_norm": 0.7007805814883743, "learning_rate": 7.567567567567569e-06, "loss": 0.0609, "step": 336 }, { "epoch": 0.2280108254397835, "grad_norm": 0.7470133530462661, "learning_rate": 7.590090090090091e-06, "loss": 0.0546, "step": 337 }, { "epoch": 0.22868741542625168, "grad_norm": 0.9299383482954492, "learning_rate": 7.612612612612613e-06, "loss": 0.0714, "step": 338 }, { "epoch": 0.2293640054127199, "grad_norm": 0.7919735328109295, "learning_rate": 7.635135135135135e-06, "loss": 0.0674, "step": 339 }, { "epoch": 0.23004059539918809, "grad_norm": 1.214035621679479, "learning_rate": 7.657657657657658e-06, "loss": 0.0879, "step": 340 }, { "epoch": 0.2307171853856563, "grad_norm": 0.7465558095516062, "learning_rate": 7.680180180180181e-06, "loss": 0.0394, "step": 341 }, { "epoch": 0.2313937753721245, "grad_norm": 0.7374121088630315, "learning_rate": 7.702702702702704e-06, "loss": 0.0547, "step": 342 }, { "epoch": 0.23207036535859268, "grad_norm": 0.6071010914405474, "learning_rate": 7.725225225225226e-06, "loss": 0.0591, "step": 343 }, { "epoch": 0.2327469553450609, "grad_norm": 0.9006190654260152, "learning_rate": 7.747747747747749e-06, "loss": 0.0835, "step": 344 }, { "epoch": 0.2334235453315291, "grad_norm": 0.6486327204147487, "learning_rate": 7.77027027027027e-06, "loss": 0.0479, "step": 345 }, { "epoch": 0.23410013531799728, "grad_norm": 0.7953218164634385, "learning_rate": 7.792792792792793e-06, "loss": 0.0753, "step": 346 }, { "epoch": 0.2347767253044655, "grad_norm": 0.7327000299018703, "learning_rate": 7.815315315315317e-06, "loss": 0.0589, "step": 347 }, { "epoch": 0.2354533152909337, "grad_norm": 0.6166326590182014, "learning_rate": 7.837837837837838e-06, "loss": 0.0451, "step": 348 }, { "epoch": 0.2361299052774019, "grad_norm": 0.6922868013628761, "learning_rate": 7.860360360360361e-06, "loss": 0.0608, "step": 349 }, { "epoch": 0.2368064952638701, "grad_norm": 0.7841237536383574, "learning_rate": 7.882882882882884e-06, "loss": 0.0659, "step": 350 }, { "epoch": 0.2374830852503383, "grad_norm": 0.5213615675912151, "learning_rate": 7.905405405405406e-06, "loss": 0.0456, "step": 351 }, { "epoch": 0.2381596752368065, "grad_norm": 0.6972976703434131, "learning_rate": 7.927927927927929e-06, "loss": 0.0533, "step": 352 }, { "epoch": 0.2388362652232747, "grad_norm": 0.6850099785760475, "learning_rate": 7.95045045045045e-06, "loss": 0.0487, "step": 353 }, { "epoch": 0.2395128552097429, "grad_norm": 0.6034297504099073, "learning_rate": 7.972972972972974e-06, "loss": 0.0448, "step": 354 }, { "epoch": 0.2401894451962111, "grad_norm": 0.8666472731967332, "learning_rate": 7.995495495495497e-06, "loss": 0.0508, "step": 355 }, { "epoch": 0.2408660351826793, "grad_norm": 0.5232908189061367, "learning_rate": 8.018018018018018e-06, "loss": 0.0433, "step": 356 }, { "epoch": 0.24154262516914748, "grad_norm": 0.7162558221507196, "learning_rate": 8.040540540540541e-06, "loss": 0.0635, "step": 357 }, { "epoch": 0.2422192151556157, "grad_norm": 0.5463869132160125, "learning_rate": 8.063063063063063e-06, "loss": 0.0608, "step": 358 }, { "epoch": 0.2428958051420839, "grad_norm": 0.6698741157825209, "learning_rate": 8.085585585585586e-06, "loss": 0.0478, "step": 359 }, { "epoch": 0.2435723951285521, "grad_norm": 1.0667149170417045, "learning_rate": 8.108108108108109e-06, "loss": 0.0623, "step": 360 }, { "epoch": 0.2442489851150203, "grad_norm": 0.8322199906613101, "learning_rate": 8.130630630630632e-06, "loss": 0.0632, "step": 361 }, { "epoch": 0.2449255751014885, "grad_norm": 1.1527891020800387, "learning_rate": 8.153153153153154e-06, "loss": 0.0918, "step": 362 }, { "epoch": 0.2456021650879567, "grad_norm": 1.0015311731112426, "learning_rate": 8.175675675675677e-06, "loss": 0.0649, "step": 363 }, { "epoch": 0.2462787550744249, "grad_norm": 1.2822270189403457, "learning_rate": 8.198198198198198e-06, "loss": 0.0615, "step": 364 }, { "epoch": 0.2469553450608931, "grad_norm": 0.7170123128195778, "learning_rate": 8.220720720720721e-06, "loss": 0.0637, "step": 365 }, { "epoch": 0.2476319350473613, "grad_norm": 0.7113945589505706, "learning_rate": 8.243243243243245e-06, "loss": 0.0488, "step": 366 }, { "epoch": 0.2483085250338295, "grad_norm": 1.1129075587429147, "learning_rate": 8.265765765765766e-06, "loss": 0.0721, "step": 367 }, { "epoch": 0.2489851150202977, "grad_norm": 0.7052680620974655, "learning_rate": 8.288288288288289e-06, "loss": 0.07, "step": 368 }, { "epoch": 0.2496617050067659, "grad_norm": 0.6560055965647756, "learning_rate": 8.31081081081081e-06, "loss": 0.0403, "step": 369 }, { "epoch": 0.2503382949932341, "grad_norm": 0.5676362590161815, "learning_rate": 8.333333333333334e-06, "loss": 0.0603, "step": 370 }, { "epoch": 0.2510148849797023, "grad_norm": 0.5097872222057397, "learning_rate": 8.355855855855857e-06, "loss": 0.0429, "step": 371 }, { "epoch": 0.2516914749661705, "grad_norm": 0.9286563571647155, "learning_rate": 8.378378378378378e-06, "loss": 0.0763, "step": 372 }, { "epoch": 0.2523680649526387, "grad_norm": 0.6214443636461204, "learning_rate": 8.400900900900901e-06, "loss": 0.0628, "step": 373 }, { "epoch": 0.2530446549391069, "grad_norm": 0.6171283354529087, "learning_rate": 8.423423423423423e-06, "loss": 0.0521, "step": 374 }, { "epoch": 0.25372124492557513, "grad_norm": 1.2968833059895006, "learning_rate": 8.445945945945948e-06, "loss": 0.0658, "step": 375 }, { "epoch": 0.2543978349120433, "grad_norm": 0.6343349368864437, "learning_rate": 8.46846846846847e-06, "loss": 0.0563, "step": 376 }, { "epoch": 0.2550744248985115, "grad_norm": 0.9583680612830061, "learning_rate": 8.490990990990992e-06, "loss": 0.0611, "step": 377 }, { "epoch": 0.2557510148849797, "grad_norm": 0.5591574769792025, "learning_rate": 8.513513513513514e-06, "loss": 0.0436, "step": 378 }, { "epoch": 0.2564276048714479, "grad_norm": 0.8927505996517491, "learning_rate": 8.536036036036037e-06, "loss": 0.0595, "step": 379 }, { "epoch": 0.2571041948579161, "grad_norm": 0.6036231696789058, "learning_rate": 8.55855855855856e-06, "loss": 0.067, "step": 380 }, { "epoch": 0.2577807848443843, "grad_norm": 0.674926700874754, "learning_rate": 8.581081081081082e-06, "loss": 0.071, "step": 381 }, { "epoch": 0.2584573748308525, "grad_norm": 0.630725065216595, "learning_rate": 8.603603603603605e-06, "loss": 0.0689, "step": 382 }, { "epoch": 0.2591339648173207, "grad_norm": 0.6901017830788927, "learning_rate": 8.626126126126126e-06, "loss": 0.0494, "step": 383 }, { "epoch": 0.2598105548037889, "grad_norm": 0.5403322347987723, "learning_rate": 8.64864864864865e-06, "loss": 0.0654, "step": 384 }, { "epoch": 0.2604871447902571, "grad_norm": 0.6416339260999229, "learning_rate": 8.671171171171172e-06, "loss": 0.0461, "step": 385 }, { "epoch": 0.26116373477672533, "grad_norm": 0.6916472928225982, "learning_rate": 8.693693693693694e-06, "loss": 0.051, "step": 386 }, { "epoch": 0.2618403247631935, "grad_norm": 0.5240431354656332, "learning_rate": 8.716216216216217e-06, "loss": 0.0461, "step": 387 }, { "epoch": 0.2625169147496617, "grad_norm": 0.6085975030951926, "learning_rate": 8.738738738738739e-06, "loss": 0.0474, "step": 388 }, { "epoch": 0.2631935047361299, "grad_norm": 1.000054446566517, "learning_rate": 8.761261261261262e-06, "loss": 0.052, "step": 389 }, { "epoch": 0.2638700947225981, "grad_norm": 0.9140253728090463, "learning_rate": 8.783783783783785e-06, "loss": 0.0491, "step": 390 }, { "epoch": 0.2645466847090663, "grad_norm": 1.0777677397593741, "learning_rate": 8.806306306306306e-06, "loss": 0.0769, "step": 391 }, { "epoch": 0.2652232746955345, "grad_norm": 0.6860601884142381, "learning_rate": 8.82882882882883e-06, "loss": 0.0612, "step": 392 }, { "epoch": 0.2658998646820027, "grad_norm": 0.7034780660251052, "learning_rate": 8.851351351351351e-06, "loss": 0.0533, "step": 393 }, { "epoch": 0.2665764546684709, "grad_norm": 0.7354148731129575, "learning_rate": 8.873873873873876e-06, "loss": 0.0508, "step": 394 }, { "epoch": 0.2672530446549391, "grad_norm": 0.4680850279518108, "learning_rate": 8.896396396396397e-06, "loss": 0.0501, "step": 395 }, { "epoch": 0.2679296346414073, "grad_norm": 1.188509234752549, "learning_rate": 8.91891891891892e-06, "loss": 0.0783, "step": 396 }, { "epoch": 0.26860622462787553, "grad_norm": 0.5583428530563981, "learning_rate": 8.941441441441442e-06, "loss": 0.0575, "step": 397 }, { "epoch": 0.2692828146143437, "grad_norm": 1.0423898739451198, "learning_rate": 8.963963963963965e-06, "loss": 0.0549, "step": 398 }, { "epoch": 0.2699594046008119, "grad_norm": 0.7311091222443428, "learning_rate": 8.986486486486488e-06, "loss": 0.0608, "step": 399 }, { "epoch": 0.2706359945872801, "grad_norm": 0.5225547351097861, "learning_rate": 9.00900900900901e-06, "loss": 0.0477, "step": 400 }, { "epoch": 0.2713125845737483, "grad_norm": 0.8567720081871087, "learning_rate": 9.031531531531533e-06, "loss": 0.0521, "step": 401 }, { "epoch": 0.2719891745602165, "grad_norm": 0.8179635254015833, "learning_rate": 9.054054054054054e-06, "loss": 0.047, "step": 402 }, { "epoch": 0.27266576454668473, "grad_norm": 0.6192623432238964, "learning_rate": 9.076576576576577e-06, "loss": 0.0526, "step": 403 }, { "epoch": 0.2733423545331529, "grad_norm": 0.9529732419140194, "learning_rate": 9.0990990990991e-06, "loss": 0.0672, "step": 404 }, { "epoch": 0.2740189445196211, "grad_norm": 0.8832817880863134, "learning_rate": 9.121621621621622e-06, "loss": 0.0644, "step": 405 }, { "epoch": 0.2746955345060893, "grad_norm": 0.9390119367020981, "learning_rate": 9.144144144144145e-06, "loss": 0.0716, "step": 406 }, { "epoch": 0.2753721244925575, "grad_norm": 0.5580452118490072, "learning_rate": 9.166666666666666e-06, "loss": 0.0499, "step": 407 }, { "epoch": 0.27604871447902574, "grad_norm": 0.6095275666530106, "learning_rate": 9.189189189189191e-06, "loss": 0.054, "step": 408 }, { "epoch": 0.2767253044654939, "grad_norm": 0.5924692533944188, "learning_rate": 9.211711711711713e-06, "loss": 0.0585, "step": 409 }, { "epoch": 0.2774018944519621, "grad_norm": 0.6491461741649488, "learning_rate": 9.234234234234236e-06, "loss": 0.0578, "step": 410 }, { "epoch": 0.2780784844384303, "grad_norm": 0.6490685083671708, "learning_rate": 9.256756756756757e-06, "loss": 0.0811, "step": 411 }, { "epoch": 0.2787550744248985, "grad_norm": 0.7765080161117859, "learning_rate": 9.27927927927928e-06, "loss": 0.0758, "step": 412 }, { "epoch": 0.2794316644113667, "grad_norm": 0.8952768170207692, "learning_rate": 9.301801801801804e-06, "loss": 0.1027, "step": 413 }, { "epoch": 0.28010825439783493, "grad_norm": 0.5544435611505211, "learning_rate": 9.324324324324325e-06, "loss": 0.0663, "step": 414 }, { "epoch": 0.2807848443843031, "grad_norm": 0.8014924169525837, "learning_rate": 9.346846846846848e-06, "loss": 0.0473, "step": 415 }, { "epoch": 0.2814614343707713, "grad_norm": 0.5692425054934582, "learning_rate": 9.36936936936937e-06, "loss": 0.0508, "step": 416 }, { "epoch": 0.2821380243572395, "grad_norm": 0.59770195978138, "learning_rate": 9.391891891891893e-06, "loss": 0.0472, "step": 417 }, { "epoch": 0.2828146143437077, "grad_norm": 0.5740539152201671, "learning_rate": 9.414414414414416e-06, "loss": 0.0592, "step": 418 }, { "epoch": 0.28349120433017594, "grad_norm": 0.9301350592935055, "learning_rate": 9.436936936936937e-06, "loss": 0.0547, "step": 419 }, { "epoch": 0.28416779431664413, "grad_norm": 0.6512576926687861, "learning_rate": 9.45945945945946e-06, "loss": 0.0595, "step": 420 }, { "epoch": 0.2848443843031123, "grad_norm": 0.739452317492861, "learning_rate": 9.481981981981982e-06, "loss": 0.0633, "step": 421 }, { "epoch": 0.2855209742895805, "grad_norm": 0.7547290147847666, "learning_rate": 9.504504504504505e-06, "loss": 0.0589, "step": 422 }, { "epoch": 0.2861975642760487, "grad_norm": 1.006531849063146, "learning_rate": 9.527027027027028e-06, "loss": 0.0705, "step": 423 }, { "epoch": 0.2868741542625169, "grad_norm": 1.2347826160840838, "learning_rate": 9.54954954954955e-06, "loss": 0.0714, "step": 424 }, { "epoch": 0.28755074424898514, "grad_norm": 0.5834190454325607, "learning_rate": 9.572072072072073e-06, "loss": 0.0741, "step": 425 }, { "epoch": 0.2882273342354533, "grad_norm": 1.0523461913659622, "learning_rate": 9.594594594594594e-06, "loss": 0.0713, "step": 426 }, { "epoch": 0.2889039242219215, "grad_norm": 0.4741085533713739, "learning_rate": 9.617117117117117e-06, "loss": 0.0448, "step": 427 }, { "epoch": 0.2895805142083897, "grad_norm": 0.5880007748012538, "learning_rate": 9.63963963963964e-06, "loss": 0.0708, "step": 428 }, { "epoch": 0.2902571041948579, "grad_norm": 0.6889454460244109, "learning_rate": 9.662162162162164e-06, "loss": 0.0518, "step": 429 }, { "epoch": 0.29093369418132614, "grad_norm": 0.6959729962192035, "learning_rate": 9.684684684684685e-06, "loss": 0.0574, "step": 430 }, { "epoch": 0.29161028416779433, "grad_norm": 0.827473475654896, "learning_rate": 9.707207207207208e-06, "loss": 0.0663, "step": 431 }, { "epoch": 0.2922868741542625, "grad_norm": 0.6763086308781805, "learning_rate": 9.729729729729732e-06, "loss": 0.0535, "step": 432 }, { "epoch": 0.2929634641407307, "grad_norm": 0.8721703546435019, "learning_rate": 9.752252252252253e-06, "loss": 0.0605, "step": 433 }, { "epoch": 0.2936400541271989, "grad_norm": 1.0347900345944163, "learning_rate": 9.774774774774776e-06, "loss": 0.0435, "step": 434 }, { "epoch": 0.2943166441136671, "grad_norm": 0.7533663188355433, "learning_rate": 9.797297297297298e-06, "loss": 0.0668, "step": 435 }, { "epoch": 0.29499323410013534, "grad_norm": 0.7903165280972534, "learning_rate": 9.81981981981982e-06, "loss": 0.0699, "step": 436 }, { "epoch": 0.2956698240866035, "grad_norm": 0.6550871582375518, "learning_rate": 9.842342342342344e-06, "loss": 0.0638, "step": 437 }, { "epoch": 0.2963464140730717, "grad_norm": 0.6040331258438423, "learning_rate": 9.864864864864865e-06, "loss": 0.0514, "step": 438 }, { "epoch": 0.2970230040595399, "grad_norm": 0.515235499537733, "learning_rate": 9.887387387387388e-06, "loss": 0.0517, "step": 439 }, { "epoch": 0.2976995940460081, "grad_norm": 0.5780813865787422, "learning_rate": 9.90990990990991e-06, "loss": 0.0491, "step": 440 }, { "epoch": 0.29837618403247634, "grad_norm": 0.611062653460649, "learning_rate": 9.932432432432433e-06, "loss": 0.0514, "step": 441 }, { "epoch": 0.29905277401894453, "grad_norm": 0.5573070083642604, "learning_rate": 9.954954954954956e-06, "loss": 0.05, "step": 442 }, { "epoch": 0.2997293640054127, "grad_norm": 0.7764390254784838, "learning_rate": 9.97747747747748e-06, "loss": 0.0573, "step": 443 }, { "epoch": 0.3004059539918809, "grad_norm": 0.63941645256914, "learning_rate": 1e-05, "loss": 0.0708, "step": 444 }, { "epoch": 0.3010825439783491, "grad_norm": 0.8471778599360368, "learning_rate": 9.999998450134754e-06, "loss": 0.0528, "step": 445 }, { "epoch": 0.3017591339648173, "grad_norm": 0.8450451817270667, "learning_rate": 9.999993800539971e-06, "loss": 0.0572, "step": 446 }, { "epoch": 0.30243572395128554, "grad_norm": 0.8844146226217167, "learning_rate": 9.999986051218538e-06, "loss": 0.0644, "step": 447 }, { "epoch": 0.30311231393775373, "grad_norm": 0.5754466363792542, "learning_rate": 9.999975202175256e-06, "loss": 0.0527, "step": 448 }, { "epoch": 0.3037889039242219, "grad_norm": 0.5959925989880287, "learning_rate": 9.999961253416853e-06, "loss": 0.0492, "step": 449 }, { "epoch": 0.3044654939106901, "grad_norm": 0.8591321055153385, "learning_rate": 9.999944204951974e-06, "loss": 0.075, "step": 450 }, { "epoch": 0.3051420838971583, "grad_norm": 0.7071183665853937, "learning_rate": 9.999924056791192e-06, "loss": 0.069, "step": 451 }, { "epoch": 0.30581867388362655, "grad_norm": 0.7677079769031657, "learning_rate": 9.999900808946996e-06, "loss": 0.0768, "step": 452 }, { "epoch": 0.30649526387009474, "grad_norm": 0.6931293022348536, "learning_rate": 9.999874461433796e-06, "loss": 0.0495, "step": 453 }, { "epoch": 0.3071718538565629, "grad_norm": 0.6525562001842464, "learning_rate": 9.999845014267928e-06, "loss": 0.0707, "step": 454 }, { "epoch": 0.3078484438430311, "grad_norm": 0.6032130470460988, "learning_rate": 9.99981246746765e-06, "loss": 0.0512, "step": 455 }, { "epoch": 0.3085250338294993, "grad_norm": 0.66181144485413, "learning_rate": 9.999776821053134e-06, "loss": 0.0545, "step": 456 }, { "epoch": 0.3092016238159675, "grad_norm": 0.6466649751991335, "learning_rate": 9.999738075046483e-06, "loss": 0.0628, "step": 457 }, { "epoch": 0.30987821380243574, "grad_norm": 0.7485951892575824, "learning_rate": 9.999696229471716e-06, "loss": 0.086, "step": 458 }, { "epoch": 0.31055480378890393, "grad_norm": 0.856119408797913, "learning_rate": 9.999651284354774e-06, "loss": 0.0697, "step": 459 }, { "epoch": 0.3112313937753721, "grad_norm": 0.6434548587724919, "learning_rate": 9.999603239723524e-06, "loss": 0.0638, "step": 460 }, { "epoch": 0.3119079837618403, "grad_norm": 0.5999827338946243, "learning_rate": 9.999552095607748e-06, "loss": 0.0447, "step": 461 }, { "epoch": 0.3125845737483085, "grad_norm": 1.3006434285410413, "learning_rate": 9.999497852039152e-06, "loss": 0.0622, "step": 462 }, { "epoch": 0.31326116373477675, "grad_norm": 1.059372351387485, "learning_rate": 9.999440509051367e-06, "loss": 0.0779, "step": 463 }, { "epoch": 0.31393775372124494, "grad_norm": 0.8847374912809629, "learning_rate": 9.999380066679943e-06, "loss": 0.0622, "step": 464 }, { "epoch": 0.31461434370771313, "grad_norm": 0.4015954011331638, "learning_rate": 9.999316524962347e-06, "loss": 0.0451, "step": 465 }, { "epoch": 0.3152909336941813, "grad_norm": 0.5801258642444339, "learning_rate": 9.999249883937971e-06, "loss": 0.0407, "step": 466 }, { "epoch": 0.3159675236806495, "grad_norm": 0.6643566861194543, "learning_rate": 9.999180143648136e-06, "loss": 0.0476, "step": 467 }, { "epoch": 0.3166441136671177, "grad_norm": 1.519649764447325, "learning_rate": 9.999107304136068e-06, "loss": 0.0967, "step": 468 }, { "epoch": 0.31732070365358594, "grad_norm": 0.7379888857729946, "learning_rate": 9.999031365446932e-06, "loss": 0.0675, "step": 469 }, { "epoch": 0.31799729364005414, "grad_norm": 0.6511620429858112, "learning_rate": 9.9989523276278e-06, "loss": 0.0511, "step": 470 }, { "epoch": 0.3186738836265223, "grad_norm": 0.7407486316989249, "learning_rate": 9.998870190727674e-06, "loss": 0.0468, "step": 471 }, { "epoch": 0.3193504736129905, "grad_norm": 0.9120240661496066, "learning_rate": 9.998784954797474e-06, "loss": 0.0605, "step": 472 }, { "epoch": 0.3200270635994587, "grad_norm": 0.6768945458494859, "learning_rate": 9.99869661989004e-06, "loss": 0.0495, "step": 473 }, { "epoch": 0.32070365358592695, "grad_norm": 0.7273592444942851, "learning_rate": 9.998605186060138e-06, "loss": 0.0523, "step": 474 }, { "epoch": 0.32138024357239514, "grad_norm": 0.9893725326416111, "learning_rate": 9.998510653364449e-06, "loss": 0.0595, "step": 475 }, { "epoch": 0.32205683355886333, "grad_norm": 0.5372243596154718, "learning_rate": 9.998413021861581e-06, "loss": 0.0534, "step": 476 }, { "epoch": 0.3227334235453315, "grad_norm": 0.6620807382992, "learning_rate": 9.998312291612056e-06, "loss": 0.0581, "step": 477 }, { "epoch": 0.3234100135317997, "grad_norm": 0.5581779223413607, "learning_rate": 9.998208462678328e-06, "loss": 0.0521, "step": 478 }, { "epoch": 0.32408660351826796, "grad_norm": 0.6358486018642957, "learning_rate": 9.998101535124758e-06, "loss": 0.0568, "step": 479 }, { "epoch": 0.32476319350473615, "grad_norm": 0.6098598965509763, "learning_rate": 9.99799150901764e-06, "loss": 0.0573, "step": 480 }, { "epoch": 0.32543978349120434, "grad_norm": 0.6430211419986778, "learning_rate": 9.997878384425183e-06, "loss": 0.0483, "step": 481 }, { "epoch": 0.3261163734776725, "grad_norm": 0.9958565461685056, "learning_rate": 9.997762161417517e-06, "loss": 0.0561, "step": 482 }, { "epoch": 0.3267929634641407, "grad_norm": 0.5285385755194703, "learning_rate": 9.997642840066696e-06, "loss": 0.0443, "step": 483 }, { "epoch": 0.3274695534506089, "grad_norm": 0.7769394701457077, "learning_rate": 9.997520420446694e-06, "loss": 0.0939, "step": 484 }, { "epoch": 0.32814614343707715, "grad_norm": 0.6003044728349175, "learning_rate": 9.9973949026334e-06, "loss": 0.0491, "step": 485 }, { "epoch": 0.32882273342354534, "grad_norm": 0.9504360667818255, "learning_rate": 9.99726628670463e-06, "loss": 0.0696, "step": 486 }, { "epoch": 0.32949932341001353, "grad_norm": 0.5647015380923237, "learning_rate": 9.997134572740122e-06, "loss": 0.0552, "step": 487 }, { "epoch": 0.3301759133964817, "grad_norm": 0.6115624905232845, "learning_rate": 9.996999760821529e-06, "loss": 0.0472, "step": 488 }, { "epoch": 0.3308525033829499, "grad_norm": 0.9453936271559629, "learning_rate": 9.996861851032426e-06, "loss": 0.0621, "step": 489 }, { "epoch": 0.33152909336941816, "grad_norm": 0.496098418094311, "learning_rate": 9.996720843458312e-06, "loss": 0.055, "step": 490 }, { "epoch": 0.33220568335588635, "grad_norm": 0.5709611582565106, "learning_rate": 9.996576738186602e-06, "loss": 0.0589, "step": 491 }, { "epoch": 0.33288227334235454, "grad_norm": 0.6279453336841245, "learning_rate": 9.996429535306638e-06, "loss": 0.0506, "step": 492 }, { "epoch": 0.33355886332882273, "grad_norm": 0.9607027447963572, "learning_rate": 9.996279234909672e-06, "loss": 0.076, "step": 493 }, { "epoch": 0.3342354533152909, "grad_norm": 0.7312108895179549, "learning_rate": 9.996125837088883e-06, "loss": 0.0565, "step": 494 }, { "epoch": 0.3349120433017591, "grad_norm": 0.6090789167815116, "learning_rate": 9.995969341939373e-06, "loss": 0.0627, "step": 495 }, { "epoch": 0.33558863328822736, "grad_norm": 0.6909338373764786, "learning_rate": 9.995809749558159e-06, "loss": 0.0462, "step": 496 }, { "epoch": 0.33626522327469555, "grad_norm": 0.7476438356921422, "learning_rate": 9.995647060044178e-06, "loss": 0.0566, "step": 497 }, { "epoch": 0.33694181326116374, "grad_norm": 0.5753897410171199, "learning_rate": 9.995481273498291e-06, "loss": 0.052, "step": 498 }, { "epoch": 0.3376184032476319, "grad_norm": 0.9847889262945785, "learning_rate": 9.995312390023275e-06, "loss": 0.0742, "step": 499 }, { "epoch": 0.3382949932341001, "grad_norm": 0.8251720827079871, "learning_rate": 9.995140409723831e-06, "loss": 0.0558, "step": 500 }, { "epoch": 0.33897158322056836, "grad_norm": 0.6173311918543511, "learning_rate": 9.994965332706574e-06, "loss": 0.0638, "step": 501 }, { "epoch": 0.33964817320703655, "grad_norm": 0.5220884176150119, "learning_rate": 9.994787159080046e-06, "loss": 0.0815, "step": 502 }, { "epoch": 0.34032476319350474, "grad_norm": 0.867115659658843, "learning_rate": 9.994605888954701e-06, "loss": 0.0758, "step": 503 }, { "epoch": 0.34100135317997293, "grad_norm": 0.6005910959010601, "learning_rate": 9.99442152244292e-06, "loss": 0.0688, "step": 504 }, { "epoch": 0.3416779431664411, "grad_norm": 0.7257293085903247, "learning_rate": 9.994234059658998e-06, "loss": 0.062, "step": 505 }, { "epoch": 0.3423545331529093, "grad_norm": 0.42476969791052893, "learning_rate": 9.994043500719155e-06, "loss": 0.0559, "step": 506 }, { "epoch": 0.34303112313937756, "grad_norm": 0.7131862947849292, "learning_rate": 9.993849845741525e-06, "loss": 0.0593, "step": 507 }, { "epoch": 0.34370771312584575, "grad_norm": 0.5472938298249831, "learning_rate": 9.993653094846162e-06, "loss": 0.0423, "step": 508 }, { "epoch": 0.34438430311231394, "grad_norm": 0.5948620000109195, "learning_rate": 9.993453248155044e-06, "loss": 0.0497, "step": 509 }, { "epoch": 0.34506089309878213, "grad_norm": 1.093733533838249, "learning_rate": 9.993250305792067e-06, "loss": 0.0746, "step": 510 }, { "epoch": 0.3457374830852503, "grad_norm": 0.5333163745309386, "learning_rate": 9.993044267883039e-06, "loss": 0.0423, "step": 511 }, { "epoch": 0.34641407307171856, "grad_norm": 0.7020673460133552, "learning_rate": 9.992835134555694e-06, "loss": 0.0535, "step": 512 }, { "epoch": 0.34709066305818675, "grad_norm": 0.6343550734219309, "learning_rate": 9.992622905939686e-06, "loss": 0.0696, "step": 513 }, { "epoch": 0.34776725304465494, "grad_norm": 0.6693806970304675, "learning_rate": 9.992407582166582e-06, "loss": 0.0642, "step": 514 }, { "epoch": 0.34844384303112313, "grad_norm": 0.6158128059524903, "learning_rate": 9.992189163369873e-06, "loss": 0.0482, "step": 515 }, { "epoch": 0.3491204330175913, "grad_norm": 0.7487588584286908, "learning_rate": 9.991967649684967e-06, "loss": 0.0673, "step": 516 }, { "epoch": 0.3497970230040595, "grad_norm": 0.6862433264965782, "learning_rate": 9.99174304124919e-06, "loss": 0.0567, "step": 517 }, { "epoch": 0.35047361299052776, "grad_norm": 0.6580399913848498, "learning_rate": 9.991515338201787e-06, "loss": 0.0609, "step": 518 }, { "epoch": 0.35115020297699595, "grad_norm": 0.5920159943928762, "learning_rate": 9.991284540683922e-06, "loss": 0.0552, "step": 519 }, { "epoch": 0.35182679296346414, "grad_norm": 0.7122440429081579, "learning_rate": 9.991050648838676e-06, "loss": 0.0858, "step": 520 }, { "epoch": 0.35250338294993233, "grad_norm": 0.5295783013499992, "learning_rate": 9.990813662811052e-06, "loss": 0.0684, "step": 521 }, { "epoch": 0.3531799729364005, "grad_norm": 0.6434007100386325, "learning_rate": 9.990573582747965e-06, "loss": 0.0694, "step": 522 }, { "epoch": 0.35385656292286877, "grad_norm": 0.5522910666589854, "learning_rate": 9.990330408798255e-06, "loss": 0.0563, "step": 523 }, { "epoch": 0.35453315290933696, "grad_norm": 0.5355815417338708, "learning_rate": 9.990084141112674e-06, "loss": 0.0474, "step": 524 }, { "epoch": 0.35520974289580515, "grad_norm": 0.5872240034067444, "learning_rate": 9.989834779843895e-06, "loss": 0.067, "step": 525 }, { "epoch": 0.35588633288227334, "grad_norm": 0.5044518811534828, "learning_rate": 9.989582325146511e-06, "loss": 0.0507, "step": 526 }, { "epoch": 0.3565629228687415, "grad_norm": 0.6290030734871784, "learning_rate": 9.98932677717703e-06, "loss": 0.0594, "step": 527 }, { "epoch": 0.3572395128552097, "grad_norm": 0.6863719680908231, "learning_rate": 9.989068136093873e-06, "loss": 0.0493, "step": 528 }, { "epoch": 0.35791610284167796, "grad_norm": 0.6009540790788344, "learning_rate": 9.98880640205739e-06, "loss": 0.0589, "step": 529 }, { "epoch": 0.35859269282814615, "grad_norm": 0.6084327497198343, "learning_rate": 9.988541575229837e-06, "loss": 0.0518, "step": 530 }, { "epoch": 0.35926928281461434, "grad_norm": 0.5273232471322782, "learning_rate": 9.988273655775398e-06, "loss": 0.0462, "step": 531 }, { "epoch": 0.35994587280108253, "grad_norm": 0.5680931468656651, "learning_rate": 9.988002643860162e-06, "loss": 0.0446, "step": 532 }, { "epoch": 0.3606224627875507, "grad_norm": 0.504296532636251, "learning_rate": 9.987728539652145e-06, "loss": 0.0492, "step": 533 }, { "epoch": 0.36129905277401897, "grad_norm": 0.7850147931450985, "learning_rate": 9.98745134332128e-06, "loss": 0.0595, "step": 534 }, { "epoch": 0.36197564276048716, "grad_norm": 0.6883218200291387, "learning_rate": 9.987171055039409e-06, "loss": 0.0679, "step": 535 }, { "epoch": 0.36265223274695535, "grad_norm": 0.6217248635294631, "learning_rate": 9.986887674980297e-06, "loss": 0.0744, "step": 536 }, { "epoch": 0.36332882273342354, "grad_norm": 0.8984877191976386, "learning_rate": 9.986601203319623e-06, "loss": 0.0611, "step": 537 }, { "epoch": 0.36400541271989173, "grad_norm": 0.5200286611594177, "learning_rate": 9.986311640234988e-06, "loss": 0.047, "step": 538 }, { "epoch": 0.3646820027063599, "grad_norm": 0.5393876296161556, "learning_rate": 9.986018985905901e-06, "loss": 0.0573, "step": 539 }, { "epoch": 0.36535859269282817, "grad_norm": 0.9610999640309457, "learning_rate": 9.985723240513795e-06, "loss": 0.0688, "step": 540 }, { "epoch": 0.36603518267929636, "grad_norm": 0.6022215074653418, "learning_rate": 9.985424404242015e-06, "loss": 0.0706, "step": 541 }, { "epoch": 0.36671177266576455, "grad_norm": 0.7613605462105187, "learning_rate": 9.985122477275824e-06, "loss": 0.0567, "step": 542 }, { "epoch": 0.36738836265223274, "grad_norm": 0.4665944640080223, "learning_rate": 9.9848174598024e-06, "loss": 0.0426, "step": 543 }, { "epoch": 0.3680649526387009, "grad_norm": 0.43200689111944535, "learning_rate": 9.984509352010839e-06, "loss": 0.0427, "step": 544 }, { "epoch": 0.36874154262516917, "grad_norm": 0.546995121085685, "learning_rate": 9.984198154092147e-06, "loss": 0.0417, "step": 545 }, { "epoch": 0.36941813261163736, "grad_norm": 0.5950822901216501, "learning_rate": 9.983883866239253e-06, "loss": 0.0454, "step": 546 }, { "epoch": 0.37009472259810555, "grad_norm": 0.6285822947918005, "learning_rate": 9.983566488647e-06, "loss": 0.0489, "step": 547 }, { "epoch": 0.37077131258457374, "grad_norm": 0.7708988381155685, "learning_rate": 9.98324602151214e-06, "loss": 0.0546, "step": 548 }, { "epoch": 0.37144790257104193, "grad_norm": 0.6385878593932224, "learning_rate": 9.98292246503335e-06, "loss": 0.0787, "step": 549 }, { "epoch": 0.3721244925575101, "grad_norm": 0.4767906982846207, "learning_rate": 9.982595819411216e-06, "loss": 0.0499, "step": 550 }, { "epoch": 0.37280108254397837, "grad_norm": 0.5981540167028362, "learning_rate": 9.98226608484824e-06, "loss": 0.042, "step": 551 }, { "epoch": 0.37347767253044656, "grad_norm": 1.0562171077633178, "learning_rate": 9.981933261548841e-06, "loss": 0.0549, "step": 552 }, { "epoch": 0.37415426251691475, "grad_norm": 0.5806835877422644, "learning_rate": 9.981597349719351e-06, "loss": 0.0574, "step": 553 }, { "epoch": 0.37483085250338294, "grad_norm": 0.4004814302679841, "learning_rate": 9.981258349568018e-06, "loss": 0.0396, "step": 554 }, { "epoch": 0.37550744248985113, "grad_norm": 0.6074327356572229, "learning_rate": 9.980916261305002e-06, "loss": 0.0585, "step": 555 }, { "epoch": 0.3761840324763194, "grad_norm": 0.6224260203576697, "learning_rate": 9.980571085142381e-06, "loss": 0.0612, "step": 556 }, { "epoch": 0.37686062246278756, "grad_norm": 0.7966370089812861, "learning_rate": 9.980222821294143e-06, "loss": 0.046, "step": 557 }, { "epoch": 0.37753721244925575, "grad_norm": 0.4410488171312846, "learning_rate": 9.979871469976197e-06, "loss": 0.0481, "step": 558 }, { "epoch": 0.37821380243572394, "grad_norm": 0.3848865222745807, "learning_rate": 9.979517031406357e-06, "loss": 0.0333, "step": 559 }, { "epoch": 0.37889039242219213, "grad_norm": 0.6351468438149818, "learning_rate": 9.97915950580436e-06, "loss": 0.0692, "step": 560 }, { "epoch": 0.3795669824086603, "grad_norm": 0.4988612014196045, "learning_rate": 9.97879889339185e-06, "loss": 0.0564, "step": 561 }, { "epoch": 0.38024357239512857, "grad_norm": 0.5769445700376538, "learning_rate": 9.97843519439239e-06, "loss": 0.0635, "step": 562 }, { "epoch": 0.38092016238159676, "grad_norm": 0.480116832601754, "learning_rate": 9.978068409031449e-06, "loss": 0.0478, "step": 563 }, { "epoch": 0.38159675236806495, "grad_norm": 0.4701133209134182, "learning_rate": 9.97769853753642e-06, "loss": 0.055, "step": 564 }, { "epoch": 0.38227334235453314, "grad_norm": 0.5503363763619515, "learning_rate": 9.977325580136598e-06, "loss": 0.0467, "step": 565 }, { "epoch": 0.38294993234100133, "grad_norm": 0.5989246732859047, "learning_rate": 9.9769495370632e-06, "loss": 0.0451, "step": 566 }, { "epoch": 0.3836265223274696, "grad_norm": 0.47256811628541845, "learning_rate": 9.97657040854935e-06, "loss": 0.0443, "step": 567 }, { "epoch": 0.38430311231393777, "grad_norm": 0.8915362833553975, "learning_rate": 9.976188194830092e-06, "loss": 0.0586, "step": 568 }, { "epoch": 0.38497970230040596, "grad_norm": 0.553155207515097, "learning_rate": 9.975802896142373e-06, "loss": 0.0625, "step": 569 }, { "epoch": 0.38565629228687415, "grad_norm": 0.5458082769709284, "learning_rate": 9.975414512725058e-06, "loss": 0.0499, "step": 570 }, { "epoch": 0.38633288227334234, "grad_norm": 0.4556060115243897, "learning_rate": 9.975023044818925e-06, "loss": 0.0392, "step": 571 }, { "epoch": 0.3870094722598105, "grad_norm": 1.0506396789722947, "learning_rate": 9.974628492666664e-06, "loss": 0.0584, "step": 572 }, { "epoch": 0.3876860622462788, "grad_norm": 0.5710503312265792, "learning_rate": 9.974230856512874e-06, "loss": 0.0527, "step": 573 }, { "epoch": 0.38836265223274696, "grad_norm": 0.5477787984347363, "learning_rate": 9.973830136604068e-06, "loss": 0.0544, "step": 574 }, { "epoch": 0.38903924221921515, "grad_norm": 0.5591499111479803, "learning_rate": 9.973426333188673e-06, "loss": 0.0544, "step": 575 }, { "epoch": 0.38971583220568334, "grad_norm": 0.6805144445945421, "learning_rate": 9.973019446517023e-06, "loss": 0.0743, "step": 576 }, { "epoch": 0.39039242219215153, "grad_norm": 0.5357256318419202, "learning_rate": 9.972609476841368e-06, "loss": 0.0492, "step": 577 }, { "epoch": 0.3910690121786198, "grad_norm": 0.6563328943248335, "learning_rate": 9.972196424415865e-06, "loss": 0.0515, "step": 578 }, { "epoch": 0.39174560216508797, "grad_norm": 0.6683875299057613, "learning_rate": 9.971780289496585e-06, "loss": 0.0596, "step": 579 }, { "epoch": 0.39242219215155616, "grad_norm": 0.6203356217668405, "learning_rate": 9.971361072341509e-06, "loss": 0.048, "step": 580 }, { "epoch": 0.39309878213802435, "grad_norm": 0.7130829657996903, "learning_rate": 9.97093877321053e-06, "loss": 0.0625, "step": 581 }, { "epoch": 0.39377537212449254, "grad_norm": 0.7118642885536309, "learning_rate": 9.970513392365449e-06, "loss": 0.0554, "step": 582 }, { "epoch": 0.3944519621109608, "grad_norm": 0.6293851578067391, "learning_rate": 9.970084930069982e-06, "loss": 0.0623, "step": 583 }, { "epoch": 0.395128552097429, "grad_norm": 0.4917487400725403, "learning_rate": 9.969653386589749e-06, "loss": 0.0343, "step": 584 }, { "epoch": 0.39580514208389717, "grad_norm": 1.223119525422176, "learning_rate": 9.969218762192286e-06, "loss": 0.0651, "step": 585 }, { "epoch": 0.39648173207036536, "grad_norm": 0.4287599437591492, "learning_rate": 9.968781057147036e-06, "loss": 0.0451, "step": 586 }, { "epoch": 0.39715832205683355, "grad_norm": 0.9774368132953997, "learning_rate": 9.968340271725352e-06, "loss": 0.0545, "step": 587 }, { "epoch": 0.39783491204330174, "grad_norm": 0.8968389920777119, "learning_rate": 9.967896406200498e-06, "loss": 0.0545, "step": 588 }, { "epoch": 0.39851150202977, "grad_norm": 0.3490632538825462, "learning_rate": 9.967449460847648e-06, "loss": 0.0406, "step": 589 }, { "epoch": 0.39918809201623817, "grad_norm": 0.7494514761875918, "learning_rate": 9.966999435943882e-06, "loss": 0.0462, "step": 590 }, { "epoch": 0.39986468200270636, "grad_norm": 1.1060014645599572, "learning_rate": 9.966546331768192e-06, "loss": 0.0643, "step": 591 }, { "epoch": 0.40054127198917455, "grad_norm": 0.5514912544682866, "learning_rate": 9.966090148601477e-06, "loss": 0.0539, "step": 592 }, { "epoch": 0.40121786197564274, "grad_norm": 0.7534435027607882, "learning_rate": 9.965630886726548e-06, "loss": 0.0511, "step": 593 }, { "epoch": 0.401894451962111, "grad_norm": 0.48360363761511427, "learning_rate": 9.965168546428122e-06, "loss": 0.047, "step": 594 }, { "epoch": 0.4025710419485792, "grad_norm": 0.5494374284358066, "learning_rate": 9.964703127992822e-06, "loss": 0.0564, "step": 595 }, { "epoch": 0.40324763193504737, "grad_norm": 0.47579309611111265, "learning_rate": 9.964234631709188e-06, "loss": 0.0546, "step": 596 }, { "epoch": 0.40392422192151556, "grad_norm": 0.4530189543942543, "learning_rate": 9.963763057867658e-06, "loss": 0.0415, "step": 597 }, { "epoch": 0.40460081190798375, "grad_norm": 0.5883162500958534, "learning_rate": 9.963288406760584e-06, "loss": 0.0542, "step": 598 }, { "epoch": 0.40527740189445194, "grad_norm": 0.8472842059393131, "learning_rate": 9.962810678682223e-06, "loss": 0.0648, "step": 599 }, { "epoch": 0.4059539918809202, "grad_norm": 0.476752183272619, "learning_rate": 9.962329873928743e-06, "loss": 0.0476, "step": 600 }, { "epoch": 0.4066305818673884, "grad_norm": 0.4620893294233114, "learning_rate": 9.961845992798213e-06, "loss": 0.043, "step": 601 }, { "epoch": 0.40730717185385656, "grad_norm": 0.5102829518810936, "learning_rate": 9.961359035590619e-06, "loss": 0.0342, "step": 602 }, { "epoch": 0.40798376184032475, "grad_norm": 0.6386683967685269, "learning_rate": 9.960869002607843e-06, "loss": 0.0591, "step": 603 }, { "epoch": 0.40866035182679294, "grad_norm": 0.47902977709344424, "learning_rate": 9.960375894153682e-06, "loss": 0.0616, "step": 604 }, { "epoch": 0.4093369418132612, "grad_norm": 0.6046102465325948, "learning_rate": 9.959879710533835e-06, "loss": 0.0729, "step": 605 }, { "epoch": 0.4100135317997294, "grad_norm": 0.5270699471361276, "learning_rate": 9.959380452055909e-06, "loss": 0.0623, "step": 606 }, { "epoch": 0.41069012178619757, "grad_norm": 0.621065800085645, "learning_rate": 9.958878119029419e-06, "loss": 0.0395, "step": 607 }, { "epoch": 0.41136671177266576, "grad_norm": 0.36218114056526846, "learning_rate": 9.958372711765785e-06, "loss": 0.0433, "step": 608 }, { "epoch": 0.41204330175913395, "grad_norm": 0.5685957527884379, "learning_rate": 9.95786423057833e-06, "loss": 0.0625, "step": 609 }, { "epoch": 0.41271989174560214, "grad_norm": 0.8964199923627175, "learning_rate": 9.957352675782283e-06, "loss": 0.0603, "step": 610 }, { "epoch": 0.4133964817320704, "grad_norm": 0.731964784299447, "learning_rate": 9.956838047694785e-06, "loss": 0.0715, "step": 611 }, { "epoch": 0.4140730717185386, "grad_norm": 0.6211147042778331, "learning_rate": 9.956320346634877e-06, "loss": 0.0428, "step": 612 }, { "epoch": 0.41474966170500677, "grad_norm": 0.6737382967821821, "learning_rate": 9.955799572923503e-06, "loss": 0.0596, "step": 613 }, { "epoch": 0.41542625169147496, "grad_norm": 0.5050139257203307, "learning_rate": 9.955275726883517e-06, "loss": 0.0416, "step": 614 }, { "epoch": 0.41610284167794315, "grad_norm": 0.5149927453539003, "learning_rate": 9.954748808839675e-06, "loss": 0.0404, "step": 615 }, { "epoch": 0.4167794316644114, "grad_norm": 1.1872072320176672, "learning_rate": 9.954218819118636e-06, "loss": 0.0715, "step": 616 }, { "epoch": 0.4174560216508796, "grad_norm": 0.6146032125426063, "learning_rate": 9.953685758048968e-06, "loss": 0.0516, "step": 617 }, { "epoch": 0.4181326116373478, "grad_norm": 0.533819097044187, "learning_rate": 9.953149625961136e-06, "loss": 0.0508, "step": 618 }, { "epoch": 0.41880920162381596, "grad_norm": 0.6712374248895266, "learning_rate": 9.952610423187516e-06, "loss": 0.0624, "step": 619 }, { "epoch": 0.41948579161028415, "grad_norm": 0.6291747872540906, "learning_rate": 9.952068150062386e-06, "loss": 0.0503, "step": 620 }, { "epoch": 0.42016238159675234, "grad_norm": 0.5191015273620253, "learning_rate": 9.951522806921922e-06, "loss": 0.0489, "step": 621 }, { "epoch": 0.4208389715832206, "grad_norm": 0.4112482372183207, "learning_rate": 9.95097439410421e-06, "loss": 0.0519, "step": 622 }, { "epoch": 0.4215155615696888, "grad_norm": 0.6218172837637318, "learning_rate": 9.950422911949238e-06, "loss": 0.0643, "step": 623 }, { "epoch": 0.42219215155615697, "grad_norm": 0.7148700144926305, "learning_rate": 9.949868360798893e-06, "loss": 0.063, "step": 624 }, { "epoch": 0.42286874154262516, "grad_norm": 0.5722947833285671, "learning_rate": 9.949310740996964e-06, "loss": 0.0632, "step": 625 }, { "epoch": 0.42354533152909335, "grad_norm": 0.729292484266292, "learning_rate": 9.94875005288915e-06, "loss": 0.0485, "step": 626 }, { "epoch": 0.4242219215155616, "grad_norm": 0.5468712068033238, "learning_rate": 9.948186296823048e-06, "loss": 0.0453, "step": 627 }, { "epoch": 0.4248985115020298, "grad_norm": 0.5379532134952075, "learning_rate": 9.947619473148152e-06, "loss": 0.0551, "step": 628 }, { "epoch": 0.425575101488498, "grad_norm": 0.6838294705325765, "learning_rate": 9.947049582215862e-06, "loss": 0.067, "step": 629 }, { "epoch": 0.42625169147496617, "grad_norm": 0.44702169819272947, "learning_rate": 9.946476624379485e-06, "loss": 0.0577, "step": 630 }, { "epoch": 0.42692828146143436, "grad_norm": 0.7107904020354667, "learning_rate": 9.945900599994219e-06, "loss": 0.0451, "step": 631 }, { "epoch": 0.42760487144790255, "grad_norm": 0.44741982038100586, "learning_rate": 9.94532150941717e-06, "loss": 0.036, "step": 632 }, { "epoch": 0.4282814614343708, "grad_norm": 0.620847654640831, "learning_rate": 9.944739353007344e-06, "loss": 0.0593, "step": 633 }, { "epoch": 0.428958051420839, "grad_norm": 0.6466303946835565, "learning_rate": 9.944154131125643e-06, "loss": 0.0537, "step": 634 }, { "epoch": 0.42963464140730717, "grad_norm": 0.6605384758118511, "learning_rate": 9.943565844134877e-06, "loss": 0.0563, "step": 635 }, { "epoch": 0.43031123139377536, "grad_norm": 0.6279944033337461, "learning_rate": 9.942974492399751e-06, "loss": 0.0517, "step": 636 }, { "epoch": 0.43098782138024355, "grad_norm": 0.799163664852329, "learning_rate": 9.94238007628687e-06, "loss": 0.065, "step": 637 }, { "epoch": 0.4316644113667118, "grad_norm": 0.6831934664432242, "learning_rate": 9.94178259616474e-06, "loss": 0.0603, "step": 638 }, { "epoch": 0.43234100135318, "grad_norm": 0.44395458651519953, "learning_rate": 9.941182052403768e-06, "loss": 0.0456, "step": 639 }, { "epoch": 0.4330175913396482, "grad_norm": 0.7882901427214065, "learning_rate": 9.940578445376259e-06, "loss": 0.0493, "step": 640 }, { "epoch": 0.43369418132611637, "grad_norm": 0.8695595713041687, "learning_rate": 9.939971775456416e-06, "loss": 0.0643, "step": 641 }, { "epoch": 0.43437077131258456, "grad_norm": 0.8732608626098697, "learning_rate": 9.93936204302034e-06, "loss": 0.0739, "step": 642 }, { "epoch": 0.43504736129905275, "grad_norm": 0.5239275121018045, "learning_rate": 9.938749248446033e-06, "loss": 0.0448, "step": 643 }, { "epoch": 0.435723951285521, "grad_norm": 0.7753422232990571, "learning_rate": 9.938133392113399e-06, "loss": 0.0635, "step": 644 }, { "epoch": 0.4364005412719892, "grad_norm": 0.6273548559703079, "learning_rate": 9.937514474404229e-06, "loss": 0.0535, "step": 645 }, { "epoch": 0.4370771312584574, "grad_norm": 0.7674442628416059, "learning_rate": 9.936892495702222e-06, "loss": 0.0578, "step": 646 }, { "epoch": 0.43775372124492556, "grad_norm": 0.4808136940151125, "learning_rate": 9.936267456392971e-06, "loss": 0.0523, "step": 647 }, { "epoch": 0.43843031123139375, "grad_norm": 0.6077370418485973, "learning_rate": 9.935639356863966e-06, "loss": 0.0644, "step": 648 }, { "epoch": 0.439106901217862, "grad_norm": 0.8564838789435368, "learning_rate": 9.935008197504596e-06, "loss": 0.0526, "step": 649 }, { "epoch": 0.4397834912043302, "grad_norm": 0.7480733069773909, "learning_rate": 9.934373978706147e-06, "loss": 0.0795, "step": 650 }, { "epoch": 0.4404600811907984, "grad_norm": 0.4289437402578333, "learning_rate": 9.933736700861798e-06, "loss": 0.041, "step": 651 }, { "epoch": 0.44113667117726657, "grad_norm": 0.520375773258376, "learning_rate": 9.933096364366625e-06, "loss": 0.0677, "step": 652 }, { "epoch": 0.44181326116373476, "grad_norm": 0.5815794825346166, "learning_rate": 9.932452969617607e-06, "loss": 0.0495, "step": 653 }, { "epoch": 0.44248985115020295, "grad_norm": 0.5821123973883511, "learning_rate": 9.931806517013612e-06, "loss": 0.0565, "step": 654 }, { "epoch": 0.4431664411366712, "grad_norm": 0.5056099241481415, "learning_rate": 9.931157006955406e-06, "loss": 0.0471, "step": 655 }, { "epoch": 0.4438430311231394, "grad_norm": 0.6414689039348711, "learning_rate": 9.93050443984565e-06, "loss": 0.0543, "step": 656 }, { "epoch": 0.4445196211096076, "grad_norm": 0.5480756679690673, "learning_rate": 9.929848816088898e-06, "loss": 0.0634, "step": 657 }, { "epoch": 0.44519621109607577, "grad_norm": 0.7174926108633242, "learning_rate": 9.929190136091604e-06, "loss": 0.0616, "step": 658 }, { "epoch": 0.44587280108254396, "grad_norm": 0.5686244495481496, "learning_rate": 9.928528400262116e-06, "loss": 0.0453, "step": 659 }, { "epoch": 0.4465493910690122, "grad_norm": 0.4914095498933844, "learning_rate": 9.92786360901067e-06, "loss": 0.0563, "step": 660 }, { "epoch": 0.4472259810554804, "grad_norm": 0.6090759602899615, "learning_rate": 9.927195762749405e-06, "loss": 0.0517, "step": 661 }, { "epoch": 0.4479025710419486, "grad_norm": 0.6161988737908256, "learning_rate": 9.926524861892346e-06, "loss": 0.0547, "step": 662 }, { "epoch": 0.4485791610284168, "grad_norm": 0.3292183584682544, "learning_rate": 9.925850906855419e-06, "loss": 0.0425, "step": 663 }, { "epoch": 0.44925575101488496, "grad_norm": 0.42276768491687367, "learning_rate": 9.925173898056436e-06, "loss": 0.0405, "step": 664 }, { "epoch": 0.44993234100135315, "grad_norm": 0.8380544193813583, "learning_rate": 9.924493835915108e-06, "loss": 0.0568, "step": 665 }, { "epoch": 0.4506089309878214, "grad_norm": 0.6999964194444989, "learning_rate": 9.923810720853038e-06, "loss": 0.0605, "step": 666 }, { "epoch": 0.4512855209742896, "grad_norm": 0.4837652325939674, "learning_rate": 9.923124553293718e-06, "loss": 0.0444, "step": 667 }, { "epoch": 0.4519621109607578, "grad_norm": 0.6216585201930606, "learning_rate": 9.922435333662537e-06, "loss": 0.0585, "step": 668 }, { "epoch": 0.45263870094722597, "grad_norm": 0.5996384717459816, "learning_rate": 9.921743062386773e-06, "loss": 0.0561, "step": 669 }, { "epoch": 0.45331529093369416, "grad_norm": 0.5326204102007049, "learning_rate": 9.921047739895596e-06, "loss": 0.042, "step": 670 }, { "epoch": 0.4539918809201624, "grad_norm": 0.49562577862563084, "learning_rate": 9.92034936662007e-06, "loss": 0.0507, "step": 671 }, { "epoch": 0.4546684709066306, "grad_norm": 0.730328018389376, "learning_rate": 9.91964794299315e-06, "loss": 0.0536, "step": 672 }, { "epoch": 0.4553450608930988, "grad_norm": 0.38455552302941354, "learning_rate": 9.918943469449676e-06, "loss": 0.049, "step": 673 }, { "epoch": 0.456021650879567, "grad_norm": 0.5401055015676005, "learning_rate": 9.918235946426389e-06, "loss": 0.0457, "step": 674 }, { "epoch": 0.45669824086603517, "grad_norm": 0.6353008301105469, "learning_rate": 9.917525374361913e-06, "loss": 0.0518, "step": 675 }, { "epoch": 0.45737483085250336, "grad_norm": 0.5267061996841854, "learning_rate": 9.916811753696764e-06, "loss": 0.0469, "step": 676 }, { "epoch": 0.4580514208389716, "grad_norm": 0.6084663569530226, "learning_rate": 9.916095084873348e-06, "loss": 0.0534, "step": 677 }, { "epoch": 0.4587280108254398, "grad_norm": 0.8048621834043865, "learning_rate": 9.915375368335962e-06, "loss": 0.0579, "step": 678 }, { "epoch": 0.459404600811908, "grad_norm": 0.5935252094474919, "learning_rate": 9.91465260453079e-06, "loss": 0.0497, "step": 679 }, { "epoch": 0.46008119079837617, "grad_norm": 0.6484070730343198, "learning_rate": 9.913926793905909e-06, "loss": 0.0501, "step": 680 }, { "epoch": 0.46075778078484436, "grad_norm": 0.7664190173161035, "learning_rate": 9.91319793691128e-06, "loss": 0.0518, "step": 681 }, { "epoch": 0.4614343707713126, "grad_norm": 0.5004208463200237, "learning_rate": 9.912466033998758e-06, "loss": 0.0527, "step": 682 }, { "epoch": 0.4621109607577808, "grad_norm": 0.746120463933263, "learning_rate": 9.91173108562208e-06, "loss": 0.0716, "step": 683 }, { "epoch": 0.462787550744249, "grad_norm": 0.7118564064495075, "learning_rate": 9.910993092236878e-06, "loss": 0.0683, "step": 684 }, { "epoch": 0.4634641407307172, "grad_norm": 0.3665810805358901, "learning_rate": 9.910252054300664e-06, "loss": 0.0379, "step": 685 }, { "epoch": 0.46414073071718537, "grad_norm": 0.6464876277176455, "learning_rate": 9.909507972272845e-06, "loss": 0.0708, "step": 686 }, { "epoch": 0.4648173207036536, "grad_norm": 0.9433879986761604, "learning_rate": 9.90876084661471e-06, "loss": 0.0714, "step": 687 }, { "epoch": 0.4654939106901218, "grad_norm": 0.5715423921258398, "learning_rate": 9.908010677789437e-06, "loss": 0.0439, "step": 688 }, { "epoch": 0.46617050067659, "grad_norm": 0.4660682565688531, "learning_rate": 9.90725746626209e-06, "loss": 0.0498, "step": 689 }, { "epoch": 0.4668470906630582, "grad_norm": 0.46524053689266187, "learning_rate": 9.90650121249962e-06, "loss": 0.0543, "step": 690 }, { "epoch": 0.4675236806495264, "grad_norm": 0.46365328983571014, "learning_rate": 9.905741916970863e-06, "loss": 0.0416, "step": 691 }, { "epoch": 0.46820027063599456, "grad_norm": 0.36787289617199853, "learning_rate": 9.904979580146544e-06, "loss": 0.0562, "step": 692 }, { "epoch": 0.4688768606224628, "grad_norm": 0.6294200557803371, "learning_rate": 9.904214202499266e-06, "loss": 0.0525, "step": 693 }, { "epoch": 0.469553450608931, "grad_norm": 0.5515365908364331, "learning_rate": 9.903445784503525e-06, "loss": 0.0448, "step": 694 }, { "epoch": 0.4702300405953992, "grad_norm": 0.4889466923384764, "learning_rate": 9.902674326635698e-06, "loss": 0.0516, "step": 695 }, { "epoch": 0.4709066305818674, "grad_norm": 0.4265080784712957, "learning_rate": 9.901899829374048e-06, "loss": 0.0542, "step": 696 }, { "epoch": 0.47158322056833557, "grad_norm": 0.7795121815798032, "learning_rate": 9.90112229319872e-06, "loss": 0.0524, "step": 697 }, { "epoch": 0.4722598105548038, "grad_norm": 0.6477501865570612, "learning_rate": 9.900341718591746e-06, "loss": 0.0668, "step": 698 }, { "epoch": 0.472936400541272, "grad_norm": 0.6093290599919889, "learning_rate": 9.899558106037039e-06, "loss": 0.0478, "step": 699 }, { "epoch": 0.4736129905277402, "grad_norm": 0.6437924066156706, "learning_rate": 9.898771456020397e-06, "loss": 0.0454, "step": 700 }, { "epoch": 0.4742895805142084, "grad_norm": 0.41372214037147886, "learning_rate": 9.897981769029504e-06, "loss": 0.0554, "step": 701 }, { "epoch": 0.4749661705006766, "grad_norm": 0.406804966119598, "learning_rate": 9.897189045553917e-06, "loss": 0.0381, "step": 702 }, { "epoch": 0.47564276048714477, "grad_norm": 0.785719969880346, "learning_rate": 9.896393286085085e-06, "loss": 0.0608, "step": 703 }, { "epoch": 0.476319350473613, "grad_norm": 0.6757989837633964, "learning_rate": 9.895594491116336e-06, "loss": 0.0637, "step": 704 }, { "epoch": 0.4769959404600812, "grad_norm": 0.54434553685572, "learning_rate": 9.89479266114288e-06, "loss": 0.0517, "step": 705 }, { "epoch": 0.4776725304465494, "grad_norm": 0.556258617178189, "learning_rate": 9.893987796661809e-06, "loss": 0.0464, "step": 706 }, { "epoch": 0.4783491204330176, "grad_norm": 0.5443569208148896, "learning_rate": 9.893179898172095e-06, "loss": 0.0478, "step": 707 }, { "epoch": 0.4790257104194858, "grad_norm": 0.3782871653886809, "learning_rate": 9.89236896617459e-06, "loss": 0.0344, "step": 708 }, { "epoch": 0.479702300405954, "grad_norm": 0.5601099762544572, "learning_rate": 9.891555001172032e-06, "loss": 0.0494, "step": 709 }, { "epoch": 0.4803788903924222, "grad_norm": 0.5597549361352787, "learning_rate": 9.890738003669029e-06, "loss": 0.0485, "step": 710 }, { "epoch": 0.4810554803788904, "grad_norm": 0.5294133072336844, "learning_rate": 9.88991797417208e-06, "loss": 0.0536, "step": 711 }, { "epoch": 0.4817320703653586, "grad_norm": 0.9265159910409836, "learning_rate": 9.889094913189561e-06, "loss": 0.0788, "step": 712 }, { "epoch": 0.4824086603518268, "grad_norm": 0.43730031477379844, "learning_rate": 9.888268821231721e-06, "loss": 0.0507, "step": 713 }, { "epoch": 0.48308525033829497, "grad_norm": 0.5567250065444755, "learning_rate": 9.887439698810694e-06, "loss": 0.0699, "step": 714 }, { "epoch": 0.4837618403247632, "grad_norm": 0.4096922412310759, "learning_rate": 9.886607546440492e-06, "loss": 0.0434, "step": 715 }, { "epoch": 0.4844384303112314, "grad_norm": 0.4636339848328735, "learning_rate": 9.885772364637002e-06, "loss": 0.0439, "step": 716 }, { "epoch": 0.4851150202976996, "grad_norm": 0.4081535798118912, "learning_rate": 9.884934153917998e-06, "loss": 0.0374, "step": 717 }, { "epoch": 0.4857916102841678, "grad_norm": 0.36006897850211084, "learning_rate": 9.884092914803119e-06, "loss": 0.0438, "step": 718 }, { "epoch": 0.486468200270636, "grad_norm": 0.6180958830932193, "learning_rate": 9.88324864781389e-06, "loss": 0.0463, "step": 719 }, { "epoch": 0.4871447902571042, "grad_norm": 0.5632508848694299, "learning_rate": 9.882401353473711e-06, "loss": 0.0483, "step": 720 }, { "epoch": 0.4878213802435724, "grad_norm": 0.5793828620228911, "learning_rate": 9.881551032307859e-06, "loss": 0.0652, "step": 721 }, { "epoch": 0.4884979702300406, "grad_norm": 0.448107907002612, "learning_rate": 9.880697684843487e-06, "loss": 0.0479, "step": 722 }, { "epoch": 0.4891745602165088, "grad_norm": 0.4391741741594667, "learning_rate": 9.879841311609625e-06, "loss": 0.0531, "step": 723 }, { "epoch": 0.489851150202977, "grad_norm": 0.5181556645111783, "learning_rate": 9.878981913137178e-06, "loss": 0.0602, "step": 724 }, { "epoch": 0.49052774018944517, "grad_norm": 0.5384630165805743, "learning_rate": 9.878119489958929e-06, "loss": 0.0688, "step": 725 }, { "epoch": 0.4912043301759134, "grad_norm": 0.5378313548014063, "learning_rate": 9.877254042609529e-06, "loss": 0.0439, "step": 726 }, { "epoch": 0.4918809201623816, "grad_norm": 0.3892701405664777, "learning_rate": 9.87638557162551e-06, "loss": 0.0398, "step": 727 }, { "epoch": 0.4925575101488498, "grad_norm": 0.4259128758039286, "learning_rate": 9.875514077545282e-06, "loss": 0.0417, "step": 728 }, { "epoch": 0.493234100135318, "grad_norm": 0.47040481180464283, "learning_rate": 9.874639560909118e-06, "loss": 0.0394, "step": 729 }, { "epoch": 0.4939106901217862, "grad_norm": 0.36087360236407345, "learning_rate": 9.873762022259177e-06, "loss": 0.0287, "step": 730 }, { "epoch": 0.4945872801082544, "grad_norm": 0.70693194587442, "learning_rate": 9.87288146213948e-06, "loss": 0.0597, "step": 731 }, { "epoch": 0.4952638700947226, "grad_norm": 0.39665457047176367, "learning_rate": 9.87199788109593e-06, "loss": 0.0387, "step": 732 }, { "epoch": 0.4959404600811908, "grad_norm": 0.48451934239788397, "learning_rate": 9.8711112796763e-06, "loss": 0.0505, "step": 733 }, { "epoch": 0.496617050067659, "grad_norm": 0.5368848334988352, "learning_rate": 9.870221658430233e-06, "loss": 0.0467, "step": 734 }, { "epoch": 0.4972936400541272, "grad_norm": 0.5674198214012114, "learning_rate": 9.869329017909248e-06, "loss": 0.042, "step": 735 }, { "epoch": 0.4979702300405954, "grad_norm": 0.5721578103687843, "learning_rate": 9.868433358666734e-06, "loss": 0.0636, "step": 736 }, { "epoch": 0.4986468200270636, "grad_norm": 0.9677416335813325, "learning_rate": 9.86753468125795e-06, "loss": 0.0587, "step": 737 }, { "epoch": 0.4993234100135318, "grad_norm": 0.40180822511425857, "learning_rate": 9.86663298624003e-06, "loss": 0.0481, "step": 738 }, { "epoch": 0.5, "grad_norm": 0.361135441012983, "learning_rate": 9.865728274171972e-06, "loss": 0.0406, "step": 739 }, { "epoch": 0.5006765899864682, "grad_norm": 0.8479454545639886, "learning_rate": 9.864820545614656e-06, "loss": 0.0598, "step": 740 }, { "epoch": 0.5013531799729364, "grad_norm": 0.5106368822232472, "learning_rate": 9.863909801130816e-06, "loss": 0.053, "step": 741 }, { "epoch": 0.5020297699594046, "grad_norm": 0.4756226065532691, "learning_rate": 9.862996041285071e-06, "loss": 0.0483, "step": 742 }, { "epoch": 0.5027063599458728, "grad_norm": 0.6219060655877254, "learning_rate": 9.862079266643899e-06, "loss": 0.0502, "step": 743 }, { "epoch": 0.503382949932341, "grad_norm": 0.4845490822586034, "learning_rate": 9.861159477775653e-06, "loss": 0.0488, "step": 744 }, { "epoch": 0.5040595399188093, "grad_norm": 0.6812021084222101, "learning_rate": 9.860236675250553e-06, "loss": 0.0554, "step": 745 }, { "epoch": 0.5047361299052774, "grad_norm": 0.5590569632794635, "learning_rate": 9.859310859640685e-06, "loss": 0.0491, "step": 746 }, { "epoch": 0.5054127198917456, "grad_norm": 0.46212555175343245, "learning_rate": 9.858382031520005e-06, "loss": 0.0539, "step": 747 }, { "epoch": 0.5060893098782138, "grad_norm": 0.40800380331762076, "learning_rate": 9.857450191464337e-06, "loss": 0.0466, "step": 748 }, { "epoch": 0.506765899864682, "grad_norm": 0.6091738784482054, "learning_rate": 9.856515340051374e-06, "loss": 0.05, "step": 749 }, { "epoch": 0.5074424898511503, "grad_norm": 0.7055256422888375, "learning_rate": 9.855577477860669e-06, "loss": 0.0434, "step": 750 }, { "epoch": 0.5081190798376184, "grad_norm": 0.9539674471910841, "learning_rate": 9.854636605473647e-06, "loss": 0.0528, "step": 751 }, { "epoch": 0.5087956698240866, "grad_norm": 0.47633172733382284, "learning_rate": 9.8536927234736e-06, "loss": 0.0497, "step": 752 }, { "epoch": 0.5094722598105548, "grad_norm": 0.7532584333033749, "learning_rate": 9.852745832445684e-06, "loss": 0.0501, "step": 753 }, { "epoch": 0.510148849797023, "grad_norm": 0.5304042345746277, "learning_rate": 9.851795932976919e-06, "loss": 0.046, "step": 754 }, { "epoch": 0.5108254397834912, "grad_norm": 0.44485606906848746, "learning_rate": 9.850843025656194e-06, "loss": 0.0549, "step": 755 }, { "epoch": 0.5115020297699594, "grad_norm": 0.5465848222940929, "learning_rate": 9.849887111074256e-06, "loss": 0.0547, "step": 756 }, { "epoch": 0.5121786197564276, "grad_norm": 0.5418310103747346, "learning_rate": 9.848928189823724e-06, "loss": 0.0441, "step": 757 }, { "epoch": 0.5128552097428958, "grad_norm": 0.7031623884866932, "learning_rate": 9.847966262499073e-06, "loss": 0.0692, "step": 758 }, { "epoch": 0.513531799729364, "grad_norm": 0.45568690818784974, "learning_rate": 9.847001329696653e-06, "loss": 0.0473, "step": 759 }, { "epoch": 0.5142083897158322, "grad_norm": 0.6068664057424306, "learning_rate": 9.846033392014665e-06, "loss": 0.0539, "step": 760 }, { "epoch": 0.5148849797023004, "grad_norm": 0.4017234045118912, "learning_rate": 9.84506245005318e-06, "loss": 0.04, "step": 761 }, { "epoch": 0.5155615696887687, "grad_norm": 0.4686458728542905, "learning_rate": 9.84408850441413e-06, "loss": 0.0478, "step": 762 }, { "epoch": 0.5162381596752368, "grad_norm": 0.45832931873341604, "learning_rate": 9.843111555701307e-06, "loss": 0.0367, "step": 763 }, { "epoch": 0.516914749661705, "grad_norm": 0.5584097887482565, "learning_rate": 9.84213160452037e-06, "loss": 0.0497, "step": 764 }, { "epoch": 0.5175913396481732, "grad_norm": 0.4431597984514597, "learning_rate": 9.841148651478833e-06, "loss": 0.0513, "step": 765 }, { "epoch": 0.5182679296346414, "grad_norm": 0.27012020329095626, "learning_rate": 9.840162697186075e-06, "loss": 0.0349, "step": 766 }, { "epoch": 0.5189445196211097, "grad_norm": 0.45363845667223135, "learning_rate": 9.839173742253334e-06, "loss": 0.041, "step": 767 }, { "epoch": 0.5196211096075778, "grad_norm": 0.50469702558176, "learning_rate": 9.838181787293707e-06, "loss": 0.0582, "step": 768 }, { "epoch": 0.520297699594046, "grad_norm": 0.4578431401614713, "learning_rate": 9.837186832922157e-06, "loss": 0.0476, "step": 769 }, { "epoch": 0.5209742895805142, "grad_norm": 0.6613480257508604, "learning_rate": 9.8361888797555e-06, "loss": 0.0527, "step": 770 }, { "epoch": 0.5216508795669824, "grad_norm": 0.7676793309014901, "learning_rate": 9.835187928412412e-06, "loss": 0.0514, "step": 771 }, { "epoch": 0.5223274695534507, "grad_norm": 0.512508020266984, "learning_rate": 9.834183979513427e-06, "loss": 0.0591, "step": 772 }, { "epoch": 0.5230040595399188, "grad_norm": 0.51084222634459, "learning_rate": 9.833177033680945e-06, "loss": 0.0423, "step": 773 }, { "epoch": 0.523680649526387, "grad_norm": 0.6413724330584915, "learning_rate": 9.832167091539215e-06, "loss": 0.0418, "step": 774 }, { "epoch": 0.5243572395128552, "grad_norm": 0.7460443994099318, "learning_rate": 9.831154153714344e-06, "loss": 0.0541, "step": 775 }, { "epoch": 0.5250338294993234, "grad_norm": 0.6734291747835667, "learning_rate": 9.830138220834305e-06, "loss": 0.0513, "step": 776 }, { "epoch": 0.5257104194857916, "grad_norm": 0.5738298507102398, "learning_rate": 9.829119293528916e-06, "loss": 0.0403, "step": 777 }, { "epoch": 0.5263870094722598, "grad_norm": 0.43068434135473127, "learning_rate": 9.82809737242986e-06, "loss": 0.0485, "step": 778 }, { "epoch": 0.527063599458728, "grad_norm": 0.8668327646717695, "learning_rate": 9.827072458170673e-06, "loss": 0.0727, "step": 779 }, { "epoch": 0.5277401894451962, "grad_norm": 0.42508437715024205, "learning_rate": 9.826044551386743e-06, "loss": 0.0352, "step": 780 }, { "epoch": 0.5284167794316644, "grad_norm": 0.5472949883047366, "learning_rate": 9.825013652715323e-06, "loss": 0.0395, "step": 781 }, { "epoch": 0.5290933694181326, "grad_norm": 0.8154528193183785, "learning_rate": 9.82397976279551e-06, "loss": 0.0446, "step": 782 }, { "epoch": 0.5297699594046008, "grad_norm": 0.5338044803942171, "learning_rate": 9.822942882268261e-06, "loss": 0.0408, "step": 783 }, { "epoch": 0.530446549391069, "grad_norm": 0.5827835289743865, "learning_rate": 9.821903011776385e-06, "loss": 0.0441, "step": 784 }, { "epoch": 0.5311231393775372, "grad_norm": 0.8365956151736382, "learning_rate": 9.820860151964548e-06, "loss": 0.0488, "step": 785 }, { "epoch": 0.5317997293640054, "grad_norm": 0.5749035683313815, "learning_rate": 9.819814303479268e-06, "loss": 0.0553, "step": 786 }, { "epoch": 0.5324763193504736, "grad_norm": 0.5212962411059993, "learning_rate": 9.818765466968909e-06, "loss": 0.0406, "step": 787 }, { "epoch": 0.5331529093369418, "grad_norm": 0.7427578140330747, "learning_rate": 9.8177136430837e-06, "loss": 0.0538, "step": 788 }, { "epoch": 0.5338294993234101, "grad_norm": 0.5977082799800159, "learning_rate": 9.816658832475709e-06, "loss": 0.0601, "step": 789 }, { "epoch": 0.5345060893098782, "grad_norm": 1.0394597388074733, "learning_rate": 9.815601035798866e-06, "loss": 0.0779, "step": 790 }, { "epoch": 0.5351826792963464, "grad_norm": 0.8065819456719235, "learning_rate": 9.814540253708945e-06, "loss": 0.0533, "step": 791 }, { "epoch": 0.5358592692828146, "grad_norm": 0.6025659459066489, "learning_rate": 9.813476486863575e-06, "loss": 0.0441, "step": 792 }, { "epoch": 0.5365358592692828, "grad_norm": 1.017224752493771, "learning_rate": 9.812409735922236e-06, "loss": 0.0668, "step": 793 }, { "epoch": 0.5372124492557511, "grad_norm": 0.4502355249080863, "learning_rate": 9.811340001546252e-06, "loss": 0.0407, "step": 794 }, { "epoch": 0.5378890392422192, "grad_norm": 0.5251804211501145, "learning_rate": 9.810267284398805e-06, "loss": 0.0654, "step": 795 }, { "epoch": 0.5385656292286874, "grad_norm": 1.433502261944195, "learning_rate": 9.80919158514492e-06, "loss": 0.0832, "step": 796 }, { "epoch": 0.5392422192151556, "grad_norm": 0.48935290039798657, "learning_rate": 9.80811290445147e-06, "loss": 0.0549, "step": 797 }, { "epoch": 0.5399188092016238, "grad_norm": 0.9396984260311254, "learning_rate": 9.807031242987182e-06, "loss": 0.0498, "step": 798 }, { "epoch": 0.540595399188092, "grad_norm": 0.6802177887060812, "learning_rate": 9.805946601422628e-06, "loss": 0.0795, "step": 799 }, { "epoch": 0.5412719891745602, "grad_norm": 0.5269895591335515, "learning_rate": 9.804858980430225e-06, "loss": 0.0784, "step": 800 }, { "epoch": 0.5419485791610285, "grad_norm": 0.5091073912024127, "learning_rate": 9.803768380684242e-06, "loss": 0.0743, "step": 801 }, { "epoch": 0.5426251691474966, "grad_norm": 0.44128852573354854, "learning_rate": 9.80267480286079e-06, "loss": 0.0514, "step": 802 }, { "epoch": 0.5433017591339648, "grad_norm": 0.7140554103607668, "learning_rate": 9.801578247637828e-06, "loss": 0.053, "step": 803 }, { "epoch": 0.543978349120433, "grad_norm": 0.5944336586055404, "learning_rate": 9.800478715695165e-06, "loss": 0.0443, "step": 804 }, { "epoch": 0.5446549391069012, "grad_norm": 0.4363773630726969, "learning_rate": 9.799376207714446e-06, "loss": 0.0438, "step": 805 }, { "epoch": 0.5453315290933695, "grad_norm": 0.4663745215765508, "learning_rate": 9.79827072437917e-06, "loss": 0.0373, "step": 806 }, { "epoch": 0.5460081190798376, "grad_norm": 0.872105611260014, "learning_rate": 9.797162266374677e-06, "loss": 0.0502, "step": 807 }, { "epoch": 0.5466847090663058, "grad_norm": 0.3824510105734749, "learning_rate": 9.79605083438815e-06, "loss": 0.0381, "step": 808 }, { "epoch": 0.547361299052774, "grad_norm": 0.5380356410057527, "learning_rate": 9.794936429108617e-06, "loss": 0.0465, "step": 809 }, { "epoch": 0.5480378890392422, "grad_norm": 0.8585674677229587, "learning_rate": 9.79381905122695e-06, "loss": 0.0632, "step": 810 }, { "epoch": 0.5487144790257105, "grad_norm": 0.4813256618801878, "learning_rate": 9.792698701435863e-06, "loss": 0.0392, "step": 811 }, { "epoch": 0.5493910690121786, "grad_norm": 0.5944200029999508, "learning_rate": 9.791575380429911e-06, "loss": 0.0556, "step": 812 }, { "epoch": 0.5500676589986468, "grad_norm": 0.5427250539898076, "learning_rate": 9.790449088905496e-06, "loss": 0.0415, "step": 813 }, { "epoch": 0.550744248985115, "grad_norm": 0.606873245670459, "learning_rate": 9.789319827560854e-06, "loss": 0.0618, "step": 814 }, { "epoch": 0.5514208389715832, "grad_norm": 0.4616053148817582, "learning_rate": 9.78818759709607e-06, "loss": 0.0347, "step": 815 }, { "epoch": 0.5520974289580515, "grad_norm": 0.6603760242553907, "learning_rate": 9.787052398213062e-06, "loss": 0.049, "step": 816 }, { "epoch": 0.5527740189445196, "grad_norm": 0.5037802895061197, "learning_rate": 9.785914231615595e-06, "loss": 0.0547, "step": 817 }, { "epoch": 0.5534506089309879, "grad_norm": 0.4696939062638266, "learning_rate": 9.784773098009269e-06, "loss": 0.045, "step": 818 }, { "epoch": 0.554127198917456, "grad_norm": 0.5578286839070521, "learning_rate": 9.783628998101525e-06, "loss": 0.0509, "step": 819 }, { "epoch": 0.5548037889039242, "grad_norm": 0.41249886508663836, "learning_rate": 9.782481932601643e-06, "loss": 0.0504, "step": 820 }, { "epoch": 0.5554803788903924, "grad_norm": 0.6001426424741404, "learning_rate": 9.781331902220748e-06, "loss": 0.0408, "step": 821 }, { "epoch": 0.5561569688768606, "grad_norm": 0.8508634044858778, "learning_rate": 9.780178907671788e-06, "loss": 0.0555, "step": 822 }, { "epoch": 0.5568335588633289, "grad_norm": 0.4411759060670162, "learning_rate": 9.779022949669565e-06, "loss": 0.0354, "step": 823 }, { "epoch": 0.557510148849797, "grad_norm": 0.6172703878800332, "learning_rate": 9.777864028930705e-06, "loss": 0.0508, "step": 824 }, { "epoch": 0.5581867388362652, "grad_norm": 0.7608138913298033, "learning_rate": 9.776702146173678e-06, "loss": 0.0708, "step": 825 }, { "epoch": 0.5588633288227334, "grad_norm": 0.623352152631987, "learning_rate": 9.775537302118791e-06, "loss": 0.0859, "step": 826 }, { "epoch": 0.5595399188092016, "grad_norm": 0.407783288556159, "learning_rate": 9.77436949748818e-06, "loss": 0.043, "step": 827 }, { "epoch": 0.5602165087956699, "grad_norm": 0.46535537137874305, "learning_rate": 9.773198733005827e-06, "loss": 0.0497, "step": 828 }, { "epoch": 0.560893098782138, "grad_norm": 0.45065579390232596, "learning_rate": 9.772025009397538e-06, "loss": 0.0527, "step": 829 }, { "epoch": 0.5615696887686062, "grad_norm": 0.46231610036598364, "learning_rate": 9.770848327390961e-06, "loss": 0.0517, "step": 830 }, { "epoch": 0.5622462787550744, "grad_norm": 1.048157851355278, "learning_rate": 9.769668687715572e-06, "loss": 0.0843, "step": 831 }, { "epoch": 0.5629228687415426, "grad_norm": 0.43874424106315724, "learning_rate": 9.76848609110269e-06, "loss": 0.0492, "step": 832 }, { "epoch": 0.5635994587280109, "grad_norm": 0.46382084948494207, "learning_rate": 9.767300538285454e-06, "loss": 0.049, "step": 833 }, { "epoch": 0.564276048714479, "grad_norm": 0.472999564690522, "learning_rate": 9.766112029998847e-06, "loss": 0.0453, "step": 834 }, { "epoch": 0.5649526387009473, "grad_norm": 0.7025502042943563, "learning_rate": 9.76492056697968e-06, "loss": 0.0518, "step": 835 }, { "epoch": 0.5656292286874154, "grad_norm": 0.49652552955740603, "learning_rate": 9.763726149966596e-06, "loss": 0.074, "step": 836 }, { "epoch": 0.5663058186738836, "grad_norm": 0.569183317519104, "learning_rate": 9.762528779700067e-06, "loss": 0.0383, "step": 837 }, { "epoch": 0.5669824086603519, "grad_norm": 0.4867420729747149, "learning_rate": 9.7613284569224e-06, "loss": 0.0461, "step": 838 }, { "epoch": 0.56765899864682, "grad_norm": 0.6620579961253371, "learning_rate": 9.760125182377732e-06, "loss": 0.0344, "step": 839 }, { "epoch": 0.5683355886332883, "grad_norm": 0.5999869012264301, "learning_rate": 9.758918956812024e-06, "loss": 0.0482, "step": 840 }, { "epoch": 0.5690121786197564, "grad_norm": 0.5816447336458729, "learning_rate": 9.757709780973074e-06, "loss": 0.0797, "step": 841 }, { "epoch": 0.5696887686062246, "grad_norm": 0.5392546672467222, "learning_rate": 9.756497655610503e-06, "loss": 0.0482, "step": 842 }, { "epoch": 0.5703653585926928, "grad_norm": 0.38395903568969386, "learning_rate": 9.755282581475769e-06, "loss": 0.0428, "step": 843 }, { "epoch": 0.571041948579161, "grad_norm": 0.350845540377357, "learning_rate": 9.754064559322147e-06, "loss": 0.048, "step": 844 }, { "epoch": 0.5717185385656293, "grad_norm": 0.426006391790128, "learning_rate": 9.752843589904746e-06, "loss": 0.0378, "step": 845 }, { "epoch": 0.5723951285520974, "grad_norm": 0.5506082651480366, "learning_rate": 9.751619673980503e-06, "loss": 0.0578, "step": 846 }, { "epoch": 0.5730717185385656, "grad_norm": 0.5607556280880193, "learning_rate": 9.75039281230818e-06, "loss": 0.0582, "step": 847 }, { "epoch": 0.5737483085250338, "grad_norm": 0.4361753389211241, "learning_rate": 9.749163005648362e-06, "loss": 0.0516, "step": 848 }, { "epoch": 0.574424898511502, "grad_norm": 0.311642821351553, "learning_rate": 9.747930254763467e-06, "loss": 0.0456, "step": 849 }, { "epoch": 0.5751014884979703, "grad_norm": 0.4218841410849098, "learning_rate": 9.746694560417731e-06, "loss": 0.0493, "step": 850 }, { "epoch": 0.5757780784844384, "grad_norm": 0.49112066955428146, "learning_rate": 9.745455923377218e-06, "loss": 0.0508, "step": 851 }, { "epoch": 0.5764546684709067, "grad_norm": 0.8830925732758667, "learning_rate": 9.74421434440982e-06, "loss": 0.1176, "step": 852 }, { "epoch": 0.5771312584573748, "grad_norm": 0.41140583556266247, "learning_rate": 9.742969824285244e-06, "loss": 0.0507, "step": 853 }, { "epoch": 0.577807848443843, "grad_norm": 0.6883009934084277, "learning_rate": 9.741722363775029e-06, "loss": 0.0518, "step": 854 }, { "epoch": 0.5784844384303113, "grad_norm": 0.4537184728374772, "learning_rate": 9.74047196365253e-06, "loss": 0.0385, "step": 855 }, { "epoch": 0.5791610284167794, "grad_norm": 0.5762945675549866, "learning_rate": 9.73921862469293e-06, "loss": 0.0505, "step": 856 }, { "epoch": 0.5798376184032477, "grad_norm": 0.37253773754660935, "learning_rate": 9.737962347673232e-06, "loss": 0.0417, "step": 857 }, { "epoch": 0.5805142083897158, "grad_norm": 0.9010191810911216, "learning_rate": 9.736703133372259e-06, "loss": 0.0417, "step": 858 }, { "epoch": 0.581190798376184, "grad_norm": 0.3508686515086484, "learning_rate": 9.735440982570656e-06, "loss": 0.0394, "step": 859 }, { "epoch": 0.5818673883626523, "grad_norm": 0.41766463733115045, "learning_rate": 9.734175896050889e-06, "loss": 0.0378, "step": 860 }, { "epoch": 0.5825439783491204, "grad_norm": 0.558033890428094, "learning_rate": 9.732907874597241e-06, "loss": 0.0494, "step": 861 }, { "epoch": 0.5832205683355887, "grad_norm": 0.6661422303593342, "learning_rate": 9.731636918995821e-06, "loss": 0.0464, "step": 862 }, { "epoch": 0.5838971583220568, "grad_norm": 0.4894819614699645, "learning_rate": 9.730363030034551e-06, "loss": 0.044, "step": 863 }, { "epoch": 0.584573748308525, "grad_norm": 0.6611212441695493, "learning_rate": 9.729086208503174e-06, "loss": 0.0694, "step": 864 }, { "epoch": 0.5852503382949933, "grad_norm": 0.35893723110631065, "learning_rate": 9.72780645519325e-06, "loss": 0.041, "step": 865 }, { "epoch": 0.5859269282814614, "grad_norm": 0.5450738557142877, "learning_rate": 9.726523770898157e-06, "loss": 0.0399, "step": 866 }, { "epoch": 0.5866035182679297, "grad_norm": 0.8876228563217334, "learning_rate": 9.725238156413089e-06, "loss": 0.0448, "step": 867 }, { "epoch": 0.5872801082543978, "grad_norm": 0.6286748245367716, "learning_rate": 9.72394961253506e-06, "loss": 0.0426, "step": 868 }, { "epoch": 0.587956698240866, "grad_norm": 0.547776339613492, "learning_rate": 9.722658140062898e-06, "loss": 0.0412, "step": 869 }, { "epoch": 0.5886332882273342, "grad_norm": 0.4480973214312458, "learning_rate": 9.721363739797243e-06, "loss": 0.0435, "step": 870 }, { "epoch": 0.5893098782138024, "grad_norm": 0.5784135916573524, "learning_rate": 9.720066412540554e-06, "loss": 0.0425, "step": 871 }, { "epoch": 0.5899864682002707, "grad_norm": 0.4819523267521132, "learning_rate": 9.718766159097109e-06, "loss": 0.0495, "step": 872 }, { "epoch": 0.5906630581867388, "grad_norm": 0.5462720785705018, "learning_rate": 9.717462980272989e-06, "loss": 0.0479, "step": 873 }, { "epoch": 0.591339648173207, "grad_norm": 0.5753746704368786, "learning_rate": 9.716156876876096e-06, "loss": 0.0531, "step": 874 }, { "epoch": 0.5920162381596752, "grad_norm": 0.40828434914038086, "learning_rate": 9.714847849716149e-06, "loss": 0.0477, "step": 875 }, { "epoch": 0.5926928281461434, "grad_norm": 0.48445222346652345, "learning_rate": 9.713535899604667e-06, "loss": 0.0425, "step": 876 }, { "epoch": 0.5933694181326117, "grad_norm": 0.6683247339589322, "learning_rate": 9.71222102735499e-06, "loss": 0.0386, "step": 877 }, { "epoch": 0.5940460081190798, "grad_norm": 0.42909292375202934, "learning_rate": 9.710903233782273e-06, "loss": 0.046, "step": 878 }, { "epoch": 0.5947225981055481, "grad_norm": 0.7303995269588103, "learning_rate": 9.70958251970347e-06, "loss": 0.0508, "step": 879 }, { "epoch": 0.5953991880920162, "grad_norm": 0.6128204947378759, "learning_rate": 9.708258885937359e-06, "loss": 0.0461, "step": 880 }, { "epoch": 0.5960757780784844, "grad_norm": 0.5653008366891901, "learning_rate": 9.706932333304518e-06, "loss": 0.0344, "step": 881 }, { "epoch": 0.5967523680649527, "grad_norm": 0.5798904476841784, "learning_rate": 9.705602862627335e-06, "loss": 0.0394, "step": 882 }, { "epoch": 0.5974289580514208, "grad_norm": 0.4386755015091282, "learning_rate": 9.704270474730018e-06, "loss": 0.0384, "step": 883 }, { "epoch": 0.5981055480378891, "grad_norm": 0.3270568512910185, "learning_rate": 9.70293517043857e-06, "loss": 0.0349, "step": 884 }, { "epoch": 0.5987821380243572, "grad_norm": 0.5416745768346694, "learning_rate": 9.701596950580807e-06, "loss": 0.0555, "step": 885 }, { "epoch": 0.5994587280108254, "grad_norm": 0.542606637133406, "learning_rate": 9.700255815986357e-06, "loss": 0.0458, "step": 886 }, { "epoch": 0.6001353179972937, "grad_norm": 0.40012553877358126, "learning_rate": 9.69891176748665e-06, "loss": 0.0312, "step": 887 }, { "epoch": 0.6008119079837618, "grad_norm": 0.5943951286176301, "learning_rate": 9.697564805914922e-06, "loss": 0.053, "step": 888 }, { "epoch": 0.6014884979702301, "grad_norm": 0.36145698367203055, "learning_rate": 9.696214932106218e-06, "loss": 0.0377, "step": 889 }, { "epoch": 0.6021650879566982, "grad_norm": 0.3645486931235761, "learning_rate": 9.694862146897385e-06, "loss": 0.0368, "step": 890 }, { "epoch": 0.6028416779431665, "grad_norm": 0.443536129736776, "learning_rate": 9.693506451127082e-06, "loss": 0.0354, "step": 891 }, { "epoch": 0.6035182679296346, "grad_norm": 0.4546288869488424, "learning_rate": 9.692147845635761e-06, "loss": 0.0387, "step": 892 }, { "epoch": 0.6041948579161028, "grad_norm": 0.5615972553394489, "learning_rate": 9.690786331265687e-06, "loss": 0.0595, "step": 893 }, { "epoch": 0.6048714479025711, "grad_norm": 0.3955607243296082, "learning_rate": 9.689421908860928e-06, "loss": 0.0356, "step": 894 }, { "epoch": 0.6055480378890392, "grad_norm": 0.3432982231897223, "learning_rate": 9.688054579267347e-06, "loss": 0.0377, "step": 895 }, { "epoch": 0.6062246278755075, "grad_norm": 0.4640843848808698, "learning_rate": 9.68668434333262e-06, "loss": 0.0469, "step": 896 }, { "epoch": 0.6069012178619756, "grad_norm": 0.5060279217404642, "learning_rate": 9.685311201906216e-06, "loss": 0.0505, "step": 897 }, { "epoch": 0.6075778078484438, "grad_norm": 0.4382657736866923, "learning_rate": 9.683935155839408e-06, "loss": 0.0439, "step": 898 }, { "epoch": 0.6082543978349121, "grad_norm": 0.761356984884096, "learning_rate": 9.682556205985274e-06, "loss": 0.0418, "step": 899 }, { "epoch": 0.6089309878213802, "grad_norm": 0.36611036135917924, "learning_rate": 9.681174353198687e-06, "loss": 0.0449, "step": 900 }, { "epoch": 0.6096075778078485, "grad_norm": 0.7103148905704162, "learning_rate": 9.67978959833632e-06, "loss": 0.0518, "step": 901 }, { "epoch": 0.6102841677943166, "grad_norm": 0.42861527819236733, "learning_rate": 9.678401942256648e-06, "loss": 0.0435, "step": 902 }, { "epoch": 0.6109607577807848, "grad_norm": 0.41156509646753364, "learning_rate": 9.67701138581994e-06, "loss": 0.0349, "step": 903 }, { "epoch": 0.6116373477672531, "grad_norm": 0.42902342479749483, "learning_rate": 9.675617929888271e-06, "loss": 0.0446, "step": 904 }, { "epoch": 0.6123139377537212, "grad_norm": 0.3475314164339951, "learning_rate": 9.674221575325503e-06, "loss": 0.0428, "step": 905 }, { "epoch": 0.6129905277401895, "grad_norm": 0.47446237263781843, "learning_rate": 9.672822322997305e-06, "loss": 0.0435, "step": 906 }, { "epoch": 0.6136671177266576, "grad_norm": 0.6151965921405438, "learning_rate": 9.671420173771135e-06, "loss": 0.0493, "step": 907 }, { "epoch": 0.6143437077131259, "grad_norm": 0.38357859953913104, "learning_rate": 9.670015128516253e-06, "loss": 0.0359, "step": 908 }, { "epoch": 0.6150202976995941, "grad_norm": 0.507570983315249, "learning_rate": 9.668607188103708e-06, "loss": 0.0515, "step": 909 }, { "epoch": 0.6156968876860622, "grad_norm": 0.3057624219178951, "learning_rate": 9.667196353406352e-06, "loss": 0.034, "step": 910 }, { "epoch": 0.6163734776725305, "grad_norm": 0.6047675298642423, "learning_rate": 9.665782625298821e-06, "loss": 0.0483, "step": 911 }, { "epoch": 0.6170500676589986, "grad_norm": 0.429809230360215, "learning_rate": 9.664366004657553e-06, "loss": 0.0479, "step": 912 }, { "epoch": 0.6177266576454669, "grad_norm": 0.33850749588389023, "learning_rate": 9.662946492360777e-06, "loss": 0.0372, "step": 913 }, { "epoch": 0.618403247631935, "grad_norm": 0.4383026714391823, "learning_rate": 9.66152408928851e-06, "loss": 0.0464, "step": 914 }, { "epoch": 0.6190798376184032, "grad_norm": 0.3389190885921122, "learning_rate": 9.66009879632257e-06, "loss": 0.0387, "step": 915 }, { "epoch": 0.6197564276048715, "grad_norm": 0.5288823141822944, "learning_rate": 9.65867061434656e-06, "loss": 0.0434, "step": 916 }, { "epoch": 0.6204330175913396, "grad_norm": 0.46263646372460493, "learning_rate": 9.657239544245877e-06, "loss": 0.0365, "step": 917 }, { "epoch": 0.6211096075778079, "grad_norm": 0.3710608847499482, "learning_rate": 9.655805586907705e-06, "loss": 0.045, "step": 918 }, { "epoch": 0.621786197564276, "grad_norm": 0.4579363079680455, "learning_rate": 9.654368743221022e-06, "loss": 0.0374, "step": 919 }, { "epoch": 0.6224627875507442, "grad_norm": 0.44294044719332765, "learning_rate": 9.652929014076593e-06, "loss": 0.0474, "step": 920 }, { "epoch": 0.6231393775372125, "grad_norm": 0.34199808247768276, "learning_rate": 9.651486400366972e-06, "loss": 0.0322, "step": 921 }, { "epoch": 0.6238159675236806, "grad_norm": 0.48767698423175115, "learning_rate": 9.650040902986504e-06, "loss": 0.0485, "step": 922 }, { "epoch": 0.6244925575101489, "grad_norm": 0.48358834244712573, "learning_rate": 9.648592522831316e-06, "loss": 0.0424, "step": 923 }, { "epoch": 0.625169147496617, "grad_norm": 0.5172813697337071, "learning_rate": 9.64714126079933e-06, "loss": 0.0591, "step": 924 }, { "epoch": 0.6258457374830853, "grad_norm": 0.38027197375093347, "learning_rate": 9.645687117790246e-06, "loss": 0.0392, "step": 925 }, { "epoch": 0.6265223274695535, "grad_norm": 0.326766981389363, "learning_rate": 9.644230094705555e-06, "loss": 0.0417, "step": 926 }, { "epoch": 0.6271989174560216, "grad_norm": 0.37975622243619345, "learning_rate": 9.642770192448537e-06, "loss": 0.0355, "step": 927 }, { "epoch": 0.6278755074424899, "grad_norm": 0.40318220005365135, "learning_rate": 9.641307411924246e-06, "loss": 0.0455, "step": 928 }, { "epoch": 0.628552097428958, "grad_norm": 0.5167687457949933, "learning_rate": 9.639841754039534e-06, "loss": 0.0407, "step": 929 }, { "epoch": 0.6292286874154263, "grad_norm": 0.4885812957255087, "learning_rate": 9.638373219703023e-06, "loss": 0.0561, "step": 930 }, { "epoch": 0.6299052774018945, "grad_norm": 0.48013557815758745, "learning_rate": 9.63690180982513e-06, "loss": 0.0442, "step": 931 }, { "epoch": 0.6305818673883626, "grad_norm": 0.4199012352919531, "learning_rate": 9.635427525318048e-06, "loss": 0.0327, "step": 932 }, { "epoch": 0.6312584573748309, "grad_norm": 0.34720279637819, "learning_rate": 9.633950367095758e-06, "loss": 0.0322, "step": 933 }, { "epoch": 0.631935047361299, "grad_norm": 0.451015570434906, "learning_rate": 9.632470336074009e-06, "loss": 0.0486, "step": 934 }, { "epoch": 0.6326116373477673, "grad_norm": 0.42015740109231275, "learning_rate": 9.63098743317035e-06, "loss": 0.035, "step": 935 }, { "epoch": 0.6332882273342354, "grad_norm": 0.6411163507899782, "learning_rate": 9.629501659304096e-06, "loss": 0.0553, "step": 936 }, { "epoch": 0.6339648173207036, "grad_norm": 0.3538309776081996, "learning_rate": 9.628013015396347e-06, "loss": 0.0337, "step": 937 }, { "epoch": 0.6346414073071719, "grad_norm": 0.5236845273971029, "learning_rate": 9.626521502369984e-06, "loss": 0.056, "step": 938 }, { "epoch": 0.63531799729364, "grad_norm": 0.41809020137958025, "learning_rate": 9.625027121149665e-06, "loss": 0.0476, "step": 939 }, { "epoch": 0.6359945872801083, "grad_norm": 0.5060195307147896, "learning_rate": 9.623529872661821e-06, "loss": 0.0418, "step": 940 }, { "epoch": 0.6366711772665764, "grad_norm": 0.44072874382524213, "learning_rate": 9.62202975783467e-06, "loss": 0.0441, "step": 941 }, { "epoch": 0.6373477672530447, "grad_norm": 0.43222278345904935, "learning_rate": 9.620526777598202e-06, "loss": 0.0554, "step": 942 }, { "epoch": 0.6380243572395129, "grad_norm": 0.582429766652759, "learning_rate": 9.619020932884182e-06, "loss": 0.0803, "step": 943 }, { "epoch": 0.638700947225981, "grad_norm": 0.3994284470354665, "learning_rate": 9.617512224626153e-06, "loss": 0.0355, "step": 944 }, { "epoch": 0.6393775372124493, "grad_norm": 0.3791513274555608, "learning_rate": 9.616000653759435e-06, "loss": 0.0321, "step": 945 }, { "epoch": 0.6400541271989174, "grad_norm": 0.4528994898732974, "learning_rate": 9.614486221221115e-06, "loss": 0.0396, "step": 946 }, { "epoch": 0.6407307171853857, "grad_norm": 0.4837314205327873, "learning_rate": 9.612968927950066e-06, "loss": 0.0436, "step": 947 }, { "epoch": 0.6414073071718539, "grad_norm": 0.5347640164457272, "learning_rate": 9.611448774886925e-06, "loss": 0.0513, "step": 948 }, { "epoch": 0.642083897158322, "grad_norm": 0.5042218258862791, "learning_rate": 9.609925762974103e-06, "loss": 0.0475, "step": 949 }, { "epoch": 0.6427604871447903, "grad_norm": 0.4264989507780004, "learning_rate": 9.60839989315579e-06, "loss": 0.0357, "step": 950 }, { "epoch": 0.6434370771312584, "grad_norm": 0.5392493535467197, "learning_rate": 9.606871166377939e-06, "loss": 0.052, "step": 951 }, { "epoch": 0.6441136671177267, "grad_norm": 0.3908082376550831, "learning_rate": 9.60533958358828e-06, "loss": 0.0426, "step": 952 }, { "epoch": 0.6447902571041949, "grad_norm": 0.7627853739424385, "learning_rate": 9.603805145736311e-06, "loss": 0.0558, "step": 953 }, { "epoch": 0.645466847090663, "grad_norm": 0.5672781531325081, "learning_rate": 9.602267853773301e-06, "loss": 0.0504, "step": 954 }, { "epoch": 0.6461434370771313, "grad_norm": 0.45194462663889134, "learning_rate": 9.60072770865229e-06, "loss": 0.0428, "step": 955 }, { "epoch": 0.6468200270635994, "grad_norm": 0.5404875667227371, "learning_rate": 9.599184711328082e-06, "loss": 0.0321, "step": 956 }, { "epoch": 0.6474966170500677, "grad_norm": 0.7264194143274479, "learning_rate": 9.597638862757255e-06, "loss": 0.0576, "step": 957 }, { "epoch": 0.6481732070365359, "grad_norm": 0.43277733284387404, "learning_rate": 9.596090163898148e-06, "loss": 0.0427, "step": 958 }, { "epoch": 0.648849797023004, "grad_norm": 0.43668803468184053, "learning_rate": 9.594538615710875e-06, "loss": 0.0408, "step": 959 }, { "epoch": 0.6495263870094723, "grad_norm": 0.6373441415174744, "learning_rate": 9.59298421915731e-06, "loss": 0.0493, "step": 960 }, { "epoch": 0.6502029769959404, "grad_norm": 0.5039542204065095, "learning_rate": 9.591426975201093e-06, "loss": 0.0505, "step": 961 }, { "epoch": 0.6508795669824087, "grad_norm": 0.5401177598769642, "learning_rate": 9.589866884807637e-06, "loss": 0.0604, "step": 962 }, { "epoch": 0.6515561569688768, "grad_norm": 0.45949039645671613, "learning_rate": 9.588303948944109e-06, "loss": 0.0442, "step": 963 }, { "epoch": 0.652232746955345, "grad_norm": 0.48521941125554213, "learning_rate": 9.586738168579446e-06, "loss": 0.036, "step": 964 }, { "epoch": 0.6529093369418133, "grad_norm": 0.4598983065811199, "learning_rate": 9.58516954468435e-06, "loss": 0.0433, "step": 965 }, { "epoch": 0.6535859269282814, "grad_norm": 0.3718353137593582, "learning_rate": 9.58359807823128e-06, "loss": 0.0465, "step": 966 }, { "epoch": 0.6542625169147497, "grad_norm": 0.3543182839474053, "learning_rate": 9.582023770194462e-06, "loss": 0.036, "step": 967 }, { "epoch": 0.6549391069012178, "grad_norm": 0.5577019363935712, "learning_rate": 9.580446621549883e-06, "loss": 0.0414, "step": 968 }, { "epoch": 0.6556156968876861, "grad_norm": 0.40349930365154446, "learning_rate": 9.578866633275289e-06, "loss": 0.0508, "step": 969 }, { "epoch": 0.6562922868741543, "grad_norm": 0.49495689312928554, "learning_rate": 9.577283806350186e-06, "loss": 0.0379, "step": 970 }, { "epoch": 0.6569688768606224, "grad_norm": 0.5897893067553056, "learning_rate": 9.575698141755844e-06, "loss": 0.0598, "step": 971 }, { "epoch": 0.6576454668470907, "grad_norm": 0.40963069354089227, "learning_rate": 9.57410964047529e-06, "loss": 0.0406, "step": 972 }, { "epoch": 0.6583220568335588, "grad_norm": 0.33723745271737793, "learning_rate": 9.572518303493305e-06, "loss": 0.0345, "step": 973 }, { "epoch": 0.6589986468200271, "grad_norm": 0.5830671994151339, "learning_rate": 9.570924131796437e-06, "loss": 0.051, "step": 974 }, { "epoch": 0.6596752368064953, "grad_norm": 0.563938532943089, "learning_rate": 9.569327126372985e-06, "loss": 0.0465, "step": 975 }, { "epoch": 0.6603518267929634, "grad_norm": 0.4110509351619063, "learning_rate": 9.567727288213005e-06, "loss": 0.0508, "step": 976 }, { "epoch": 0.6610284167794317, "grad_norm": 0.5101530319858956, "learning_rate": 9.566124618308312e-06, "loss": 0.0469, "step": 977 }, { "epoch": 0.6617050067658998, "grad_norm": 0.5446245789300718, "learning_rate": 9.564519117652473e-06, "loss": 0.0475, "step": 978 }, { "epoch": 0.6623815967523681, "grad_norm": 0.38532615768990897, "learning_rate": 9.562910787240814e-06, "loss": 0.0371, "step": 979 }, { "epoch": 0.6630581867388363, "grad_norm": 0.5452581963795065, "learning_rate": 9.56129962807041e-06, "loss": 0.0463, "step": 980 }, { "epoch": 0.6637347767253045, "grad_norm": 0.493897519037664, "learning_rate": 9.559685641140098e-06, "loss": 0.046, "step": 981 }, { "epoch": 0.6644113667117727, "grad_norm": 0.49120411649237344, "learning_rate": 9.55806882745046e-06, "loss": 0.038, "step": 982 }, { "epoch": 0.6650879566982408, "grad_norm": 0.37594809722236106, "learning_rate": 9.556449188003831e-06, "loss": 0.0332, "step": 983 }, { "epoch": 0.6657645466847091, "grad_norm": 0.5689483506911673, "learning_rate": 9.554826723804304e-06, "loss": 0.0428, "step": 984 }, { "epoch": 0.6664411366711772, "grad_norm": 0.47459352359813967, "learning_rate": 9.553201435857718e-06, "loss": 0.048, "step": 985 }, { "epoch": 0.6671177266576455, "grad_norm": 0.4175841295594717, "learning_rate": 9.551573325171662e-06, "loss": 0.0433, "step": 986 }, { "epoch": 0.6677943166441137, "grad_norm": 0.4703368986441674, "learning_rate": 9.54994239275548e-06, "loss": 0.045, "step": 987 }, { "epoch": 0.6684709066305818, "grad_norm": 0.5174768810420528, "learning_rate": 9.54830863962026e-06, "loss": 0.0477, "step": 988 }, { "epoch": 0.6691474966170501, "grad_norm": 0.45031078391738333, "learning_rate": 9.546672066778842e-06, "loss": 0.0453, "step": 989 }, { "epoch": 0.6698240866035182, "grad_norm": 0.4661069434329015, "learning_rate": 9.545032675245814e-06, "loss": 0.0361, "step": 990 }, { "epoch": 0.6705006765899865, "grad_norm": 0.6263373512558729, "learning_rate": 9.543390466037507e-06, "loss": 0.0482, "step": 991 }, { "epoch": 0.6711772665764547, "grad_norm": 0.6198243720105686, "learning_rate": 9.541745440172006e-06, "loss": 0.0463, "step": 992 }, { "epoch": 0.6718538565629228, "grad_norm": 0.41016128196243706, "learning_rate": 9.540097598669135e-06, "loss": 0.0417, "step": 993 }, { "epoch": 0.6725304465493911, "grad_norm": 0.39800918436000615, "learning_rate": 9.538446942550468e-06, "loss": 0.0415, "step": 994 }, { "epoch": 0.6732070365358592, "grad_norm": 0.5543222813132193, "learning_rate": 9.536793472839325e-06, "loss": 0.0401, "step": 995 }, { "epoch": 0.6738836265223275, "grad_norm": 0.5940809739304163, "learning_rate": 9.535137190560765e-06, "loss": 0.0442, "step": 996 }, { "epoch": 0.6745602165087957, "grad_norm": 0.6673629427097577, "learning_rate": 9.533478096741597e-06, "loss": 0.0496, "step": 997 }, { "epoch": 0.6752368064952639, "grad_norm": 0.6404194200639447, "learning_rate": 9.531816192410366e-06, "loss": 0.05, "step": 998 }, { "epoch": 0.6759133964817321, "grad_norm": 0.40213167814173456, "learning_rate": 9.530151478597366e-06, "loss": 0.0303, "step": 999 }, { "epoch": 0.6765899864682002, "grad_norm": 0.8361044332179497, "learning_rate": 9.528483956334628e-06, "loss": 0.0402, "step": 1000 }, { "epoch": 0.6772665764546685, "grad_norm": 0.6371984225608404, "learning_rate": 9.526813626655929e-06, "loss": 0.0487, "step": 1001 }, { "epoch": 0.6779431664411367, "grad_norm": 0.3529968456644681, "learning_rate": 9.525140490596778e-06, "loss": 0.0457, "step": 1002 }, { "epoch": 0.6786197564276049, "grad_norm": 0.6890677686012265, "learning_rate": 9.523464549194434e-06, "loss": 0.0434, "step": 1003 }, { "epoch": 0.6792963464140731, "grad_norm": 0.4976579938373486, "learning_rate": 9.521785803487888e-06, "loss": 0.0364, "step": 1004 }, { "epoch": 0.6799729364005412, "grad_norm": 0.32376422130205423, "learning_rate": 9.520104254517873e-06, "loss": 0.037, "step": 1005 }, { "epoch": 0.6806495263870095, "grad_norm": 0.6068602119994349, "learning_rate": 9.518419903326859e-06, "loss": 0.0474, "step": 1006 }, { "epoch": 0.6813261163734776, "grad_norm": 0.5220480755210147, "learning_rate": 9.51673275095905e-06, "loss": 0.0484, "step": 1007 }, { "epoch": 0.6820027063599459, "grad_norm": 0.4482989240924032, "learning_rate": 9.515042798460393e-06, "loss": 0.0369, "step": 1008 }, { "epoch": 0.6826792963464141, "grad_norm": 0.5301825487940978, "learning_rate": 9.513350046878565e-06, "loss": 0.0502, "step": 1009 }, { "epoch": 0.6833558863328822, "grad_norm": 0.5303576531468821, "learning_rate": 9.511654497262984e-06, "loss": 0.0642, "step": 1010 }, { "epoch": 0.6840324763193505, "grad_norm": 0.5349687200733244, "learning_rate": 9.509956150664796e-06, "loss": 0.0522, "step": 1011 }, { "epoch": 0.6847090663058186, "grad_norm": 0.6886510654722846, "learning_rate": 9.508255008136885e-06, "loss": 0.047, "step": 1012 }, { "epoch": 0.6853856562922869, "grad_norm": 0.6089451596108506, "learning_rate": 9.506551070733869e-06, "loss": 0.0422, "step": 1013 }, { "epoch": 0.6860622462787551, "grad_norm": 0.62732429197919, "learning_rate": 9.504844339512096e-06, "loss": 0.0563, "step": 1014 }, { "epoch": 0.6867388362652233, "grad_norm": 0.5271058132726802, "learning_rate": 9.50313481552965e-06, "loss": 0.0419, "step": 1015 }, { "epoch": 0.6874154262516915, "grad_norm": 0.40746987014719194, "learning_rate": 9.501422499846338e-06, "loss": 0.0374, "step": 1016 }, { "epoch": 0.6880920162381596, "grad_norm": 0.4185099148099006, "learning_rate": 9.49970739352371e-06, "loss": 0.0446, "step": 1017 }, { "epoch": 0.6887686062246279, "grad_norm": 0.5619760070106918, "learning_rate": 9.497989497625036e-06, "loss": 0.0398, "step": 1018 }, { "epoch": 0.6894451962110961, "grad_norm": 0.3937333405315312, "learning_rate": 9.49626881321532e-06, "loss": 0.0361, "step": 1019 }, { "epoch": 0.6901217861975643, "grad_norm": 0.4291750955145607, "learning_rate": 9.494545341361291e-06, "loss": 0.0392, "step": 1020 }, { "epoch": 0.6907983761840325, "grad_norm": 0.8025857719308748, "learning_rate": 9.492819083131412e-06, "loss": 0.059, "step": 1021 }, { "epoch": 0.6914749661705006, "grad_norm": 0.3449247526874084, "learning_rate": 9.491090039595869e-06, "loss": 0.0386, "step": 1022 }, { "epoch": 0.6921515561569689, "grad_norm": 0.40244666843718724, "learning_rate": 9.489358211826577e-06, "loss": 0.0324, "step": 1023 }, { "epoch": 0.6928281461434371, "grad_norm": 0.8645169898857172, "learning_rate": 9.487623600897172e-06, "loss": 0.07, "step": 1024 }, { "epoch": 0.6935047361299053, "grad_norm": 1.13253333163851, "learning_rate": 9.485886207883022e-06, "loss": 0.0512, "step": 1025 }, { "epoch": 0.6941813261163735, "grad_norm": 0.5641277435023277, "learning_rate": 9.484146033861216e-06, "loss": 0.0437, "step": 1026 }, { "epoch": 0.6948579161028416, "grad_norm": 0.8351590629206309, "learning_rate": 9.482403079910571e-06, "loss": 0.0488, "step": 1027 }, { "epoch": 0.6955345060893099, "grad_norm": 0.5072789648438356, "learning_rate": 9.480657347111621e-06, "loss": 0.0532, "step": 1028 }, { "epoch": 0.696211096075778, "grad_norm": 0.4708085692179905, "learning_rate": 9.478908836546629e-06, "loss": 0.0337, "step": 1029 }, { "epoch": 0.6968876860622463, "grad_norm": 0.623392456528769, "learning_rate": 9.477157549299574e-06, "loss": 0.0431, "step": 1030 }, { "epoch": 0.6975642760487145, "grad_norm": 0.486412673751419, "learning_rate": 9.475403486456162e-06, "loss": 0.0493, "step": 1031 }, { "epoch": 0.6982408660351827, "grad_norm": 0.3619781041039289, "learning_rate": 9.473646649103819e-06, "loss": 0.0359, "step": 1032 }, { "epoch": 0.6989174560216509, "grad_norm": 0.5055199497504502, "learning_rate": 9.471887038331686e-06, "loss": 0.0417, "step": 1033 }, { "epoch": 0.699594046008119, "grad_norm": 0.731027212723068, "learning_rate": 9.470124655230627e-06, "loss": 0.0397, "step": 1034 }, { "epoch": 0.7002706359945873, "grad_norm": 0.6703689044818381, "learning_rate": 9.468359500893227e-06, "loss": 0.0392, "step": 1035 }, { "epoch": 0.7009472259810555, "grad_norm": 0.5332302039854108, "learning_rate": 9.466591576413785e-06, "loss": 0.0621, "step": 1036 }, { "epoch": 0.7016238159675237, "grad_norm": 0.4020929727770793, "learning_rate": 9.464820882888319e-06, "loss": 0.0473, "step": 1037 }, { "epoch": 0.7023004059539919, "grad_norm": 0.5995014402035128, "learning_rate": 9.463047421414564e-06, "loss": 0.049, "step": 1038 }, { "epoch": 0.70297699594046, "grad_norm": 0.41047290020663224, "learning_rate": 9.461271193091971e-06, "loss": 0.0384, "step": 1039 }, { "epoch": 0.7036535859269283, "grad_norm": 0.4631576567759274, "learning_rate": 9.459492199021705e-06, "loss": 0.0462, "step": 1040 }, { "epoch": 0.7043301759133965, "grad_norm": 0.5471261313831118, "learning_rate": 9.457710440306645e-06, "loss": 0.0485, "step": 1041 }, { "epoch": 0.7050067658998647, "grad_norm": 0.7540552681273482, "learning_rate": 9.455925918051388e-06, "loss": 0.0608, "step": 1042 }, { "epoch": 0.7056833558863329, "grad_norm": 0.4657514871110121, "learning_rate": 9.454138633362241e-06, "loss": 0.0456, "step": 1043 }, { "epoch": 0.706359945872801, "grad_norm": 0.5912269204702294, "learning_rate": 9.452348587347224e-06, "loss": 0.0342, "step": 1044 }, { "epoch": 0.7070365358592693, "grad_norm": 0.5858407115011176, "learning_rate": 9.450555781116068e-06, "loss": 0.0387, "step": 1045 }, { "epoch": 0.7077131258457375, "grad_norm": 0.47893427829398627, "learning_rate": 9.448760215780218e-06, "loss": 0.0512, "step": 1046 }, { "epoch": 0.7083897158322057, "grad_norm": 0.6581060020226865, "learning_rate": 9.446961892452824e-06, "loss": 0.053, "step": 1047 }, { "epoch": 0.7090663058186739, "grad_norm": 0.6447452214945267, "learning_rate": 9.445160812248754e-06, "loss": 0.0525, "step": 1048 }, { "epoch": 0.709742895805142, "grad_norm": 0.3340549267704265, "learning_rate": 9.44335697628458e-06, "loss": 0.0326, "step": 1049 }, { "epoch": 0.7104194857916103, "grad_norm": 0.5470335567157704, "learning_rate": 9.44155038567858e-06, "loss": 0.0499, "step": 1050 }, { "epoch": 0.7110960757780784, "grad_norm": 0.3966932054039324, "learning_rate": 9.439741041550745e-06, "loss": 0.032, "step": 1051 }, { "epoch": 0.7117726657645467, "grad_norm": 0.44056447508322627, "learning_rate": 9.437928945022772e-06, "loss": 0.0412, "step": 1052 }, { "epoch": 0.7124492557510149, "grad_norm": 0.3869827577066024, "learning_rate": 9.43611409721806e-06, "loss": 0.0431, "step": 1053 }, { "epoch": 0.713125845737483, "grad_norm": 0.3980613763277345, "learning_rate": 9.434296499261719e-06, "loss": 0.048, "step": 1054 }, { "epoch": 0.7138024357239513, "grad_norm": 0.7811011021579496, "learning_rate": 9.432476152280562e-06, "loss": 0.0514, "step": 1055 }, { "epoch": 0.7144790257104194, "grad_norm": 0.37486212234087307, "learning_rate": 9.430653057403105e-06, "loss": 0.0389, "step": 1056 }, { "epoch": 0.7151556156968877, "grad_norm": 0.4317908795764949, "learning_rate": 9.428827215759569e-06, "loss": 0.0439, "step": 1057 }, { "epoch": 0.7158322056833559, "grad_norm": 0.3836328577522653, "learning_rate": 9.426998628481876e-06, "loss": 0.038, "step": 1058 }, { "epoch": 0.7165087956698241, "grad_norm": 0.47017815847762695, "learning_rate": 9.425167296703655e-06, "loss": 0.0392, "step": 1059 }, { "epoch": 0.7171853856562923, "grad_norm": 0.43806505261570117, "learning_rate": 9.42333322156023e-06, "loss": 0.051, "step": 1060 }, { "epoch": 0.7178619756427604, "grad_norm": 0.5105023036196973, "learning_rate": 9.42149640418863e-06, "loss": 0.0426, "step": 1061 }, { "epoch": 0.7185385656292287, "grad_norm": 0.4775796931237403, "learning_rate": 9.419656845727582e-06, "loss": 0.041, "step": 1062 }, { "epoch": 0.7192151556156969, "grad_norm": 0.46565718426011904, "learning_rate": 9.417814547317513e-06, "loss": 0.0415, "step": 1063 }, { "epoch": 0.7198917456021651, "grad_norm": 0.3718909598609268, "learning_rate": 9.415969510100549e-06, "loss": 0.0306, "step": 1064 }, { "epoch": 0.7205683355886333, "grad_norm": 0.4842662981078468, "learning_rate": 9.414121735220513e-06, "loss": 0.0445, "step": 1065 }, { "epoch": 0.7212449255751014, "grad_norm": 0.47420840949599175, "learning_rate": 9.412271223822929e-06, "loss": 0.0486, "step": 1066 }, { "epoch": 0.7219215155615697, "grad_norm": 0.48895220057621547, "learning_rate": 9.41041797705501e-06, "loss": 0.0393, "step": 1067 }, { "epoch": 0.7225981055480379, "grad_norm": 0.4875400791808188, "learning_rate": 9.408561996065672e-06, "loss": 0.0519, "step": 1068 }, { "epoch": 0.7232746955345061, "grad_norm": 0.6144523783870842, "learning_rate": 9.406703282005523e-06, "loss": 0.0442, "step": 1069 }, { "epoch": 0.7239512855209743, "grad_norm": 0.4754043228344461, "learning_rate": 9.404841836026863e-06, "loss": 0.0595, "step": 1070 }, { "epoch": 0.7246278755074425, "grad_norm": 0.47642322745592036, "learning_rate": 9.40297765928369e-06, "loss": 0.0501, "step": 1071 }, { "epoch": 0.7253044654939107, "grad_norm": 0.32558905339326694, "learning_rate": 9.401110752931694e-06, "loss": 0.0339, "step": 1072 }, { "epoch": 0.725981055480379, "grad_norm": 0.45351688092518283, "learning_rate": 9.399241118128255e-06, "loss": 0.039, "step": 1073 }, { "epoch": 0.7266576454668471, "grad_norm": 0.2500361418411447, "learning_rate": 9.397368756032445e-06, "loss": 0.0274, "step": 1074 }, { "epoch": 0.7273342354533153, "grad_norm": 0.41487142952439504, "learning_rate": 9.395493667805032e-06, "loss": 0.0448, "step": 1075 }, { "epoch": 0.7280108254397835, "grad_norm": 0.3163011214155268, "learning_rate": 9.393615854608461e-06, "loss": 0.0309, "step": 1076 }, { "epoch": 0.7286874154262517, "grad_norm": 0.4334865050484572, "learning_rate": 9.391735317606885e-06, "loss": 0.0419, "step": 1077 }, { "epoch": 0.7293640054127198, "grad_norm": 0.4769385480050723, "learning_rate": 9.389852057966129e-06, "loss": 0.0506, "step": 1078 }, { "epoch": 0.7300405953991881, "grad_norm": 0.35996837272589777, "learning_rate": 9.387966076853714e-06, "loss": 0.039, "step": 1079 }, { "epoch": 0.7307171853856563, "grad_norm": 0.3256486674181158, "learning_rate": 9.386077375438848e-06, "loss": 0.0336, "step": 1080 }, { "epoch": 0.7313937753721245, "grad_norm": 0.6222509033563155, "learning_rate": 9.384185954892423e-06, "loss": 0.0522, "step": 1081 }, { "epoch": 0.7320703653585927, "grad_norm": 0.4082078879056643, "learning_rate": 9.382291816387018e-06, "loss": 0.0375, "step": 1082 }, { "epoch": 0.7327469553450608, "grad_norm": 0.33543824603862954, "learning_rate": 9.380394961096895e-06, "loss": 0.0352, "step": 1083 }, { "epoch": 0.7334235453315291, "grad_norm": 0.5060874797939505, "learning_rate": 9.378495390198005e-06, "loss": 0.0379, "step": 1084 }, { "epoch": 0.7341001353179973, "grad_norm": 0.41743487319063965, "learning_rate": 9.376593104867976e-06, "loss": 0.0426, "step": 1085 }, { "epoch": 0.7347767253044655, "grad_norm": 0.5184910031839326, "learning_rate": 9.374688106286127e-06, "loss": 0.0489, "step": 1086 }, { "epoch": 0.7354533152909337, "grad_norm": 0.4084470647844144, "learning_rate": 9.372780395633451e-06, "loss": 0.0486, "step": 1087 }, { "epoch": 0.7361299052774019, "grad_norm": 0.47999994192707224, "learning_rate": 9.370869974092628e-06, "loss": 0.0439, "step": 1088 }, { "epoch": 0.7368064952638701, "grad_norm": 0.3813360371555213, "learning_rate": 9.368956842848014e-06, "loss": 0.0309, "step": 1089 }, { "epoch": 0.7374830852503383, "grad_norm": 0.4503744066776666, "learning_rate": 9.36704100308565e-06, "loss": 0.0473, "step": 1090 }, { "epoch": 0.7381596752368065, "grad_norm": 0.35717092121021204, "learning_rate": 9.36512245599325e-06, "loss": 0.0412, "step": 1091 }, { "epoch": 0.7388362652232747, "grad_norm": 0.5764849679099431, "learning_rate": 9.363201202760212e-06, "loss": 0.0474, "step": 1092 }, { "epoch": 0.7395128552097429, "grad_norm": 0.2875285228362852, "learning_rate": 9.36127724457761e-06, "loss": 0.0284, "step": 1093 }, { "epoch": 0.7401894451962111, "grad_norm": 0.43016349621660316, "learning_rate": 9.359350582638193e-06, "loss": 0.0434, "step": 1094 }, { "epoch": 0.7408660351826793, "grad_norm": 0.45620787154940123, "learning_rate": 9.357421218136387e-06, "loss": 0.0533, "step": 1095 }, { "epoch": 0.7415426251691475, "grad_norm": 0.8166639455518038, "learning_rate": 9.355489152268296e-06, "loss": 0.0948, "step": 1096 }, { "epoch": 0.7422192151556157, "grad_norm": 0.3144810238083724, "learning_rate": 9.353554386231697e-06, "loss": 0.0408, "step": 1097 }, { "epoch": 0.7428958051420839, "grad_norm": 0.36361057224729754, "learning_rate": 9.351616921226036e-06, "loss": 0.0501, "step": 1098 }, { "epoch": 0.7435723951285521, "grad_norm": 0.5326846996900816, "learning_rate": 9.349676758452441e-06, "loss": 0.0462, "step": 1099 }, { "epoch": 0.7442489851150202, "grad_norm": 0.29471165072913635, "learning_rate": 9.347733899113709e-06, "loss": 0.0307, "step": 1100 }, { "epoch": 0.7449255751014885, "grad_norm": 0.6901682512249298, "learning_rate": 9.345788344414306e-06, "loss": 0.0606, "step": 1101 }, { "epoch": 0.7456021650879567, "grad_norm": 0.367005328692303, "learning_rate": 9.343840095560373e-06, "loss": 0.0393, "step": 1102 }, { "epoch": 0.7462787550744249, "grad_norm": 0.5773480553617552, "learning_rate": 9.341889153759715e-06, "loss": 0.0455, "step": 1103 }, { "epoch": 0.7469553450608931, "grad_norm": 0.5822139256764285, "learning_rate": 9.339935520221816e-06, "loss": 0.0546, "step": 1104 }, { "epoch": 0.7476319350473613, "grad_norm": 0.4303231690692278, "learning_rate": 9.33797919615782e-06, "loss": 0.0321, "step": 1105 }, { "epoch": 0.7483085250338295, "grad_norm": 0.5164958197463455, "learning_rate": 9.336020182780545e-06, "loss": 0.0371, "step": 1106 }, { "epoch": 0.7489851150202977, "grad_norm": 0.5840950520153051, "learning_rate": 9.33405848130447e-06, "loss": 0.0363, "step": 1107 }, { "epoch": 0.7496617050067659, "grad_norm": 0.8580538083904401, "learning_rate": 9.332094092945749e-06, "loss": 0.0593, "step": 1108 }, { "epoch": 0.7503382949932341, "grad_norm": 0.45669196915044574, "learning_rate": 9.330127018922195e-06, "loss": 0.0419, "step": 1109 }, { "epoch": 0.7510148849797023, "grad_norm": 0.6311970925803472, "learning_rate": 9.328157260453286e-06, "loss": 0.0431, "step": 1110 }, { "epoch": 0.7516914749661705, "grad_norm": 0.5072034301545328, "learning_rate": 9.326184818760167e-06, "loss": 0.047, "step": 1111 }, { "epoch": 0.7523680649526387, "grad_norm": 0.4469160704046604, "learning_rate": 9.324209695065644e-06, "loss": 0.0451, "step": 1112 }, { "epoch": 0.7530446549391069, "grad_norm": 0.4704715800688946, "learning_rate": 9.322231890594193e-06, "loss": 0.0419, "step": 1113 }, { "epoch": 0.7537212449255751, "grad_norm": 0.5600133859764858, "learning_rate": 9.32025140657194e-06, "loss": 0.0505, "step": 1114 }, { "epoch": 0.7543978349120433, "grad_norm": 0.4038967147652209, "learning_rate": 9.318268244226681e-06, "loss": 0.0479, "step": 1115 }, { "epoch": 0.7550744248985115, "grad_norm": 0.47394683668025994, "learning_rate": 9.31628240478787e-06, "loss": 0.0427, "step": 1116 }, { "epoch": 0.7557510148849798, "grad_norm": 0.4363282100835522, "learning_rate": 9.31429388948662e-06, "loss": 0.0409, "step": 1117 }, { "epoch": 0.7564276048714479, "grad_norm": 0.6631345517105665, "learning_rate": 9.312302699555701e-06, "loss": 0.0526, "step": 1118 }, { "epoch": 0.7571041948579161, "grad_norm": 0.4251997861259301, "learning_rate": 9.310308836229548e-06, "loss": 0.0458, "step": 1119 }, { "epoch": 0.7577807848443843, "grad_norm": 0.4375017741111722, "learning_rate": 9.308312300744247e-06, "loss": 0.0388, "step": 1120 }, { "epoch": 0.7584573748308525, "grad_norm": 0.5247235800165496, "learning_rate": 9.306313094337539e-06, "loss": 0.0443, "step": 1121 }, { "epoch": 0.7591339648173207, "grad_norm": 0.584204461473707, "learning_rate": 9.304311218248828e-06, "loss": 0.0555, "step": 1122 }, { "epoch": 0.7598105548037889, "grad_norm": 0.34492025134679155, "learning_rate": 9.30230667371917e-06, "loss": 0.0353, "step": 1123 }, { "epoch": 0.7604871447902571, "grad_norm": 0.39682716944671537, "learning_rate": 9.30029946199127e-06, "loss": 0.042, "step": 1124 }, { "epoch": 0.7611637347767253, "grad_norm": 0.32624179514132623, "learning_rate": 9.298289584309496e-06, "loss": 0.0352, "step": 1125 }, { "epoch": 0.7618403247631935, "grad_norm": 0.6122427027969475, "learning_rate": 9.29627704191986e-06, "loss": 0.0427, "step": 1126 }, { "epoch": 0.7625169147496617, "grad_norm": 0.43218148320586536, "learning_rate": 9.294261836070034e-06, "loss": 0.0468, "step": 1127 }, { "epoch": 0.7631935047361299, "grad_norm": 0.43508867127579565, "learning_rate": 9.292243968009332e-06, "loss": 0.0397, "step": 1128 }, { "epoch": 0.7638700947225981, "grad_norm": 0.3605762409954845, "learning_rate": 9.290223438988726e-06, "loss": 0.0387, "step": 1129 }, { "epoch": 0.7645466847090663, "grad_norm": 0.44807011054996954, "learning_rate": 9.288200250260836e-06, "loss": 0.0456, "step": 1130 }, { "epoch": 0.7652232746955345, "grad_norm": 0.3486712359694568, "learning_rate": 9.286174403079928e-06, "loss": 0.0436, "step": 1131 }, { "epoch": 0.7658998646820027, "grad_norm": 0.39675791530677357, "learning_rate": 9.284145898701921e-06, "loss": 0.0368, "step": 1132 }, { "epoch": 0.7665764546684709, "grad_norm": 0.5133506378319832, "learning_rate": 9.282114738384375e-06, "loss": 0.0491, "step": 1133 }, { "epoch": 0.7672530446549392, "grad_norm": 0.5956950097378654, "learning_rate": 9.280080923386501e-06, "loss": 0.0587, "step": 1134 }, { "epoch": 0.7679296346414073, "grad_norm": 0.3113555182626179, "learning_rate": 9.278044454969157e-06, "loss": 0.0327, "step": 1135 }, { "epoch": 0.7686062246278755, "grad_norm": 0.42301234160024176, "learning_rate": 9.27600533439484e-06, "loss": 0.0417, "step": 1136 }, { "epoch": 0.7692828146143437, "grad_norm": 0.41994481653635907, "learning_rate": 9.273963562927695e-06, "loss": 0.0337, "step": 1137 }, { "epoch": 0.7699594046008119, "grad_norm": 0.4158291408145277, "learning_rate": 9.271919141833514e-06, "loss": 0.0359, "step": 1138 }, { "epoch": 0.7706359945872802, "grad_norm": 0.5260878684426418, "learning_rate": 9.269872072379725e-06, "loss": 0.055, "step": 1139 }, { "epoch": 0.7713125845737483, "grad_norm": 0.6764803636872951, "learning_rate": 9.267822355835402e-06, "loss": 0.0582, "step": 1140 }, { "epoch": 0.7719891745602165, "grad_norm": 0.5363223867689032, "learning_rate": 9.265769993471258e-06, "loss": 0.0382, "step": 1141 }, { "epoch": 0.7726657645466847, "grad_norm": 0.4661861760992129, "learning_rate": 9.263714986559647e-06, "loss": 0.0367, "step": 1142 }, { "epoch": 0.7733423545331529, "grad_norm": 0.3728301812931657, "learning_rate": 9.261657336374561e-06, "loss": 0.0542, "step": 1143 }, { "epoch": 0.774018944519621, "grad_norm": 0.7420033962125824, "learning_rate": 9.259597044191635e-06, "loss": 0.0556, "step": 1144 }, { "epoch": 0.7746955345060893, "grad_norm": 0.6254152152879247, "learning_rate": 9.25753411128814e-06, "loss": 0.0526, "step": 1145 }, { "epoch": 0.7753721244925575, "grad_norm": 0.32286689466872553, "learning_rate": 9.25546853894298e-06, "loss": 0.0363, "step": 1146 }, { "epoch": 0.7760487144790257, "grad_norm": 0.758545351167433, "learning_rate": 9.253400328436699e-06, "loss": 0.0566, "step": 1147 }, { "epoch": 0.7767253044654939, "grad_norm": 0.5172544507113069, "learning_rate": 9.251329481051476e-06, "loss": 0.0418, "step": 1148 }, { "epoch": 0.7774018944519621, "grad_norm": 0.4944990830258745, "learning_rate": 9.249255998071127e-06, "loss": 0.0479, "step": 1149 }, { "epoch": 0.7780784844384303, "grad_norm": 0.4680289187890781, "learning_rate": 9.247179880781099e-06, "loss": 0.0434, "step": 1150 }, { "epoch": 0.7787550744248986, "grad_norm": 0.37933941688250344, "learning_rate": 9.24510113046847e-06, "loss": 0.0381, "step": 1151 }, { "epoch": 0.7794316644113667, "grad_norm": 0.5580678117796845, "learning_rate": 9.243019748421956e-06, "loss": 0.0411, "step": 1152 }, { "epoch": 0.7801082543978349, "grad_norm": 0.3792863592199863, "learning_rate": 9.2409357359319e-06, "loss": 0.0387, "step": 1153 }, { "epoch": 0.7807848443843031, "grad_norm": 0.49151909894050444, "learning_rate": 9.238849094290279e-06, "loss": 0.0475, "step": 1154 }, { "epoch": 0.7814614343707713, "grad_norm": 0.4150200341713875, "learning_rate": 9.236759824790698e-06, "loss": 0.0374, "step": 1155 }, { "epoch": 0.7821380243572396, "grad_norm": 0.3328406752636634, "learning_rate": 9.234667928728392e-06, "loss": 0.0321, "step": 1156 }, { "epoch": 0.7828146143437077, "grad_norm": 0.9741272621124953, "learning_rate": 9.23257340740022e-06, "loss": 0.062, "step": 1157 }, { "epoch": 0.7834912043301759, "grad_norm": 0.5466469275195904, "learning_rate": 9.230476262104678e-06, "loss": 0.0561, "step": 1158 }, { "epoch": 0.7841677943166441, "grad_norm": 0.38619283655988507, "learning_rate": 9.22837649414188e-06, "loss": 0.0339, "step": 1159 }, { "epoch": 0.7848443843031123, "grad_norm": 0.44722381263042815, "learning_rate": 9.226274104813567e-06, "loss": 0.0449, "step": 1160 }, { "epoch": 0.7855209742895806, "grad_norm": 0.3507827327034681, "learning_rate": 9.22416909542311e-06, "loss": 0.0373, "step": 1161 }, { "epoch": 0.7861975642760487, "grad_norm": 0.4603881135888454, "learning_rate": 9.222061467275503e-06, "loss": 0.0572, "step": 1162 }, { "epoch": 0.786874154262517, "grad_norm": 0.4085902778519741, "learning_rate": 9.219951221677356e-06, "loss": 0.0377, "step": 1163 }, { "epoch": 0.7875507442489851, "grad_norm": 0.46066320087189744, "learning_rate": 9.217838359936914e-06, "loss": 0.0388, "step": 1164 }, { "epoch": 0.7882273342354533, "grad_norm": 0.37053625701059334, "learning_rate": 9.215722883364033e-06, "loss": 0.0395, "step": 1165 }, { "epoch": 0.7889039242219216, "grad_norm": 0.38791731905276344, "learning_rate": 9.213604793270196e-06, "loss": 0.0522, "step": 1166 }, { "epoch": 0.7895805142083897, "grad_norm": 0.486347306914698, "learning_rate": 9.211484090968505e-06, "loss": 0.0383, "step": 1167 }, { "epoch": 0.790257104194858, "grad_norm": 0.5309241482023681, "learning_rate": 9.20936077777368e-06, "loss": 0.0604, "step": 1168 }, { "epoch": 0.7909336941813261, "grad_norm": 0.3936513120655959, "learning_rate": 9.207234855002062e-06, "loss": 0.0439, "step": 1169 }, { "epoch": 0.7916102841677943, "grad_norm": 0.3905053801028906, "learning_rate": 9.205106323971607e-06, "loss": 0.039, "step": 1170 }, { "epoch": 0.7922868741542625, "grad_norm": 0.43833901476768505, "learning_rate": 9.202975186001892e-06, "loss": 0.0482, "step": 1171 }, { "epoch": 0.7929634641407307, "grad_norm": 0.7516556302472751, "learning_rate": 9.200841442414106e-06, "loss": 0.056, "step": 1172 }, { "epoch": 0.793640054127199, "grad_norm": 0.3819234864029351, "learning_rate": 9.198705094531053e-06, "loss": 0.0439, "step": 1173 }, { "epoch": 0.7943166441136671, "grad_norm": 0.6120124177063407, "learning_rate": 9.196566143677157e-06, "loss": 0.059, "step": 1174 }, { "epoch": 0.7949932341001353, "grad_norm": 0.4910293294970947, "learning_rate": 9.19442459117845e-06, "loss": 0.0456, "step": 1175 }, { "epoch": 0.7956698240866035, "grad_norm": 0.5328215559905269, "learning_rate": 9.192280438362581e-06, "loss": 0.0611, "step": 1176 }, { "epoch": 0.7963464140730717, "grad_norm": 0.35797326030579124, "learning_rate": 9.190133686558809e-06, "loss": 0.0384, "step": 1177 }, { "epoch": 0.79702300405954, "grad_norm": 0.3188990333573404, "learning_rate": 9.187984337098002e-06, "loss": 0.0322, "step": 1178 }, { "epoch": 0.7976995940460081, "grad_norm": 0.3794249733690347, "learning_rate": 9.185832391312644e-06, "loss": 0.0398, "step": 1179 }, { "epoch": 0.7983761840324763, "grad_norm": 0.3822287588253073, "learning_rate": 9.183677850536823e-06, "loss": 0.0392, "step": 1180 }, { "epoch": 0.7990527740189445, "grad_norm": 0.35000444773849926, "learning_rate": 9.181520716106238e-06, "loss": 0.0328, "step": 1181 }, { "epoch": 0.7997293640054127, "grad_norm": 0.3199665504575698, "learning_rate": 9.179360989358199e-06, "loss": 0.033, "step": 1182 }, { "epoch": 0.800405953991881, "grad_norm": 0.40110028777129825, "learning_rate": 9.177198671631616e-06, "loss": 0.0507, "step": 1183 }, { "epoch": 0.8010825439783491, "grad_norm": 0.3605079309777011, "learning_rate": 9.175033764267013e-06, "loss": 0.0388, "step": 1184 }, { "epoch": 0.8017591339648173, "grad_norm": 0.48857922904966433, "learning_rate": 9.172866268606514e-06, "loss": 0.0471, "step": 1185 }, { "epoch": 0.8024357239512855, "grad_norm": 0.3356889780868183, "learning_rate": 9.17069618599385e-06, "loss": 0.0374, "step": 1186 }, { "epoch": 0.8031123139377537, "grad_norm": 0.5677033185722352, "learning_rate": 9.168523517774356e-06, "loss": 0.0399, "step": 1187 }, { "epoch": 0.803788903924222, "grad_norm": 0.44601535886712806, "learning_rate": 9.166348265294968e-06, "loss": 0.0317, "step": 1188 }, { "epoch": 0.8044654939106901, "grad_norm": 0.28908970148597557, "learning_rate": 9.164170429904224e-06, "loss": 0.0253, "step": 1189 }, { "epoch": 0.8051420838971584, "grad_norm": 0.37936933561101305, "learning_rate": 9.16199001295227e-06, "loss": 0.0333, "step": 1190 }, { "epoch": 0.8058186738836265, "grad_norm": 0.5378799769789874, "learning_rate": 9.15980701579084e-06, "loss": 0.0463, "step": 1191 }, { "epoch": 0.8064952638700947, "grad_norm": 0.3664821531951507, "learning_rate": 9.157621439773278e-06, "loss": 0.0416, "step": 1192 }, { "epoch": 0.8071718538565629, "grad_norm": 0.5992640475325052, "learning_rate": 9.155433286254524e-06, "loss": 0.0465, "step": 1193 }, { "epoch": 0.8078484438430311, "grad_norm": 0.44246404429673586, "learning_rate": 9.153242556591115e-06, "loss": 0.0476, "step": 1194 }, { "epoch": 0.8085250338294994, "grad_norm": 0.49868840269074993, "learning_rate": 9.151049252141185e-06, "loss": 0.0666, "step": 1195 }, { "epoch": 0.8092016238159675, "grad_norm": 0.4199962889037978, "learning_rate": 9.148853374264463e-06, "loss": 0.0412, "step": 1196 }, { "epoch": 0.8098782138024357, "grad_norm": 0.6412453213760052, "learning_rate": 9.146654924322277e-06, "loss": 0.038, "step": 1197 }, { "epoch": 0.8105548037889039, "grad_norm": 0.364908387543322, "learning_rate": 9.144453903677546e-06, "loss": 0.0571, "step": 1198 }, { "epoch": 0.8112313937753721, "grad_norm": 0.44181588815410566, "learning_rate": 9.142250313694785e-06, "loss": 0.0338, "step": 1199 }, { "epoch": 0.8119079837618404, "grad_norm": 0.2652282884689182, "learning_rate": 9.140044155740102e-06, "loss": 0.0301, "step": 1200 }, { "epoch": 0.8125845737483085, "grad_norm": 0.5075050392163715, "learning_rate": 9.137835431181192e-06, "loss": 0.0498, "step": 1201 }, { "epoch": 0.8132611637347767, "grad_norm": 0.45580087964926747, "learning_rate": 9.13562414138735e-06, "loss": 0.0322, "step": 1202 }, { "epoch": 0.8139377537212449, "grad_norm": 0.586251118417022, "learning_rate": 9.133410287729454e-06, "loss": 0.0544, "step": 1203 }, { "epoch": 0.8146143437077131, "grad_norm": 0.47252601204366695, "learning_rate": 9.131193871579975e-06, "loss": 0.0441, "step": 1204 }, { "epoch": 0.8152909336941814, "grad_norm": 1.0376002865967784, "learning_rate": 9.12897489431297e-06, "loss": 0.064, "step": 1205 }, { "epoch": 0.8159675236806495, "grad_norm": 0.362957751058306, "learning_rate": 9.126753357304088e-06, "loss": 0.0384, "step": 1206 }, { "epoch": 0.8166441136671178, "grad_norm": 0.3195252023339085, "learning_rate": 9.12452926193056e-06, "loss": 0.0393, "step": 1207 }, { "epoch": 0.8173207036535859, "grad_norm": 0.3753364178537597, "learning_rate": 9.122302609571204e-06, "loss": 0.0272, "step": 1208 }, { "epoch": 0.8179972936400541, "grad_norm": 0.3709876630054977, "learning_rate": 9.120073401606427e-06, "loss": 0.0375, "step": 1209 }, { "epoch": 0.8186738836265224, "grad_norm": 0.3918475527915412, "learning_rate": 9.117841639418218e-06, "loss": 0.0373, "step": 1210 }, { "epoch": 0.8193504736129905, "grad_norm": 0.7699736605312884, "learning_rate": 9.115607324390146e-06, "loss": 0.0571, "step": 1211 }, { "epoch": 0.8200270635994588, "grad_norm": 0.7354881317235208, "learning_rate": 9.11337045790737e-06, "loss": 0.0431, "step": 1212 }, { "epoch": 0.8207036535859269, "grad_norm": 0.4607374635589529, "learning_rate": 9.111131041356624e-06, "loss": 0.0374, "step": 1213 }, { "epoch": 0.8213802435723951, "grad_norm": 0.4527689501272563, "learning_rate": 9.108889076126226e-06, "loss": 0.0368, "step": 1214 }, { "epoch": 0.8220568335588633, "grad_norm": 0.9913737981840453, "learning_rate": 9.106644563606076e-06, "loss": 0.0406, "step": 1215 }, { "epoch": 0.8227334235453315, "grad_norm": 0.5240453942563007, "learning_rate": 9.104397505187645e-06, "loss": 0.0513, "step": 1216 }, { "epoch": 0.8234100135317998, "grad_norm": 0.6220759777033223, "learning_rate": 9.102147902263994e-06, "loss": 0.0452, "step": 1217 }, { "epoch": 0.8240866035182679, "grad_norm": 0.3221077414204484, "learning_rate": 9.099895756229754e-06, "loss": 0.0371, "step": 1218 }, { "epoch": 0.8247631935047361, "grad_norm": 0.6362524400354698, "learning_rate": 9.097641068481133e-06, "loss": 0.0587, "step": 1219 }, { "epoch": 0.8254397834912043, "grad_norm": 0.4386780555869206, "learning_rate": 9.095383840415915e-06, "loss": 0.0475, "step": 1220 }, { "epoch": 0.8261163734776725, "grad_norm": 0.49091159815368024, "learning_rate": 9.093124073433464e-06, "loss": 0.0306, "step": 1221 }, { "epoch": 0.8267929634641408, "grad_norm": 0.37397811856107593, "learning_rate": 9.090861768934708e-06, "loss": 0.0368, "step": 1222 }, { "epoch": 0.8274695534506089, "grad_norm": 0.4195135915086271, "learning_rate": 9.088596928322158e-06, "loss": 0.0326, "step": 1223 }, { "epoch": 0.8281461434370772, "grad_norm": 0.44335281138776866, "learning_rate": 9.08632955299989e-06, "loss": 0.0313, "step": 1224 }, { "epoch": 0.8288227334235453, "grad_norm": 0.45103277842907036, "learning_rate": 9.084059644373558e-06, "loss": 0.0421, "step": 1225 }, { "epoch": 0.8294993234100135, "grad_norm": 0.2989663277016761, "learning_rate": 9.08178720385038e-06, "loss": 0.0313, "step": 1226 }, { "epoch": 0.8301759133964818, "grad_norm": 0.3421564851969261, "learning_rate": 9.07951223283915e-06, "loss": 0.0338, "step": 1227 }, { "epoch": 0.8308525033829499, "grad_norm": 0.6700826368288783, "learning_rate": 9.077234732750223e-06, "loss": 0.0539, "step": 1228 }, { "epoch": 0.8315290933694182, "grad_norm": 0.42400569896634155, "learning_rate": 9.074954704995532e-06, "loss": 0.0429, "step": 1229 }, { "epoch": 0.8322056833558863, "grad_norm": 0.43217022714275255, "learning_rate": 9.072672150988563e-06, "loss": 0.0531, "step": 1230 }, { "epoch": 0.8328822733423545, "grad_norm": 0.33547200954628376, "learning_rate": 9.070387072144386e-06, "loss": 0.0427, "step": 1231 }, { "epoch": 0.8335588633288228, "grad_norm": 0.42985392965408503, "learning_rate": 9.06809946987962e-06, "loss": 0.0489, "step": 1232 }, { "epoch": 0.8342354533152909, "grad_norm": 0.3897977124645228, "learning_rate": 9.065809345612458e-06, "loss": 0.0383, "step": 1233 }, { "epoch": 0.8349120433017592, "grad_norm": 0.5109827851236731, "learning_rate": 9.06351670076265e-06, "loss": 0.0545, "step": 1234 }, { "epoch": 0.8355886332882273, "grad_norm": 0.4915371656148965, "learning_rate": 9.061221536751517e-06, "loss": 0.0466, "step": 1235 }, { "epoch": 0.8362652232746955, "grad_norm": 0.35946300440613643, "learning_rate": 9.058923855001935e-06, "loss": 0.0428, "step": 1236 }, { "epoch": 0.8369418132611637, "grad_norm": 0.332874472838773, "learning_rate": 9.056623656938344e-06, "loss": 0.0447, "step": 1237 }, { "epoch": 0.8376184032476319, "grad_norm": 0.49876938075197785, "learning_rate": 9.05432094398674e-06, "loss": 0.0407, "step": 1238 }, { "epoch": 0.8382949932341002, "grad_norm": 0.3299883868490397, "learning_rate": 9.052015717574683e-06, "loss": 0.0385, "step": 1239 }, { "epoch": 0.8389715832205683, "grad_norm": 0.4304477923193939, "learning_rate": 9.049707979131288e-06, "loss": 0.051, "step": 1240 }, { "epoch": 0.8396481732070366, "grad_norm": 0.4018257282137082, "learning_rate": 9.04739773008723e-06, "loss": 0.0328, "step": 1241 }, { "epoch": 0.8403247631935047, "grad_norm": 0.5139290067371771, "learning_rate": 9.045084971874738e-06, "loss": 0.0545, "step": 1242 }, { "epoch": 0.8410013531799729, "grad_norm": 0.3030113463432619, "learning_rate": 9.042769705927597e-06, "loss": 0.0278, "step": 1243 }, { "epoch": 0.8416779431664412, "grad_norm": 0.9482400717391877, "learning_rate": 9.040451933681148e-06, "loss": 0.0589, "step": 1244 }, { "epoch": 0.8423545331529093, "grad_norm": 0.4483159760178228, "learning_rate": 9.038131656572284e-06, "loss": 0.0419, "step": 1245 }, { "epoch": 0.8430311231393776, "grad_norm": 0.40578292251140413, "learning_rate": 9.035808876039451e-06, "loss": 0.0436, "step": 1246 }, { "epoch": 0.8437077131258457, "grad_norm": 0.3694519035719774, "learning_rate": 9.033483593522652e-06, "loss": 0.0384, "step": 1247 }, { "epoch": 0.8443843031123139, "grad_norm": 0.4777745163070217, "learning_rate": 9.03115581046343e-06, "loss": 0.039, "step": 1248 }, { "epoch": 0.8450608930987822, "grad_norm": 0.4213671177327557, "learning_rate": 9.028825528304892e-06, "loss": 0.0467, "step": 1249 }, { "epoch": 0.8457374830852503, "grad_norm": 0.5393334161933487, "learning_rate": 9.026492748491683e-06, "loss": 0.042, "step": 1250 }, { "epoch": 0.8464140730717186, "grad_norm": 0.4617021484576158, "learning_rate": 9.02415747247e-06, "loss": 0.0479, "step": 1251 }, { "epoch": 0.8470906630581867, "grad_norm": 0.44235254226389215, "learning_rate": 9.02181970168759e-06, "loss": 0.044, "step": 1252 }, { "epoch": 0.847767253044655, "grad_norm": 0.6653049698515878, "learning_rate": 9.019479437593748e-06, "loss": 0.0456, "step": 1253 }, { "epoch": 0.8484438430311232, "grad_norm": 0.4178732931904288, "learning_rate": 9.017136681639307e-06, "loss": 0.0347, "step": 1254 }, { "epoch": 0.8491204330175913, "grad_norm": 0.3861007999326273, "learning_rate": 9.014791435276651e-06, "loss": 0.04, "step": 1255 }, { "epoch": 0.8497970230040596, "grad_norm": 0.40929829348577457, "learning_rate": 9.012443699959706e-06, "loss": 0.0394, "step": 1256 }, { "epoch": 0.8504736129905277, "grad_norm": 0.811155314670333, "learning_rate": 9.010093477143942e-06, "loss": 0.0666, "step": 1257 }, { "epoch": 0.851150202976996, "grad_norm": 0.41445007834888836, "learning_rate": 9.007740768286369e-06, "loss": 0.0444, "step": 1258 }, { "epoch": 0.8518267929634641, "grad_norm": 0.3234418608942231, "learning_rate": 9.005385574845543e-06, "loss": 0.0373, "step": 1259 }, { "epoch": 0.8525033829499323, "grad_norm": 0.6188106431766026, "learning_rate": 9.003027898281551e-06, "loss": 0.0427, "step": 1260 }, { "epoch": 0.8531799729364006, "grad_norm": 0.6533740575115408, "learning_rate": 9.000667740056033e-06, "loss": 0.0534, "step": 1261 }, { "epoch": 0.8538565629228687, "grad_norm": 0.4189806216269285, "learning_rate": 8.998305101632155e-06, "loss": 0.0507, "step": 1262 }, { "epoch": 0.854533152909337, "grad_norm": 0.27350968064232517, "learning_rate": 8.995939984474624e-06, "loss": 0.0287, "step": 1263 }, { "epoch": 0.8552097428958051, "grad_norm": 0.42523468953669913, "learning_rate": 8.99357239004969e-06, "loss": 0.0402, "step": 1264 }, { "epoch": 0.8558863328822733, "grad_norm": 0.5369799132951332, "learning_rate": 8.991202319825131e-06, "loss": 0.0549, "step": 1265 }, { "epoch": 0.8565629228687416, "grad_norm": 0.42772143241216515, "learning_rate": 8.988829775270265e-06, "loss": 0.0382, "step": 1266 }, { "epoch": 0.8572395128552097, "grad_norm": 0.2987472351107439, "learning_rate": 8.986454757855938e-06, "loss": 0.0349, "step": 1267 }, { "epoch": 0.857916102841678, "grad_norm": 0.3961898182156086, "learning_rate": 8.984077269054535e-06, "loss": 0.0381, "step": 1268 }, { "epoch": 0.8585926928281461, "grad_norm": 0.34654081148722105, "learning_rate": 8.981697310339972e-06, "loss": 0.0411, "step": 1269 }, { "epoch": 0.8592692828146143, "grad_norm": 0.41463106570300323, "learning_rate": 8.979314883187694e-06, "loss": 0.0365, "step": 1270 }, { "epoch": 0.8599458728010826, "grad_norm": 0.3310488399216415, "learning_rate": 8.976929989074677e-06, "loss": 0.0324, "step": 1271 }, { "epoch": 0.8606224627875507, "grad_norm": 0.5261414363189244, "learning_rate": 8.974542629479426e-06, "loss": 0.0421, "step": 1272 }, { "epoch": 0.861299052774019, "grad_norm": 0.4342571413552553, "learning_rate": 8.972152805881978e-06, "loss": 0.0382, "step": 1273 }, { "epoch": 0.8619756427604871, "grad_norm": 0.4835208110976876, "learning_rate": 8.969760519763891e-06, "loss": 0.0487, "step": 1274 }, { "epoch": 0.8626522327469553, "grad_norm": 0.4274045983340695, "learning_rate": 8.967365772608258e-06, "loss": 0.0488, "step": 1275 }, { "epoch": 0.8633288227334236, "grad_norm": 0.3572140106942213, "learning_rate": 8.96496856589969e-06, "loss": 0.0357, "step": 1276 }, { "epoch": 0.8640054127198917, "grad_norm": 0.3389369988565005, "learning_rate": 8.962568901124326e-06, "loss": 0.0379, "step": 1277 }, { "epoch": 0.86468200270636, "grad_norm": 0.4252598709924321, "learning_rate": 8.96016677976983e-06, "loss": 0.0552, "step": 1278 }, { "epoch": 0.8653585926928281, "grad_norm": 0.4274585234906251, "learning_rate": 8.957762203325389e-06, "loss": 0.0608, "step": 1279 }, { "epoch": 0.8660351826792964, "grad_norm": 0.413955169996957, "learning_rate": 8.955355173281709e-06, "loss": 0.0391, "step": 1280 }, { "epoch": 0.8667117726657646, "grad_norm": 0.3632041768440433, "learning_rate": 8.952945691131016e-06, "loss": 0.0322, "step": 1281 }, { "epoch": 0.8673883626522327, "grad_norm": 0.4207387415918705, "learning_rate": 8.950533758367063e-06, "loss": 0.0385, "step": 1282 }, { "epoch": 0.868064952638701, "grad_norm": 0.3097737464929893, "learning_rate": 8.948119376485119e-06, "loss": 0.0366, "step": 1283 }, { "epoch": 0.8687415426251691, "grad_norm": 0.5002017558211296, "learning_rate": 8.94570254698197e-06, "loss": 0.0361, "step": 1284 }, { "epoch": 0.8694181326116374, "grad_norm": 0.41760825960748166, "learning_rate": 8.943283271355915e-06, "loss": 0.0513, "step": 1285 }, { "epoch": 0.8700947225981055, "grad_norm": 0.3735950997932303, "learning_rate": 8.940861551106784e-06, "loss": 0.0348, "step": 1286 }, { "epoch": 0.8707713125845737, "grad_norm": 0.49506876431359564, "learning_rate": 8.938437387735903e-06, "loss": 0.0543, "step": 1287 }, { "epoch": 0.871447902571042, "grad_norm": 0.39553789847793236, "learning_rate": 8.93601078274613e-06, "loss": 0.0358, "step": 1288 }, { "epoch": 0.8721244925575101, "grad_norm": 0.28824367101976933, "learning_rate": 8.933581737641824e-06, "loss": 0.0344, "step": 1289 }, { "epoch": 0.8728010825439784, "grad_norm": 0.29995877856501596, "learning_rate": 8.931150253928866e-06, "loss": 0.0324, "step": 1290 }, { "epoch": 0.8734776725304465, "grad_norm": 0.34421727276338043, "learning_rate": 8.928716333114643e-06, "loss": 0.0424, "step": 1291 }, { "epoch": 0.8741542625169147, "grad_norm": 0.42456032358958895, "learning_rate": 8.926279976708056e-06, "loss": 0.0397, "step": 1292 }, { "epoch": 0.874830852503383, "grad_norm": 0.4052196496131863, "learning_rate": 8.923841186219512e-06, "loss": 0.04, "step": 1293 }, { "epoch": 0.8755074424898511, "grad_norm": 0.51106078651104, "learning_rate": 8.921399963160934e-06, "loss": 0.0383, "step": 1294 }, { "epoch": 0.8761840324763194, "grad_norm": 1.0246571675765568, "learning_rate": 8.918956309045743e-06, "loss": 0.0419, "step": 1295 }, { "epoch": 0.8768606224627875, "grad_norm": 0.489503464874003, "learning_rate": 8.916510225388878e-06, "loss": 0.0553, "step": 1296 }, { "epoch": 0.8775372124492558, "grad_norm": 0.5868667513212799, "learning_rate": 8.914061713706776e-06, "loss": 0.0375, "step": 1297 }, { "epoch": 0.878213802435724, "grad_norm": 0.568524914043775, "learning_rate": 8.911610775517383e-06, "loss": 0.0316, "step": 1298 }, { "epoch": 0.8788903924221921, "grad_norm": 0.4303128515684782, "learning_rate": 8.90915741234015e-06, "loss": 0.0457, "step": 1299 }, { "epoch": 0.8795669824086604, "grad_norm": 0.4989966037455737, "learning_rate": 8.906701625696028e-06, "loss": 0.0441, "step": 1300 }, { "epoch": 0.8802435723951285, "grad_norm": 0.3963366881165422, "learning_rate": 8.904243417107473e-06, "loss": 0.044, "step": 1301 }, { "epoch": 0.8809201623815968, "grad_norm": 1.1829084709010367, "learning_rate": 8.901782788098442e-06, "loss": 0.0571, "step": 1302 }, { "epoch": 0.881596752368065, "grad_norm": 0.347485401772923, "learning_rate": 8.899319740194391e-06, "loss": 0.0337, "step": 1303 }, { "epoch": 0.8822733423545331, "grad_norm": 0.31723947459588236, "learning_rate": 8.89685427492228e-06, "loss": 0.0306, "step": 1304 }, { "epoch": 0.8829499323410014, "grad_norm": 0.43758053153646603, "learning_rate": 8.894386393810563e-06, "loss": 0.0394, "step": 1305 }, { "epoch": 0.8836265223274695, "grad_norm": 0.5122954040268756, "learning_rate": 8.891916098389193e-06, "loss": 0.0492, "step": 1306 }, { "epoch": 0.8843031123139378, "grad_norm": 0.4291499611534161, "learning_rate": 8.889443390189618e-06, "loss": 0.0501, "step": 1307 }, { "epoch": 0.8849797023004059, "grad_norm": 0.43676737301055296, "learning_rate": 8.886968270744789e-06, "loss": 0.044, "step": 1308 }, { "epoch": 0.8856562922868741, "grad_norm": 0.3464180119501573, "learning_rate": 8.88449074158914e-06, "loss": 0.045, "step": 1309 }, { "epoch": 0.8863328822733424, "grad_norm": 0.3493272905534359, "learning_rate": 8.882010804258612e-06, "loss": 0.0344, "step": 1310 }, { "epoch": 0.8870094722598105, "grad_norm": 0.3295603557387321, "learning_rate": 8.879528460290628e-06, "loss": 0.0373, "step": 1311 }, { "epoch": 0.8876860622462788, "grad_norm": 0.5660882260290939, "learning_rate": 8.877043711224109e-06, "loss": 0.0467, "step": 1312 }, { "epoch": 0.8883626522327469, "grad_norm": 0.4073247970162156, "learning_rate": 8.874556558599465e-06, "loss": 0.0425, "step": 1313 }, { "epoch": 0.8890392422192152, "grad_norm": 0.5009482962458793, "learning_rate": 8.872067003958597e-06, "loss": 0.0585, "step": 1314 }, { "epoch": 0.8897158322056834, "grad_norm": 0.48128534011963076, "learning_rate": 8.869575048844896e-06, "loss": 0.04, "step": 1315 }, { "epoch": 0.8903924221921515, "grad_norm": 0.2963676138282622, "learning_rate": 8.867080694803238e-06, "loss": 0.032, "step": 1316 }, { "epoch": 0.8910690121786198, "grad_norm": 0.42528060953519137, "learning_rate": 8.864583943379987e-06, "loss": 0.0262, "step": 1317 }, { "epoch": 0.8917456021650879, "grad_norm": 0.3732517425386884, "learning_rate": 8.862084796122998e-06, "loss": 0.0362, "step": 1318 }, { "epoch": 0.8924221921515562, "grad_norm": 0.49779303193927454, "learning_rate": 8.859583254581604e-06, "loss": 0.0365, "step": 1319 }, { "epoch": 0.8930987821380244, "grad_norm": 0.33431602169000396, "learning_rate": 8.85707932030663e-06, "loss": 0.0437, "step": 1320 }, { "epoch": 0.8937753721244925, "grad_norm": 0.3337056934295125, "learning_rate": 8.854572994850376e-06, "loss": 0.0361, "step": 1321 }, { "epoch": 0.8944519621109608, "grad_norm": 0.44781149353065697, "learning_rate": 8.85206427976663e-06, "loss": 0.0433, "step": 1322 }, { "epoch": 0.8951285520974289, "grad_norm": 0.3760732383639481, "learning_rate": 8.849553176610661e-06, "loss": 0.0439, "step": 1323 }, { "epoch": 0.8958051420838972, "grad_norm": 0.41732728601814806, "learning_rate": 8.847039686939218e-06, "loss": 0.0402, "step": 1324 }, { "epoch": 0.8964817320703654, "grad_norm": 0.40984679968932614, "learning_rate": 8.844523812310527e-06, "loss": 0.0328, "step": 1325 }, { "epoch": 0.8971583220568335, "grad_norm": 0.3972413846697271, "learning_rate": 8.842005554284296e-06, "loss": 0.0306, "step": 1326 }, { "epoch": 0.8978349120433018, "grad_norm": 0.3877047464990477, "learning_rate": 8.83948491442171e-06, "loss": 0.0363, "step": 1327 }, { "epoch": 0.8985115020297699, "grad_norm": 0.3983312816608221, "learning_rate": 8.836961894285428e-06, "loss": 0.0527, "step": 1328 }, { "epoch": 0.8991880920162382, "grad_norm": 0.44535858494343633, "learning_rate": 8.834436495439588e-06, "loss": 0.0411, "step": 1329 }, { "epoch": 0.8998646820027063, "grad_norm": 0.4155006794569815, "learning_rate": 8.8319087194498e-06, "loss": 0.035, "step": 1330 }, { "epoch": 0.9005412719891746, "grad_norm": 0.5184428865538863, "learning_rate": 8.829378567883152e-06, "loss": 0.0524, "step": 1331 }, { "epoch": 0.9012178619756428, "grad_norm": 0.4257897513356357, "learning_rate": 8.826846042308195e-06, "loss": 0.0419, "step": 1332 }, { "epoch": 0.9018944519621109, "grad_norm": 0.3381178914295803, "learning_rate": 8.824311144294966e-06, "loss": 0.0374, "step": 1333 }, { "epoch": 0.9025710419485792, "grad_norm": 0.4276666299311541, "learning_rate": 8.82177387541496e-06, "loss": 0.0385, "step": 1334 }, { "epoch": 0.9032476319350473, "grad_norm": 0.40618159906521484, "learning_rate": 8.819234237241148e-06, "loss": 0.0368, "step": 1335 }, { "epoch": 0.9039242219215156, "grad_norm": 0.46333943890891277, "learning_rate": 8.816692231347972e-06, "loss": 0.0431, "step": 1336 }, { "epoch": 0.9046008119079838, "grad_norm": 0.35018955285789055, "learning_rate": 8.814147859311333e-06, "loss": 0.0292, "step": 1337 }, { "epoch": 0.9052774018944519, "grad_norm": 0.4055635239042611, "learning_rate": 8.81160112270861e-06, "loss": 0.0416, "step": 1338 }, { "epoch": 0.9059539918809202, "grad_norm": 0.43599525265023237, "learning_rate": 8.809052023118638e-06, "loss": 0.0485, "step": 1339 }, { "epoch": 0.9066305818673883, "grad_norm": 0.3633448138753023, "learning_rate": 8.806500562121724e-06, "loss": 0.0354, "step": 1340 }, { "epoch": 0.9073071718538566, "grad_norm": 0.4194681453206341, "learning_rate": 8.803946741299635e-06, "loss": 0.0348, "step": 1341 }, { "epoch": 0.9079837618403248, "grad_norm": 0.3484340555072282, "learning_rate": 8.801390562235603e-06, "loss": 0.0412, "step": 1342 }, { "epoch": 0.908660351826793, "grad_norm": 0.39180485947097266, "learning_rate": 8.79883202651432e-06, "loss": 0.0344, "step": 1343 }, { "epoch": 0.9093369418132612, "grad_norm": 0.5878957640500705, "learning_rate": 8.796271135721944e-06, "loss": 0.0438, "step": 1344 }, { "epoch": 0.9100135317997293, "grad_norm": 0.3585537157318733, "learning_rate": 8.793707891446086e-06, "loss": 0.0465, "step": 1345 }, { "epoch": 0.9106901217861976, "grad_norm": 0.42682788574677444, "learning_rate": 8.791142295275819e-06, "loss": 0.0419, "step": 1346 }, { "epoch": 0.9113667117726658, "grad_norm": 0.43525000643942857, "learning_rate": 8.788574348801676e-06, "loss": 0.0439, "step": 1347 }, { "epoch": 0.912043301759134, "grad_norm": 0.4271708108608584, "learning_rate": 8.786004053615642e-06, "loss": 0.0408, "step": 1348 }, { "epoch": 0.9127198917456022, "grad_norm": 0.3982985169696188, "learning_rate": 8.783431411311165e-06, "loss": 0.0406, "step": 1349 }, { "epoch": 0.9133964817320703, "grad_norm": 0.4420284914173309, "learning_rate": 8.780856423483145e-06, "loss": 0.049, "step": 1350 }, { "epoch": 0.9140730717185386, "grad_norm": 0.5133876517309015, "learning_rate": 8.778279091727933e-06, "loss": 0.0376, "step": 1351 }, { "epoch": 0.9147496617050067, "grad_norm": 0.4517642045654338, "learning_rate": 8.775699417643337e-06, "loss": 0.036, "step": 1352 }, { "epoch": 0.915426251691475, "grad_norm": 0.4187033930337199, "learning_rate": 8.773117402828618e-06, "loss": 0.0387, "step": 1353 }, { "epoch": 0.9161028416779432, "grad_norm": 0.5689038088698848, "learning_rate": 8.770533048884483e-06, "loss": 0.0457, "step": 1354 }, { "epoch": 0.9167794316644113, "grad_norm": 0.5477824374413572, "learning_rate": 8.767946357413091e-06, "loss": 0.0508, "step": 1355 }, { "epoch": 0.9174560216508796, "grad_norm": 0.432617373887483, "learning_rate": 8.765357330018056e-06, "loss": 0.0512, "step": 1356 }, { "epoch": 0.9181326116373477, "grad_norm": 0.39216575430497996, "learning_rate": 8.76276596830443e-06, "loss": 0.0393, "step": 1357 }, { "epoch": 0.918809201623816, "grad_norm": 0.505404286144978, "learning_rate": 8.760172273878723e-06, "loss": 0.0432, "step": 1358 }, { "epoch": 0.9194857916102842, "grad_norm": 0.41370772948770634, "learning_rate": 8.757576248348883e-06, "loss": 0.0458, "step": 1359 }, { "epoch": 0.9201623815967523, "grad_norm": 0.3533451333059337, "learning_rate": 8.754977893324305e-06, "loss": 0.0371, "step": 1360 }, { "epoch": 0.9208389715832206, "grad_norm": 0.33252775450950467, "learning_rate": 8.75237721041583e-06, "loss": 0.037, "step": 1361 }, { "epoch": 0.9215155615696887, "grad_norm": 0.30753827187988675, "learning_rate": 8.74977420123574e-06, "loss": 0.043, "step": 1362 }, { "epoch": 0.922192151556157, "grad_norm": 0.31934034280234236, "learning_rate": 8.747168867397765e-06, "loss": 0.0295, "step": 1363 }, { "epoch": 0.9228687415426252, "grad_norm": 0.4469463716086972, "learning_rate": 8.744561210517067e-06, "loss": 0.0394, "step": 1364 }, { "epoch": 0.9235453315290933, "grad_norm": 0.5110721075186322, "learning_rate": 8.741951232210254e-06, "loss": 0.0748, "step": 1365 }, { "epoch": 0.9242219215155616, "grad_norm": 0.3587084701697703, "learning_rate": 8.73933893409537e-06, "loss": 0.0356, "step": 1366 }, { "epoch": 0.9248985115020297, "grad_norm": 0.6121188033381618, "learning_rate": 8.736724317791903e-06, "loss": 0.0447, "step": 1367 }, { "epoch": 0.925575101488498, "grad_norm": 0.6167435253465511, "learning_rate": 8.734107384920771e-06, "loss": 0.0342, "step": 1368 }, { "epoch": 0.9262516914749662, "grad_norm": 0.7836362184370608, "learning_rate": 8.731488137104332e-06, "loss": 0.0426, "step": 1369 }, { "epoch": 0.9269282814614344, "grad_norm": 0.3434747402556253, "learning_rate": 8.728866575966379e-06, "loss": 0.0324, "step": 1370 }, { "epoch": 0.9276048714479026, "grad_norm": 0.34921447130412037, "learning_rate": 8.726242703132139e-06, "loss": 0.0336, "step": 1371 }, { "epoch": 0.9282814614343707, "grad_norm": 0.34504458772487184, "learning_rate": 8.72361652022827e-06, "loss": 0.0327, "step": 1372 }, { "epoch": 0.928958051420839, "grad_norm": 0.4475201290757406, "learning_rate": 8.720988028882867e-06, "loss": 0.0436, "step": 1373 }, { "epoch": 0.9296346414073072, "grad_norm": 0.5271961046361782, "learning_rate": 8.71835723072545e-06, "loss": 0.0508, "step": 1374 }, { "epoch": 0.9303112313937754, "grad_norm": 0.5633526363693144, "learning_rate": 8.715724127386971e-06, "loss": 0.0458, "step": 1375 }, { "epoch": 0.9309878213802436, "grad_norm": 0.36374114879754027, "learning_rate": 8.713088720499817e-06, "loss": 0.0315, "step": 1376 }, { "epoch": 0.9316644113667117, "grad_norm": 0.415289551630065, "learning_rate": 8.710451011697794e-06, "loss": 0.0417, "step": 1377 }, { "epoch": 0.93234100135318, "grad_norm": 0.5290341699989562, "learning_rate": 8.70781100261614e-06, "loss": 0.0459, "step": 1378 }, { "epoch": 0.9330175913396481, "grad_norm": 0.3854241345094511, "learning_rate": 8.705168694891522e-06, "loss": 0.0434, "step": 1379 }, { "epoch": 0.9336941813261164, "grad_norm": 0.31857051670522657, "learning_rate": 8.702524090162023e-06, "loss": 0.0328, "step": 1380 }, { "epoch": 0.9343707713125846, "grad_norm": 0.45310218723053053, "learning_rate": 8.699877190067158e-06, "loss": 0.0409, "step": 1381 }, { "epoch": 0.9350473612990527, "grad_norm": 0.40780180266545213, "learning_rate": 8.697227996247861e-06, "loss": 0.0387, "step": 1382 }, { "epoch": 0.935723951285521, "grad_norm": 0.42366079031226367, "learning_rate": 8.694576510346493e-06, "loss": 0.0464, "step": 1383 }, { "epoch": 0.9364005412719891, "grad_norm": 0.5556293597587709, "learning_rate": 8.691922734006828e-06, "loss": 0.0467, "step": 1384 }, { "epoch": 0.9370771312584574, "grad_norm": 0.3072980297939123, "learning_rate": 8.689266668874067e-06, "loss": 0.0344, "step": 1385 }, { "epoch": 0.9377537212449256, "grad_norm": 0.4031713885243129, "learning_rate": 8.686608316594826e-06, "loss": 0.0537, "step": 1386 }, { "epoch": 0.9384303112313938, "grad_norm": 0.2959236664300782, "learning_rate": 8.683947678817139e-06, "loss": 0.0281, "step": 1387 }, { "epoch": 0.939106901217862, "grad_norm": 0.37637415121870216, "learning_rate": 8.681284757190462e-06, "loss": 0.0428, "step": 1388 }, { "epoch": 0.9397834912043301, "grad_norm": 0.33953434939379035, "learning_rate": 8.67861955336566e-06, "loss": 0.0357, "step": 1389 }, { "epoch": 0.9404600811907984, "grad_norm": 0.36545007825329084, "learning_rate": 8.675952068995014e-06, "loss": 0.0466, "step": 1390 }, { "epoch": 0.9411366711772666, "grad_norm": 0.363065405780556, "learning_rate": 8.673282305732225e-06, "loss": 0.0378, "step": 1391 }, { "epoch": 0.9418132611637348, "grad_norm": 0.47084181189146035, "learning_rate": 8.670610265232398e-06, "loss": 0.0438, "step": 1392 }, { "epoch": 0.942489851150203, "grad_norm": 0.6011275614491401, "learning_rate": 8.667935949152057e-06, "loss": 0.0425, "step": 1393 }, { "epoch": 0.9431664411366711, "grad_norm": 0.25204300564609355, "learning_rate": 8.665259359149132e-06, "loss": 0.0291, "step": 1394 }, { "epoch": 0.9438430311231394, "grad_norm": 0.33203799073671036, "learning_rate": 8.662580496882967e-06, "loss": 0.0365, "step": 1395 }, { "epoch": 0.9445196211096076, "grad_norm": 0.5095084735188393, "learning_rate": 8.659899364014309e-06, "loss": 0.0539, "step": 1396 }, { "epoch": 0.9451962110960758, "grad_norm": 0.400212300905144, "learning_rate": 8.657215962205318e-06, "loss": 0.0459, "step": 1397 }, { "epoch": 0.945872801082544, "grad_norm": 0.47524099633985223, "learning_rate": 8.654530293119558e-06, "loss": 0.0426, "step": 1398 }, { "epoch": 0.9465493910690121, "grad_norm": 0.4513709827543037, "learning_rate": 8.651842358421999e-06, "loss": 0.0384, "step": 1399 }, { "epoch": 0.9472259810554804, "grad_norm": 0.42380788435904354, "learning_rate": 8.649152159779015e-06, "loss": 0.0534, "step": 1400 }, { "epoch": 0.9479025710419485, "grad_norm": 0.4746189652862116, "learning_rate": 8.646459698858386e-06, "loss": 0.0496, "step": 1401 }, { "epoch": 0.9485791610284168, "grad_norm": 0.33647820772373627, "learning_rate": 8.64376497732929e-06, "loss": 0.0359, "step": 1402 }, { "epoch": 0.949255751014885, "grad_norm": 0.3826856128010313, "learning_rate": 8.64106799686231e-06, "loss": 0.0379, "step": 1403 }, { "epoch": 0.9499323410013532, "grad_norm": 0.43063802215566566, "learning_rate": 8.638368759129433e-06, "loss": 0.0383, "step": 1404 }, { "epoch": 0.9506089309878214, "grad_norm": 0.4379848528368647, "learning_rate": 8.635667265804034e-06, "loss": 0.0402, "step": 1405 }, { "epoch": 0.9512855209742895, "grad_norm": 0.365501588914398, "learning_rate": 8.632963518560894e-06, "loss": 0.0302, "step": 1406 }, { "epoch": 0.9519621109607578, "grad_norm": 0.5678982451258465, "learning_rate": 8.630257519076196e-06, "loss": 0.0514, "step": 1407 }, { "epoch": 0.952638700947226, "grad_norm": 0.42652816898918783, "learning_rate": 8.627549269027509e-06, "loss": 0.0335, "step": 1408 }, { "epoch": 0.9533152909336942, "grad_norm": 0.4278765027867784, "learning_rate": 8.624838770093805e-06, "loss": 0.052, "step": 1409 }, { "epoch": 0.9539918809201624, "grad_norm": 0.35663373279217603, "learning_rate": 8.622126023955446e-06, "loss": 0.0352, "step": 1410 }, { "epoch": 0.9546684709066305, "grad_norm": 0.2603426735691204, "learning_rate": 8.619411032294187e-06, "loss": 0.0252, "step": 1411 }, { "epoch": 0.9553450608930988, "grad_norm": 0.4082089492484779, "learning_rate": 8.616693796793178e-06, "loss": 0.0371, "step": 1412 }, { "epoch": 0.956021650879567, "grad_norm": 0.4961975359572775, "learning_rate": 8.613974319136959e-06, "loss": 0.0471, "step": 1413 }, { "epoch": 0.9566982408660352, "grad_norm": 0.3054302991764964, "learning_rate": 8.611252601011457e-06, "loss": 0.0301, "step": 1414 }, { "epoch": 0.9573748308525034, "grad_norm": 0.30909263378726926, "learning_rate": 8.608528644103994e-06, "loss": 0.0426, "step": 1415 }, { "epoch": 0.9580514208389715, "grad_norm": 0.36823686981212433, "learning_rate": 8.605802450103276e-06, "loss": 0.0318, "step": 1416 }, { "epoch": 0.9587280108254398, "grad_norm": 0.4674893579677942, "learning_rate": 8.603074020699393e-06, "loss": 0.0393, "step": 1417 }, { "epoch": 0.959404600811908, "grad_norm": 0.8831206562571963, "learning_rate": 8.600343357583826e-06, "loss": 0.0625, "step": 1418 }, { "epoch": 0.9600811907983762, "grad_norm": 0.4818670093680137, "learning_rate": 8.597610462449441e-06, "loss": 0.0409, "step": 1419 }, { "epoch": 0.9607577807848444, "grad_norm": 0.4343179237952745, "learning_rate": 8.594875336990482e-06, "loss": 0.046, "step": 1420 }, { "epoch": 0.9614343707713126, "grad_norm": 0.45361580834281207, "learning_rate": 8.592137982902585e-06, "loss": 0.0376, "step": 1421 }, { "epoch": 0.9621109607577808, "grad_norm": 0.3861893952181174, "learning_rate": 8.589398401882755e-06, "loss": 0.0357, "step": 1422 }, { "epoch": 0.9627875507442489, "grad_norm": 0.49705716116923543, "learning_rate": 8.586656595629387e-06, "loss": 0.0442, "step": 1423 }, { "epoch": 0.9634641407307172, "grad_norm": 2.0948926324352524, "learning_rate": 8.583912565842258e-06, "loss": 0.0437, "step": 1424 }, { "epoch": 0.9641407307171854, "grad_norm": 0.7543514241780067, "learning_rate": 8.581166314222512e-06, "loss": 0.04, "step": 1425 }, { "epoch": 0.9648173207036536, "grad_norm": 0.6287871594271817, "learning_rate": 8.57841784247268e-06, "loss": 0.044, "step": 1426 }, { "epoch": 0.9654939106901218, "grad_norm": 0.2973673246487005, "learning_rate": 8.575667152296666e-06, "loss": 0.0315, "step": 1427 }, { "epoch": 0.9661705006765899, "grad_norm": 0.4947375813489154, "learning_rate": 8.572914245399748e-06, "loss": 0.0413, "step": 1428 }, { "epoch": 0.9668470906630582, "grad_norm": 0.6129574364831808, "learning_rate": 8.570159123488584e-06, "loss": 0.0424, "step": 1429 }, { "epoch": 0.9675236806495264, "grad_norm": 0.4962127020607894, "learning_rate": 8.567401788271195e-06, "loss": 0.037, "step": 1430 }, { "epoch": 0.9682002706359946, "grad_norm": 0.7511633360613846, "learning_rate": 8.564642241456986e-06, "loss": 0.0455, "step": 1431 }, { "epoch": 0.9688768606224628, "grad_norm": 0.3728095573745696, "learning_rate": 8.561880484756726e-06, "loss": 0.0386, "step": 1432 }, { "epoch": 0.969553450608931, "grad_norm": 0.7617079823802402, "learning_rate": 8.559116519882551e-06, "loss": 0.0445, "step": 1433 }, { "epoch": 0.9702300405953992, "grad_norm": 0.5868883031623126, "learning_rate": 8.556350348547978e-06, "loss": 0.0329, "step": 1434 }, { "epoch": 0.9709066305818674, "grad_norm": 0.37583133626818915, "learning_rate": 8.553581972467875e-06, "loss": 0.0332, "step": 1435 }, { "epoch": 0.9715832205683356, "grad_norm": 0.46307090629435255, "learning_rate": 8.550811393358494e-06, "loss": 0.0369, "step": 1436 }, { "epoch": 0.9722598105548038, "grad_norm": 0.8679344247761973, "learning_rate": 8.54803861293744e-06, "loss": 0.0583, "step": 1437 }, { "epoch": 0.972936400541272, "grad_norm": 0.5324883304926782, "learning_rate": 8.545263632923687e-06, "loss": 0.0316, "step": 1438 }, { "epoch": 0.9736129905277402, "grad_norm": 0.3736040104215009, "learning_rate": 8.542486455037578e-06, "loss": 0.0304, "step": 1439 }, { "epoch": 0.9742895805142084, "grad_norm": 0.5557634980946791, "learning_rate": 8.539707081000808e-06, "loss": 0.0353, "step": 1440 }, { "epoch": 0.9749661705006766, "grad_norm": 0.8313826424684858, "learning_rate": 8.536925512536441e-06, "loss": 0.0404, "step": 1441 }, { "epoch": 0.9756427604871448, "grad_norm": 0.4738732342928901, "learning_rate": 8.534141751368901e-06, "loss": 0.0455, "step": 1442 }, { "epoch": 0.976319350473613, "grad_norm": 0.4252280747402319, "learning_rate": 8.531355799223968e-06, "loss": 0.0376, "step": 1443 }, { "epoch": 0.9769959404600812, "grad_norm": 0.5163268090039261, "learning_rate": 8.528567657828785e-06, "loss": 0.0359, "step": 1444 }, { "epoch": 0.9776725304465493, "grad_norm": 0.6482992708520467, "learning_rate": 8.525777328911846e-06, "loss": 0.0402, "step": 1445 }, { "epoch": 0.9783491204330176, "grad_norm": 0.327154709322352, "learning_rate": 8.522984814203006e-06, "loss": 0.0264, "step": 1446 }, { "epoch": 0.9790257104194858, "grad_norm": 0.34617077354022013, "learning_rate": 8.520190115433473e-06, "loss": 0.0351, "step": 1447 }, { "epoch": 0.979702300405954, "grad_norm": 1.0728151682978733, "learning_rate": 8.517393234335812e-06, "loss": 0.0602, "step": 1448 }, { "epoch": 0.9803788903924222, "grad_norm": 0.47186916437704224, "learning_rate": 8.514594172643934e-06, "loss": 0.0535, "step": 1449 }, { "epoch": 0.9810554803788903, "grad_norm": 0.5570069753863448, "learning_rate": 8.51179293209311e-06, "loss": 0.0474, "step": 1450 }, { "epoch": 0.9817320703653586, "grad_norm": 0.414451585522381, "learning_rate": 8.508989514419959e-06, "loss": 0.0327, "step": 1451 }, { "epoch": 0.9824086603518268, "grad_norm": 0.3941767751245504, "learning_rate": 8.506183921362443e-06, "loss": 0.0268, "step": 1452 }, { "epoch": 0.983085250338295, "grad_norm": 0.5328637980848144, "learning_rate": 8.503376154659886e-06, "loss": 0.0512, "step": 1453 }, { "epoch": 0.9837618403247632, "grad_norm": 0.46205090639806845, "learning_rate": 8.500566216052948e-06, "loss": 0.0473, "step": 1454 }, { "epoch": 0.9844384303112313, "grad_norm": 0.4458660847402536, "learning_rate": 8.497754107283637e-06, "loss": 0.0379, "step": 1455 }, { "epoch": 0.9851150202976996, "grad_norm": 0.37550898093481744, "learning_rate": 8.494939830095315e-06, "loss": 0.0484, "step": 1456 }, { "epoch": 0.9857916102841678, "grad_norm": 0.5418968458578318, "learning_rate": 8.492123386232678e-06, "loss": 0.0479, "step": 1457 }, { "epoch": 0.986468200270636, "grad_norm": 0.6605495668597899, "learning_rate": 8.489304777441772e-06, "loss": 0.0337, "step": 1458 }, { "epoch": 0.9871447902571042, "grad_norm": 0.37757453503117505, "learning_rate": 8.486484005469977e-06, "loss": 0.0441, "step": 1459 }, { "epoch": 0.9878213802435724, "grad_norm": 1.6101027995387267, "learning_rate": 8.483661072066027e-06, "loss": 0.0536, "step": 1460 }, { "epoch": 0.9884979702300406, "grad_norm": 0.37741568372068746, "learning_rate": 8.480835978979983e-06, "loss": 0.0332, "step": 1461 }, { "epoch": 0.9891745602165088, "grad_norm": 0.360903326245528, "learning_rate": 8.478008727963253e-06, "loss": 0.0378, "step": 1462 }, { "epoch": 0.989851150202977, "grad_norm": 0.4449941405624222, "learning_rate": 8.475179320768581e-06, "loss": 0.0376, "step": 1463 }, { "epoch": 0.9905277401894452, "grad_norm": 0.3399711778243278, "learning_rate": 8.472347759150044e-06, "loss": 0.0275, "step": 1464 }, { "epoch": 0.9912043301759134, "grad_norm": 0.6851638740987143, "learning_rate": 8.46951404486306e-06, "loss": 0.0419, "step": 1465 }, { "epoch": 0.9918809201623816, "grad_norm": 0.45020867674883097, "learning_rate": 8.466678179664378e-06, "loss": 0.0528, "step": 1466 }, { "epoch": 0.9925575101488497, "grad_norm": 0.5147653396813089, "learning_rate": 8.463840165312083e-06, "loss": 0.0682, "step": 1467 }, { "epoch": 0.993234100135318, "grad_norm": 0.35734575635179644, "learning_rate": 8.461000003565588e-06, "loss": 0.0447, "step": 1468 }, { "epoch": 0.9939106901217862, "grad_norm": 1.1181458457099287, "learning_rate": 8.458157696185643e-06, "loss": 0.0566, "step": 1469 }, { "epoch": 0.9945872801082544, "grad_norm": 0.3770843257846701, "learning_rate": 8.455313244934324e-06, "loss": 0.0412, "step": 1470 }, { "epoch": 0.9952638700947226, "grad_norm": 0.3277284833415035, "learning_rate": 8.452466651575039e-06, "loss": 0.0319, "step": 1471 }, { "epoch": 0.9959404600811907, "grad_norm": 0.4301378623400825, "learning_rate": 8.44961791787252e-06, "loss": 0.0455, "step": 1472 }, { "epoch": 0.996617050067659, "grad_norm": 0.5153539805412881, "learning_rate": 8.446767045592829e-06, "loss": 0.0502, "step": 1473 }, { "epoch": 0.9972936400541272, "grad_norm": 0.4532534249478539, "learning_rate": 8.443914036503356e-06, "loss": 0.0451, "step": 1474 }, { "epoch": 0.9979702300405954, "grad_norm": 0.4550178125850516, "learning_rate": 8.44105889237281e-06, "loss": 0.0492, "step": 1475 }, { "epoch": 0.9986468200270636, "grad_norm": 0.4079338583231294, "learning_rate": 8.438201614971227e-06, "loss": 0.0435, "step": 1476 }, { "epoch": 0.9993234100135318, "grad_norm": 0.37749640154370667, "learning_rate": 8.435342206069965e-06, "loss": 0.0374, "step": 1477 }, { "epoch": 1.0, "grad_norm": 0.5003925380914696, "learning_rate": 8.432480667441703e-06, "loss": 0.0446, "step": 1478 }, { "epoch": 1.0, "eval_loss": 0.043284833431243896, "eval_runtime": 236.1821, "eval_samples_per_second": 42.15, "eval_steps_per_second": 1.321, "step": 1478 }, { "epoch": 1.0006765899864682, "grad_norm": 0.33522796023289275, "learning_rate": 8.429617000860441e-06, "loss": 0.0324, "step": 1479 }, { "epoch": 1.0013531799729365, "grad_norm": 0.33595347771660705, "learning_rate": 8.4267512081015e-06, "loss": 0.0315, "step": 1480 }, { "epoch": 1.0020297699594045, "grad_norm": 0.3800678530622861, "learning_rate": 8.423883290941514e-06, "loss": 0.0462, "step": 1481 }, { "epoch": 1.0027063599458728, "grad_norm": 0.9187984515660299, "learning_rate": 8.421013251158437e-06, "loss": 0.0704, "step": 1482 }, { "epoch": 1.003382949932341, "grad_norm": 0.34154557986940803, "learning_rate": 8.418141090531543e-06, "loss": 0.0369, "step": 1483 }, { "epoch": 1.0040595399188093, "grad_norm": 0.4655237200659949, "learning_rate": 8.415266810841412e-06, "loss": 0.043, "step": 1484 }, { "epoch": 1.0047361299052775, "grad_norm": 0.2820164912654239, "learning_rate": 8.412390413869944e-06, "loss": 0.0246, "step": 1485 }, { "epoch": 1.0054127198917455, "grad_norm": 0.5598470452671843, "learning_rate": 8.409511901400351e-06, "loss": 0.0464, "step": 1486 }, { "epoch": 1.0060893098782138, "grad_norm": 0.4965875342631596, "learning_rate": 8.406631275217156e-06, "loss": 0.0373, "step": 1487 }, { "epoch": 1.006765899864682, "grad_norm": 0.37281908124115976, "learning_rate": 8.40374853710619e-06, "loss": 0.0325, "step": 1488 }, { "epoch": 1.0074424898511503, "grad_norm": 0.35947338122712696, "learning_rate": 8.400863688854598e-06, "loss": 0.03, "step": 1489 }, { "epoch": 1.0081190798376185, "grad_norm": 0.8250910442927936, "learning_rate": 8.397976732250827e-06, "loss": 0.0353, "step": 1490 }, { "epoch": 1.0087956698240865, "grad_norm": 1.089833068210979, "learning_rate": 8.395087669084638e-06, "loss": 0.0387, "step": 1491 }, { "epoch": 1.0094722598105548, "grad_norm": 0.4130176092842279, "learning_rate": 8.392196501147092e-06, "loss": 0.0341, "step": 1492 }, { "epoch": 1.010148849797023, "grad_norm": 0.7927218466842345, "learning_rate": 8.389303230230556e-06, "loss": 0.0465, "step": 1493 }, { "epoch": 1.0108254397834913, "grad_norm": 0.3443406951629904, "learning_rate": 8.386407858128707e-06, "loss": 0.0269, "step": 1494 }, { "epoch": 1.0115020297699595, "grad_norm": 0.5931951717062812, "learning_rate": 8.383510386636516e-06, "loss": 0.0409, "step": 1495 }, { "epoch": 1.0121786197564275, "grad_norm": 0.48089521261360063, "learning_rate": 8.380610817550256e-06, "loss": 0.0305, "step": 1496 }, { "epoch": 1.0128552097428958, "grad_norm": 0.3327952861061012, "learning_rate": 8.377709152667513e-06, "loss": 0.0362, "step": 1497 }, { "epoch": 1.013531799729364, "grad_norm": 0.34476344023970174, "learning_rate": 8.374805393787154e-06, "loss": 0.0317, "step": 1498 }, { "epoch": 1.0142083897158323, "grad_norm": 0.66524767617579, "learning_rate": 8.371899542709355e-06, "loss": 0.0602, "step": 1499 }, { "epoch": 1.0148849797023005, "grad_norm": 0.5081402036360071, "learning_rate": 8.36899160123559e-06, "loss": 0.0485, "step": 1500 }, { "epoch": 1.0155615696887685, "grad_norm": 0.674647947598683, "learning_rate": 8.366081571168625e-06, "loss": 0.0318, "step": 1501 }, { "epoch": 1.0162381596752368, "grad_norm": 0.30597063315759776, "learning_rate": 8.363169454312518e-06, "loss": 0.0312, "step": 1502 }, { "epoch": 1.016914749661705, "grad_norm": 0.44430106872899877, "learning_rate": 8.36025525247263e-06, "loss": 0.0362, "step": 1503 }, { "epoch": 1.0175913396481733, "grad_norm": 0.3661569722846717, "learning_rate": 8.357338967455605e-06, "loss": 0.0287, "step": 1504 }, { "epoch": 1.0182679296346413, "grad_norm": 0.6498882765499547, "learning_rate": 8.354420601069384e-06, "loss": 0.0322, "step": 1505 }, { "epoch": 1.0189445196211095, "grad_norm": 0.5930942256287522, "learning_rate": 8.3515001551232e-06, "loss": 0.0339, "step": 1506 }, { "epoch": 1.0196211096075778, "grad_norm": 0.3884818716396439, "learning_rate": 8.348577631427565e-06, "loss": 0.032, "step": 1507 }, { "epoch": 1.020297699594046, "grad_norm": 0.511039217554216, "learning_rate": 8.345653031794292e-06, "loss": 0.0348, "step": 1508 }, { "epoch": 1.0209742895805143, "grad_norm": 0.46397974067848646, "learning_rate": 8.342726358036473e-06, "loss": 0.0411, "step": 1509 }, { "epoch": 1.0216508795669823, "grad_norm": 0.5205087161344565, "learning_rate": 8.339797611968488e-06, "loss": 0.0484, "step": 1510 }, { "epoch": 1.0223274695534506, "grad_norm": 0.3811280149269098, "learning_rate": 8.336866795406003e-06, "loss": 0.0355, "step": 1511 }, { "epoch": 1.0230040595399188, "grad_norm": 0.3328731343602381, "learning_rate": 8.333933910165964e-06, "loss": 0.0316, "step": 1512 }, { "epoch": 1.023680649526387, "grad_norm": 0.4671503400685048, "learning_rate": 8.3309989580666e-06, "loss": 0.0338, "step": 1513 }, { "epoch": 1.0243572395128553, "grad_norm": 0.4439344594654805, "learning_rate": 8.32806194092743e-06, "loss": 0.0339, "step": 1514 }, { "epoch": 1.0250338294993233, "grad_norm": 0.3248928138089397, "learning_rate": 8.325122860569241e-06, "loss": 0.0306, "step": 1515 }, { "epoch": 1.0257104194857916, "grad_norm": 0.3066152336742344, "learning_rate": 8.322181718814107e-06, "loss": 0.0338, "step": 1516 }, { "epoch": 1.0263870094722598, "grad_norm": 0.39788533595101166, "learning_rate": 8.319238517485376e-06, "loss": 0.0318, "step": 1517 }, { "epoch": 1.027063599458728, "grad_norm": 0.37846270133688437, "learning_rate": 8.316293258407673e-06, "loss": 0.0323, "step": 1518 }, { "epoch": 1.0277401894451963, "grad_norm": 0.33545972275343455, "learning_rate": 8.313345943406903e-06, "loss": 0.031, "step": 1519 }, { "epoch": 1.0284167794316643, "grad_norm": 0.34039697813991265, "learning_rate": 8.310396574310239e-06, "loss": 0.0396, "step": 1520 }, { "epoch": 1.0290933694181326, "grad_norm": 0.42115212717125267, "learning_rate": 8.307445152946133e-06, "loss": 0.0372, "step": 1521 }, { "epoch": 1.0297699594046008, "grad_norm": 0.39351326497939293, "learning_rate": 8.304491681144306e-06, "loss": 0.0423, "step": 1522 }, { "epoch": 1.030446549391069, "grad_norm": 0.38777271173894157, "learning_rate": 8.301536160735752e-06, "loss": 0.0303, "step": 1523 }, { "epoch": 1.0311231393775373, "grad_norm": 0.5419920395095353, "learning_rate": 8.298578593552737e-06, "loss": 0.0339, "step": 1524 }, { "epoch": 1.0317997293640053, "grad_norm": 0.34527420183614954, "learning_rate": 8.295618981428788e-06, "loss": 0.0223, "step": 1525 }, { "epoch": 1.0324763193504736, "grad_norm": 0.34245985834211895, "learning_rate": 8.292657326198707e-06, "loss": 0.0323, "step": 1526 }, { "epoch": 1.0331529093369418, "grad_norm": 0.5092072347718094, "learning_rate": 8.289693629698564e-06, "loss": 0.0387, "step": 1527 }, { "epoch": 1.03382949932341, "grad_norm": 0.4761678063172832, "learning_rate": 8.286727893765687e-06, "loss": 0.0387, "step": 1528 }, { "epoch": 1.0345060893098783, "grad_norm": 0.8543000263590717, "learning_rate": 8.283760120238672e-06, "loss": 0.0475, "step": 1529 }, { "epoch": 1.0351826792963463, "grad_norm": 0.3218258332800442, "learning_rate": 8.280790310957382e-06, "loss": 0.033, "step": 1530 }, { "epoch": 1.0358592692828146, "grad_norm": 0.254089712043321, "learning_rate": 8.277818467762937e-06, "loss": 0.0245, "step": 1531 }, { "epoch": 1.0365358592692828, "grad_norm": 0.4275886427356734, "learning_rate": 8.27484459249772e-06, "loss": 0.0439, "step": 1532 }, { "epoch": 1.037212449255751, "grad_norm": 0.5234314087999413, "learning_rate": 8.271868687005371e-06, "loss": 0.0453, "step": 1533 }, { "epoch": 1.0378890392422193, "grad_norm": 0.3271970526964452, "learning_rate": 8.268890753130794e-06, "loss": 0.0296, "step": 1534 }, { "epoch": 1.0385656292286873, "grad_norm": 0.6884229044748504, "learning_rate": 8.265910792720147e-06, "loss": 0.05, "step": 1535 }, { "epoch": 1.0392422192151556, "grad_norm": 0.4463909883398464, "learning_rate": 8.262928807620843e-06, "loss": 0.0399, "step": 1536 }, { "epoch": 1.0399188092016238, "grad_norm": 0.39159432382126147, "learning_rate": 8.259944799681555e-06, "loss": 0.0323, "step": 1537 }, { "epoch": 1.040595399188092, "grad_norm": 0.4942700529498418, "learning_rate": 8.256958770752203e-06, "loss": 0.0499, "step": 1538 }, { "epoch": 1.0412719891745603, "grad_norm": 0.2945986814302107, "learning_rate": 8.253970722683968e-06, "loss": 0.0261, "step": 1539 }, { "epoch": 1.0419485791610283, "grad_norm": 0.3335683511759464, "learning_rate": 8.250980657329278e-06, "loss": 0.0319, "step": 1540 }, { "epoch": 1.0426251691474966, "grad_norm": 0.31197255403397706, "learning_rate": 8.24798857654181e-06, "loss": 0.0321, "step": 1541 }, { "epoch": 1.0433017591339648, "grad_norm": 0.3053910422157987, "learning_rate": 8.244994482176495e-06, "loss": 0.025, "step": 1542 }, { "epoch": 1.043978349120433, "grad_norm": 0.5682066498563606, "learning_rate": 8.241998376089508e-06, "loss": 0.0357, "step": 1543 }, { "epoch": 1.044654939106901, "grad_norm": 0.5442165459359114, "learning_rate": 8.239000260138277e-06, "loss": 0.0391, "step": 1544 }, { "epoch": 1.0453315290933693, "grad_norm": 0.41921869529424377, "learning_rate": 8.236000136181468e-06, "loss": 0.0433, "step": 1545 }, { "epoch": 1.0460081190798376, "grad_norm": 0.4051124974993968, "learning_rate": 8.232998006078998e-06, "loss": 0.0318, "step": 1546 }, { "epoch": 1.0466847090663058, "grad_norm": 0.510469955338187, "learning_rate": 8.229993871692028e-06, "loss": 0.0378, "step": 1547 }, { "epoch": 1.047361299052774, "grad_norm": 0.45677758549316627, "learning_rate": 8.226987734882956e-06, "loss": 0.0445, "step": 1548 }, { "epoch": 1.048037889039242, "grad_norm": 0.4883730554355881, "learning_rate": 8.223979597515425e-06, "loss": 0.037, "step": 1549 }, { "epoch": 1.0487144790257104, "grad_norm": 0.40831327457707306, "learning_rate": 8.220969461454322e-06, "loss": 0.0438, "step": 1550 }, { "epoch": 1.0493910690121786, "grad_norm": 0.4668162520799239, "learning_rate": 8.217957328565765e-06, "loss": 0.032, "step": 1551 }, { "epoch": 1.0500676589986468, "grad_norm": 0.377538200410174, "learning_rate": 8.214943200717114e-06, "loss": 0.0406, "step": 1552 }, { "epoch": 1.050744248985115, "grad_norm": 0.5971134143439779, "learning_rate": 8.211927079776969e-06, "loss": 0.0303, "step": 1553 }, { "epoch": 1.0514208389715831, "grad_norm": 0.33996140132456804, "learning_rate": 8.208908967615159e-06, "loss": 0.0265, "step": 1554 }, { "epoch": 1.0520974289580514, "grad_norm": 0.4139458638311267, "learning_rate": 8.205888866102753e-06, "loss": 0.0429, "step": 1555 }, { "epoch": 1.0527740189445196, "grad_norm": 0.373063790038525, "learning_rate": 8.202866777112049e-06, "loss": 0.0404, "step": 1556 }, { "epoch": 1.0534506089309879, "grad_norm": 0.4080877429876385, "learning_rate": 8.199842702516584e-06, "loss": 0.0446, "step": 1557 }, { "epoch": 1.054127198917456, "grad_norm": 0.333078946434557, "learning_rate": 8.196816644191116e-06, "loss": 0.0349, "step": 1558 }, { "epoch": 1.0548037889039241, "grad_norm": 0.5599428068564104, "learning_rate": 8.193788604011639e-06, "loss": 0.045, "step": 1559 }, { "epoch": 1.0554803788903924, "grad_norm": 0.38792237910080307, "learning_rate": 8.190758583855379e-06, "loss": 0.0345, "step": 1560 }, { "epoch": 1.0561569688768606, "grad_norm": 0.3910418436826441, "learning_rate": 8.187726585600779e-06, "loss": 0.0399, "step": 1561 }, { "epoch": 1.0568335588633289, "grad_norm": 0.38310512968348226, "learning_rate": 8.18469261112752e-06, "loss": 0.032, "step": 1562 }, { "epoch": 1.057510148849797, "grad_norm": 0.30445322632194627, "learning_rate": 8.181656662316498e-06, "loss": 0.0319, "step": 1563 }, { "epoch": 1.0581867388362651, "grad_norm": 0.3300628513317476, "learning_rate": 8.178618741049841e-06, "loss": 0.024, "step": 1564 }, { "epoch": 1.0588633288227334, "grad_norm": 0.3856795494235353, "learning_rate": 8.175578849210894e-06, "loss": 0.0448, "step": 1565 }, { "epoch": 1.0595399188092016, "grad_norm": 0.5723942162671996, "learning_rate": 8.172536988684227e-06, "loss": 0.046, "step": 1566 }, { "epoch": 1.0602165087956699, "grad_norm": 0.31413445599872286, "learning_rate": 8.169493161355632e-06, "loss": 0.0374, "step": 1567 }, { "epoch": 1.060893098782138, "grad_norm": 0.5475407183176644, "learning_rate": 8.166447369112115e-06, "loss": 0.0489, "step": 1568 }, { "epoch": 1.0615696887686061, "grad_norm": 0.5244671095066844, "learning_rate": 8.163399613841903e-06, "loss": 0.0495, "step": 1569 }, { "epoch": 1.0622462787550744, "grad_norm": 0.3758287293296234, "learning_rate": 8.160349897434441e-06, "loss": 0.0411, "step": 1570 }, { "epoch": 1.0629228687415426, "grad_norm": 0.7654749597629602, "learning_rate": 8.157298221780388e-06, "loss": 0.0366, "step": 1571 }, { "epoch": 1.0635994587280109, "grad_norm": 0.9044094727671382, "learning_rate": 8.15424458877162e-06, "loss": 0.0348, "step": 1572 }, { "epoch": 1.0642760487144791, "grad_norm": 0.37991636890979896, "learning_rate": 8.151189000301223e-06, "loss": 0.0353, "step": 1573 }, { "epoch": 1.0649526387009471, "grad_norm": 0.4254130519529926, "learning_rate": 8.148131458263499e-06, "loss": 0.0344, "step": 1574 }, { "epoch": 1.0656292286874154, "grad_norm": 0.3696155354094528, "learning_rate": 8.145071964553956e-06, "loss": 0.0301, "step": 1575 }, { "epoch": 1.0663058186738836, "grad_norm": 0.43828698159398116, "learning_rate": 8.142010521069319e-06, "loss": 0.0296, "step": 1576 }, { "epoch": 1.0669824086603519, "grad_norm": 0.3898802338702638, "learning_rate": 8.138947129707517e-06, "loss": 0.0464, "step": 1577 }, { "epoch": 1.0676589986468201, "grad_norm": 0.3677185173412872, "learning_rate": 8.135881792367686e-06, "loss": 0.0312, "step": 1578 }, { "epoch": 1.0683355886332881, "grad_norm": 0.3618411532164138, "learning_rate": 8.132814510950172e-06, "loss": 0.0371, "step": 1579 }, { "epoch": 1.0690121786197564, "grad_norm": 0.32566881071724985, "learning_rate": 8.129745287356521e-06, "loss": 0.0293, "step": 1580 }, { "epoch": 1.0696887686062246, "grad_norm": 0.4687197012890747, "learning_rate": 8.12667412348949e-06, "loss": 0.037, "step": 1581 }, { "epoch": 1.0703653585926929, "grad_norm": 0.31449392470467563, "learning_rate": 8.12360102125303e-06, "loss": 0.0321, "step": 1582 }, { "epoch": 1.0710419485791611, "grad_norm": 0.3527388825096024, "learning_rate": 8.120525982552304e-06, "loss": 0.0346, "step": 1583 }, { "epoch": 1.0717185385656292, "grad_norm": 0.5887727003319773, "learning_rate": 8.117449009293668e-06, "loss": 0.0457, "step": 1584 }, { "epoch": 1.0723951285520974, "grad_norm": 0.26688692588913676, "learning_rate": 8.11437010338468e-06, "loss": 0.0219, "step": 1585 }, { "epoch": 1.0730717185385656, "grad_norm": 0.26821326851919136, "learning_rate": 8.111289266734095e-06, "loss": 0.0234, "step": 1586 }, { "epoch": 1.073748308525034, "grad_norm": 0.2671973703500071, "learning_rate": 8.108206501251868e-06, "loss": 0.0257, "step": 1587 }, { "epoch": 1.0744248985115021, "grad_norm": 0.3298742756049819, "learning_rate": 8.105121808849143e-06, "loss": 0.026, "step": 1588 }, { "epoch": 1.0751014884979702, "grad_norm": 0.4953624489261912, "learning_rate": 8.102035191438268e-06, "loss": 0.0387, "step": 1589 }, { "epoch": 1.0757780784844384, "grad_norm": 0.4502736852528534, "learning_rate": 8.098946650932776e-06, "loss": 0.0504, "step": 1590 }, { "epoch": 1.0764546684709067, "grad_norm": 0.5280120427931959, "learning_rate": 8.095856189247396e-06, "loss": 0.0483, "step": 1591 }, { "epoch": 1.077131258457375, "grad_norm": 0.3892483852796154, "learning_rate": 8.092763808298048e-06, "loss": 0.0407, "step": 1592 }, { "epoch": 1.0778078484438431, "grad_norm": 0.3164332934881196, "learning_rate": 8.089669510001843e-06, "loss": 0.0286, "step": 1593 }, { "epoch": 1.0784844384303112, "grad_norm": 0.375134934327732, "learning_rate": 8.086573296277078e-06, "loss": 0.0361, "step": 1594 }, { "epoch": 1.0791610284167794, "grad_norm": 0.3232728457786153, "learning_rate": 8.083475169043237e-06, "loss": 0.0326, "step": 1595 }, { "epoch": 1.0798376184032477, "grad_norm": 0.49497911034836545, "learning_rate": 8.080375130220995e-06, "loss": 0.041, "step": 1596 }, { "epoch": 1.080514208389716, "grad_norm": 0.3878288769567584, "learning_rate": 8.077273181732207e-06, "loss": 0.0279, "step": 1597 }, { "epoch": 1.0811907983761841, "grad_norm": 0.41041669159238475, "learning_rate": 8.074169325499915e-06, "loss": 0.0397, "step": 1598 }, { "epoch": 1.0818673883626522, "grad_norm": 0.4103534818542798, "learning_rate": 8.071063563448341e-06, "loss": 0.031, "step": 1599 }, { "epoch": 1.0825439783491204, "grad_norm": 0.4131078990705404, "learning_rate": 8.06795589750289e-06, "loss": 0.0439, "step": 1600 }, { "epoch": 1.0832205683355887, "grad_norm": 0.41139813037009376, "learning_rate": 8.06484632959015e-06, "loss": 0.0339, "step": 1601 }, { "epoch": 1.083897158322057, "grad_norm": 0.35903652512166706, "learning_rate": 8.061734861637883e-06, "loss": 0.0302, "step": 1602 }, { "epoch": 1.084573748308525, "grad_norm": 0.4265701546683336, "learning_rate": 8.058621495575032e-06, "loss": 0.0381, "step": 1603 }, { "epoch": 1.0852503382949932, "grad_norm": 0.4721907691857595, "learning_rate": 8.055506233331718e-06, "loss": 0.0394, "step": 1604 }, { "epoch": 1.0859269282814614, "grad_norm": 0.485689393088364, "learning_rate": 8.052389076839233e-06, "loss": 0.0361, "step": 1605 }, { "epoch": 1.0866035182679297, "grad_norm": 0.39125175857039185, "learning_rate": 8.049270028030045e-06, "loss": 0.0304, "step": 1606 }, { "epoch": 1.087280108254398, "grad_norm": 0.565839044842079, "learning_rate": 8.046149088837803e-06, "loss": 0.031, "step": 1607 }, { "epoch": 1.087956698240866, "grad_norm": 0.3441864604049982, "learning_rate": 8.043026261197312e-06, "loss": 0.031, "step": 1608 }, { "epoch": 1.0886332882273342, "grad_norm": 0.2861275412838357, "learning_rate": 8.039901547044564e-06, "loss": 0.0319, "step": 1609 }, { "epoch": 1.0893098782138024, "grad_norm": 0.39751366001270527, "learning_rate": 8.03677494831671e-06, "loss": 0.025, "step": 1610 }, { "epoch": 1.0899864682002707, "grad_norm": 0.6844399185434005, "learning_rate": 8.033646466952072e-06, "loss": 0.0505, "step": 1611 }, { "epoch": 1.090663058186739, "grad_norm": 0.40688187672698983, "learning_rate": 8.03051610489014e-06, "loss": 0.036, "step": 1612 }, { "epoch": 1.091339648173207, "grad_norm": 0.5056451013640939, "learning_rate": 8.027383864071573e-06, "loss": 0.0302, "step": 1613 }, { "epoch": 1.0920162381596752, "grad_norm": 0.3301636612204815, "learning_rate": 8.024249746438189e-06, "loss": 0.0271, "step": 1614 }, { "epoch": 1.0926928281461434, "grad_norm": 0.42877224360312527, "learning_rate": 8.021113753932972e-06, "loss": 0.0337, "step": 1615 }, { "epoch": 1.0933694181326117, "grad_norm": 0.43008720041470855, "learning_rate": 8.017975888500067e-06, "loss": 0.0329, "step": 1616 }, { "epoch": 1.09404600811908, "grad_norm": 0.5083512064425184, "learning_rate": 8.014836152084784e-06, "loss": 0.0434, "step": 1617 }, { "epoch": 1.094722598105548, "grad_norm": 0.7713722017793138, "learning_rate": 8.01169454663359e-06, "loss": 0.0637, "step": 1618 }, { "epoch": 1.0953991880920162, "grad_norm": 0.3990509266010287, "learning_rate": 8.008551074094108e-06, "loss": 0.0313, "step": 1619 }, { "epoch": 1.0960757780784844, "grad_norm": 0.35489169548609006, "learning_rate": 8.005405736415127e-06, "loss": 0.0293, "step": 1620 }, { "epoch": 1.0967523680649527, "grad_norm": 0.37090764662473097, "learning_rate": 8.00225853554658e-06, "loss": 0.0312, "step": 1621 }, { "epoch": 1.097428958051421, "grad_norm": 0.4883875153457756, "learning_rate": 7.99910947343957e-06, "loss": 0.05, "step": 1622 }, { "epoch": 1.098105548037889, "grad_norm": 0.43816736897448766, "learning_rate": 7.995958552046338e-06, "loss": 0.0328, "step": 1623 }, { "epoch": 1.0987821380243572, "grad_norm": 0.5871725257052695, "learning_rate": 7.99280577332029e-06, "loss": 0.0493, "step": 1624 }, { "epoch": 1.0994587280108254, "grad_norm": 0.41783825761682186, "learning_rate": 7.989651139215979e-06, "loss": 0.0297, "step": 1625 }, { "epoch": 1.1001353179972937, "grad_norm": 0.566107827599266, "learning_rate": 7.986494651689104e-06, "loss": 0.0397, "step": 1626 }, { "epoch": 1.100811907983762, "grad_norm": 0.4219969821926648, "learning_rate": 7.983336312696521e-06, "loss": 0.0325, "step": 1627 }, { "epoch": 1.10148849797023, "grad_norm": 0.45797002548525556, "learning_rate": 7.980176124196231e-06, "loss": 0.0359, "step": 1628 }, { "epoch": 1.1021650879566982, "grad_norm": 0.4725202273779479, "learning_rate": 7.977014088147375e-06, "loss": 0.0288, "step": 1629 }, { "epoch": 1.1028416779431665, "grad_norm": 0.6259522913144866, "learning_rate": 7.973850206510251e-06, "loss": 0.0336, "step": 1630 }, { "epoch": 1.1035182679296347, "grad_norm": 0.4909351056917921, "learning_rate": 7.970684481246291e-06, "loss": 0.0313, "step": 1631 }, { "epoch": 1.104194857916103, "grad_norm": 0.4852188933981559, "learning_rate": 7.967516914318075e-06, "loss": 0.0346, "step": 1632 }, { "epoch": 1.104871447902571, "grad_norm": 0.7070531892888307, "learning_rate": 7.964347507689325e-06, "loss": 0.0357, "step": 1633 }, { "epoch": 1.1055480378890392, "grad_norm": 0.3683086646902459, "learning_rate": 7.961176263324902e-06, "loss": 0.0362, "step": 1634 }, { "epoch": 1.1062246278755075, "grad_norm": 0.4844979931695627, "learning_rate": 7.958003183190804e-06, "loss": 0.0482, "step": 1635 }, { "epoch": 1.1069012178619757, "grad_norm": 0.4638708289005973, "learning_rate": 7.954828269254173e-06, "loss": 0.0407, "step": 1636 }, { "epoch": 1.1075778078484437, "grad_norm": 0.3266540988896249, "learning_rate": 7.951651523483283e-06, "loss": 0.0312, "step": 1637 }, { "epoch": 1.108254397834912, "grad_norm": 0.2880477755406068, "learning_rate": 7.948472947847546e-06, "loss": 0.0263, "step": 1638 }, { "epoch": 1.1089309878213802, "grad_norm": 0.49976069322425787, "learning_rate": 7.945292544317505e-06, "loss": 0.0453, "step": 1639 }, { "epoch": 1.1096075778078485, "grad_norm": 0.9157470463047522, "learning_rate": 7.942110314864842e-06, "loss": 0.0413, "step": 1640 }, { "epoch": 1.1102841677943167, "grad_norm": 0.39513786066966183, "learning_rate": 7.938926261462366e-06, "loss": 0.0377, "step": 1641 }, { "epoch": 1.1109607577807847, "grad_norm": 0.35044777038520347, "learning_rate": 7.93574038608402e-06, "loss": 0.0315, "step": 1642 }, { "epoch": 1.111637347767253, "grad_norm": 0.4056375593937972, "learning_rate": 7.932552690704871e-06, "loss": 0.0369, "step": 1643 }, { "epoch": 1.1123139377537212, "grad_norm": 0.4688694087789003, "learning_rate": 7.929363177301124e-06, "loss": 0.0458, "step": 1644 }, { "epoch": 1.1129905277401895, "grad_norm": 0.49214825386525457, "learning_rate": 7.926171847850101e-06, "loss": 0.0461, "step": 1645 }, { "epoch": 1.1136671177266577, "grad_norm": 0.4603831493068334, "learning_rate": 7.922978704330257e-06, "loss": 0.0312, "step": 1646 }, { "epoch": 1.1143437077131257, "grad_norm": 0.35540355813278274, "learning_rate": 7.919783748721169e-06, "loss": 0.0358, "step": 1647 }, { "epoch": 1.115020297699594, "grad_norm": 0.3086143597181157, "learning_rate": 7.916586983003534e-06, "loss": 0.0297, "step": 1648 }, { "epoch": 1.1156968876860622, "grad_norm": 0.3404204101476662, "learning_rate": 7.913388409159175e-06, "loss": 0.0358, "step": 1649 }, { "epoch": 1.1163734776725305, "grad_norm": 0.4997484911135465, "learning_rate": 7.910188029171039e-06, "loss": 0.0326, "step": 1650 }, { "epoch": 1.1170500676589987, "grad_norm": 0.2605099264047454, "learning_rate": 7.906985845023187e-06, "loss": 0.0242, "step": 1651 }, { "epoch": 1.1177266576454667, "grad_norm": 0.34049687122226624, "learning_rate": 7.903781858700799e-06, "loss": 0.0279, "step": 1652 }, { "epoch": 1.118403247631935, "grad_norm": 0.40720284283588737, "learning_rate": 7.900576072190177e-06, "loss": 0.0398, "step": 1653 }, { "epoch": 1.1190798376184032, "grad_norm": 0.43246417364287354, "learning_rate": 7.897368487478733e-06, "loss": 0.04, "step": 1654 }, { "epoch": 1.1197564276048715, "grad_norm": 0.46293228867610187, "learning_rate": 7.894159106554997e-06, "loss": 0.0322, "step": 1655 }, { "epoch": 1.1204330175913397, "grad_norm": 0.3614772846544025, "learning_rate": 7.890947931408614e-06, "loss": 0.0333, "step": 1656 }, { "epoch": 1.1211096075778078, "grad_norm": 0.4177570410280789, "learning_rate": 7.887734964030337e-06, "loss": 0.0293, "step": 1657 }, { "epoch": 1.121786197564276, "grad_norm": 0.5524351239758096, "learning_rate": 7.884520206412036e-06, "loss": 0.0409, "step": 1658 }, { "epoch": 1.1224627875507442, "grad_norm": 0.5420011967543492, "learning_rate": 7.881303660546684e-06, "loss": 0.0572, "step": 1659 }, { "epoch": 1.1231393775372125, "grad_norm": 0.28848419254404023, "learning_rate": 7.87808532842837e-06, "loss": 0.0278, "step": 1660 }, { "epoch": 1.1238159675236807, "grad_norm": 0.8523139376772814, "learning_rate": 7.87486521205228e-06, "loss": 0.0373, "step": 1661 }, { "epoch": 1.1244925575101488, "grad_norm": 0.3168921113962619, "learning_rate": 7.871643313414718e-06, "loss": 0.0246, "step": 1662 }, { "epoch": 1.125169147496617, "grad_norm": 0.5133589993063726, "learning_rate": 7.868419634513087e-06, "loss": 0.0363, "step": 1663 }, { "epoch": 1.1258457374830853, "grad_norm": 0.3674668379082116, "learning_rate": 7.865194177345894e-06, "loss": 0.0319, "step": 1664 }, { "epoch": 1.1265223274695535, "grad_norm": 0.6093969710809076, "learning_rate": 7.861966943912746e-06, "loss": 0.0383, "step": 1665 }, { "epoch": 1.1271989174560217, "grad_norm": 0.31434767917615763, "learning_rate": 7.858737936214355e-06, "loss": 0.0229, "step": 1666 }, { "epoch": 1.1278755074424898, "grad_norm": 0.49117740766595575, "learning_rate": 7.855507156252536e-06, "loss": 0.0374, "step": 1667 }, { "epoch": 1.128552097428958, "grad_norm": 0.4135179430121241, "learning_rate": 7.852274606030191e-06, "loss": 0.0348, "step": 1668 }, { "epoch": 1.1292286874154263, "grad_norm": 0.3331697162708948, "learning_rate": 7.849040287551331e-06, "loss": 0.0292, "step": 1669 }, { "epoch": 1.1299052774018945, "grad_norm": 0.37454027897025555, "learning_rate": 7.84580420282106e-06, "loss": 0.0326, "step": 1670 }, { "epoch": 1.1305818673883627, "grad_norm": 0.42709392587927086, "learning_rate": 7.842566353845575e-06, "loss": 0.0428, "step": 1671 }, { "epoch": 1.1312584573748308, "grad_norm": 0.4695476700239723, "learning_rate": 7.839326742632168e-06, "loss": 0.0401, "step": 1672 }, { "epoch": 1.131935047361299, "grad_norm": 0.7265900096506797, "learning_rate": 7.836085371189221e-06, "loss": 0.0336, "step": 1673 }, { "epoch": 1.1326116373477673, "grad_norm": 0.29255653236545787, "learning_rate": 7.832842241526212e-06, "loss": 0.0367, "step": 1674 }, { "epoch": 1.1332882273342355, "grad_norm": 0.3024686470353999, "learning_rate": 7.829597355653707e-06, "loss": 0.0242, "step": 1675 }, { "epoch": 1.1339648173207038, "grad_norm": 0.33658204307282275, "learning_rate": 7.82635071558336e-06, "loss": 0.0307, "step": 1676 }, { "epoch": 1.1346414073071718, "grad_norm": 0.4023445895726629, "learning_rate": 7.82310232332791e-06, "loss": 0.0405, "step": 1677 }, { "epoch": 1.13531799729364, "grad_norm": 0.26451858971973113, "learning_rate": 7.81985218090119e-06, "loss": 0.0262, "step": 1678 }, { "epoch": 1.1359945872801083, "grad_norm": 0.3952379143134822, "learning_rate": 7.81660029031811e-06, "loss": 0.0271, "step": 1679 }, { "epoch": 1.1366711772665765, "grad_norm": 0.36027556723477727, "learning_rate": 7.813346653594667e-06, "loss": 0.0263, "step": 1680 }, { "epoch": 1.1373477672530448, "grad_norm": 0.4853700627381573, "learning_rate": 7.810091272747943e-06, "loss": 0.0315, "step": 1681 }, { "epoch": 1.1380243572395128, "grad_norm": 0.4164582623763797, "learning_rate": 7.806834149796094e-06, "loss": 0.0345, "step": 1682 }, { "epoch": 1.138700947225981, "grad_norm": 0.3382194588460656, "learning_rate": 7.803575286758365e-06, "loss": 0.0278, "step": 1683 }, { "epoch": 1.1393775372124493, "grad_norm": 0.48303131698605367, "learning_rate": 7.800314685655072e-06, "loss": 0.0455, "step": 1684 }, { "epoch": 1.1400541271989175, "grad_norm": 0.4337702265887451, "learning_rate": 7.797052348507614e-06, "loss": 0.0376, "step": 1685 }, { "epoch": 1.1407307171853858, "grad_norm": 0.5900178022531811, "learning_rate": 7.793788277338464e-06, "loss": 0.0552, "step": 1686 }, { "epoch": 1.1414073071718538, "grad_norm": 0.37966231199451633, "learning_rate": 7.790522474171171e-06, "loss": 0.0316, "step": 1687 }, { "epoch": 1.142083897158322, "grad_norm": 0.33225280887417386, "learning_rate": 7.787254941030353e-06, "loss": 0.0297, "step": 1688 }, { "epoch": 1.1427604871447903, "grad_norm": 0.4131074510358068, "learning_rate": 7.78398567994171e-06, "loss": 0.0294, "step": 1689 }, { "epoch": 1.1434370771312585, "grad_norm": 0.6372941134981561, "learning_rate": 7.780714692932002e-06, "loss": 0.0356, "step": 1690 }, { "epoch": 1.1441136671177268, "grad_norm": 0.33031146965199987, "learning_rate": 7.777441982029072e-06, "loss": 0.0356, "step": 1691 }, { "epoch": 1.1447902571041948, "grad_norm": 0.3332906140704906, "learning_rate": 7.774167549261817e-06, "loss": 0.0254, "step": 1692 }, { "epoch": 1.145466847090663, "grad_norm": 0.39298673791634586, "learning_rate": 7.770891396660212e-06, "loss": 0.0369, "step": 1693 }, { "epoch": 1.1461434370771313, "grad_norm": 0.4165825025098529, "learning_rate": 7.767613526255296e-06, "loss": 0.0282, "step": 1694 }, { "epoch": 1.1468200270635995, "grad_norm": 0.34886050427002396, "learning_rate": 7.764333940079169e-06, "loss": 0.0318, "step": 1695 }, { "epoch": 1.1474966170500678, "grad_norm": 0.5327647557270032, "learning_rate": 7.761052640165e-06, "loss": 0.03, "step": 1696 }, { "epoch": 1.1481732070365358, "grad_norm": 0.4615044713226057, "learning_rate": 7.757769628547018e-06, "loss": 0.0434, "step": 1697 }, { "epoch": 1.148849797023004, "grad_norm": 0.4304149838702361, "learning_rate": 7.754484907260513e-06, "loss": 0.0422, "step": 1698 }, { "epoch": 1.1495263870094723, "grad_norm": 0.37820064932543607, "learning_rate": 7.751198478341836e-06, "loss": 0.0317, "step": 1699 }, { "epoch": 1.1502029769959405, "grad_norm": 0.4371365288523147, "learning_rate": 7.747910343828391e-06, "loss": 0.0435, "step": 1700 }, { "epoch": 1.1508795669824086, "grad_norm": 0.35221916919671453, "learning_rate": 7.744620505758652e-06, "loss": 0.0392, "step": 1701 }, { "epoch": 1.1515561569688768, "grad_norm": 0.32769852974318486, "learning_rate": 7.741328966172134e-06, "loss": 0.0283, "step": 1702 }, { "epoch": 1.152232746955345, "grad_norm": 0.2944078529104389, "learning_rate": 7.738035727109418e-06, "loss": 0.0309, "step": 1703 }, { "epoch": 1.1529093369418133, "grad_norm": 0.43745468977194324, "learning_rate": 7.734740790612137e-06, "loss": 0.0376, "step": 1704 }, { "epoch": 1.1535859269282815, "grad_norm": 0.301414702108444, "learning_rate": 7.731444158722967e-06, "loss": 0.0255, "step": 1705 }, { "epoch": 1.1542625169147496, "grad_norm": 0.3095088765629941, "learning_rate": 7.728145833485647e-06, "loss": 0.0252, "step": 1706 }, { "epoch": 1.1549391069012178, "grad_norm": 0.34503916445165483, "learning_rate": 7.724845816944962e-06, "loss": 0.0347, "step": 1707 }, { "epoch": 1.155615696887686, "grad_norm": 0.3577350655249529, "learning_rate": 7.72154411114674e-06, "loss": 0.0346, "step": 1708 }, { "epoch": 1.1562922868741543, "grad_norm": 0.40226713175297785, "learning_rate": 7.718240718137863e-06, "loss": 0.037, "step": 1709 }, { "epoch": 1.1569688768606226, "grad_norm": 0.5002419556023001, "learning_rate": 7.714935639966257e-06, "loss": 0.0373, "step": 1710 }, { "epoch": 1.1576454668470906, "grad_norm": 0.3026300902484468, "learning_rate": 7.711628878680892e-06, "loss": 0.04, "step": 1711 }, { "epoch": 1.1583220568335588, "grad_norm": 0.3416151011693065, "learning_rate": 7.708320436331782e-06, "loss": 0.0266, "step": 1712 }, { "epoch": 1.158998646820027, "grad_norm": 0.36286593113461185, "learning_rate": 7.705010314969983e-06, "loss": 0.0284, "step": 1713 }, { "epoch": 1.1596752368064953, "grad_norm": 0.35775642200205643, "learning_rate": 7.70169851664759e-06, "loss": 0.0375, "step": 1714 }, { "epoch": 1.1603518267929636, "grad_norm": 0.4725787571443051, "learning_rate": 7.698385043417741e-06, "loss": 0.048, "step": 1715 }, { "epoch": 1.1610284167794316, "grad_norm": 0.29847538616807273, "learning_rate": 7.695069897334613e-06, "loss": 0.0251, "step": 1716 }, { "epoch": 1.1617050067658998, "grad_norm": 0.41082923259581106, "learning_rate": 7.691753080453413e-06, "loss": 0.0399, "step": 1717 }, { "epoch": 1.162381596752368, "grad_norm": 0.5190388420490879, "learning_rate": 7.688434594830392e-06, "loss": 0.0343, "step": 1718 }, { "epoch": 1.1630581867388363, "grad_norm": 0.28683473141450755, "learning_rate": 7.685114442522831e-06, "loss": 0.0283, "step": 1719 }, { "epoch": 1.1637347767253043, "grad_norm": 0.5867114519178951, "learning_rate": 7.681792625589046e-06, "loss": 0.0465, "step": 1720 }, { "epoch": 1.1644113667117726, "grad_norm": 0.28189626443705196, "learning_rate": 7.678469146088385e-06, "loss": 0.0377, "step": 1721 }, { "epoch": 1.1650879566982408, "grad_norm": 0.4257169140629805, "learning_rate": 7.675144006081225e-06, "loss": 0.0313, "step": 1722 }, { "epoch": 1.165764546684709, "grad_norm": 0.26975661652548166, "learning_rate": 7.671817207628973e-06, "loss": 0.0218, "step": 1723 }, { "epoch": 1.1664411366711773, "grad_norm": 0.29291059896048743, "learning_rate": 7.668488752794067e-06, "loss": 0.0262, "step": 1724 }, { "epoch": 1.1671177266576453, "grad_norm": 0.2923749383286343, "learning_rate": 7.66515864363997e-06, "loss": 0.0325, "step": 1725 }, { "epoch": 1.1677943166441136, "grad_norm": 0.42348482931588116, "learning_rate": 7.661826882231165e-06, "loss": 0.0472, "step": 1726 }, { "epoch": 1.1684709066305818, "grad_norm": 0.4901156885176605, "learning_rate": 7.658493470633173e-06, "loss": 0.0406, "step": 1727 }, { "epoch": 1.16914749661705, "grad_norm": 0.3180984357336836, "learning_rate": 7.65515841091252e-06, "loss": 0.0305, "step": 1728 }, { "epoch": 1.1698240866035183, "grad_norm": 0.6441721154116925, "learning_rate": 7.651821705136771e-06, "loss": 0.0526, "step": 1729 }, { "epoch": 1.1705006765899864, "grad_norm": 0.26853044046386193, "learning_rate": 7.648483355374496e-06, "loss": 0.0277, "step": 1730 }, { "epoch": 1.1711772665764546, "grad_norm": 0.3139187607695551, "learning_rate": 7.645143363695302e-06, "loss": 0.0284, "step": 1731 }, { "epoch": 1.1718538565629228, "grad_norm": 0.5718022912501142, "learning_rate": 7.641801732169796e-06, "loss": 0.0465, "step": 1732 }, { "epoch": 1.172530446549391, "grad_norm": 0.5447100324678501, "learning_rate": 7.63845846286961e-06, "loss": 0.033, "step": 1733 }, { "epoch": 1.1732070365358593, "grad_norm": 0.41472159633268796, "learning_rate": 7.635113557867395e-06, "loss": 0.0333, "step": 1734 }, { "epoch": 1.1738836265223274, "grad_norm": 0.49728259053599116, "learning_rate": 7.63176701923681e-06, "loss": 0.0409, "step": 1735 }, { "epoch": 1.1745602165087956, "grad_norm": 0.6398719692509215, "learning_rate": 7.628418849052523e-06, "loss": 0.0394, "step": 1736 }, { "epoch": 1.1752368064952639, "grad_norm": 0.36361617017018366, "learning_rate": 7.625069049390228e-06, "loss": 0.0375, "step": 1737 }, { "epoch": 1.175913396481732, "grad_norm": 0.5502760150541254, "learning_rate": 7.621717622326617e-06, "loss": 0.0369, "step": 1738 }, { "epoch": 1.1765899864682003, "grad_norm": 0.5090515892709198, "learning_rate": 7.61836456993939e-06, "loss": 0.049, "step": 1739 }, { "epoch": 1.1772665764546684, "grad_norm": 0.44776416479226083, "learning_rate": 7.615009894307263e-06, "loss": 0.0339, "step": 1740 }, { "epoch": 1.1779431664411366, "grad_norm": 0.3492141392783174, "learning_rate": 7.611653597509954e-06, "loss": 0.0247, "step": 1741 }, { "epoch": 1.1786197564276049, "grad_norm": 0.41289570839557044, "learning_rate": 7.608295681628185e-06, "loss": 0.0314, "step": 1742 }, { "epoch": 1.179296346414073, "grad_norm": 0.3852313302007077, "learning_rate": 7.604936148743682e-06, "loss": 0.0328, "step": 1743 }, { "epoch": 1.1799729364005414, "grad_norm": 0.36440277176471003, "learning_rate": 7.6015750009391776e-06, "loss": 0.0357, "step": 1744 }, { "epoch": 1.1806495263870094, "grad_norm": 0.33358825995538066, "learning_rate": 7.5982122402983986e-06, "loss": 0.0305, "step": 1745 }, { "epoch": 1.1813261163734776, "grad_norm": 0.33472694610144493, "learning_rate": 7.594847868906076e-06, "loss": 0.0375, "step": 1746 }, { "epoch": 1.1820027063599459, "grad_norm": 0.33714868177143426, "learning_rate": 7.5914818888479406e-06, "loss": 0.0272, "step": 1747 }, { "epoch": 1.182679296346414, "grad_norm": 0.46217272974324325, "learning_rate": 7.588114302210719e-06, "loss": 0.0413, "step": 1748 }, { "epoch": 1.1833558863328824, "grad_norm": 0.39229166482071326, "learning_rate": 7.584745111082128e-06, "loss": 0.0378, "step": 1749 }, { "epoch": 1.1840324763193504, "grad_norm": 0.40934798728743826, "learning_rate": 7.5813743175508914e-06, "loss": 0.039, "step": 1750 }, { "epoch": 1.1847090663058186, "grad_norm": 0.36785624519295507, "learning_rate": 7.578001923706715e-06, "loss": 0.0278, "step": 1751 }, { "epoch": 1.1853856562922869, "grad_norm": 0.33291977893035213, "learning_rate": 7.574627931640304e-06, "loss": 0.0307, "step": 1752 }, { "epoch": 1.1860622462787551, "grad_norm": 0.3110764512490773, "learning_rate": 7.571252343443349e-06, "loss": 0.0284, "step": 1753 }, { "epoch": 1.1867388362652234, "grad_norm": 0.32015619790102334, "learning_rate": 7.5678751612085344e-06, "loss": 0.0309, "step": 1754 }, { "epoch": 1.1874154262516914, "grad_norm": 0.44750536856238265, "learning_rate": 7.564496387029532e-06, "loss": 0.0346, "step": 1755 }, { "epoch": 1.1880920162381596, "grad_norm": 0.32415843516157805, "learning_rate": 7.5611160230009975e-06, "loss": 0.0298, "step": 1756 }, { "epoch": 1.1887686062246279, "grad_norm": 0.4935743931579038, "learning_rate": 7.557734071218575e-06, "loss": 0.0397, "step": 1757 }, { "epoch": 1.1894451962110961, "grad_norm": 0.39357605337326074, "learning_rate": 7.5543505337788934e-06, "loss": 0.0418, "step": 1758 }, { "epoch": 1.1901217861975644, "grad_norm": 0.370324508135328, "learning_rate": 7.550965412779563e-06, "loss": 0.0377, "step": 1759 }, { "epoch": 1.1907983761840324, "grad_norm": 0.3817327709787257, "learning_rate": 7.547578710319174e-06, "loss": 0.0377, "step": 1760 }, { "epoch": 1.1914749661705006, "grad_norm": 0.4073392230960148, "learning_rate": 7.544190428497304e-06, "loss": 0.03, "step": 1761 }, { "epoch": 1.1921515561569689, "grad_norm": 0.332922118773184, "learning_rate": 7.540800569414501e-06, "loss": 0.0333, "step": 1762 }, { "epoch": 1.1928281461434371, "grad_norm": 0.3137655265944553, "learning_rate": 7.537409135172298e-06, "loss": 0.0285, "step": 1763 }, { "epoch": 1.1935047361299054, "grad_norm": 0.3998108253196923, "learning_rate": 7.5340161278732e-06, "loss": 0.0312, "step": 1764 }, { "epoch": 1.1941813261163734, "grad_norm": 0.4620898570120324, "learning_rate": 7.530621549620689e-06, "loss": 0.0366, "step": 1765 }, { "epoch": 1.1948579161028416, "grad_norm": 0.3314723928152739, "learning_rate": 7.527225402519218e-06, "loss": 0.0348, "step": 1766 }, { "epoch": 1.19553450608931, "grad_norm": 0.3256254347500369, "learning_rate": 7.52382768867422e-06, "loss": 0.0291, "step": 1767 }, { "epoch": 1.1962110960757781, "grad_norm": 0.4716243068193622, "learning_rate": 7.52042841019209e-06, "loss": 0.0391, "step": 1768 }, { "epoch": 1.1968876860622464, "grad_norm": 0.505941339169538, "learning_rate": 7.5170275691802e-06, "loss": 0.0468, "step": 1769 }, { "epoch": 1.1975642760487144, "grad_norm": 0.2440627906387663, "learning_rate": 7.5136251677468856e-06, "loss": 0.0265, "step": 1770 }, { "epoch": 1.1982408660351827, "grad_norm": 0.3862800024403366, "learning_rate": 7.510221208001457e-06, "loss": 0.0311, "step": 1771 }, { "epoch": 1.198917456021651, "grad_norm": 0.4494524757815659, "learning_rate": 7.50681569205418e-06, "loss": 0.041, "step": 1772 }, { "epoch": 1.1995940460081191, "grad_norm": 0.30573292592723056, "learning_rate": 7.5034086220162945e-06, "loss": 0.029, "step": 1773 }, { "epoch": 1.2002706359945874, "grad_norm": 0.35724743501500195, "learning_rate": 7.500000000000001e-06, "loss": 0.0436, "step": 1774 }, { "epoch": 1.2009472259810554, "grad_norm": 0.28470371882579004, "learning_rate": 7.496589828118458e-06, "loss": 0.0268, "step": 1775 }, { "epoch": 1.2016238159675237, "grad_norm": 0.5802818204184877, "learning_rate": 7.4931781084857915e-06, "loss": 0.0393, "step": 1776 }, { "epoch": 1.202300405953992, "grad_norm": 0.37036387455969116, "learning_rate": 7.489764843217082e-06, "loss": 0.0275, "step": 1777 }, { "epoch": 1.2029769959404601, "grad_norm": 0.4040523667928298, "learning_rate": 7.4863500344283715e-06, "loss": 0.0443, "step": 1778 }, { "epoch": 1.2036535859269284, "grad_norm": 0.33092259192120466, "learning_rate": 7.482933684236654e-06, "loss": 0.0277, "step": 1779 }, { "epoch": 1.2043301759133964, "grad_norm": 0.31198159640990164, "learning_rate": 7.4795157947598864e-06, "loss": 0.0267, "step": 1780 }, { "epoch": 1.2050067658998647, "grad_norm": 0.25570069748428265, "learning_rate": 7.476096368116974e-06, "loss": 0.0294, "step": 1781 }, { "epoch": 1.205683355886333, "grad_norm": 0.6124980425015061, "learning_rate": 7.4726754064277775e-06, "loss": 0.0434, "step": 1782 }, { "epoch": 1.2063599458728012, "grad_norm": 0.3730907914223387, "learning_rate": 7.469252911813107e-06, "loss": 0.0282, "step": 1783 }, { "epoch": 1.2070365358592694, "grad_norm": 0.45770328786927994, "learning_rate": 7.465828886394729e-06, "loss": 0.0365, "step": 1784 }, { "epoch": 1.2077131258457374, "grad_norm": 0.32839720355086915, "learning_rate": 7.462403332295351e-06, "loss": 0.0297, "step": 1785 }, { "epoch": 1.2083897158322057, "grad_norm": 0.3069509341440066, "learning_rate": 7.458976251638632e-06, "loss": 0.0213, "step": 1786 }, { "epoch": 1.209066305818674, "grad_norm": 0.3121843224806762, "learning_rate": 7.455547646549179e-06, "loss": 0.0264, "step": 1787 }, { "epoch": 1.2097428958051422, "grad_norm": 0.4398713007216972, "learning_rate": 7.452117519152542e-06, "loss": 0.0561, "step": 1788 }, { "epoch": 1.2104194857916104, "grad_norm": 0.3437192919415782, "learning_rate": 7.448685871575213e-06, "loss": 0.032, "step": 1789 }, { "epoch": 1.2110960757780784, "grad_norm": 0.3043326496305167, "learning_rate": 7.445252705944632e-06, "loss": 0.0261, "step": 1790 }, { "epoch": 1.2117726657645467, "grad_norm": 0.442295644348202, "learning_rate": 7.441818024389173e-06, "loss": 0.0304, "step": 1791 }, { "epoch": 1.212449255751015, "grad_norm": 0.4861660836252403, "learning_rate": 7.438381829038157e-06, "loss": 0.0419, "step": 1792 }, { "epoch": 1.2131258457374832, "grad_norm": 0.31993392324287556, "learning_rate": 7.434944122021837e-06, "loss": 0.0304, "step": 1793 }, { "epoch": 1.2138024357239512, "grad_norm": 0.41406701587352657, "learning_rate": 7.431504905471407e-06, "loss": 0.0266, "step": 1794 }, { "epoch": 1.2144790257104194, "grad_norm": 0.3633991929204166, "learning_rate": 7.428064181518997e-06, "loss": 0.0357, "step": 1795 }, { "epoch": 1.2151556156968877, "grad_norm": 0.4237964217740215, "learning_rate": 7.424621952297668e-06, "loss": 0.034, "step": 1796 }, { "epoch": 1.215832205683356, "grad_norm": 0.38030150791077555, "learning_rate": 7.4211782199414204e-06, "loss": 0.0414, "step": 1797 }, { "epoch": 1.2165087956698242, "grad_norm": 0.2702929059974458, "learning_rate": 7.417732986585179e-06, "loss": 0.0256, "step": 1798 }, { "epoch": 1.2171853856562922, "grad_norm": 0.3970911186679923, "learning_rate": 7.414286254364804e-06, "loss": 0.0231, "step": 1799 }, { "epoch": 1.2178619756427604, "grad_norm": 0.5690876899384154, "learning_rate": 7.410838025417083e-06, "loss": 0.0328, "step": 1800 }, { "epoch": 1.2185385656292287, "grad_norm": 0.5606083143485081, "learning_rate": 7.407388301879735e-06, "loss": 0.0499, "step": 1801 }, { "epoch": 1.219215155615697, "grad_norm": 0.2974123739096487, "learning_rate": 7.403937085891397e-06, "loss": 0.0379, "step": 1802 }, { "epoch": 1.2198917456021652, "grad_norm": 0.4254067773408788, "learning_rate": 7.400484379591644e-06, "loss": 0.0384, "step": 1803 }, { "epoch": 1.2205683355886332, "grad_norm": 0.419035886897327, "learning_rate": 7.397030185120962e-06, "loss": 0.0358, "step": 1804 }, { "epoch": 1.2212449255751014, "grad_norm": 0.3306811361233171, "learning_rate": 7.393574504620767e-06, "loss": 0.0279, "step": 1805 }, { "epoch": 1.2219215155615697, "grad_norm": 0.4373800575024946, "learning_rate": 7.390117340233396e-06, "loss": 0.0309, "step": 1806 }, { "epoch": 1.222598105548038, "grad_norm": 0.3412603697202751, "learning_rate": 7.386658694102103e-06, "loss": 0.0334, "step": 1807 }, { "epoch": 1.2232746955345062, "grad_norm": 0.36595538328644733, "learning_rate": 7.383198568371064e-06, "loss": 0.0372, "step": 1808 }, { "epoch": 1.2239512855209742, "grad_norm": 0.4175259270789885, "learning_rate": 7.379736965185369e-06, "loss": 0.034, "step": 1809 }, { "epoch": 1.2246278755074425, "grad_norm": 0.4358483411091658, "learning_rate": 7.376273886691024e-06, "loss": 0.0433, "step": 1810 }, { "epoch": 1.2253044654939107, "grad_norm": 0.47099895438816536, "learning_rate": 7.372809335034955e-06, "loss": 0.0343, "step": 1811 }, { "epoch": 1.225981055480379, "grad_norm": 0.39883289123379007, "learning_rate": 7.369343312364994e-06, "loss": 0.0355, "step": 1812 }, { "epoch": 1.226657645466847, "grad_norm": 0.2570217171563449, "learning_rate": 7.365875820829889e-06, "loss": 0.0274, "step": 1813 }, { "epoch": 1.2273342354533152, "grad_norm": 0.36785410865452167, "learning_rate": 7.362406862579299e-06, "loss": 0.0488, "step": 1814 }, { "epoch": 1.2280108254397835, "grad_norm": 0.5117751581032768, "learning_rate": 7.358936439763789e-06, "loss": 0.0479, "step": 1815 }, { "epoch": 1.2286874154262517, "grad_norm": 0.25366821073798473, "learning_rate": 7.355464554534837e-06, "loss": 0.025, "step": 1816 }, { "epoch": 1.22936400541272, "grad_norm": 0.4914107184195335, "learning_rate": 7.351991209044822e-06, "loss": 0.0345, "step": 1817 }, { "epoch": 1.230040595399188, "grad_norm": 0.32382934645796957, "learning_rate": 7.348516405447031e-06, "loss": 0.0275, "step": 1818 }, { "epoch": 1.2307171853856562, "grad_norm": 0.3904025573575658, "learning_rate": 7.345040145895656e-06, "loss": 0.0351, "step": 1819 }, { "epoch": 1.2313937753721245, "grad_norm": 0.3828009858030226, "learning_rate": 7.341562432545793e-06, "loss": 0.0432, "step": 1820 }, { "epoch": 1.2320703653585927, "grad_norm": 0.3545668565357232, "learning_rate": 7.338083267553433e-06, "loss": 0.0339, "step": 1821 }, { "epoch": 1.232746955345061, "grad_norm": 0.37698907015804395, "learning_rate": 7.334602653075471e-06, "loss": 0.0304, "step": 1822 }, { "epoch": 1.233423545331529, "grad_norm": 0.5494252593816147, "learning_rate": 7.331120591269701e-06, "loss": 0.0419, "step": 1823 }, { "epoch": 1.2341001353179972, "grad_norm": 0.3371825382799375, "learning_rate": 7.327637084294818e-06, "loss": 0.0311, "step": 1824 }, { "epoch": 1.2347767253044655, "grad_norm": 0.32525737487534057, "learning_rate": 7.324152134310401e-06, "loss": 0.0272, "step": 1825 }, { "epoch": 1.2354533152909337, "grad_norm": 0.30485570847197085, "learning_rate": 7.3206657434769354e-06, "loss": 0.0296, "step": 1826 }, { "epoch": 1.236129905277402, "grad_norm": 0.5896781547259747, "learning_rate": 7.317177913955795e-06, "loss": 0.0452, "step": 1827 }, { "epoch": 1.23680649526387, "grad_norm": 0.24082812103365153, "learning_rate": 7.313688647909245e-06, "loss": 0.0191, "step": 1828 }, { "epoch": 1.2374830852503382, "grad_norm": 0.2739898053952858, "learning_rate": 7.310197947500446e-06, "loss": 0.0228, "step": 1829 }, { "epoch": 1.2381596752368065, "grad_norm": 0.3626964636222225, "learning_rate": 7.30670581489344e-06, "loss": 0.0308, "step": 1830 }, { "epoch": 1.2388362652232747, "grad_norm": 0.4300869336134017, "learning_rate": 7.303212252253163e-06, "loss": 0.0421, "step": 1831 }, { "epoch": 1.239512855209743, "grad_norm": 0.3578639679889823, "learning_rate": 7.2997172617454335e-06, "loss": 0.032, "step": 1832 }, { "epoch": 1.240189445196211, "grad_norm": 0.30010753600803947, "learning_rate": 7.29622084553696e-06, "loss": 0.0285, "step": 1833 }, { "epoch": 1.2408660351826792, "grad_norm": 0.30841554347311356, "learning_rate": 7.29272300579533e-06, "loss": 0.0351, "step": 1834 }, { "epoch": 1.2415426251691475, "grad_norm": 0.5914157627033085, "learning_rate": 7.289223744689018e-06, "loss": 0.0313, "step": 1835 }, { "epoch": 1.2422192151556157, "grad_norm": 0.6591605263758334, "learning_rate": 7.285723064387373e-06, "loss": 0.0533, "step": 1836 }, { "epoch": 1.242895805142084, "grad_norm": 0.39489023938145035, "learning_rate": 7.282220967060634e-06, "loss": 0.0378, "step": 1837 }, { "epoch": 1.243572395128552, "grad_norm": 0.3579266660032314, "learning_rate": 7.278717454879907e-06, "loss": 0.0349, "step": 1838 }, { "epoch": 1.2442489851150202, "grad_norm": 1.5192761124350649, "learning_rate": 7.2752125300171835e-06, "loss": 0.0453, "step": 1839 }, { "epoch": 1.2449255751014885, "grad_norm": 0.46901360526469527, "learning_rate": 7.271706194645327e-06, "loss": 0.0318, "step": 1840 }, { "epoch": 1.2456021650879567, "grad_norm": 0.2941993859492877, "learning_rate": 7.26819845093808e-06, "loss": 0.0272, "step": 1841 }, { "epoch": 1.246278755074425, "grad_norm": 0.5656120002648428, "learning_rate": 7.264689301070048e-06, "loss": 0.0318, "step": 1842 }, { "epoch": 1.246955345060893, "grad_norm": 0.34297030662363087, "learning_rate": 7.2611787472167194e-06, "loss": 0.0297, "step": 1843 }, { "epoch": 1.2476319350473613, "grad_norm": 0.3427656632543216, "learning_rate": 7.257666791554448e-06, "loss": 0.0316, "step": 1844 }, { "epoch": 1.2483085250338295, "grad_norm": 0.3986940128566045, "learning_rate": 7.254153436260456e-06, "loss": 0.048, "step": 1845 }, { "epoch": 1.2489851150202977, "grad_norm": 0.4102796662454975, "learning_rate": 7.250638683512833e-06, "loss": 0.0465, "step": 1846 }, { "epoch": 1.249661705006766, "grad_norm": 0.36303880433424535, "learning_rate": 7.247122535490539e-06, "loss": 0.0407, "step": 1847 }, { "epoch": 1.250338294993234, "grad_norm": 0.4227726410120517, "learning_rate": 7.2436049943733955e-06, "loss": 0.0484, "step": 1848 }, { "epoch": 1.2510148849797023, "grad_norm": 0.35731347162707455, "learning_rate": 7.240086062342087e-06, "loss": 0.0378, "step": 1849 }, { "epoch": 1.2516914749661705, "grad_norm": 0.3898581094476785, "learning_rate": 7.236565741578163e-06, "loss": 0.0376, "step": 1850 }, { "epoch": 1.2523680649526387, "grad_norm": 0.31193670850306715, "learning_rate": 7.233044034264034e-06, "loss": 0.0351, "step": 1851 }, { "epoch": 1.253044654939107, "grad_norm": 0.37208656401812507, "learning_rate": 7.229520942582965e-06, "loss": 0.0314, "step": 1852 }, { "epoch": 1.253721244925575, "grad_norm": 0.2917320550304776, "learning_rate": 7.2259964687190855e-06, "loss": 0.0312, "step": 1853 }, { "epoch": 1.2543978349120433, "grad_norm": 0.3768590619705817, "learning_rate": 7.22247061485738e-06, "loss": 0.0407, "step": 1854 }, { "epoch": 1.2550744248985115, "grad_norm": 0.2695118904034237, "learning_rate": 7.218943383183684e-06, "loss": 0.0236, "step": 1855 }, { "epoch": 1.2557510148849798, "grad_norm": 0.28625741408281885, "learning_rate": 7.215414775884695e-06, "loss": 0.0265, "step": 1856 }, { "epoch": 1.256427604871448, "grad_norm": 0.2960892263088646, "learning_rate": 7.211884795147958e-06, "loss": 0.0265, "step": 1857 }, { "epoch": 1.257104194857916, "grad_norm": 0.33672543886910594, "learning_rate": 7.208353443161871e-06, "loss": 0.0417, "step": 1858 }, { "epoch": 1.2577807848443843, "grad_norm": 0.3705340418236436, "learning_rate": 7.204820722115681e-06, "loss": 0.0295, "step": 1859 }, { "epoch": 1.2584573748308525, "grad_norm": 0.39785029994746385, "learning_rate": 7.201286634199484e-06, "loss": 0.0417, "step": 1860 }, { "epoch": 1.2591339648173208, "grad_norm": 0.22744172659411263, "learning_rate": 7.197751181604228e-06, "loss": 0.0241, "step": 1861 }, { "epoch": 1.259810554803789, "grad_norm": 0.31737065279219406, "learning_rate": 7.194214366521699e-06, "loss": 0.036, "step": 1862 }, { "epoch": 1.260487144790257, "grad_norm": 0.3558706744710046, "learning_rate": 7.190676191144532e-06, "loss": 0.0323, "step": 1863 }, { "epoch": 1.2611637347767253, "grad_norm": 0.5041509694465267, "learning_rate": 7.187136657666208e-06, "loss": 0.0455, "step": 1864 }, { "epoch": 1.2618403247631935, "grad_norm": 0.7918153565133799, "learning_rate": 7.183595768281044e-06, "loss": 0.0369, "step": 1865 }, { "epoch": 1.2625169147496618, "grad_norm": 0.3576372518860047, "learning_rate": 7.180053525184202e-06, "loss": 0.0318, "step": 1866 }, { "epoch": 1.26319350473613, "grad_norm": 0.3545731617593706, "learning_rate": 7.176509930571682e-06, "loss": 0.04, "step": 1867 }, { "epoch": 1.263870094722598, "grad_norm": 0.24907884616057627, "learning_rate": 7.172964986640319e-06, "loss": 0.0258, "step": 1868 }, { "epoch": 1.2645466847090663, "grad_norm": 0.3990611988547069, "learning_rate": 7.169418695587791e-06, "loss": 0.0387, "step": 1869 }, { "epoch": 1.2652232746955345, "grad_norm": 0.8533603526557988, "learning_rate": 7.165871059612604e-06, "loss": 0.0322, "step": 1870 }, { "epoch": 1.2658998646820028, "grad_norm": 0.4201196052175664, "learning_rate": 7.162322080914106e-06, "loss": 0.0404, "step": 1871 }, { "epoch": 1.266576454668471, "grad_norm": 0.38394986726536406, "learning_rate": 7.158771761692464e-06, "loss": 0.0486, "step": 1872 }, { "epoch": 1.267253044654939, "grad_norm": 0.2778453641283127, "learning_rate": 7.155220104148694e-06, "loss": 0.0243, "step": 1873 }, { "epoch": 1.2679296346414073, "grad_norm": 0.347755938494088, "learning_rate": 7.151667110484626e-06, "loss": 0.0249, "step": 1874 }, { "epoch": 1.2686062246278755, "grad_norm": 0.321177144443505, "learning_rate": 7.148112782902927e-06, "loss": 0.0312, "step": 1875 }, { "epoch": 1.2692828146143438, "grad_norm": 0.3345751051084106, "learning_rate": 7.144557123607087e-06, "loss": 0.0327, "step": 1876 }, { "epoch": 1.269959404600812, "grad_norm": 0.3646963498599837, "learning_rate": 7.141000134801426e-06, "loss": 0.0337, "step": 1877 }, { "epoch": 1.27063599458728, "grad_norm": 0.25366441604914525, "learning_rate": 7.137441818691081e-06, "loss": 0.0219, "step": 1878 }, { "epoch": 1.2713125845737483, "grad_norm": 0.29505248364640535, "learning_rate": 7.133882177482019e-06, "loss": 0.0285, "step": 1879 }, { "epoch": 1.2719891745602165, "grad_norm": 0.41020582940222533, "learning_rate": 7.130321213381025e-06, "loss": 0.0363, "step": 1880 }, { "epoch": 1.2726657645466848, "grad_norm": 0.32971167969871995, "learning_rate": 7.1267589285957075e-06, "loss": 0.0324, "step": 1881 }, { "epoch": 1.273342354533153, "grad_norm": 0.3866276317569233, "learning_rate": 7.123195325334486e-06, "loss": 0.0336, "step": 1882 }, { "epoch": 1.274018944519621, "grad_norm": 0.4475787946434262, "learning_rate": 7.119630405806607e-06, "loss": 0.0504, "step": 1883 }, { "epoch": 1.2746955345060893, "grad_norm": 0.28936968902941973, "learning_rate": 7.1160641722221255e-06, "loss": 0.0224, "step": 1884 }, { "epoch": 1.2753721244925575, "grad_norm": 0.4049126240542631, "learning_rate": 7.112496626791915e-06, "loss": 0.0418, "step": 1885 }, { "epoch": 1.2760487144790258, "grad_norm": 0.3552857013720632, "learning_rate": 7.108927771727661e-06, "loss": 0.0334, "step": 1886 }, { "epoch": 1.276725304465494, "grad_norm": 0.35734244118327135, "learning_rate": 7.105357609241863e-06, "loss": 0.0318, "step": 1887 }, { "epoch": 1.277401894451962, "grad_norm": 0.4029530974663675, "learning_rate": 7.101786141547829e-06, "loss": 0.034, "step": 1888 }, { "epoch": 1.2780784844384303, "grad_norm": 0.24957046377592598, "learning_rate": 7.098213370859673e-06, "loss": 0.0273, "step": 1889 }, { "epoch": 1.2787550744248986, "grad_norm": 0.4552743843761523, "learning_rate": 7.094639299392324e-06, "loss": 0.0492, "step": 1890 }, { "epoch": 1.2794316644113666, "grad_norm": 0.4478278647204221, "learning_rate": 7.0910639293615125e-06, "loss": 0.0446, "step": 1891 }, { "epoch": 1.280108254397835, "grad_norm": 0.3632687438028061, "learning_rate": 7.087487262983776e-06, "loss": 0.0319, "step": 1892 }, { "epoch": 1.280784844384303, "grad_norm": 0.34356582039614136, "learning_rate": 7.083909302476453e-06, "loss": 0.0314, "step": 1893 }, { "epoch": 1.2814614343707713, "grad_norm": 0.47234299537785407, "learning_rate": 7.080330050057687e-06, "loss": 0.0377, "step": 1894 }, { "epoch": 1.2821380243572396, "grad_norm": 0.46778752210844915, "learning_rate": 7.076749507946422e-06, "loss": 0.0483, "step": 1895 }, { "epoch": 1.2828146143437076, "grad_norm": 0.25889490958952643, "learning_rate": 7.0731676783624015e-06, "loss": 0.0287, "step": 1896 }, { "epoch": 1.283491204330176, "grad_norm": 0.39166620194980056, "learning_rate": 7.069584563526166e-06, "loss": 0.0416, "step": 1897 }, { "epoch": 1.284167794316644, "grad_norm": 0.686299029669835, "learning_rate": 7.066000165659054e-06, "loss": 0.0354, "step": 1898 }, { "epoch": 1.2848443843031123, "grad_norm": 0.4802383448666566, "learning_rate": 7.062414486983197e-06, "loss": 0.0581, "step": 1899 }, { "epoch": 1.2855209742895806, "grad_norm": 0.31311888043709935, "learning_rate": 7.058827529721526e-06, "loss": 0.0333, "step": 1900 }, { "epoch": 1.2861975642760486, "grad_norm": 0.4387072159234433, "learning_rate": 7.055239296097758e-06, "loss": 0.0357, "step": 1901 }, { "epoch": 1.2868741542625168, "grad_norm": 0.5647885966002203, "learning_rate": 7.051649788336405e-06, "loss": 0.0331, "step": 1902 }, { "epoch": 1.287550744248985, "grad_norm": 0.39020468286056914, "learning_rate": 7.048059008662772e-06, "loss": 0.0304, "step": 1903 }, { "epoch": 1.2882273342354533, "grad_norm": 0.4040841233603088, "learning_rate": 7.044466959302945e-06, "loss": 0.0292, "step": 1904 }, { "epoch": 1.2889039242219216, "grad_norm": 0.4370296663488622, "learning_rate": 7.040873642483801e-06, "loss": 0.0284, "step": 1905 }, { "epoch": 1.2895805142083896, "grad_norm": 0.5615939264222903, "learning_rate": 7.037279060433004e-06, "loss": 0.0363, "step": 1906 }, { "epoch": 1.2902571041948578, "grad_norm": 0.41362958692463603, "learning_rate": 7.033683215379002e-06, "loss": 0.03, "step": 1907 }, { "epoch": 1.290933694181326, "grad_norm": 0.4240644099885667, "learning_rate": 7.030086109551023e-06, "loss": 0.0379, "step": 1908 }, { "epoch": 1.2916102841677943, "grad_norm": 0.3927369609998387, "learning_rate": 7.02648774517908e-06, "loss": 0.0325, "step": 1909 }, { "epoch": 1.2922868741542626, "grad_norm": 0.5016041617659053, "learning_rate": 7.022888124493964e-06, "loss": 0.0349, "step": 1910 }, { "epoch": 1.2929634641407306, "grad_norm": 0.4350517164606389, "learning_rate": 7.019287249727248e-06, "loss": 0.0296, "step": 1911 }, { "epoch": 1.2936400541271988, "grad_norm": 0.36137303117135344, "learning_rate": 7.015685123111276e-06, "loss": 0.0366, "step": 1912 }, { "epoch": 1.294316644113667, "grad_norm": 0.4479583549009364, "learning_rate": 7.012081746879178e-06, "loss": 0.0388, "step": 1913 }, { "epoch": 1.2949932341001353, "grad_norm": 0.5723533232917354, "learning_rate": 7.008477123264849e-06, "loss": 0.0383, "step": 1914 }, { "epoch": 1.2956698240866036, "grad_norm": 0.35527692033264874, "learning_rate": 7.004871254502962e-06, "loss": 0.0299, "step": 1915 }, { "epoch": 1.2963464140730716, "grad_norm": 0.30625989024013195, "learning_rate": 7.001264142828961e-06, "loss": 0.0276, "step": 1916 }, { "epoch": 1.2970230040595399, "grad_norm": 0.4021769539521218, "learning_rate": 6.997655790479062e-06, "loss": 0.0296, "step": 1917 }, { "epoch": 1.297699594046008, "grad_norm": 0.46910891089868806, "learning_rate": 6.9940461996902495e-06, "loss": 0.0357, "step": 1918 }, { "epoch": 1.2983761840324763, "grad_norm": 0.48277128794886304, "learning_rate": 6.990435372700273e-06, "loss": 0.0393, "step": 1919 }, { "epoch": 1.2990527740189446, "grad_norm": 0.2897119627855233, "learning_rate": 6.986823311747652e-06, "loss": 0.0216, "step": 1920 }, { "epoch": 1.2997293640054126, "grad_norm": 0.29404834759520837, "learning_rate": 6.983210019071671e-06, "loss": 0.0298, "step": 1921 }, { "epoch": 1.3004059539918809, "grad_norm": 0.4654456507271897, "learning_rate": 6.979595496912374e-06, "loss": 0.0523, "step": 1922 }, { "epoch": 1.301082543978349, "grad_norm": 0.3861213528569032, "learning_rate": 6.97597974751057e-06, "loss": 0.0424, "step": 1923 }, { "epoch": 1.3017591339648173, "grad_norm": 0.3763987234193048, "learning_rate": 6.972362773107832e-06, "loss": 0.0274, "step": 1924 }, { "epoch": 1.3024357239512856, "grad_norm": 0.4930265693347563, "learning_rate": 6.968744575946484e-06, "loss": 0.0454, "step": 1925 }, { "epoch": 1.3031123139377536, "grad_norm": 0.7085680617892084, "learning_rate": 6.965125158269619e-06, "loss": 0.0452, "step": 1926 }, { "epoch": 1.3037889039242219, "grad_norm": 0.29466271097816943, "learning_rate": 6.961504522321077e-06, "loss": 0.0334, "step": 1927 }, { "epoch": 1.30446549391069, "grad_norm": 0.3254460745028027, "learning_rate": 6.957882670345458e-06, "loss": 0.0253, "step": 1928 }, { "epoch": 1.3051420838971584, "grad_norm": 0.35444113858214193, "learning_rate": 6.954259604588114e-06, "loss": 0.0323, "step": 1929 }, { "epoch": 1.3058186738836266, "grad_norm": 0.52127584285171, "learning_rate": 6.950635327295154e-06, "loss": 0.049, "step": 1930 }, { "epoch": 1.3064952638700946, "grad_norm": 0.3550872489898909, "learning_rate": 6.94700984071343e-06, "loss": 0.0331, "step": 1931 }, { "epoch": 1.3071718538565629, "grad_norm": 0.42030445015020074, "learning_rate": 6.943383147090552e-06, "loss": 0.045, "step": 1932 }, { "epoch": 1.3078484438430311, "grad_norm": 0.29625798135552134, "learning_rate": 6.939755248674872e-06, "loss": 0.0268, "step": 1933 }, { "epoch": 1.3085250338294994, "grad_norm": 0.49240760765291725, "learning_rate": 6.936126147715494e-06, "loss": 0.0335, "step": 1934 }, { "epoch": 1.3092016238159676, "grad_norm": 0.42465284862854297, "learning_rate": 6.932495846462262e-06, "loss": 0.0256, "step": 1935 }, { "epoch": 1.3098782138024356, "grad_norm": 0.3805865277222013, "learning_rate": 6.928864347165769e-06, "loss": 0.0353, "step": 1936 }, { "epoch": 1.3105548037889039, "grad_norm": 0.5019950449960542, "learning_rate": 6.925231652077349e-06, "loss": 0.0559, "step": 1937 }, { "epoch": 1.3112313937753721, "grad_norm": 0.3886221758964247, "learning_rate": 6.921597763449075e-06, "loss": 0.0381, "step": 1938 }, { "epoch": 1.3119079837618404, "grad_norm": 0.5536427584541227, "learning_rate": 6.917962683533765e-06, "loss": 0.0472, "step": 1939 }, { "epoch": 1.3125845737483086, "grad_norm": 0.48745194443929857, "learning_rate": 6.914326414584971e-06, "loss": 0.0417, "step": 1940 }, { "epoch": 1.3132611637347766, "grad_norm": 0.32193328227593826, "learning_rate": 6.9106889588569845e-06, "loss": 0.0285, "step": 1941 }, { "epoch": 1.3139377537212449, "grad_norm": 0.49476158649818397, "learning_rate": 6.907050318604831e-06, "loss": 0.0465, "step": 1942 }, { "epoch": 1.3146143437077131, "grad_norm": 0.39431421931011645, "learning_rate": 6.903410496084272e-06, "loss": 0.0294, "step": 1943 }, { "epoch": 1.3152909336941814, "grad_norm": 0.47405875743912096, "learning_rate": 6.8997694935518e-06, "loss": 0.042, "step": 1944 }, { "epoch": 1.3159675236806496, "grad_norm": 0.29733195878313373, "learning_rate": 6.896127313264643e-06, "loss": 0.0259, "step": 1945 }, { "epoch": 1.3166441136671176, "grad_norm": 0.39496217525237887, "learning_rate": 6.892483957480754e-06, "loss": 0.0434, "step": 1946 }, { "epoch": 1.317320703653586, "grad_norm": 0.34049160066288403, "learning_rate": 6.888839428458819e-06, "loss": 0.0455, "step": 1947 }, { "epoch": 1.3179972936400541, "grad_norm": 0.48592835205029217, "learning_rate": 6.885193728458247e-06, "loss": 0.037, "step": 1948 }, { "epoch": 1.3186738836265224, "grad_norm": 0.3316655074997355, "learning_rate": 6.8815468597391785e-06, "loss": 0.0339, "step": 1949 }, { "epoch": 1.3193504736129906, "grad_norm": 0.5105973850235418, "learning_rate": 6.877898824562472e-06, "loss": 0.0426, "step": 1950 }, { "epoch": 1.3200270635994586, "grad_norm": 0.4015952845350009, "learning_rate": 6.8742496251897185e-06, "loss": 0.0258, "step": 1951 }, { "epoch": 1.320703653585927, "grad_norm": 0.3082620160475625, "learning_rate": 6.8705992638832185e-06, "loss": 0.0289, "step": 1952 }, { "epoch": 1.3213802435723951, "grad_norm": 0.3552939542081474, "learning_rate": 6.8669477429060026e-06, "loss": 0.0311, "step": 1953 }, { "epoch": 1.3220568335588634, "grad_norm": 0.3160118419064857, "learning_rate": 6.863295064521816e-06, "loss": 0.0273, "step": 1954 }, { "epoch": 1.3227334235453316, "grad_norm": 0.3641908306233171, "learning_rate": 6.859641230995123e-06, "loss": 0.0302, "step": 1955 }, { "epoch": 1.3234100135317997, "grad_norm": 0.29663698696202934, "learning_rate": 6.855986244591104e-06, "loss": 0.0343, "step": 1956 }, { "epoch": 1.324086603518268, "grad_norm": 0.5164054662110036, "learning_rate": 6.852330107575653e-06, "loss": 0.0502, "step": 1957 }, { "epoch": 1.3247631935047361, "grad_norm": 0.37482013806126385, "learning_rate": 6.848672822215378e-06, "loss": 0.0498, "step": 1958 }, { "epoch": 1.3254397834912044, "grad_norm": 0.3877114629635358, "learning_rate": 6.845014390777595e-06, "loss": 0.0369, "step": 1959 }, { "epoch": 1.3261163734776726, "grad_norm": 0.4727065765763033, "learning_rate": 6.841354815530341e-06, "loss": 0.0416, "step": 1960 }, { "epoch": 1.3267929634641407, "grad_norm": 0.2551714333929746, "learning_rate": 6.8376940987423526e-06, "loss": 0.031, "step": 1961 }, { "epoch": 1.327469553450609, "grad_norm": 0.3885309030982085, "learning_rate": 6.834032242683075e-06, "loss": 0.0362, "step": 1962 }, { "epoch": 1.3281461434370772, "grad_norm": 0.43539867609887284, "learning_rate": 6.830369249622663e-06, "loss": 0.032, "step": 1963 }, { "epoch": 1.3288227334235454, "grad_norm": 0.3410568789390074, "learning_rate": 6.8267051218319766e-06, "loss": 0.0404, "step": 1964 }, { "epoch": 1.3294993234100136, "grad_norm": 0.2712607415709858, "learning_rate": 6.823039861582574e-06, "loss": 0.0333, "step": 1965 }, { "epoch": 1.3301759133964817, "grad_norm": 0.32396143704614094, "learning_rate": 6.819373471146722e-06, "loss": 0.0307, "step": 1966 }, { "epoch": 1.33085250338295, "grad_norm": 0.35684203155714, "learning_rate": 6.815705952797383e-06, "loss": 0.0318, "step": 1967 }, { "epoch": 1.3315290933694182, "grad_norm": 0.4010634033800326, "learning_rate": 6.8120373088082215e-06, "loss": 0.0384, "step": 1968 }, { "epoch": 1.3322056833558864, "grad_norm": 0.3089504714330527, "learning_rate": 6.808367541453599e-06, "loss": 0.0298, "step": 1969 }, { "epoch": 1.3328822733423547, "grad_norm": 0.40216323878568305, "learning_rate": 6.804696653008574e-06, "loss": 0.0354, "step": 1970 }, { "epoch": 1.3335588633288227, "grad_norm": 0.3192824884643534, "learning_rate": 6.801024645748899e-06, "loss": 0.0338, "step": 1971 }, { "epoch": 1.334235453315291, "grad_norm": 0.35068944988584044, "learning_rate": 6.797351521951021e-06, "loss": 0.0342, "step": 1972 }, { "epoch": 1.3349120433017592, "grad_norm": 0.39623052144026705, "learning_rate": 6.793677283892077e-06, "loss": 0.027, "step": 1973 }, { "epoch": 1.3355886332882274, "grad_norm": 0.390208636935748, "learning_rate": 6.7900019338499005e-06, "loss": 0.0321, "step": 1974 }, { "epoch": 1.3362652232746957, "grad_norm": 0.3745006285107433, "learning_rate": 6.786325474103006e-06, "loss": 0.0333, "step": 1975 }, { "epoch": 1.3369418132611637, "grad_norm": 0.5320986808788182, "learning_rate": 6.782647906930602e-06, "loss": 0.0457, "step": 1976 }, { "epoch": 1.337618403247632, "grad_norm": 0.37218918371951465, "learning_rate": 6.778969234612583e-06, "loss": 0.0304, "step": 1977 }, { "epoch": 1.3382949932341002, "grad_norm": 0.41581495398695617, "learning_rate": 6.775289459429526e-06, "loss": 0.0331, "step": 1978 }, { "epoch": 1.3389715832205684, "grad_norm": 0.3083089395931107, "learning_rate": 6.771608583662694e-06, "loss": 0.0305, "step": 1979 }, { "epoch": 1.3396481732070367, "grad_norm": 0.4549625165103553, "learning_rate": 6.767926609594032e-06, "loss": 0.0352, "step": 1980 }, { "epoch": 1.3403247631935047, "grad_norm": 0.29487283752090165, "learning_rate": 6.764243539506166e-06, "loss": 0.0272, "step": 1981 }, { "epoch": 1.341001353179973, "grad_norm": 0.3915630691845097, "learning_rate": 6.760559375682398e-06, "loss": 0.029, "step": 1982 }, { "epoch": 1.3416779431664412, "grad_norm": 0.2985063865689262, "learning_rate": 6.7568741204067145e-06, "loss": 0.0254, "step": 1983 }, { "epoch": 1.3423545331529092, "grad_norm": 0.37609563254971934, "learning_rate": 6.753187775963773e-06, "loss": 0.0271, "step": 1984 }, { "epoch": 1.3430311231393777, "grad_norm": 0.2960225373939122, "learning_rate": 6.749500344638908e-06, "loss": 0.0241, "step": 1985 }, { "epoch": 1.3437077131258457, "grad_norm": 0.46484952982762223, "learning_rate": 6.74581182871813e-06, "loss": 0.0312, "step": 1986 }, { "epoch": 1.344384303112314, "grad_norm": 0.4643978417506475, "learning_rate": 6.7421222304881194e-06, "loss": 0.0415, "step": 1987 }, { "epoch": 1.3450608930987822, "grad_norm": 0.3629192973826768, "learning_rate": 6.738431552236228e-06, "loss": 0.0315, "step": 1988 }, { "epoch": 1.3457374830852502, "grad_norm": 0.34050995548259366, "learning_rate": 6.734739796250477e-06, "loss": 0.0291, "step": 1989 }, { "epoch": 1.3464140730717187, "grad_norm": 0.374090343086368, "learning_rate": 6.731046964819555e-06, "loss": 0.0331, "step": 1990 }, { "epoch": 1.3470906630581867, "grad_norm": 0.36890224068615757, "learning_rate": 6.727353060232822e-06, "loss": 0.0255, "step": 1991 }, { "epoch": 1.347767253044655, "grad_norm": 0.31052678606574163, "learning_rate": 6.723658084780297e-06, "loss": 0.0289, "step": 1992 }, { "epoch": 1.3484438430311232, "grad_norm": 0.29195805083667253, "learning_rate": 6.719962040752665e-06, "loss": 0.0305, "step": 1993 }, { "epoch": 1.3491204330175912, "grad_norm": 0.346054583503512, "learning_rate": 6.716264930441279e-06, "loss": 0.0395, "step": 1994 }, { "epoch": 1.3497970230040595, "grad_norm": 0.5532799972686093, "learning_rate": 6.712566756138142e-06, "loss": 0.0361, "step": 1995 }, { "epoch": 1.3504736129905277, "grad_norm": 0.3070130836221577, "learning_rate": 6.708867520135924e-06, "loss": 0.0387, "step": 1996 }, { "epoch": 1.351150202976996, "grad_norm": 0.36365030320565084, "learning_rate": 6.705167224727956e-06, "loss": 0.0345, "step": 1997 }, { "epoch": 1.3518267929634642, "grad_norm": 0.2936265965360264, "learning_rate": 6.701465872208216e-06, "loss": 0.0273, "step": 1998 }, { "epoch": 1.3525033829499322, "grad_norm": 0.26117973313412873, "learning_rate": 6.697763464871346e-06, "loss": 0.0236, "step": 1999 }, { "epoch": 1.3531799729364005, "grad_norm": 0.30964815143078134, "learning_rate": 6.694060005012642e-06, "loss": 0.0263, "step": 2000 }, { "epoch": 1.3538565629228687, "grad_norm": 0.4914254563625275, "learning_rate": 6.690355494928043e-06, "loss": 0.0297, "step": 2001 }, { "epoch": 1.354533152909337, "grad_norm": 0.522963248563417, "learning_rate": 6.686649936914151e-06, "loss": 0.0375, "step": 2002 }, { "epoch": 1.3552097428958052, "grad_norm": 0.3153906403863871, "learning_rate": 6.682943333268208e-06, "loss": 0.0322, "step": 2003 }, { "epoch": 1.3558863328822732, "grad_norm": 0.3064065236158712, "learning_rate": 6.6792356862881144e-06, "loss": 0.0274, "step": 2004 }, { "epoch": 1.3565629228687415, "grad_norm": 0.5655353933523952, "learning_rate": 6.675526998272405e-06, "loss": 0.0363, "step": 2005 }, { "epoch": 1.3572395128552097, "grad_norm": 0.5824549510550345, "learning_rate": 6.671817271520269e-06, "loss": 0.0323, "step": 2006 }, { "epoch": 1.357916102841678, "grad_norm": 0.2908098025143807, "learning_rate": 6.668106508331539e-06, "loss": 0.0281, "step": 2007 }, { "epoch": 1.3585926928281462, "grad_norm": 0.3886613946837747, "learning_rate": 6.664394711006684e-06, "loss": 0.0593, "step": 2008 }, { "epoch": 1.3592692828146142, "grad_norm": 0.2602980440199633, "learning_rate": 6.660681881846822e-06, "loss": 0.0237, "step": 2009 }, { "epoch": 1.3599458728010825, "grad_norm": 0.2962619776666657, "learning_rate": 6.656968023153706e-06, "loss": 0.0293, "step": 2010 }, { "epoch": 1.3606224627875507, "grad_norm": 0.4715239486702429, "learning_rate": 6.653253137229727e-06, "loss": 0.0242, "step": 2011 }, { "epoch": 1.361299052774019, "grad_norm": 0.30883648305118566, "learning_rate": 6.6495372263779145e-06, "loss": 0.0279, "step": 2012 }, { "epoch": 1.3619756427604872, "grad_norm": 0.4829350071057054, "learning_rate": 6.6458202929019345e-06, "loss": 0.0383, "step": 2013 }, { "epoch": 1.3626522327469552, "grad_norm": 0.27575244400290594, "learning_rate": 6.6421023391060845e-06, "loss": 0.0231, "step": 2014 }, { "epoch": 1.3633288227334235, "grad_norm": 0.43115525602974536, "learning_rate": 6.6383833672952945e-06, "loss": 0.0316, "step": 2015 }, { "epoch": 1.3640054127198917, "grad_norm": 0.45118528428735527, "learning_rate": 6.634663379775126e-06, "loss": 0.0473, "step": 2016 }, { "epoch": 1.36468200270636, "grad_norm": 0.4366983129632561, "learning_rate": 6.630942378851774e-06, "loss": 0.0335, "step": 2017 }, { "epoch": 1.3653585926928282, "grad_norm": 0.4462867555229021, "learning_rate": 6.627220366832056e-06, "loss": 0.0391, "step": 2018 }, { "epoch": 1.3660351826792962, "grad_norm": 0.32642262322940635, "learning_rate": 6.6234973460234184e-06, "loss": 0.0266, "step": 2019 }, { "epoch": 1.3667117726657645, "grad_norm": 0.22280107789149386, "learning_rate": 6.619773318733934e-06, "loss": 0.0221, "step": 2020 }, { "epoch": 1.3673883626522327, "grad_norm": 0.35837691241255937, "learning_rate": 6.616048287272301e-06, "loss": 0.0358, "step": 2021 }, { "epoch": 1.368064952638701, "grad_norm": 0.33777792151625974, "learning_rate": 6.612322253947836e-06, "loss": 0.032, "step": 2022 }, { "epoch": 1.3687415426251692, "grad_norm": 0.7163729402437692, "learning_rate": 6.608595221070478e-06, "loss": 0.0338, "step": 2023 }, { "epoch": 1.3694181326116373, "grad_norm": 0.319789911005006, "learning_rate": 6.60486719095079e-06, "loss": 0.0283, "step": 2024 }, { "epoch": 1.3700947225981055, "grad_norm": 0.27226424546217287, "learning_rate": 6.601138165899945e-06, "loss": 0.0227, "step": 2025 }, { "epoch": 1.3707713125845737, "grad_norm": 0.3273309937541878, "learning_rate": 6.597408148229742e-06, "loss": 0.0314, "step": 2026 }, { "epoch": 1.371447902571042, "grad_norm": 0.27041595036966815, "learning_rate": 6.5936771402525875e-06, "loss": 0.0328, "step": 2027 }, { "epoch": 1.3721244925575102, "grad_norm": 0.32856085638934746, "learning_rate": 6.589945144281508e-06, "loss": 0.0359, "step": 2028 }, { "epoch": 1.3728010825439783, "grad_norm": 0.5082650016533564, "learning_rate": 6.586212162630137e-06, "loss": 0.0429, "step": 2029 }, { "epoch": 1.3734776725304465, "grad_norm": 0.36097284492020404, "learning_rate": 6.582478197612725e-06, "loss": 0.0237, "step": 2030 }, { "epoch": 1.3741542625169147, "grad_norm": 0.28282032291287545, "learning_rate": 6.578743251544128e-06, "loss": 0.0256, "step": 2031 }, { "epoch": 1.374830852503383, "grad_norm": 0.26420694962027813, "learning_rate": 6.57500732673981e-06, "loss": 0.0274, "step": 2032 }, { "epoch": 1.3755074424898512, "grad_norm": 0.4219218950231299, "learning_rate": 6.571270425515843e-06, "loss": 0.0294, "step": 2033 }, { "epoch": 1.3761840324763193, "grad_norm": 0.3313638757920285, "learning_rate": 6.567532550188908e-06, "loss": 0.0324, "step": 2034 }, { "epoch": 1.3768606224627875, "grad_norm": 0.39554296467903294, "learning_rate": 6.56379370307628e-06, "loss": 0.0365, "step": 2035 }, { "epoch": 1.3775372124492558, "grad_norm": 0.541519278217376, "learning_rate": 6.560053886495847e-06, "loss": 0.0553, "step": 2036 }, { "epoch": 1.378213802435724, "grad_norm": 0.36617668515633667, "learning_rate": 6.556313102766094e-06, "loss": 0.0387, "step": 2037 }, { "epoch": 1.3788903924221922, "grad_norm": 0.42547372186926263, "learning_rate": 6.552571354206104e-06, "loss": 0.0276, "step": 2038 }, { "epoch": 1.3795669824086603, "grad_norm": 0.38643235510601825, "learning_rate": 6.548828643135559e-06, "loss": 0.0338, "step": 2039 }, { "epoch": 1.3802435723951285, "grad_norm": 0.42679781985591464, "learning_rate": 6.545084971874738e-06, "loss": 0.0334, "step": 2040 }, { "epoch": 1.3809201623815968, "grad_norm": 0.4811872195591035, "learning_rate": 6.541340342744517e-06, "loss": 0.0314, "step": 2041 }, { "epoch": 1.381596752368065, "grad_norm": 0.3881378711290422, "learning_rate": 6.537594758066362e-06, "loss": 0.0289, "step": 2042 }, { "epoch": 1.3822733423545333, "grad_norm": 0.363316686679428, "learning_rate": 6.533848220162336e-06, "loss": 0.0329, "step": 2043 }, { "epoch": 1.3829499323410013, "grad_norm": 0.5194872511315176, "learning_rate": 6.530100731355089e-06, "loss": 0.0364, "step": 2044 }, { "epoch": 1.3836265223274695, "grad_norm": 0.3509987046782082, "learning_rate": 6.5263522939678626e-06, "loss": 0.0367, "step": 2045 }, { "epoch": 1.3843031123139378, "grad_norm": 0.8794986059479333, "learning_rate": 6.5226029103244846e-06, "loss": 0.0277, "step": 2046 }, { "epoch": 1.384979702300406, "grad_norm": 0.37079855582402726, "learning_rate": 6.518852582749373e-06, "loss": 0.026, "step": 2047 }, { "epoch": 1.3856562922868743, "grad_norm": 0.38243280438514937, "learning_rate": 6.515101313567529e-06, "loss": 0.0342, "step": 2048 }, { "epoch": 1.3863328822733423, "grad_norm": 0.3566315120700252, "learning_rate": 6.511349105104534e-06, "loss": 0.039, "step": 2049 }, { "epoch": 1.3870094722598105, "grad_norm": 0.2737242000700114, "learning_rate": 6.507595959686558e-06, "loss": 0.0212, "step": 2050 }, { "epoch": 1.3876860622462788, "grad_norm": 0.401899097675513, "learning_rate": 6.503841879640349e-06, "loss": 0.0315, "step": 2051 }, { "epoch": 1.388362652232747, "grad_norm": 0.42994403962960515, "learning_rate": 6.500086867293231e-06, "loss": 0.0458, "step": 2052 }, { "epoch": 1.3890392422192153, "grad_norm": 0.3298658103107547, "learning_rate": 6.496330924973112e-06, "loss": 0.0257, "step": 2053 }, { "epoch": 1.3897158322056833, "grad_norm": 0.457177812307383, "learning_rate": 6.492574055008474e-06, "loss": 0.03, "step": 2054 }, { "epoch": 1.3903924221921515, "grad_norm": 0.5646728212522865, "learning_rate": 6.488816259728372e-06, "loss": 0.0453, "step": 2055 }, { "epoch": 1.3910690121786198, "grad_norm": 0.3077669655382729, "learning_rate": 6.4850575414624385e-06, "loss": 0.0352, "step": 2056 }, { "epoch": 1.391745602165088, "grad_norm": 0.2585666570782783, "learning_rate": 6.481297902540875e-06, "loss": 0.0275, "step": 2057 }, { "epoch": 1.3924221921515563, "grad_norm": 0.32267053665643863, "learning_rate": 6.477537345294455e-06, "loss": 0.0344, "step": 2058 }, { "epoch": 1.3930987821380243, "grad_norm": 0.35264057294196477, "learning_rate": 6.473775872054522e-06, "loss": 0.0313, "step": 2059 }, { "epoch": 1.3937753721244925, "grad_norm": 0.2807470775739839, "learning_rate": 6.4700134851529864e-06, "loss": 0.0264, "step": 2060 }, { "epoch": 1.3944519621109608, "grad_norm": 0.4160332993314981, "learning_rate": 6.466250186922325e-06, "loss": 0.039, "step": 2061 }, { "epoch": 1.395128552097429, "grad_norm": 0.29364956421954663, "learning_rate": 6.46248597969558e-06, "loss": 0.0276, "step": 2062 }, { "epoch": 1.3958051420838973, "grad_norm": 0.35315452768437977, "learning_rate": 6.458720865806356e-06, "loss": 0.0298, "step": 2063 }, { "epoch": 1.3964817320703653, "grad_norm": 0.35059909971098374, "learning_rate": 6.454954847588824e-06, "loss": 0.0346, "step": 2064 }, { "epoch": 1.3971583220568335, "grad_norm": 0.3451610708419374, "learning_rate": 6.4511879273777065e-06, "loss": 0.0292, "step": 2065 }, { "epoch": 1.3978349120433018, "grad_norm": 0.3260737404156102, "learning_rate": 6.447420107508297e-06, "loss": 0.0361, "step": 2066 }, { "epoch": 1.39851150202977, "grad_norm": 0.26665138272941735, "learning_rate": 6.443651390316438e-06, "loss": 0.0191, "step": 2067 }, { "epoch": 1.3991880920162383, "grad_norm": 0.3316380161766256, "learning_rate": 6.439881778138531e-06, "loss": 0.0286, "step": 2068 }, { "epoch": 1.3998646820027063, "grad_norm": 0.2555171129823219, "learning_rate": 6.436111273311533e-06, "loss": 0.0239, "step": 2069 }, { "epoch": 1.4005412719891746, "grad_norm": 0.31877515349966057, "learning_rate": 6.4323398781729525e-06, "loss": 0.0388, "step": 2070 }, { "epoch": 1.4012178619756428, "grad_norm": 0.34526680355294986, "learning_rate": 6.428567595060853e-06, "loss": 0.0347, "step": 2071 }, { "epoch": 1.401894451962111, "grad_norm": 0.3541673705549516, "learning_rate": 6.424794426313845e-06, "loss": 0.0297, "step": 2072 }, { "epoch": 1.4025710419485793, "grad_norm": 0.3846339376565212, "learning_rate": 6.42102037427109e-06, "loss": 0.0366, "step": 2073 }, { "epoch": 1.4032476319350473, "grad_norm": 0.40517755933658284, "learning_rate": 6.417245441272299e-06, "loss": 0.0401, "step": 2074 }, { "epoch": 1.4039242219215156, "grad_norm": 0.34974420366334524, "learning_rate": 6.413469629657724e-06, "loss": 0.0304, "step": 2075 }, { "epoch": 1.4046008119079838, "grad_norm": 0.31033898464225507, "learning_rate": 6.409692941768166e-06, "loss": 0.0261, "step": 2076 }, { "epoch": 1.4052774018944518, "grad_norm": 0.3105272439758102, "learning_rate": 6.405915379944967e-06, "loss": 0.0369, "step": 2077 }, { "epoch": 1.4059539918809203, "grad_norm": 0.30416573331952884, "learning_rate": 6.402136946530014e-06, "loss": 0.0249, "step": 2078 }, { "epoch": 1.4066305818673883, "grad_norm": 0.3677384878897852, "learning_rate": 6.398357643865731e-06, "loss": 0.0413, "step": 2079 }, { "epoch": 1.4073071718538566, "grad_norm": 0.26852596319192984, "learning_rate": 6.394577474295081e-06, "loss": 0.0231, "step": 2080 }, { "epoch": 1.4079837618403248, "grad_norm": 0.2714719235055002, "learning_rate": 6.390796440161566e-06, "loss": 0.024, "step": 2081 }, { "epoch": 1.4086603518267928, "grad_norm": 0.4053568590348655, "learning_rate": 6.387014543809224e-06, "loss": 0.0529, "step": 2082 }, { "epoch": 1.4093369418132613, "grad_norm": 0.38264869853034933, "learning_rate": 6.383231787582625e-06, "loss": 0.043, "step": 2083 }, { "epoch": 1.4100135317997293, "grad_norm": 0.3023966476066303, "learning_rate": 6.3794481738268765e-06, "loss": 0.0254, "step": 2084 }, { "epoch": 1.4106901217861976, "grad_norm": 0.25204219489059054, "learning_rate": 6.375663704887614e-06, "loss": 0.0247, "step": 2085 }, { "epoch": 1.4113667117726658, "grad_norm": 0.3603784899646957, "learning_rate": 6.371878383111002e-06, "loss": 0.0282, "step": 2086 }, { "epoch": 1.4120433017591338, "grad_norm": 0.3448311011389403, "learning_rate": 6.368092210843739e-06, "loss": 0.026, "step": 2087 }, { "epoch": 1.412719891745602, "grad_norm": 0.2691654436255992, "learning_rate": 6.364305190433049e-06, "loss": 0.0217, "step": 2088 }, { "epoch": 1.4133964817320703, "grad_norm": 0.36562016184734747, "learning_rate": 6.360517324226676e-06, "loss": 0.0413, "step": 2089 }, { "epoch": 1.4140730717185386, "grad_norm": 0.49998657397031354, "learning_rate": 6.3567286145728944e-06, "loss": 0.0475, "step": 2090 }, { "epoch": 1.4147496617050068, "grad_norm": 0.35948438061063, "learning_rate": 6.3529390638205036e-06, "loss": 0.0338, "step": 2091 }, { "epoch": 1.4154262516914748, "grad_norm": 0.30024840156854604, "learning_rate": 6.349148674318816e-06, "loss": 0.0223, "step": 2092 }, { "epoch": 1.416102841677943, "grad_norm": 0.37323546109523026, "learning_rate": 6.34535744841767e-06, "loss": 0.0348, "step": 2093 }, { "epoch": 1.4167794316644113, "grad_norm": 0.3138761193803162, "learning_rate": 6.341565388467425e-06, "loss": 0.0212, "step": 2094 }, { "epoch": 1.4174560216508796, "grad_norm": 0.42562570612532213, "learning_rate": 6.3377724968189494e-06, "loss": 0.0427, "step": 2095 }, { "epoch": 1.4181326116373478, "grad_norm": 0.2975809896826504, "learning_rate": 6.3339787758236316e-06, "loss": 0.0319, "step": 2096 }, { "epoch": 1.4188092016238159, "grad_norm": 0.20055310290567982, "learning_rate": 6.330184227833376e-06, "loss": 0.0192, "step": 2097 }, { "epoch": 1.419485791610284, "grad_norm": 0.2903888548762818, "learning_rate": 6.326388855200598e-06, "loss": 0.0332, "step": 2098 }, { "epoch": 1.4201623815967523, "grad_norm": 0.3237604981373609, "learning_rate": 6.322592660278223e-06, "loss": 0.0211, "step": 2099 }, { "epoch": 1.4208389715832206, "grad_norm": 0.4563084635253882, "learning_rate": 6.3187956454196885e-06, "loss": 0.0293, "step": 2100 }, { "epoch": 1.4215155615696888, "grad_norm": 0.3565677324507497, "learning_rate": 6.314997812978938e-06, "loss": 0.0471, "step": 2101 }, { "epoch": 1.4221921515561569, "grad_norm": 0.37222445082887984, "learning_rate": 6.311199165310422e-06, "loss": 0.0299, "step": 2102 }, { "epoch": 1.422868741542625, "grad_norm": 0.3603043808313665, "learning_rate": 6.3073997047691e-06, "loss": 0.0345, "step": 2103 }, { "epoch": 1.4235453315290933, "grad_norm": 0.28793353670094385, "learning_rate": 6.30359943371043e-06, "loss": 0.03, "step": 2104 }, { "epoch": 1.4242219215155616, "grad_norm": 0.31778201674399575, "learning_rate": 6.299798354490376e-06, "loss": 0.0326, "step": 2105 }, { "epoch": 1.4248985115020298, "grad_norm": 0.39267194353182716, "learning_rate": 6.295996469465404e-06, "loss": 0.0441, "step": 2106 }, { "epoch": 1.4255751014884979, "grad_norm": 0.2555114962569351, "learning_rate": 6.292193780992475e-06, "loss": 0.0209, "step": 2107 }, { "epoch": 1.426251691474966, "grad_norm": 0.36004345414357736, "learning_rate": 6.288390291429054e-06, "loss": 0.033, "step": 2108 }, { "epoch": 1.4269282814614344, "grad_norm": 0.5615797430841993, "learning_rate": 6.284586003133096e-06, "loss": 0.0382, "step": 2109 }, { "epoch": 1.4276048714479026, "grad_norm": 0.29624884789625017, "learning_rate": 6.280780918463057e-06, "loss": 0.0338, "step": 2110 }, { "epoch": 1.4282814614343708, "grad_norm": 0.3837618125131463, "learning_rate": 6.276975039777885e-06, "loss": 0.0342, "step": 2111 }, { "epoch": 1.4289580514208389, "grad_norm": 0.27034435250466143, "learning_rate": 6.2731683694370185e-06, "loss": 0.0295, "step": 2112 }, { "epoch": 1.4296346414073071, "grad_norm": 0.7620266575277731, "learning_rate": 6.269360909800386e-06, "loss": 0.0314, "step": 2113 }, { "epoch": 1.4303112313937754, "grad_norm": 0.3480021200860554, "learning_rate": 6.265552663228411e-06, "loss": 0.035, "step": 2114 }, { "epoch": 1.4309878213802436, "grad_norm": 0.2758089710725665, "learning_rate": 6.261743632081998e-06, "loss": 0.0248, "step": 2115 }, { "epoch": 1.4316644113667119, "grad_norm": 0.3379064505582351, "learning_rate": 6.257933818722544e-06, "loss": 0.0265, "step": 2116 }, { "epoch": 1.4323410013531799, "grad_norm": 0.38029596555033884, "learning_rate": 6.254123225511924e-06, "loss": 0.0488, "step": 2117 }, { "epoch": 1.4330175913396481, "grad_norm": 0.27409117177720943, "learning_rate": 6.250311854812504e-06, "loss": 0.0248, "step": 2118 }, { "epoch": 1.4336941813261164, "grad_norm": 0.36908567109447665, "learning_rate": 6.246499708987127e-06, "loss": 0.0395, "step": 2119 }, { "epoch": 1.4343707713125846, "grad_norm": 0.48804597539011046, "learning_rate": 6.242686790399117e-06, "loss": 0.0522, "step": 2120 }, { "epoch": 1.4350473612990529, "grad_norm": 0.2535007906418984, "learning_rate": 6.238873101412282e-06, "loss": 0.0308, "step": 2121 }, { "epoch": 1.4357239512855209, "grad_norm": 0.41145685151365685, "learning_rate": 6.2350586443908965e-06, "loss": 0.0352, "step": 2122 }, { "epoch": 1.4364005412719891, "grad_norm": 0.2782286467786743, "learning_rate": 6.231243421699725e-06, "loss": 0.0381, "step": 2123 }, { "epoch": 1.4370771312584574, "grad_norm": 0.46114348105248215, "learning_rate": 6.227427435703997e-06, "loss": 0.036, "step": 2124 }, { "epoch": 1.4377537212449256, "grad_norm": 0.4419496506779566, "learning_rate": 6.223610688769418e-06, "loss": 0.024, "step": 2125 }, { "epoch": 1.4384303112313939, "grad_norm": 0.3390263762200865, "learning_rate": 6.219793183262165e-06, "loss": 0.0419, "step": 2126 }, { "epoch": 1.439106901217862, "grad_norm": 0.5057159702902083, "learning_rate": 6.215974921548888e-06, "loss": 0.0553, "step": 2127 }, { "epoch": 1.4397834912043301, "grad_norm": 0.38168261802940295, "learning_rate": 6.2121559059966995e-06, "loss": 0.0383, "step": 2128 }, { "epoch": 1.4404600811907984, "grad_norm": 0.35613068846279383, "learning_rate": 6.2083361389731874e-06, "loss": 0.0369, "step": 2129 }, { "epoch": 1.4411366711772666, "grad_norm": 0.3524748625297267, "learning_rate": 6.204515622846399e-06, "loss": 0.0284, "step": 2130 }, { "epoch": 1.4418132611637349, "grad_norm": 0.6151239085092787, "learning_rate": 6.200694359984849e-06, "loss": 0.0307, "step": 2131 }, { "epoch": 1.442489851150203, "grad_norm": 0.2957639819925075, "learning_rate": 6.1968723527575155e-06, "loss": 0.0251, "step": 2132 }, { "epoch": 1.4431664411366711, "grad_norm": 0.44334858465795796, "learning_rate": 6.193049603533835e-06, "loss": 0.0273, "step": 2133 }, { "epoch": 1.4438430311231394, "grad_norm": 0.3829246618926111, "learning_rate": 6.189226114683708e-06, "loss": 0.0354, "step": 2134 }, { "epoch": 1.4445196211096076, "grad_norm": 0.319004469884991, "learning_rate": 6.185401888577488e-06, "loss": 0.0342, "step": 2135 }, { "epoch": 1.4451962110960759, "grad_norm": 1.0098018314065516, "learning_rate": 6.181576927585993e-06, "loss": 0.039, "step": 2136 }, { "epoch": 1.445872801082544, "grad_norm": 0.23140407916564779, "learning_rate": 6.177751234080491e-06, "loss": 0.0206, "step": 2137 }, { "epoch": 1.4465493910690121, "grad_norm": 0.5468696369070107, "learning_rate": 6.173924810432705e-06, "loss": 0.0379, "step": 2138 }, { "epoch": 1.4472259810554804, "grad_norm": 0.5154559749265718, "learning_rate": 6.170097659014812e-06, "loss": 0.039, "step": 2139 }, { "epoch": 1.4479025710419486, "grad_norm": 0.37189366775822497, "learning_rate": 6.166269782199441e-06, "loss": 0.0309, "step": 2140 }, { "epoch": 1.4485791610284169, "grad_norm": 0.5175418728381058, "learning_rate": 6.162441182359667e-06, "loss": 0.0463, "step": 2141 }, { "epoch": 1.449255751014885, "grad_norm": 0.3518796223354458, "learning_rate": 6.158611861869018e-06, "loss": 0.0323, "step": 2142 }, { "epoch": 1.4499323410013532, "grad_norm": 0.8374256432474917, "learning_rate": 6.154781823101463e-06, "loss": 0.0361, "step": 2143 }, { "epoch": 1.4506089309878214, "grad_norm": 0.5178072670670487, "learning_rate": 6.150951068431424e-06, "loss": 0.0419, "step": 2144 }, { "epoch": 1.4512855209742896, "grad_norm": 0.6220397251423426, "learning_rate": 6.147119600233758e-06, "loss": 0.0393, "step": 2145 }, { "epoch": 1.451962110960758, "grad_norm": 0.37835908831438864, "learning_rate": 6.143287420883772e-06, "loss": 0.0285, "step": 2146 }, { "epoch": 1.452638700947226, "grad_norm": 0.5095509553729675, "learning_rate": 6.1394545327572086e-06, "loss": 0.0348, "step": 2147 }, { "epoch": 1.4533152909336942, "grad_norm": 0.29047671694508453, "learning_rate": 6.135620938230254e-06, "loss": 0.0303, "step": 2148 }, { "epoch": 1.4539918809201624, "grad_norm": 0.4705487897755824, "learning_rate": 6.131786639679527e-06, "loss": 0.0358, "step": 2149 }, { "epoch": 1.4546684709066307, "grad_norm": 0.37836045179522076, "learning_rate": 6.127951639482088e-06, "loss": 0.0319, "step": 2150 }, { "epoch": 1.455345060893099, "grad_norm": 0.304184930680853, "learning_rate": 6.1241159400154306e-06, "loss": 0.0276, "step": 2151 }, { "epoch": 1.456021650879567, "grad_norm": 0.325903651893824, "learning_rate": 6.12027954365748e-06, "loss": 0.0338, "step": 2152 }, { "epoch": 1.4566982408660352, "grad_norm": 0.29062088304713973, "learning_rate": 6.116442452786599e-06, "loss": 0.029, "step": 2153 }, { "epoch": 1.4573748308525034, "grad_norm": 0.30661073480979945, "learning_rate": 6.112604669781572e-06, "loss": 0.0314, "step": 2154 }, { "epoch": 1.4580514208389717, "grad_norm": 0.4290895580278508, "learning_rate": 6.108766197021623e-06, "loss": 0.0363, "step": 2155 }, { "epoch": 1.45872801082544, "grad_norm": 0.3229603064211765, "learning_rate": 6.104927036886392e-06, "loss": 0.0204, "step": 2156 }, { "epoch": 1.459404600811908, "grad_norm": 0.30348680465543104, "learning_rate": 6.101087191755958e-06, "loss": 0.0349, "step": 2157 }, { "epoch": 1.4600811907983762, "grad_norm": 0.29203304265229446, "learning_rate": 6.097246664010813e-06, "loss": 0.0321, "step": 2158 }, { "epoch": 1.4607577807848444, "grad_norm": 0.3626742231462104, "learning_rate": 6.09340545603188e-06, "loss": 0.0221, "step": 2159 }, { "epoch": 1.4614343707713127, "grad_norm": 0.4282709049330542, "learning_rate": 6.0895635702004985e-06, "loss": 0.0315, "step": 2160 }, { "epoch": 1.462110960757781, "grad_norm": 0.37997009810871746, "learning_rate": 6.085721008898434e-06, "loss": 0.0412, "step": 2161 }, { "epoch": 1.462787550744249, "grad_norm": 0.4162757999240797, "learning_rate": 6.081877774507864e-06, "loss": 0.0423, "step": 2162 }, { "epoch": 1.4634641407307172, "grad_norm": 0.36280176386712804, "learning_rate": 6.078033869411389e-06, "loss": 0.0302, "step": 2163 }, { "epoch": 1.4641407307171854, "grad_norm": 0.48194851027098257, "learning_rate": 6.0741892959920205e-06, "loss": 0.03, "step": 2164 }, { "epoch": 1.4648173207036537, "grad_norm": 0.4484108166297937, "learning_rate": 6.070344056633189e-06, "loss": 0.0329, "step": 2165 }, { "epoch": 1.465493910690122, "grad_norm": 0.3294514538164327, "learning_rate": 6.066498153718735e-06, "loss": 0.0318, "step": 2166 }, { "epoch": 1.46617050067659, "grad_norm": 0.8495940935008968, "learning_rate": 6.062651589632911e-06, "loss": 0.0368, "step": 2167 }, { "epoch": 1.4668470906630582, "grad_norm": 0.35158915451929507, "learning_rate": 6.05880436676038e-06, "loss": 0.0307, "step": 2168 }, { "epoch": 1.4675236806495264, "grad_norm": 0.45524338738755626, "learning_rate": 6.054956487486212e-06, "loss": 0.0297, "step": 2169 }, { "epoch": 1.4682002706359945, "grad_norm": 0.45007889929618056, "learning_rate": 6.0511079541958825e-06, "loss": 0.0383, "step": 2170 }, { "epoch": 1.468876860622463, "grad_norm": 0.283783063897314, "learning_rate": 6.04725876927528e-06, "loss": 0.0274, "step": 2171 }, { "epoch": 1.469553450608931, "grad_norm": 0.5068185378853123, "learning_rate": 6.043408935110688e-06, "loss": 0.0347, "step": 2172 }, { "epoch": 1.4702300405953992, "grad_norm": 0.40950528843006856, "learning_rate": 6.039558454088796e-06, "loss": 0.0433, "step": 2173 }, { "epoch": 1.4709066305818674, "grad_norm": 0.4398326416941755, "learning_rate": 6.035707328596698e-06, "loss": 0.025, "step": 2174 }, { "epoch": 1.4715832205683355, "grad_norm": 0.3655865643045538, "learning_rate": 6.0318555610218796e-06, "loss": 0.0339, "step": 2175 }, { "epoch": 1.472259810554804, "grad_norm": 0.8971567567375772, "learning_rate": 6.0280031537522335e-06, "loss": 0.0338, "step": 2176 }, { "epoch": 1.472936400541272, "grad_norm": 0.36910870295649967, "learning_rate": 6.02415010917604e-06, "loss": 0.0314, "step": 2177 }, { "epoch": 1.4736129905277402, "grad_norm": 0.507486121609872, "learning_rate": 6.020296429681985e-06, "loss": 0.0512, "step": 2178 }, { "epoch": 1.4742895805142084, "grad_norm": 0.43022508829325473, "learning_rate": 6.016442117659135e-06, "loss": 0.0306, "step": 2179 }, { "epoch": 1.4749661705006765, "grad_norm": 0.6069253084374097, "learning_rate": 6.0125871754969614e-06, "loss": 0.0681, "step": 2180 }, { "epoch": 1.4756427604871447, "grad_norm": 0.29274371865920534, "learning_rate": 6.0087316055853175e-06, "loss": 0.0252, "step": 2181 }, { "epoch": 1.476319350473613, "grad_norm": 0.32838593532178906, "learning_rate": 6.00487541031445e-06, "loss": 0.0307, "step": 2182 }, { "epoch": 1.4769959404600812, "grad_norm": 0.3881233925938184, "learning_rate": 6.001018592074991e-06, "loss": 0.0442, "step": 2183 }, { "epoch": 1.4776725304465494, "grad_norm": 0.4491432196267391, "learning_rate": 5.997161153257963e-06, "loss": 0.0357, "step": 2184 }, { "epoch": 1.4783491204330175, "grad_norm": 0.745493908416148, "learning_rate": 5.9933030962547656e-06, "loss": 0.0393, "step": 2185 }, { "epoch": 1.4790257104194857, "grad_norm": 0.4476644226503889, "learning_rate": 5.989444423457189e-06, "loss": 0.03, "step": 2186 }, { "epoch": 1.479702300405954, "grad_norm": 0.385099145514051, "learning_rate": 5.985585137257401e-06, "loss": 0.0379, "step": 2187 }, { "epoch": 1.4803788903924222, "grad_norm": 0.31941391994184515, "learning_rate": 5.981725240047954e-06, "loss": 0.0325, "step": 2188 }, { "epoch": 1.4810554803788905, "grad_norm": 0.5583607111370749, "learning_rate": 5.977864734221773e-06, "loss": 0.0375, "step": 2189 }, { "epoch": 1.4817320703653585, "grad_norm": 0.38885996053755606, "learning_rate": 5.974003622172167e-06, "loss": 0.0304, "step": 2190 }, { "epoch": 1.4824086603518267, "grad_norm": 0.3172314371438738, "learning_rate": 5.9701419062928125e-06, "loss": 0.0298, "step": 2191 }, { "epoch": 1.483085250338295, "grad_norm": 0.37107791908123133, "learning_rate": 5.9662795889777666e-06, "loss": 0.0252, "step": 2192 }, { "epoch": 1.4837618403247632, "grad_norm": 0.5656949660516135, "learning_rate": 5.962416672621461e-06, "loss": 0.0395, "step": 2193 }, { "epoch": 1.4844384303112315, "grad_norm": 0.2715393500302299, "learning_rate": 5.958553159618693e-06, "loss": 0.0247, "step": 2194 }, { "epoch": 1.4851150202976995, "grad_norm": 0.27915366615382764, "learning_rate": 5.954689052364633e-06, "loss": 0.0174, "step": 2195 }, { "epoch": 1.4857916102841677, "grad_norm": 0.30230696083916986, "learning_rate": 5.950824353254818e-06, "loss": 0.0268, "step": 2196 }, { "epoch": 1.486468200270636, "grad_norm": 0.7262061409576978, "learning_rate": 5.946959064685156e-06, "loss": 0.0335, "step": 2197 }, { "epoch": 1.4871447902571042, "grad_norm": 0.37117066478352984, "learning_rate": 5.943093189051916e-06, "loss": 0.0268, "step": 2198 }, { "epoch": 1.4878213802435725, "grad_norm": 0.30444721089664023, "learning_rate": 5.939226728751733e-06, "loss": 0.0294, "step": 2199 }, { "epoch": 1.4884979702300405, "grad_norm": 0.3720986974214217, "learning_rate": 5.9353596861816e-06, "loss": 0.0312, "step": 2200 }, { "epoch": 1.4891745602165087, "grad_norm": 0.2934579870646905, "learning_rate": 5.931492063738882e-06, "loss": 0.0305, "step": 2201 }, { "epoch": 1.489851150202977, "grad_norm": 0.2507508948273361, "learning_rate": 5.92762386382129e-06, "loss": 0.0285, "step": 2202 }, { "epoch": 1.4905277401894452, "grad_norm": 0.3384469594998731, "learning_rate": 5.9237550888269045e-06, "loss": 0.0318, "step": 2203 }, { "epoch": 1.4912043301759135, "grad_norm": 0.41935236318360014, "learning_rate": 5.919885741154155e-06, "loss": 0.0298, "step": 2204 }, { "epoch": 1.4918809201623815, "grad_norm": 0.3659203155838567, "learning_rate": 5.916015823201827e-06, "loss": 0.0409, "step": 2205 }, { "epoch": 1.4925575101488497, "grad_norm": 0.7338320709405541, "learning_rate": 5.912145337369064e-06, "loss": 0.0307, "step": 2206 }, { "epoch": 1.493234100135318, "grad_norm": 0.3538766905901768, "learning_rate": 5.908274286055358e-06, "loss": 0.0282, "step": 2207 }, { "epoch": 1.4939106901217862, "grad_norm": 0.46303894457737094, "learning_rate": 5.904402671660551e-06, "loss": 0.0325, "step": 2208 }, { "epoch": 1.4945872801082545, "grad_norm": 0.333197174440168, "learning_rate": 5.900530496584834e-06, "loss": 0.0332, "step": 2209 }, { "epoch": 1.4952638700947225, "grad_norm": 0.33949094401429847, "learning_rate": 5.8966577632287506e-06, "loss": 0.0323, "step": 2210 }, { "epoch": 1.4959404600811907, "grad_norm": 0.3747968067888869, "learning_rate": 5.892784473993184e-06, "loss": 0.036, "step": 2211 }, { "epoch": 1.496617050067659, "grad_norm": 0.3905306867661988, "learning_rate": 5.888910631279366e-06, "loss": 0.0433, "step": 2212 }, { "epoch": 1.4972936400541272, "grad_norm": 0.3683225446896326, "learning_rate": 5.885036237488868e-06, "loss": 0.037, "step": 2213 }, { "epoch": 1.4979702300405955, "grad_norm": 0.5040762838033244, "learning_rate": 5.88116129502361e-06, "loss": 0.0356, "step": 2214 }, { "epoch": 1.4986468200270635, "grad_norm": 0.3361864175196371, "learning_rate": 5.8772858062858414e-06, "loss": 0.03, "step": 2215 }, { "epoch": 1.4993234100135318, "grad_norm": 0.32950178770432687, "learning_rate": 5.873409773678163e-06, "loss": 0.0348, "step": 2216 }, { "epoch": 1.5, "grad_norm": 0.2545920551466513, "learning_rate": 5.869533199603498e-06, "loss": 0.0283, "step": 2217 }, { "epoch": 1.5006765899864682, "grad_norm": 0.38917650015289157, "learning_rate": 5.8656560864651225e-06, "loss": 0.0277, "step": 2218 }, { "epoch": 1.5013531799729365, "grad_norm": 0.34582128523983924, "learning_rate": 5.861778436666631e-06, "loss": 0.0319, "step": 2219 }, { "epoch": 1.5020297699594045, "grad_norm": 0.3980767468611284, "learning_rate": 5.857900252611959e-06, "loss": 0.0333, "step": 2220 }, { "epoch": 1.5027063599458728, "grad_norm": 0.2666736899074194, "learning_rate": 5.854021536705373e-06, "loss": 0.0376, "step": 2221 }, { "epoch": 1.503382949932341, "grad_norm": 0.3238074387153149, "learning_rate": 5.8501422913514665e-06, "loss": 0.0312, "step": 2222 }, { "epoch": 1.5040595399188093, "grad_norm": 0.7465006023551094, "learning_rate": 5.846262518955163e-06, "loss": 0.042, "step": 2223 }, { "epoch": 1.5047361299052775, "grad_norm": 0.3007257851162775, "learning_rate": 5.842382221921711e-06, "loss": 0.0277, "step": 2224 }, { "epoch": 1.5054127198917455, "grad_norm": 0.28283027914607606, "learning_rate": 5.838501402656688e-06, "loss": 0.0313, "step": 2225 }, { "epoch": 1.5060893098782138, "grad_norm": 0.4113052310883682, "learning_rate": 5.83462006356599e-06, "loss": 0.0332, "step": 2226 }, { "epoch": 1.506765899864682, "grad_norm": 0.3135099299519799, "learning_rate": 5.830738207055841e-06, "loss": 0.0293, "step": 2227 }, { "epoch": 1.5074424898511503, "grad_norm": 0.3175534859605412, "learning_rate": 5.8268558355327795e-06, "loss": 0.0289, "step": 2228 }, { "epoch": 1.5081190798376185, "grad_norm": 0.3817557999387755, "learning_rate": 5.82297295140367e-06, "loss": 0.0394, "step": 2229 }, { "epoch": 1.5087956698240865, "grad_norm": 0.400476420224585, "learning_rate": 5.819089557075689e-06, "loss": 0.0285, "step": 2230 }, { "epoch": 1.5094722598105548, "grad_norm": 0.28199296079401087, "learning_rate": 5.815205654956333e-06, "loss": 0.0224, "step": 2231 }, { "epoch": 1.510148849797023, "grad_norm": 0.387485117063692, "learning_rate": 5.811321247453409e-06, "loss": 0.0477, "step": 2232 }, { "epoch": 1.510825439783491, "grad_norm": 0.31527053910146335, "learning_rate": 5.807436336975045e-06, "loss": 0.0319, "step": 2233 }, { "epoch": 1.5115020297699595, "grad_norm": 0.3878838111364579, "learning_rate": 5.803550925929673e-06, "loss": 0.024, "step": 2234 }, { "epoch": 1.5121786197564275, "grad_norm": 0.3884093592351876, "learning_rate": 5.799665016726039e-06, "loss": 0.0237, "step": 2235 }, { "epoch": 1.5128552097428958, "grad_norm": 0.30494177236123693, "learning_rate": 5.795778611773197e-06, "loss": 0.0271, "step": 2236 }, { "epoch": 1.513531799729364, "grad_norm": 0.2890206836051052, "learning_rate": 5.791891713480509e-06, "loss": 0.0254, "step": 2237 }, { "epoch": 1.514208389715832, "grad_norm": 0.41133323803561245, "learning_rate": 5.788004324257643e-06, "loss": 0.0355, "step": 2238 }, { "epoch": 1.5148849797023005, "grad_norm": 0.44773139766116443, "learning_rate": 5.784116446514571e-06, "loss": 0.0405, "step": 2239 }, { "epoch": 1.5155615696887685, "grad_norm": 0.35202944055249824, "learning_rate": 5.780228082661564e-06, "loss": 0.0283, "step": 2240 }, { "epoch": 1.5162381596752368, "grad_norm": 0.3679419256015076, "learning_rate": 5.776339235109203e-06, "loss": 0.0304, "step": 2241 }, { "epoch": 1.516914749661705, "grad_norm": 0.4012639841648305, "learning_rate": 5.772449906268362e-06, "loss": 0.0372, "step": 2242 }, { "epoch": 1.517591339648173, "grad_norm": 0.26874391610278875, "learning_rate": 5.768560098550213e-06, "loss": 0.0285, "step": 2243 }, { "epoch": 1.5182679296346415, "grad_norm": 0.280920653440193, "learning_rate": 5.764669814366231e-06, "loss": 0.0303, "step": 2244 }, { "epoch": 1.5189445196211095, "grad_norm": 0.43178951630195245, "learning_rate": 5.760779056128178e-06, "loss": 0.0364, "step": 2245 }, { "epoch": 1.5196211096075778, "grad_norm": 0.3379292088933386, "learning_rate": 5.756887826248118e-06, "loss": 0.026, "step": 2246 }, { "epoch": 1.520297699594046, "grad_norm": 0.48985792995504546, "learning_rate": 5.752996127138404e-06, "loss": 0.0256, "step": 2247 }, { "epoch": 1.520974289580514, "grad_norm": 0.42483730153783744, "learning_rate": 5.749103961211679e-06, "loss": 0.0392, "step": 2248 }, { "epoch": 1.5216508795669825, "grad_norm": 0.3047063888515312, "learning_rate": 5.745211330880872e-06, "loss": 0.0303, "step": 2249 }, { "epoch": 1.5223274695534506, "grad_norm": 0.23780136080719255, "learning_rate": 5.74131823855921e-06, "loss": 0.0199, "step": 2250 }, { "epoch": 1.5230040595399188, "grad_norm": 0.23773757627075567, "learning_rate": 5.737424686660198e-06, "loss": 0.0264, "step": 2251 }, { "epoch": 1.523680649526387, "grad_norm": 0.5277599293734544, "learning_rate": 5.733530677597627e-06, "loss": 0.0299, "step": 2252 }, { "epoch": 1.524357239512855, "grad_norm": 0.4383727747813701, "learning_rate": 5.729636213785574e-06, "loss": 0.0333, "step": 2253 }, { "epoch": 1.5250338294993235, "grad_norm": 0.2804393276276087, "learning_rate": 5.725741297638399e-06, "loss": 0.0254, "step": 2254 }, { "epoch": 1.5257104194857916, "grad_norm": 0.29051578642396086, "learning_rate": 5.721845931570734e-06, "loss": 0.0273, "step": 2255 }, { "epoch": 1.5263870094722598, "grad_norm": 0.46143304691040654, "learning_rate": 5.717950117997502e-06, "loss": 0.0317, "step": 2256 }, { "epoch": 1.527063599458728, "grad_norm": 0.30504259152648083, "learning_rate": 5.714053859333893e-06, "loss": 0.0242, "step": 2257 }, { "epoch": 1.527740189445196, "grad_norm": 0.369149715164369, "learning_rate": 5.710157157995382e-06, "loss": 0.0407, "step": 2258 }, { "epoch": 1.5284167794316645, "grad_norm": 0.7067227051887994, "learning_rate": 5.70626001639771e-06, "loss": 0.0369, "step": 2259 }, { "epoch": 1.5290933694181326, "grad_norm": 0.39630297252409374, "learning_rate": 5.702362436956895e-06, "loss": 0.0331, "step": 2260 }, { "epoch": 1.5297699594046008, "grad_norm": 0.2983715595820596, "learning_rate": 5.6984644220892295e-06, "loss": 0.0248, "step": 2261 }, { "epoch": 1.530446549391069, "grad_norm": 0.2916272939939936, "learning_rate": 5.694565974211267e-06, "loss": 0.0311, "step": 2262 }, { "epoch": 1.531123139377537, "grad_norm": 0.28916619163731966, "learning_rate": 5.69066709573984e-06, "loss": 0.0278, "step": 2263 }, { "epoch": 1.5317997293640055, "grad_norm": 0.5797745890372059, "learning_rate": 5.686767789092041e-06, "loss": 0.0354, "step": 2264 }, { "epoch": 1.5324763193504736, "grad_norm": 0.41935238131521263, "learning_rate": 5.6828680566852314e-06, "loss": 0.0372, "step": 2265 }, { "epoch": 1.5331529093369418, "grad_norm": 0.31269369456931645, "learning_rate": 5.678967900937032e-06, "loss": 0.0282, "step": 2266 }, { "epoch": 1.53382949932341, "grad_norm": 0.3502492713207318, "learning_rate": 5.675067324265332e-06, "loss": 0.0299, "step": 2267 }, { "epoch": 1.534506089309878, "grad_norm": 0.48059011983953875, "learning_rate": 5.671166329088278e-06, "loss": 0.0389, "step": 2268 }, { "epoch": 1.5351826792963466, "grad_norm": 0.35812795442247, "learning_rate": 5.667264917824277e-06, "loss": 0.0313, "step": 2269 }, { "epoch": 1.5358592692828146, "grad_norm": 0.4503601989414651, "learning_rate": 5.663363092891991e-06, "loss": 0.0371, "step": 2270 }, { "epoch": 1.5365358592692828, "grad_norm": 0.27188587094119093, "learning_rate": 5.659460856710346e-06, "loss": 0.0258, "step": 2271 }, { "epoch": 1.537212449255751, "grad_norm": 0.46550547520971586, "learning_rate": 5.655558211698513e-06, "loss": 0.0383, "step": 2272 }, { "epoch": 1.537889039242219, "grad_norm": 0.33906217259787863, "learning_rate": 5.651655160275925e-06, "loss": 0.032, "step": 2273 }, { "epoch": 1.5385656292286876, "grad_norm": 0.2995042962480075, "learning_rate": 5.647751704862263e-06, "loss": 0.026, "step": 2274 }, { "epoch": 1.5392422192151556, "grad_norm": 0.2687361674785122, "learning_rate": 5.643847847877458e-06, "loss": 0.0226, "step": 2275 }, { "epoch": 1.5399188092016238, "grad_norm": 0.3224258661734978, "learning_rate": 5.639943591741691e-06, "loss": 0.0295, "step": 2276 }, { "epoch": 1.540595399188092, "grad_norm": 0.361300059365036, "learning_rate": 5.636038938875391e-06, "loss": 0.0314, "step": 2277 }, { "epoch": 1.54127198917456, "grad_norm": 0.3031849303460406, "learning_rate": 5.632133891699232e-06, "loss": 0.0224, "step": 2278 }, { "epoch": 1.5419485791610286, "grad_norm": 0.3635016257159539, "learning_rate": 5.628228452634132e-06, "loss": 0.0269, "step": 2279 }, { "epoch": 1.5426251691474966, "grad_norm": 0.3291595141471702, "learning_rate": 5.624322624101255e-06, "loss": 0.0309, "step": 2280 }, { "epoch": 1.5433017591339648, "grad_norm": 0.4136027896750933, "learning_rate": 5.620416408522002e-06, "loss": 0.0311, "step": 2281 }, { "epoch": 1.543978349120433, "grad_norm": 0.3654256380068809, "learning_rate": 5.616509808318017e-06, "loss": 0.03, "step": 2282 }, { "epoch": 1.544654939106901, "grad_norm": 0.4088012751290403, "learning_rate": 5.612602825911179e-06, "loss": 0.0239, "step": 2283 }, { "epoch": 1.5453315290933696, "grad_norm": 0.3327583838679735, "learning_rate": 5.608695463723614e-06, "loss": 0.0332, "step": 2284 }, { "epoch": 1.5460081190798376, "grad_norm": 0.28656806895970455, "learning_rate": 5.604787724177666e-06, "loss": 0.0252, "step": 2285 }, { "epoch": 1.5466847090663058, "grad_norm": 0.3756101794457761, "learning_rate": 5.600879609695929e-06, "loss": 0.0224, "step": 2286 }, { "epoch": 1.547361299052774, "grad_norm": 0.3372624946666842, "learning_rate": 5.596971122701221e-06, "loss": 0.0264, "step": 2287 }, { "epoch": 1.548037889039242, "grad_norm": 0.5262564576474407, "learning_rate": 5.593062265616598e-06, "loss": 0.0273, "step": 2288 }, { "epoch": 1.5487144790257106, "grad_norm": 0.2780470196228257, "learning_rate": 5.589153040865333e-06, "loss": 0.0307, "step": 2289 }, { "epoch": 1.5493910690121786, "grad_norm": 0.2863435793032491, "learning_rate": 5.585243450870941e-06, "loss": 0.0249, "step": 2290 }, { "epoch": 1.5500676589986468, "grad_norm": 0.3354611138791834, "learning_rate": 5.581333498057153e-06, "loss": 0.0251, "step": 2291 }, { "epoch": 1.550744248985115, "grad_norm": 0.352696713815565, "learning_rate": 5.577423184847932e-06, "loss": 0.0272, "step": 2292 }, { "epoch": 1.5514208389715831, "grad_norm": 0.29367697450554253, "learning_rate": 5.573512513667459e-06, "loss": 0.0262, "step": 2293 }, { "epoch": 1.5520974289580516, "grad_norm": 0.3406886200568296, "learning_rate": 5.56960148694014e-06, "loss": 0.0312, "step": 2294 }, { "epoch": 1.5527740189445196, "grad_norm": 0.2841177723734298, "learning_rate": 5.565690107090603e-06, "loss": 0.0252, "step": 2295 }, { "epoch": 1.5534506089309879, "grad_norm": 0.5547936101971093, "learning_rate": 5.5617783765436894e-06, "loss": 0.0377, "step": 2296 }, { "epoch": 1.554127198917456, "grad_norm": 0.5201124496523982, "learning_rate": 5.557866297724462e-06, "loss": 0.047, "step": 2297 }, { "epoch": 1.5548037889039241, "grad_norm": 0.28795915904164315, "learning_rate": 5.553953873058201e-06, "loss": 0.0276, "step": 2298 }, { "epoch": 1.5554803788903924, "grad_norm": 0.4327150106187772, "learning_rate": 5.550041104970398e-06, "loss": 0.0436, "step": 2299 }, { "epoch": 1.5561569688768606, "grad_norm": 0.3780508913737479, "learning_rate": 5.5461279958867556e-06, "loss": 0.0348, "step": 2300 }, { "epoch": 1.5568335588633289, "grad_norm": 0.7123830628748699, "learning_rate": 5.542214548233195e-06, "loss": 0.0427, "step": 2301 }, { "epoch": 1.557510148849797, "grad_norm": 0.44688756294255394, "learning_rate": 5.538300764435838e-06, "loss": 0.0372, "step": 2302 }, { "epoch": 1.5581867388362651, "grad_norm": 0.31050222373873143, "learning_rate": 5.534386646921023e-06, "loss": 0.028, "step": 2303 }, { "epoch": 1.5588633288227334, "grad_norm": 0.2474705453010173, "learning_rate": 5.530472198115291e-06, "loss": 0.0224, "step": 2304 }, { "epoch": 1.5595399188092016, "grad_norm": 0.4243315473822393, "learning_rate": 5.52655742044539e-06, "loss": 0.0434, "step": 2305 }, { "epoch": 1.5602165087956699, "grad_norm": 0.3073804022735799, "learning_rate": 5.522642316338268e-06, "loss": 0.024, "step": 2306 }, { "epoch": 1.560893098782138, "grad_norm": 0.30508481179625546, "learning_rate": 5.518726888221082e-06, "loss": 0.0261, "step": 2307 }, { "epoch": 1.5615696887686061, "grad_norm": 0.3645415300263851, "learning_rate": 5.514811138521186e-06, "loss": 0.0457, "step": 2308 }, { "epoch": 1.5622462787550744, "grad_norm": 0.3731880232304017, "learning_rate": 5.510895069666132e-06, "loss": 0.0332, "step": 2309 }, { "epoch": 1.5629228687415426, "grad_norm": 0.29881040608542214, "learning_rate": 5.506978684083672e-06, "loss": 0.0322, "step": 2310 }, { "epoch": 1.5635994587280109, "grad_norm": 0.4139647800047502, "learning_rate": 5.503061984201755e-06, "loss": 0.0353, "step": 2311 }, { "epoch": 1.5642760487144791, "grad_norm": 0.26339450128141606, "learning_rate": 5.499144972448525e-06, "loss": 0.0221, "step": 2312 }, { "epoch": 1.5649526387009471, "grad_norm": 0.27030552970296745, "learning_rate": 5.495227651252315e-06, "loss": 0.0281, "step": 2313 }, { "epoch": 1.5656292286874154, "grad_norm": 0.3430571955467693, "learning_rate": 5.4913100230416536e-06, "loss": 0.0259, "step": 2314 }, { "epoch": 1.5663058186738836, "grad_norm": 0.41751827223862137, "learning_rate": 5.48739209024526e-06, "loss": 0.0432, "step": 2315 }, { "epoch": 1.5669824086603519, "grad_norm": 0.38928508117837146, "learning_rate": 5.483473855292043e-06, "loss": 0.0509, "step": 2316 }, { "epoch": 1.5676589986468201, "grad_norm": 0.597502646451781, "learning_rate": 5.479555320611094e-06, "loss": 0.0389, "step": 2317 }, { "epoch": 1.5683355886332881, "grad_norm": 0.30570336204655224, "learning_rate": 5.475636488631697e-06, "loss": 0.0331, "step": 2318 }, { "epoch": 1.5690121786197564, "grad_norm": 0.3724477977801581, "learning_rate": 5.471717361783312e-06, "loss": 0.0351, "step": 2319 }, { "epoch": 1.5696887686062246, "grad_norm": 0.3717105129186735, "learning_rate": 5.46779794249559e-06, "loss": 0.0327, "step": 2320 }, { "epoch": 1.5703653585926927, "grad_norm": 0.32443787395412504, "learning_rate": 5.463878233198358e-06, "loss": 0.0338, "step": 2321 }, { "epoch": 1.5710419485791611, "grad_norm": 0.2759446934870076, "learning_rate": 5.459958236321625e-06, "loss": 0.0242, "step": 2322 }, { "epoch": 1.5717185385656292, "grad_norm": 0.3020330020083215, "learning_rate": 5.4560379542955766e-06, "loss": 0.0312, "step": 2323 }, { "epoch": 1.5723951285520974, "grad_norm": 0.7976163557677621, "learning_rate": 5.45211738955058e-06, "loss": 0.0342, "step": 2324 }, { "epoch": 1.5730717185385656, "grad_norm": 0.30519498323082467, "learning_rate": 5.448196544517168e-06, "loss": 0.0289, "step": 2325 }, { "epoch": 1.5737483085250337, "grad_norm": 0.45976303799466584, "learning_rate": 5.444275421626058e-06, "loss": 0.0303, "step": 2326 }, { "epoch": 1.5744248985115021, "grad_norm": 0.5640631094334173, "learning_rate": 5.440354023308134e-06, "loss": 0.04, "step": 2327 }, { "epoch": 1.5751014884979702, "grad_norm": 0.3212546983456934, "learning_rate": 5.436432351994452e-06, "loss": 0.0315, "step": 2328 }, { "epoch": 1.5757780784844384, "grad_norm": 0.2697860133399002, "learning_rate": 5.4325104101162345e-06, "loss": 0.0215, "step": 2329 }, { "epoch": 1.5764546684709067, "grad_norm": 0.45636905948917267, "learning_rate": 5.428588200104875e-06, "loss": 0.0304, "step": 2330 }, { "epoch": 1.5771312584573747, "grad_norm": 0.42084923241510136, "learning_rate": 5.4246657243919345e-06, "loss": 0.0319, "step": 2331 }, { "epoch": 1.5778078484438431, "grad_norm": 0.3275868438390289, "learning_rate": 5.420742985409132e-06, "loss": 0.0333, "step": 2332 }, { "epoch": 1.5784844384303112, "grad_norm": 0.356416041888783, "learning_rate": 5.41681998558836e-06, "loss": 0.0338, "step": 2333 }, { "epoch": 1.5791610284167794, "grad_norm": 0.2923914256852067, "learning_rate": 5.412896727361663e-06, "loss": 0.0275, "step": 2334 }, { "epoch": 1.5798376184032477, "grad_norm": 0.40928252595061093, "learning_rate": 5.408973213161251e-06, "loss": 0.0301, "step": 2335 }, { "epoch": 1.5805142083897157, "grad_norm": 0.3685539206759366, "learning_rate": 5.405049445419488e-06, "loss": 0.0331, "step": 2336 }, { "epoch": 1.5811907983761841, "grad_norm": 0.29125932019013834, "learning_rate": 5.401125426568904e-06, "loss": 0.0285, "step": 2337 }, { "epoch": 1.5818673883626522, "grad_norm": 0.43011279214871445, "learning_rate": 5.397201159042176e-06, "loss": 0.0358, "step": 2338 }, { "epoch": 1.5825439783491204, "grad_norm": 0.5382754877623533, "learning_rate": 5.393276645272139e-06, "loss": 0.0361, "step": 2339 }, { "epoch": 1.5832205683355887, "grad_norm": 0.4224877364186262, "learning_rate": 5.3893518876917795e-06, "loss": 0.0382, "step": 2340 }, { "epoch": 1.5838971583220567, "grad_norm": 0.5357649876397851, "learning_rate": 5.385426888734237e-06, "loss": 0.0273, "step": 2341 }, { "epoch": 1.5845737483085252, "grad_norm": 0.3313339071953703, "learning_rate": 5.381501650832798e-06, "loss": 0.0285, "step": 2342 }, { "epoch": 1.5852503382949932, "grad_norm": 0.37701724090596933, "learning_rate": 5.377576176420899e-06, "loss": 0.0298, "step": 2343 }, { "epoch": 1.5859269282814614, "grad_norm": 0.38807658159329533, "learning_rate": 5.373650467932122e-06, "loss": 0.0301, "step": 2344 }, { "epoch": 1.5866035182679297, "grad_norm": 0.2930869365951142, "learning_rate": 5.3697245278001956e-06, "loss": 0.0256, "step": 2345 }, { "epoch": 1.5872801082543977, "grad_norm": 0.34352526166541836, "learning_rate": 5.365798358458989e-06, "loss": 0.031, "step": 2346 }, { "epoch": 1.5879566982408662, "grad_norm": 0.4198102347999944, "learning_rate": 5.361871962342519e-06, "loss": 0.0354, "step": 2347 }, { "epoch": 1.5886332882273342, "grad_norm": 0.4117109193265844, "learning_rate": 5.357945341884936e-06, "loss": 0.0444, "step": 2348 }, { "epoch": 1.5893098782138024, "grad_norm": 0.42754919195883917, "learning_rate": 5.354018499520536e-06, "loss": 0.0195, "step": 2349 }, { "epoch": 1.5899864682002707, "grad_norm": 0.29354593613448376, "learning_rate": 5.350091437683746e-06, "loss": 0.0308, "step": 2350 }, { "epoch": 1.5906630581867387, "grad_norm": 0.2599587562244208, "learning_rate": 5.346164158809136e-06, "loss": 0.0209, "step": 2351 }, { "epoch": 1.5913396481732072, "grad_norm": 0.5137231231769998, "learning_rate": 5.342236665331407e-06, "loss": 0.0447, "step": 2352 }, { "epoch": 1.5920162381596752, "grad_norm": 0.28814223157523017, "learning_rate": 5.338308959685391e-06, "loss": 0.0293, "step": 2353 }, { "epoch": 1.5926928281461434, "grad_norm": 0.3324234006014201, "learning_rate": 5.334381044306057e-06, "loss": 0.0275, "step": 2354 }, { "epoch": 1.5933694181326117, "grad_norm": 0.8135230429138659, "learning_rate": 5.3304529216284974e-06, "loss": 0.039, "step": 2355 }, { "epoch": 1.5940460081190797, "grad_norm": 0.24029161254048562, "learning_rate": 5.32652459408794e-06, "loss": 0.0219, "step": 2356 }, { "epoch": 1.5947225981055482, "grad_norm": 0.39312159225943955, "learning_rate": 5.322596064119731e-06, "loss": 0.0319, "step": 2357 }, { "epoch": 1.5953991880920162, "grad_norm": 0.31450535898429005, "learning_rate": 5.318667334159354e-06, "loss": 0.0277, "step": 2358 }, { "epoch": 1.5960757780784844, "grad_norm": 0.28790052265587107, "learning_rate": 5.314738406642405e-06, "loss": 0.0255, "step": 2359 }, { "epoch": 1.5967523680649527, "grad_norm": 0.3466919896879651, "learning_rate": 5.310809284004608e-06, "loss": 0.0257, "step": 2360 }, { "epoch": 1.5974289580514207, "grad_norm": 0.30185329987038856, "learning_rate": 5.306879968681808e-06, "loss": 0.0242, "step": 2361 }, { "epoch": 1.5981055480378892, "grad_norm": 0.476468114203344, "learning_rate": 5.30295046310997e-06, "loss": 0.0379, "step": 2362 }, { "epoch": 1.5987821380243572, "grad_norm": 0.3266348346067902, "learning_rate": 5.299020769725172e-06, "loss": 0.0306, "step": 2363 }, { "epoch": 1.5994587280108254, "grad_norm": 0.3510866291031492, "learning_rate": 5.2950908909636144e-06, "loss": 0.0301, "step": 2364 }, { "epoch": 1.6001353179972937, "grad_norm": 0.29795316071876665, "learning_rate": 5.2911608292616116e-06, "loss": 0.0238, "step": 2365 }, { "epoch": 1.6008119079837617, "grad_norm": 0.3357903221858778, "learning_rate": 5.2872305870555874e-06, "loss": 0.0336, "step": 2366 }, { "epoch": 1.6014884979702302, "grad_norm": 0.3222633508321025, "learning_rate": 5.2833001667820815e-06, "loss": 0.0305, "step": 2367 }, { "epoch": 1.6021650879566982, "grad_norm": 0.3013168859199103, "learning_rate": 5.279369570877742e-06, "loss": 0.0245, "step": 2368 }, { "epoch": 1.6028416779431665, "grad_norm": 0.3594003014147857, "learning_rate": 5.275438801779328e-06, "loss": 0.0423, "step": 2369 }, { "epoch": 1.6035182679296347, "grad_norm": 0.4092351587208865, "learning_rate": 5.271507861923701e-06, "loss": 0.0498, "step": 2370 }, { "epoch": 1.6041948579161027, "grad_norm": 0.33582306878695767, "learning_rate": 5.267576753747839e-06, "loss": 0.028, "step": 2371 }, { "epoch": 1.6048714479025712, "grad_norm": 0.3557381764205679, "learning_rate": 5.263645479688807e-06, "loss": 0.0417, "step": 2372 }, { "epoch": 1.6055480378890392, "grad_norm": 0.348355854143184, "learning_rate": 5.2597140421837915e-06, "loss": 0.0345, "step": 2373 }, { "epoch": 1.6062246278755075, "grad_norm": 0.3523634460129781, "learning_rate": 5.255782443670068e-06, "loss": 0.0297, "step": 2374 }, { "epoch": 1.6069012178619757, "grad_norm": 0.41331346487260207, "learning_rate": 5.251850686585015e-06, "loss": 0.0376, "step": 2375 }, { "epoch": 1.6075778078484437, "grad_norm": 0.9209189140563419, "learning_rate": 5.247918773366112e-06, "loss": 0.0387, "step": 2376 }, { "epoch": 1.6082543978349122, "grad_norm": 0.2853025462118463, "learning_rate": 5.243986706450933e-06, "loss": 0.037, "step": 2377 }, { "epoch": 1.6089309878213802, "grad_norm": 0.269536875487749, "learning_rate": 5.240054488277148e-06, "loss": 0.0252, "step": 2378 }, { "epoch": 1.6096075778078485, "grad_norm": 0.43886888149438347, "learning_rate": 5.2361221212825175e-06, "loss": 0.0381, "step": 2379 }, { "epoch": 1.6102841677943167, "grad_norm": 0.3042639669769169, "learning_rate": 5.2321896079048994e-06, "loss": 0.0299, "step": 2380 }, { "epoch": 1.6109607577807847, "grad_norm": 0.3857116430689522, "learning_rate": 5.2282569505822414e-06, "loss": 0.027, "step": 2381 }, { "epoch": 1.6116373477672532, "grad_norm": 0.36262497922029185, "learning_rate": 5.224324151752575e-06, "loss": 0.0388, "step": 2382 }, { "epoch": 1.6123139377537212, "grad_norm": 0.34505466311143673, "learning_rate": 5.220391213854028e-06, "loss": 0.0348, "step": 2383 }, { "epoch": 1.6129905277401895, "grad_norm": 0.37328643137737, "learning_rate": 5.216458139324806e-06, "loss": 0.0311, "step": 2384 }, { "epoch": 1.6136671177266577, "grad_norm": 0.2640567293362263, "learning_rate": 5.212524930603205e-06, "loss": 0.0271, "step": 2385 }, { "epoch": 1.6143437077131257, "grad_norm": 0.29616927061434556, "learning_rate": 5.208591590127603e-06, "loss": 0.0297, "step": 2386 }, { "epoch": 1.6150202976995942, "grad_norm": 0.24550333446499212, "learning_rate": 5.2046581203364585e-06, "loss": 0.0261, "step": 2387 }, { "epoch": 1.6156968876860622, "grad_norm": 0.29369395840996215, "learning_rate": 5.200724523668311e-06, "loss": 0.0245, "step": 2388 }, { "epoch": 1.6163734776725305, "grad_norm": 0.2683000629854068, "learning_rate": 5.196790802561776e-06, "loss": 0.0221, "step": 2389 }, { "epoch": 1.6170500676589987, "grad_norm": 0.34670876389850336, "learning_rate": 5.192856959455552e-06, "loss": 0.0391, "step": 2390 }, { "epoch": 1.6177266576454667, "grad_norm": 0.36926778359752926, "learning_rate": 5.188922996788409e-06, "loss": 0.0333, "step": 2391 }, { "epoch": 1.618403247631935, "grad_norm": 0.31394306373678366, "learning_rate": 5.184988916999191e-06, "loss": 0.0272, "step": 2392 }, { "epoch": 1.6190798376184032, "grad_norm": 0.2900385153166722, "learning_rate": 5.181054722526815e-06, "loss": 0.0249, "step": 2393 }, { "epoch": 1.6197564276048715, "grad_norm": 0.2782432898626616, "learning_rate": 5.177120415810271e-06, "loss": 0.0277, "step": 2394 }, { "epoch": 1.6204330175913397, "grad_norm": 0.4515117647214088, "learning_rate": 5.173185999288615e-06, "loss": 0.0254, "step": 2395 }, { "epoch": 1.6211096075778078, "grad_norm": 0.3454856270531984, "learning_rate": 5.1692514754009744e-06, "loss": 0.033, "step": 2396 }, { "epoch": 1.621786197564276, "grad_norm": 0.2614263919421152, "learning_rate": 5.165316846586541e-06, "loss": 0.0238, "step": 2397 }, { "epoch": 1.6224627875507442, "grad_norm": 0.34588880208782086, "learning_rate": 5.161382115284576e-06, "loss": 0.0263, "step": 2398 }, { "epoch": 1.6231393775372125, "grad_norm": 0.3024368957354851, "learning_rate": 5.1574472839343956e-06, "loss": 0.0255, "step": 2399 }, { "epoch": 1.6238159675236807, "grad_norm": 0.3230674290561768, "learning_rate": 5.153512354975388e-06, "loss": 0.0292, "step": 2400 }, { "epoch": 1.6244925575101488, "grad_norm": 0.33268775259496547, "learning_rate": 5.1495773308469935e-06, "loss": 0.0242, "step": 2401 }, { "epoch": 1.625169147496617, "grad_norm": 0.3434921825358834, "learning_rate": 5.145642213988716e-06, "loss": 0.0282, "step": 2402 }, { "epoch": 1.6258457374830853, "grad_norm": 0.28930577449493566, "learning_rate": 5.1417070068401165e-06, "loss": 0.0261, "step": 2403 }, { "epoch": 1.6265223274695535, "grad_norm": 0.41573144359694525, "learning_rate": 5.137771711840811e-06, "loss": 0.0332, "step": 2404 }, { "epoch": 1.6271989174560217, "grad_norm": 0.30668670795523817, "learning_rate": 5.133836331430469e-06, "loss": 0.0334, "step": 2405 }, { "epoch": 1.6278755074424898, "grad_norm": 0.38110277999573, "learning_rate": 5.129900868048817e-06, "loss": 0.0335, "step": 2406 }, { "epoch": 1.628552097428958, "grad_norm": 0.3087558173880505, "learning_rate": 5.1259653241356275e-06, "loss": 0.0267, "step": 2407 }, { "epoch": 1.6292286874154263, "grad_norm": 0.36578731267965897, "learning_rate": 5.1220297021307275e-06, "loss": 0.0244, "step": 2408 }, { "epoch": 1.6299052774018945, "grad_norm": 0.30876751239844696, "learning_rate": 5.11809400447399e-06, "loss": 0.0295, "step": 2409 }, { "epoch": 1.6305818673883627, "grad_norm": 0.3979178676891502, "learning_rate": 5.114158233605334e-06, "loss": 0.0333, "step": 2410 }, { "epoch": 1.6312584573748308, "grad_norm": 0.4297870999131062, "learning_rate": 5.110222391964728e-06, "loss": 0.0414, "step": 2411 }, { "epoch": 1.631935047361299, "grad_norm": 0.31896124832207173, "learning_rate": 5.106286481992179e-06, "loss": 0.0351, "step": 2412 }, { "epoch": 1.6326116373477673, "grad_norm": 0.23096169080932097, "learning_rate": 5.1023505061277405e-06, "loss": 0.0227, "step": 2413 }, { "epoch": 1.6332882273342353, "grad_norm": 0.3413028792115987, "learning_rate": 5.098414466811504e-06, "loss": 0.0374, "step": 2414 }, { "epoch": 1.6339648173207038, "grad_norm": 0.45352921659674916, "learning_rate": 5.094478366483604e-06, "loss": 0.0339, "step": 2415 }, { "epoch": 1.6346414073071718, "grad_norm": 0.4903068175504176, "learning_rate": 5.090542207584207e-06, "loss": 0.0321, "step": 2416 }, { "epoch": 1.63531799729364, "grad_norm": 0.4244606937624204, "learning_rate": 5.086605992553524e-06, "loss": 0.0297, "step": 2417 }, { "epoch": 1.6359945872801083, "grad_norm": 0.3013416344160587, "learning_rate": 5.082669723831793e-06, "loss": 0.0287, "step": 2418 }, { "epoch": 1.6366711772665763, "grad_norm": 0.3393425034746203, "learning_rate": 5.07873340385929e-06, "loss": 0.0302, "step": 2419 }, { "epoch": 1.6373477672530448, "grad_norm": 0.3068218935542081, "learning_rate": 5.074797035076319e-06, "loss": 0.0412, "step": 2420 }, { "epoch": 1.6380243572395128, "grad_norm": 0.3287251477257142, "learning_rate": 5.070860619923218e-06, "loss": 0.0348, "step": 2421 }, { "epoch": 1.638700947225981, "grad_norm": 0.2956531549869574, "learning_rate": 5.066924160840353e-06, "loss": 0.0291, "step": 2422 }, { "epoch": 1.6393775372124493, "grad_norm": 0.40633426779896475, "learning_rate": 5.062987660268114e-06, "loss": 0.0399, "step": 2423 }, { "epoch": 1.6400541271989173, "grad_norm": 0.35961004553091297, "learning_rate": 5.059051120646924e-06, "loss": 0.0349, "step": 2424 }, { "epoch": 1.6407307171853858, "grad_norm": 0.4275448811460775, "learning_rate": 5.055114544417219e-06, "loss": 0.0324, "step": 2425 }, { "epoch": 1.6414073071718538, "grad_norm": 0.2749068378521967, "learning_rate": 5.051177934019468e-06, "loss": 0.0265, "step": 2426 }, { "epoch": 1.642083897158322, "grad_norm": 0.31688921004024967, "learning_rate": 5.047241291894156e-06, "loss": 0.0257, "step": 2427 }, { "epoch": 1.6427604871447903, "grad_norm": 0.3297456720984391, "learning_rate": 5.043304620481791e-06, "loss": 0.0245, "step": 2428 }, { "epoch": 1.6434370771312583, "grad_norm": 0.32117853754405856, "learning_rate": 5.039367922222894e-06, "loss": 0.0293, "step": 2429 }, { "epoch": 1.6441136671177268, "grad_norm": 0.2742748087180567, "learning_rate": 5.035431199558008e-06, "loss": 0.0262, "step": 2430 }, { "epoch": 1.6447902571041948, "grad_norm": 0.39990732911072946, "learning_rate": 5.031494454927688e-06, "loss": 0.0286, "step": 2431 }, { "epoch": 1.645466847090663, "grad_norm": 0.3086953831666762, "learning_rate": 5.027557690772503e-06, "loss": 0.0283, "step": 2432 }, { "epoch": 1.6461434370771313, "grad_norm": 0.3869513723833396, "learning_rate": 5.0236209095330344e-06, "loss": 0.0304, "step": 2433 }, { "epoch": 1.6468200270635993, "grad_norm": 0.363960957477567, "learning_rate": 5.019684113649877e-06, "loss": 0.0321, "step": 2434 }, { "epoch": 1.6474966170500678, "grad_norm": 0.3834734462758225, "learning_rate": 5.0157473055636285e-06, "loss": 0.0402, "step": 2435 }, { "epoch": 1.6481732070365358, "grad_norm": 0.3305860583863713, "learning_rate": 5.011810487714901e-06, "loss": 0.0333, "step": 2436 }, { "epoch": 1.648849797023004, "grad_norm": 0.3971169135342817, "learning_rate": 5.007873662544306e-06, "loss": 0.0371, "step": 2437 }, { "epoch": 1.6495263870094723, "grad_norm": 0.3796118208418505, "learning_rate": 5.003936832492465e-06, "loss": 0.028, "step": 2438 }, { "epoch": 1.6502029769959403, "grad_norm": 0.40048953027734274, "learning_rate": 5e-06, "loss": 0.0352, "step": 2439 }, { "epoch": 1.6508795669824088, "grad_norm": 0.26909899622698136, "learning_rate": 4.9960631675075364e-06, "loss": 0.0258, "step": 2440 }, { "epoch": 1.6515561569688768, "grad_norm": 0.30799193045806944, "learning_rate": 4.9921263374556946e-06, "loss": 0.0316, "step": 2441 }, { "epoch": 1.652232746955345, "grad_norm": 0.28341985680868104, "learning_rate": 4.988189512285101e-06, "loss": 0.0318, "step": 2442 }, { "epoch": 1.6529093369418133, "grad_norm": 0.3149542562422749, "learning_rate": 4.984252694436373e-06, "loss": 0.0318, "step": 2443 }, { "epoch": 1.6535859269282813, "grad_norm": 0.4672164364042736, "learning_rate": 4.980315886350125e-06, "loss": 0.0281, "step": 2444 }, { "epoch": 1.6542625169147498, "grad_norm": 0.39497733540158936, "learning_rate": 4.976379090466966e-06, "loss": 0.0305, "step": 2445 }, { "epoch": 1.6549391069012178, "grad_norm": 0.3048033655998791, "learning_rate": 4.972442309227498e-06, "loss": 0.0326, "step": 2446 }, { "epoch": 1.655615696887686, "grad_norm": 0.3438137923735947, "learning_rate": 4.968505545072314e-06, "loss": 0.0252, "step": 2447 }, { "epoch": 1.6562922868741543, "grad_norm": 0.4526020897772555, "learning_rate": 4.964568800441993e-06, "loss": 0.0319, "step": 2448 }, { "epoch": 1.6569688768606223, "grad_norm": 0.4090532831167456, "learning_rate": 4.960632077777107e-06, "loss": 0.0374, "step": 2449 }, { "epoch": 1.6576454668470908, "grad_norm": 0.284886721313467, "learning_rate": 4.956695379518211e-06, "loss": 0.0255, "step": 2450 }, { "epoch": 1.6583220568335588, "grad_norm": 0.5135484250527513, "learning_rate": 4.952758708105845e-06, "loss": 0.0271, "step": 2451 }, { "epoch": 1.658998646820027, "grad_norm": 0.32544962255843, "learning_rate": 4.948822065980533e-06, "loss": 0.0293, "step": 2452 }, { "epoch": 1.6596752368064953, "grad_norm": 0.36630927223611487, "learning_rate": 4.944885455582783e-06, "loss": 0.0339, "step": 2453 }, { "epoch": 1.6603518267929633, "grad_norm": 0.4938733572333338, "learning_rate": 4.940948879353078e-06, "loss": 0.0393, "step": 2454 }, { "epoch": 1.6610284167794318, "grad_norm": 0.28527794194948547, "learning_rate": 4.937012339731886e-06, "loss": 0.0239, "step": 2455 }, { "epoch": 1.6617050067658998, "grad_norm": 0.35862330240430557, "learning_rate": 4.933075839159649e-06, "loss": 0.0367, "step": 2456 }, { "epoch": 1.662381596752368, "grad_norm": 0.31645960995043676, "learning_rate": 4.929139380076784e-06, "loss": 0.0296, "step": 2457 }, { "epoch": 1.6630581867388363, "grad_norm": 0.3838919728468458, "learning_rate": 4.9252029649236835e-06, "loss": 0.0344, "step": 2458 }, { "epoch": 1.6637347767253043, "grad_norm": 0.35149806363624375, "learning_rate": 4.921266596140712e-06, "loss": 0.0325, "step": 2459 }, { "epoch": 1.6644113667117728, "grad_norm": 0.31589055620712414, "learning_rate": 4.917330276168208e-06, "loss": 0.0258, "step": 2460 }, { "epoch": 1.6650879566982408, "grad_norm": 0.3752516646182594, "learning_rate": 4.913394007446477e-06, "loss": 0.0335, "step": 2461 }, { "epoch": 1.665764546684709, "grad_norm": 0.440045427122306, "learning_rate": 4.909457792415793e-06, "loss": 0.0373, "step": 2462 }, { "epoch": 1.6664411366711773, "grad_norm": 0.3143434042022228, "learning_rate": 4.905521633516399e-06, "loss": 0.027, "step": 2463 }, { "epoch": 1.6671177266576453, "grad_norm": 0.49174735617574794, "learning_rate": 4.9015855331884984e-06, "loss": 0.0441, "step": 2464 }, { "epoch": 1.6677943166441138, "grad_norm": 0.2501179762617151, "learning_rate": 4.897649493872262e-06, "loss": 0.0213, "step": 2465 }, { "epoch": 1.6684709066305818, "grad_norm": 0.26357079947156115, "learning_rate": 4.8937135180078236e-06, "loss": 0.0281, "step": 2466 }, { "epoch": 1.66914749661705, "grad_norm": 0.5002463762113306, "learning_rate": 4.889777608035273e-06, "loss": 0.0473, "step": 2467 }, { "epoch": 1.6698240866035183, "grad_norm": 0.2597923656039043, "learning_rate": 4.8858417663946665e-06, "loss": 0.0304, "step": 2468 }, { "epoch": 1.6705006765899864, "grad_norm": 0.30532394499123533, "learning_rate": 4.8819059955260105e-06, "loss": 0.0282, "step": 2469 }, { "epoch": 1.6711772665764548, "grad_norm": 0.3669725059102211, "learning_rate": 4.877970297869273e-06, "loss": 0.028, "step": 2470 }, { "epoch": 1.6718538565629228, "grad_norm": 0.2958032207690955, "learning_rate": 4.874034675864373e-06, "loss": 0.0308, "step": 2471 }, { "epoch": 1.672530446549391, "grad_norm": 0.29591575842976964, "learning_rate": 4.870099131951185e-06, "loss": 0.0261, "step": 2472 }, { "epoch": 1.6732070365358593, "grad_norm": 0.444414438652638, "learning_rate": 4.866163668569531e-06, "loss": 0.0356, "step": 2473 }, { "epoch": 1.6738836265223274, "grad_norm": 0.2284144790986997, "learning_rate": 4.862228288159191e-06, "loss": 0.0203, "step": 2474 }, { "epoch": 1.6745602165087958, "grad_norm": 0.3168089399181314, "learning_rate": 4.858292993159884e-06, "loss": 0.0266, "step": 2475 }, { "epoch": 1.6752368064952639, "grad_norm": 0.45337324058368567, "learning_rate": 4.854357786011286e-06, "loss": 0.0304, "step": 2476 }, { "epoch": 1.675913396481732, "grad_norm": 0.30255064608622795, "learning_rate": 4.850422669153009e-06, "loss": 0.0219, "step": 2477 }, { "epoch": 1.6765899864682003, "grad_norm": 0.550612196678239, "learning_rate": 4.846487645024614e-06, "loss": 0.037, "step": 2478 }, { "epoch": 1.6772665764546684, "grad_norm": 0.4686831388655964, "learning_rate": 4.842552716065605e-06, "loss": 0.0306, "step": 2479 }, { "epoch": 1.6779431664411368, "grad_norm": 0.4019281797042012, "learning_rate": 4.838617884715425e-06, "loss": 0.0258, "step": 2480 }, { "epoch": 1.6786197564276049, "grad_norm": 0.34616581680942493, "learning_rate": 4.8346831534134595e-06, "loss": 0.0266, "step": 2481 }, { "epoch": 1.679296346414073, "grad_norm": 1.1356553221371501, "learning_rate": 4.830748524599026e-06, "loss": 0.0457, "step": 2482 }, { "epoch": 1.6799729364005414, "grad_norm": 0.301710670880077, "learning_rate": 4.826814000711388e-06, "loss": 0.0288, "step": 2483 }, { "epoch": 1.6806495263870094, "grad_norm": 0.37289610939268875, "learning_rate": 4.822879584189732e-06, "loss": 0.0272, "step": 2484 }, { "epoch": 1.6813261163734776, "grad_norm": 0.34094199109719636, "learning_rate": 4.818945277473187e-06, "loss": 0.0299, "step": 2485 }, { "epoch": 1.6820027063599459, "grad_norm": 0.3537092429111352, "learning_rate": 4.81501108300081e-06, "loss": 0.0241, "step": 2486 }, { "epoch": 1.682679296346414, "grad_norm": 0.33125130972563754, "learning_rate": 4.811077003211592e-06, "loss": 0.0313, "step": 2487 }, { "epoch": 1.6833558863328824, "grad_norm": 0.3773197200871978, "learning_rate": 4.807143040544448e-06, "loss": 0.0367, "step": 2488 }, { "epoch": 1.6840324763193504, "grad_norm": 0.5071851311613988, "learning_rate": 4.803209197438224e-06, "loss": 0.0287, "step": 2489 }, { "epoch": 1.6847090663058186, "grad_norm": 0.3337540398275131, "learning_rate": 4.799275476331692e-06, "loss": 0.0325, "step": 2490 }, { "epoch": 1.6853856562922869, "grad_norm": 0.31481810707745517, "learning_rate": 4.795341879663543e-06, "loss": 0.0371, "step": 2491 }, { "epoch": 1.6860622462787551, "grad_norm": 0.26514726928104737, "learning_rate": 4.791408409872398e-06, "loss": 0.0332, "step": 2492 }, { "epoch": 1.6867388362652234, "grad_norm": 0.3757558973504405, "learning_rate": 4.787475069396796e-06, "loss": 0.0326, "step": 2493 }, { "epoch": 1.6874154262516914, "grad_norm": 0.31743216087205844, "learning_rate": 4.783541860675195e-06, "loss": 0.0301, "step": 2494 }, { "epoch": 1.6880920162381596, "grad_norm": 0.3045483562125322, "learning_rate": 4.779608786145974e-06, "loss": 0.032, "step": 2495 }, { "epoch": 1.6887686062246279, "grad_norm": 0.4876288542970148, "learning_rate": 4.775675848247427e-06, "loss": 0.0398, "step": 2496 }, { "epoch": 1.6894451962110961, "grad_norm": 0.33917181742610175, "learning_rate": 4.771743049417761e-06, "loss": 0.0266, "step": 2497 }, { "epoch": 1.6901217861975644, "grad_norm": 0.4500498165699519, "learning_rate": 4.767810392095102e-06, "loss": 0.0352, "step": 2498 }, { "epoch": 1.6907983761840324, "grad_norm": 0.46955843946063675, "learning_rate": 4.763877878717484e-06, "loss": 0.0293, "step": 2499 }, { "epoch": 1.6914749661705006, "grad_norm": 0.3647226957252124, "learning_rate": 4.759945511722854e-06, "loss": 0.0319, "step": 2500 }, { "epoch": 1.6921515561569689, "grad_norm": 0.3249651773966415, "learning_rate": 4.756013293549067e-06, "loss": 0.0327, "step": 2501 }, { "epoch": 1.6928281461434371, "grad_norm": 0.4342956740924432, "learning_rate": 4.752081226633888e-06, "loss": 0.0346, "step": 2502 }, { "epoch": 1.6935047361299054, "grad_norm": 0.314857610946973, "learning_rate": 4.748149313414987e-06, "loss": 0.0318, "step": 2503 }, { "epoch": 1.6941813261163734, "grad_norm": 0.3860453440238996, "learning_rate": 4.744217556329935e-06, "loss": 0.0307, "step": 2504 }, { "epoch": 1.6948579161028416, "grad_norm": 0.29012309026341493, "learning_rate": 4.740285957816211e-06, "loss": 0.0266, "step": 2505 }, { "epoch": 1.69553450608931, "grad_norm": 0.24644734015822517, "learning_rate": 4.736354520311194e-06, "loss": 0.0264, "step": 2506 }, { "epoch": 1.696211096075778, "grad_norm": 0.27533726904417705, "learning_rate": 4.732423246252164e-06, "loss": 0.0241, "step": 2507 }, { "epoch": 1.6968876860622464, "grad_norm": 0.3284741190072217, "learning_rate": 4.728492138076299e-06, "loss": 0.025, "step": 2508 }, { "epoch": 1.6975642760487144, "grad_norm": 0.3313886100791312, "learning_rate": 4.724561198220672e-06, "loss": 0.0277, "step": 2509 }, { "epoch": 1.6982408660351827, "grad_norm": 0.40421821706479716, "learning_rate": 4.7206304291222585e-06, "loss": 0.0374, "step": 2510 }, { "epoch": 1.698917456021651, "grad_norm": 0.27182154889405613, "learning_rate": 4.71669983321792e-06, "loss": 0.0259, "step": 2511 }, { "epoch": 1.699594046008119, "grad_norm": 0.3896546706705147, "learning_rate": 4.712769412944413e-06, "loss": 0.0481, "step": 2512 }, { "epoch": 1.7002706359945874, "grad_norm": 0.30186878962276914, "learning_rate": 4.70883917073839e-06, "loss": 0.0338, "step": 2513 }, { "epoch": 1.7009472259810554, "grad_norm": 0.3036270775035042, "learning_rate": 4.704909109036387e-06, "loss": 0.0314, "step": 2514 }, { "epoch": 1.7016238159675237, "grad_norm": 0.33079428571205244, "learning_rate": 4.700979230274829e-06, "loss": 0.031, "step": 2515 }, { "epoch": 1.702300405953992, "grad_norm": 0.3306106553601701, "learning_rate": 4.697049536890033e-06, "loss": 0.0244, "step": 2516 }, { "epoch": 1.70297699594046, "grad_norm": 0.2869525173971686, "learning_rate": 4.693120031318194e-06, "loss": 0.0245, "step": 2517 }, { "epoch": 1.7036535859269284, "grad_norm": 0.2852776126066731, "learning_rate": 4.6891907159953935e-06, "loss": 0.0333, "step": 2518 }, { "epoch": 1.7043301759133964, "grad_norm": 0.3109897284406666, "learning_rate": 4.685261593357598e-06, "loss": 0.0339, "step": 2519 }, { "epoch": 1.7050067658998647, "grad_norm": 0.33832212060387673, "learning_rate": 4.681332665840647e-06, "loss": 0.0316, "step": 2520 }, { "epoch": 1.705683355886333, "grad_norm": 0.48143311083384405, "learning_rate": 4.677403935880269e-06, "loss": 0.0435, "step": 2521 }, { "epoch": 1.706359945872801, "grad_norm": 0.3352376146112779, "learning_rate": 4.673475405912061e-06, "loss": 0.0347, "step": 2522 }, { "epoch": 1.7070365358592694, "grad_norm": 0.26621326568856285, "learning_rate": 4.669547078371503e-06, "loss": 0.0217, "step": 2523 }, { "epoch": 1.7077131258457374, "grad_norm": 0.27771554687799066, "learning_rate": 4.6656189556939446e-06, "loss": 0.0228, "step": 2524 }, { "epoch": 1.7083897158322057, "grad_norm": 0.3584084542308951, "learning_rate": 4.6616910403146095e-06, "loss": 0.0359, "step": 2525 }, { "epoch": 1.709066305818674, "grad_norm": 0.405356741402472, "learning_rate": 4.657763334668594e-06, "loss": 0.0341, "step": 2526 }, { "epoch": 1.709742895805142, "grad_norm": 0.2899337703771637, "learning_rate": 4.653835841190865e-06, "loss": 0.0319, "step": 2527 }, { "epoch": 1.7104194857916104, "grad_norm": 0.4226805581452788, "learning_rate": 4.649908562316255e-06, "loss": 0.0254, "step": 2528 }, { "epoch": 1.7110960757780784, "grad_norm": 0.3731275190776583, "learning_rate": 4.645981500479466e-06, "loss": 0.0272, "step": 2529 }, { "epoch": 1.7117726657645467, "grad_norm": 0.26488359407055706, "learning_rate": 4.6420546581150665e-06, "loss": 0.0254, "step": 2530 }, { "epoch": 1.712449255751015, "grad_norm": 0.2624951430826583, "learning_rate": 4.6381280376574836e-06, "loss": 0.0279, "step": 2531 }, { "epoch": 1.713125845737483, "grad_norm": 0.2714821116235528, "learning_rate": 4.634201641541013e-06, "loss": 0.029, "step": 2532 }, { "epoch": 1.7138024357239514, "grad_norm": 0.28853122708498474, "learning_rate": 4.630275472199805e-06, "loss": 0.0304, "step": 2533 }, { "epoch": 1.7144790257104194, "grad_norm": 0.3895490198463758, "learning_rate": 4.626349532067879e-06, "loss": 0.041, "step": 2534 }, { "epoch": 1.7151556156968877, "grad_norm": 0.35021800573512446, "learning_rate": 4.622423823579102e-06, "loss": 0.0298, "step": 2535 }, { "epoch": 1.715832205683356, "grad_norm": 0.2758281987995898, "learning_rate": 4.618498349167204e-06, "loss": 0.0335, "step": 2536 }, { "epoch": 1.716508795669824, "grad_norm": 0.5134424381152581, "learning_rate": 4.6145731112657644e-06, "loss": 0.0554, "step": 2537 }, { "epoch": 1.7171853856562924, "grad_norm": 0.3357813889948677, "learning_rate": 4.610648112308221e-06, "loss": 0.036, "step": 2538 }, { "epoch": 1.7178619756427604, "grad_norm": 0.28051604079149534, "learning_rate": 4.6067233547278614e-06, "loss": 0.0266, "step": 2539 }, { "epoch": 1.7185385656292287, "grad_norm": 0.2503535805027952, "learning_rate": 4.602798840957825e-06, "loss": 0.0236, "step": 2540 }, { "epoch": 1.719215155615697, "grad_norm": 0.29342839889218436, "learning_rate": 4.598874573431097e-06, "loss": 0.0304, "step": 2541 }, { "epoch": 1.719891745602165, "grad_norm": 0.35057222844367764, "learning_rate": 4.594950554580512e-06, "loss": 0.0336, "step": 2542 }, { "epoch": 1.7205683355886334, "grad_norm": 0.32832034013834693, "learning_rate": 4.5910267868387525e-06, "loss": 0.034, "step": 2543 }, { "epoch": 1.7212449255751014, "grad_norm": 0.33188864379037686, "learning_rate": 4.587103272638339e-06, "loss": 0.029, "step": 2544 }, { "epoch": 1.7219215155615697, "grad_norm": 0.2816727963933415, "learning_rate": 4.583180014411642e-06, "loss": 0.0289, "step": 2545 }, { "epoch": 1.722598105548038, "grad_norm": 0.23776351515810717, "learning_rate": 4.579257014590869e-06, "loss": 0.0211, "step": 2546 }, { "epoch": 1.723274695534506, "grad_norm": 0.3457653307991375, "learning_rate": 4.575334275608067e-06, "loss": 0.0274, "step": 2547 }, { "epoch": 1.7239512855209744, "grad_norm": 0.458375658403468, "learning_rate": 4.571411799895126e-06, "loss": 0.0322, "step": 2548 }, { "epoch": 1.7246278755074425, "grad_norm": 0.21555292570133547, "learning_rate": 4.567489589883766e-06, "loss": 0.0224, "step": 2549 }, { "epoch": 1.7253044654939107, "grad_norm": 0.49539941506089014, "learning_rate": 4.563567648005551e-06, "loss": 0.0461, "step": 2550 }, { "epoch": 1.725981055480379, "grad_norm": 0.23672890710910027, "learning_rate": 4.559645976691868e-06, "loss": 0.0205, "step": 2551 }, { "epoch": 1.726657645466847, "grad_norm": 0.3477761464178626, "learning_rate": 4.5557245783739425e-06, "loss": 0.0265, "step": 2552 }, { "epoch": 1.7273342354533154, "grad_norm": 0.4018184710552279, "learning_rate": 4.551803455482833e-06, "loss": 0.0391, "step": 2553 }, { "epoch": 1.7280108254397835, "grad_norm": 0.368434955098215, "learning_rate": 4.5478826104494225e-06, "loss": 0.0336, "step": 2554 }, { "epoch": 1.7286874154262517, "grad_norm": 0.21197450185826905, "learning_rate": 4.543962045704424e-06, "loss": 0.0146, "step": 2555 }, { "epoch": 1.72936400541272, "grad_norm": 0.31679353577409447, "learning_rate": 4.540041763678377e-06, "loss": 0.026, "step": 2556 }, { "epoch": 1.730040595399188, "grad_norm": 0.4608580369354584, "learning_rate": 4.536121766801645e-06, "loss": 0.0389, "step": 2557 }, { "epoch": 1.7307171853856564, "grad_norm": 0.35660547773058215, "learning_rate": 4.532202057504412e-06, "loss": 0.038, "step": 2558 }, { "epoch": 1.7313937753721245, "grad_norm": 0.2979235390461743, "learning_rate": 4.528282638216689e-06, "loss": 0.0282, "step": 2559 }, { "epoch": 1.7320703653585927, "grad_norm": 0.28853628014305954, "learning_rate": 4.524363511368304e-06, "loss": 0.0264, "step": 2560 }, { "epoch": 1.732746955345061, "grad_norm": 0.3837334521930021, "learning_rate": 4.520444679388906e-06, "loss": 0.0328, "step": 2561 }, { "epoch": 1.733423545331529, "grad_norm": 0.3310512307525053, "learning_rate": 4.516526144707957e-06, "loss": 0.0244, "step": 2562 }, { "epoch": 1.7341001353179974, "grad_norm": 0.3076839943784699, "learning_rate": 4.512607909754741e-06, "loss": 0.0277, "step": 2563 }, { "epoch": 1.7347767253044655, "grad_norm": 0.45622340126657446, "learning_rate": 4.508689976958348e-06, "loss": 0.0392, "step": 2564 }, { "epoch": 1.7354533152909337, "grad_norm": 0.28131755964827976, "learning_rate": 4.504772348747687e-06, "loss": 0.0286, "step": 2565 }, { "epoch": 1.736129905277402, "grad_norm": 0.30147650400360737, "learning_rate": 4.500855027551477e-06, "loss": 0.0293, "step": 2566 }, { "epoch": 1.73680649526387, "grad_norm": 0.33851406100999554, "learning_rate": 4.496938015798246e-06, "loss": 0.0263, "step": 2567 }, { "epoch": 1.7374830852503385, "grad_norm": 0.3561164566438635, "learning_rate": 4.493021315916328e-06, "loss": 0.033, "step": 2568 }, { "epoch": 1.7381596752368065, "grad_norm": 0.36434014933441217, "learning_rate": 4.48910493033387e-06, "loss": 0.0185, "step": 2569 }, { "epoch": 1.7388362652232747, "grad_norm": 0.3673289035810428, "learning_rate": 4.485188861478817e-06, "loss": 0.0292, "step": 2570 }, { "epoch": 1.739512855209743, "grad_norm": 0.28756688532278163, "learning_rate": 4.481273111778919e-06, "loss": 0.0239, "step": 2571 }, { "epoch": 1.740189445196211, "grad_norm": 0.36322743263883556, "learning_rate": 4.477357683661734e-06, "loss": 0.0414, "step": 2572 }, { "epoch": 1.7408660351826795, "grad_norm": 0.48899289689185416, "learning_rate": 4.473442579554612e-06, "loss": 0.0379, "step": 2573 }, { "epoch": 1.7415426251691475, "grad_norm": 0.4448440873212462, "learning_rate": 4.46952780188471e-06, "loss": 0.0259, "step": 2574 }, { "epoch": 1.7422192151556157, "grad_norm": 0.26910046156525524, "learning_rate": 4.465613353078978e-06, "loss": 0.0292, "step": 2575 }, { "epoch": 1.742895805142084, "grad_norm": 0.4618472054959546, "learning_rate": 4.461699235564164e-06, "loss": 0.0298, "step": 2576 }, { "epoch": 1.743572395128552, "grad_norm": 0.34108507079214767, "learning_rate": 4.457785451766808e-06, "loss": 0.035, "step": 2577 }, { "epoch": 1.7442489851150202, "grad_norm": 0.5639311474799743, "learning_rate": 4.453872004113247e-06, "loss": 0.0358, "step": 2578 }, { "epoch": 1.7449255751014885, "grad_norm": 0.3426306890416558, "learning_rate": 4.449958895029604e-06, "loss": 0.0332, "step": 2579 }, { "epoch": 1.7456021650879567, "grad_norm": 0.4145944361184108, "learning_rate": 4.446046126941801e-06, "loss": 0.0388, "step": 2580 }, { "epoch": 1.746278755074425, "grad_norm": 0.3062466633155877, "learning_rate": 4.442133702275539e-06, "loss": 0.0242, "step": 2581 }, { "epoch": 1.746955345060893, "grad_norm": 0.37681131182332217, "learning_rate": 4.438221623456312e-06, "loss": 0.0299, "step": 2582 }, { "epoch": 1.7476319350473613, "grad_norm": 0.35980152188751374, "learning_rate": 4.4343098929094e-06, "loss": 0.0279, "step": 2583 }, { "epoch": 1.7483085250338295, "grad_norm": 0.3527994633461267, "learning_rate": 4.4303985130598615e-06, "loss": 0.0335, "step": 2584 }, { "epoch": 1.7489851150202977, "grad_norm": 0.34657169139864935, "learning_rate": 4.426487486332544e-06, "loss": 0.0281, "step": 2585 }, { "epoch": 1.749661705006766, "grad_norm": 0.38271880544219666, "learning_rate": 4.42257681515207e-06, "loss": 0.0404, "step": 2586 }, { "epoch": 1.750338294993234, "grad_norm": 0.3493929775948048, "learning_rate": 4.4186665019428485e-06, "loss": 0.0351, "step": 2587 }, { "epoch": 1.7510148849797023, "grad_norm": 0.3887216055934382, "learning_rate": 4.41475654912906e-06, "loss": 0.0347, "step": 2588 }, { "epoch": 1.7516914749661705, "grad_norm": 0.36373045467764825, "learning_rate": 4.410846959134667e-06, "loss": 0.034, "step": 2589 }, { "epoch": 1.7523680649526387, "grad_norm": 0.3585495419331542, "learning_rate": 4.406937734383405e-06, "loss": 0.0213, "step": 2590 }, { "epoch": 1.753044654939107, "grad_norm": 0.26978918420242576, "learning_rate": 4.4030288772987795e-06, "loss": 0.0218, "step": 2591 }, { "epoch": 1.753721244925575, "grad_norm": 0.4913247264596912, "learning_rate": 4.399120390304072e-06, "loss": 0.0353, "step": 2592 }, { "epoch": 1.7543978349120433, "grad_norm": 0.3640852256705213, "learning_rate": 4.395212275822336e-06, "loss": 0.0316, "step": 2593 }, { "epoch": 1.7550744248985115, "grad_norm": 0.34787757904616445, "learning_rate": 4.391304536276389e-06, "loss": 0.029, "step": 2594 }, { "epoch": 1.7557510148849798, "grad_norm": 0.39433208435920863, "learning_rate": 4.3873971740888205e-06, "loss": 0.0253, "step": 2595 }, { "epoch": 1.756427604871448, "grad_norm": 0.3232978320311612, "learning_rate": 4.383490191681985e-06, "loss": 0.0307, "step": 2596 }, { "epoch": 1.757104194857916, "grad_norm": 0.31679177199140235, "learning_rate": 4.379583591477999e-06, "loss": 0.0266, "step": 2597 }, { "epoch": 1.7577807848443843, "grad_norm": 0.34946238304926175, "learning_rate": 4.375677375898746e-06, "loss": 0.0308, "step": 2598 }, { "epoch": 1.7584573748308525, "grad_norm": 0.44043705194334837, "learning_rate": 4.371771547365869e-06, "loss": 0.0416, "step": 2599 }, { "epoch": 1.7591339648173205, "grad_norm": 0.23837244376575895, "learning_rate": 4.367866108300769e-06, "loss": 0.027, "step": 2600 }, { "epoch": 1.759810554803789, "grad_norm": 0.3711163901861945, "learning_rate": 4.3639610611246106e-06, "loss": 0.0305, "step": 2601 }, { "epoch": 1.760487144790257, "grad_norm": 0.229365775241179, "learning_rate": 4.36005640825831e-06, "loss": 0.0279, "step": 2602 }, { "epoch": 1.7611637347767253, "grad_norm": 0.4369561044549791, "learning_rate": 4.3561521521225445e-06, "loss": 0.0331, "step": 2603 }, { "epoch": 1.7618403247631935, "grad_norm": 0.27636344202214114, "learning_rate": 4.352248295137739e-06, "loss": 0.0265, "step": 2604 }, { "epoch": 1.7625169147496615, "grad_norm": 0.29708739553259333, "learning_rate": 4.348344839724076e-06, "loss": 0.0218, "step": 2605 }, { "epoch": 1.76319350473613, "grad_norm": 0.30476715927779024, "learning_rate": 4.3444417883014885e-06, "loss": 0.0336, "step": 2606 }, { "epoch": 1.763870094722598, "grad_norm": 0.32606089081914547, "learning_rate": 4.340539143289655e-06, "loss": 0.0386, "step": 2607 }, { "epoch": 1.7645466847090663, "grad_norm": 0.6625495622679105, "learning_rate": 4.33663690710801e-06, "loss": 0.0269, "step": 2608 }, { "epoch": 1.7652232746955345, "grad_norm": 0.2261189032678472, "learning_rate": 4.332735082175724e-06, "loss": 0.021, "step": 2609 }, { "epoch": 1.7658998646820026, "grad_norm": 0.3677337458826156, "learning_rate": 4.3288336709117246e-06, "loss": 0.0277, "step": 2610 }, { "epoch": 1.766576454668471, "grad_norm": 0.28741855426393353, "learning_rate": 4.32493267573467e-06, "loss": 0.0334, "step": 2611 }, { "epoch": 1.767253044654939, "grad_norm": 0.30175491897174606, "learning_rate": 4.3210320990629696e-06, "loss": 0.0306, "step": 2612 }, { "epoch": 1.7679296346414073, "grad_norm": 0.4459154430617385, "learning_rate": 4.31713194331477e-06, "loss": 0.0367, "step": 2613 }, { "epoch": 1.7686062246278755, "grad_norm": 0.3301191373871344, "learning_rate": 4.313232210907959e-06, "loss": 0.0278, "step": 2614 }, { "epoch": 1.7692828146143436, "grad_norm": 0.3423866029784998, "learning_rate": 4.30933290426016e-06, "loss": 0.0349, "step": 2615 }, { "epoch": 1.769959404600812, "grad_norm": 0.2764470334730458, "learning_rate": 4.305434025788735e-06, "loss": 0.0279, "step": 2616 }, { "epoch": 1.77063599458728, "grad_norm": 0.33679865197336717, "learning_rate": 4.301535577910774e-06, "loss": 0.0312, "step": 2617 }, { "epoch": 1.7713125845737483, "grad_norm": 0.4093251697612486, "learning_rate": 4.297637563043106e-06, "loss": 0.0367, "step": 2618 }, { "epoch": 1.7719891745602165, "grad_norm": 0.35941913077867427, "learning_rate": 4.293739983602292e-06, "loss": 0.0345, "step": 2619 }, { "epoch": 1.7726657645466846, "grad_norm": 0.269367285779406, "learning_rate": 4.28984284200462e-06, "loss": 0.0323, "step": 2620 }, { "epoch": 1.773342354533153, "grad_norm": 0.2676923665995716, "learning_rate": 4.285946140666107e-06, "loss": 0.0258, "step": 2621 }, { "epoch": 1.774018944519621, "grad_norm": 0.24123311226995156, "learning_rate": 4.282049882002499e-06, "loss": 0.0234, "step": 2622 }, { "epoch": 1.7746955345060893, "grad_norm": 0.29092215210195627, "learning_rate": 4.278154068429268e-06, "loss": 0.0252, "step": 2623 }, { "epoch": 1.7753721244925575, "grad_norm": 0.2611541302096314, "learning_rate": 4.274258702361604e-06, "loss": 0.0301, "step": 2624 }, { "epoch": 1.7760487144790256, "grad_norm": 0.2877502258563499, "learning_rate": 4.270363786214427e-06, "loss": 0.0333, "step": 2625 }, { "epoch": 1.776725304465494, "grad_norm": 0.3961002448448291, "learning_rate": 4.266469322402374e-06, "loss": 0.0268, "step": 2626 }, { "epoch": 1.777401894451962, "grad_norm": 0.38927950641745135, "learning_rate": 4.2625753133398036e-06, "loss": 0.0284, "step": 2627 }, { "epoch": 1.7780784844384303, "grad_norm": 0.6194414898579439, "learning_rate": 4.25868176144079e-06, "loss": 0.0397, "step": 2628 }, { "epoch": 1.7787550744248986, "grad_norm": 0.342068842517698, "learning_rate": 4.254788669119127e-06, "loss": 0.0307, "step": 2629 }, { "epoch": 1.7794316644113666, "grad_norm": 0.27806995435172194, "learning_rate": 4.250896038788324e-06, "loss": 0.028, "step": 2630 }, { "epoch": 1.780108254397835, "grad_norm": 0.34842401607345286, "learning_rate": 4.247003872861598e-06, "loss": 0.0284, "step": 2631 }, { "epoch": 1.780784844384303, "grad_norm": 0.3546096166196037, "learning_rate": 4.2431121737518824e-06, "loss": 0.0253, "step": 2632 }, { "epoch": 1.7814614343707713, "grad_norm": 0.3522986221138865, "learning_rate": 4.239220943871823e-06, "loss": 0.0303, "step": 2633 }, { "epoch": 1.7821380243572396, "grad_norm": 0.30079475954802903, "learning_rate": 4.23533018563377e-06, "loss": 0.0221, "step": 2634 }, { "epoch": 1.7828146143437076, "grad_norm": 0.5347889112997443, "learning_rate": 4.231439901449788e-06, "loss": 0.0457, "step": 2635 }, { "epoch": 1.783491204330176, "grad_norm": 0.3461976124986228, "learning_rate": 4.227550093731641e-06, "loss": 0.0309, "step": 2636 }, { "epoch": 1.784167794316644, "grad_norm": 0.27407351545678477, "learning_rate": 4.223660764890799e-06, "loss": 0.0254, "step": 2637 }, { "epoch": 1.7848443843031123, "grad_norm": 0.30508100978344044, "learning_rate": 4.2197719173384374e-06, "loss": 0.0324, "step": 2638 }, { "epoch": 1.7855209742895806, "grad_norm": 0.3924144964947684, "learning_rate": 4.215883553485431e-06, "loss": 0.0297, "step": 2639 }, { "epoch": 1.7861975642760486, "grad_norm": 0.7441042381740874, "learning_rate": 4.211995675742358e-06, "loss": 0.0399, "step": 2640 }, { "epoch": 1.786874154262517, "grad_norm": 0.2705361268974501, "learning_rate": 4.208108286519491e-06, "loss": 0.0293, "step": 2641 }, { "epoch": 1.787550744248985, "grad_norm": 0.30342958290381394, "learning_rate": 4.204221388226803e-06, "loss": 0.0331, "step": 2642 }, { "epoch": 1.7882273342354533, "grad_norm": 0.26675480220215986, "learning_rate": 4.2003349832739624e-06, "loss": 0.0314, "step": 2643 }, { "epoch": 1.7889039242219216, "grad_norm": 0.3637247014588533, "learning_rate": 4.196449074070329e-06, "loss": 0.0275, "step": 2644 }, { "epoch": 1.7895805142083896, "grad_norm": 0.3042441928119871, "learning_rate": 4.1925636630249565e-06, "loss": 0.0357, "step": 2645 }, { "epoch": 1.790257104194858, "grad_norm": 0.29898440201684634, "learning_rate": 4.1886787525465914e-06, "loss": 0.0277, "step": 2646 }, { "epoch": 1.790933694181326, "grad_norm": 0.3115332185043352, "learning_rate": 4.184794345043668e-06, "loss": 0.0288, "step": 2647 }, { "epoch": 1.7916102841677943, "grad_norm": 0.3192481722771207, "learning_rate": 4.180910442924312e-06, "loss": 0.032, "step": 2648 }, { "epoch": 1.7922868741542626, "grad_norm": 0.35266106657956, "learning_rate": 4.17702704859633e-06, "loss": 0.0323, "step": 2649 }, { "epoch": 1.7929634641407306, "grad_norm": 0.2733674404274185, "learning_rate": 4.173144164467221e-06, "loss": 0.0311, "step": 2650 }, { "epoch": 1.793640054127199, "grad_norm": 0.3901746916827659, "learning_rate": 4.169261792944161e-06, "loss": 0.0256, "step": 2651 }, { "epoch": 1.794316644113667, "grad_norm": 0.4484340132928396, "learning_rate": 4.165379936434011e-06, "loss": 0.0371, "step": 2652 }, { "epoch": 1.7949932341001353, "grad_norm": 0.27622663319068086, "learning_rate": 4.161498597343313e-06, "loss": 0.026, "step": 2653 }, { "epoch": 1.7956698240866036, "grad_norm": 0.3073889438853535, "learning_rate": 4.15761777807829e-06, "loss": 0.0248, "step": 2654 }, { "epoch": 1.7963464140730716, "grad_norm": 0.4537979707068046, "learning_rate": 4.153737481044838e-06, "loss": 0.0308, "step": 2655 }, { "epoch": 1.79702300405954, "grad_norm": 0.2937261921157818, "learning_rate": 4.149857708648536e-06, "loss": 0.0251, "step": 2656 }, { "epoch": 1.797699594046008, "grad_norm": 0.3435845967720961, "learning_rate": 4.1459784632946295e-06, "loss": 0.0296, "step": 2657 }, { "epoch": 1.7983761840324763, "grad_norm": 0.22005209060832764, "learning_rate": 4.142099747388042e-06, "loss": 0.0198, "step": 2658 }, { "epoch": 1.7990527740189446, "grad_norm": 0.31058586661156673, "learning_rate": 4.138221563333371e-06, "loss": 0.0331, "step": 2659 }, { "epoch": 1.7997293640054126, "grad_norm": 0.37392366255520676, "learning_rate": 4.134343913534879e-06, "loss": 0.0329, "step": 2660 }, { "epoch": 1.800405953991881, "grad_norm": 0.3481303544278709, "learning_rate": 4.1304668003965016e-06, "loss": 0.027, "step": 2661 }, { "epoch": 1.801082543978349, "grad_norm": 0.32759244786674374, "learning_rate": 4.126590226321838e-06, "loss": 0.0313, "step": 2662 }, { "epoch": 1.8017591339648173, "grad_norm": 0.32188468006032117, "learning_rate": 4.12271419371416e-06, "loss": 0.0284, "step": 2663 }, { "epoch": 1.8024357239512856, "grad_norm": 0.24912883260939556, "learning_rate": 4.118838704976392e-06, "loss": 0.0196, "step": 2664 }, { "epoch": 1.8031123139377536, "grad_norm": 0.3189010629371992, "learning_rate": 4.114963762511134e-06, "loss": 0.0365, "step": 2665 }, { "epoch": 1.803788903924222, "grad_norm": 0.43688902084727205, "learning_rate": 4.111089368720635e-06, "loss": 0.0452, "step": 2666 }, { "epoch": 1.80446549391069, "grad_norm": 0.4638381936567443, "learning_rate": 4.107215526006818e-06, "loss": 0.0571, "step": 2667 }, { "epoch": 1.8051420838971584, "grad_norm": 0.4125074637965231, "learning_rate": 4.10334223677125e-06, "loss": 0.0309, "step": 2668 }, { "epoch": 1.8058186738836266, "grad_norm": 0.32727486615921647, "learning_rate": 4.099469503415167e-06, "loss": 0.0432, "step": 2669 }, { "epoch": 1.8064952638700946, "grad_norm": 0.3548546321761083, "learning_rate": 4.0955973283394525e-06, "loss": 0.0279, "step": 2670 }, { "epoch": 1.8071718538565629, "grad_norm": 0.5084283120306401, "learning_rate": 4.091725713944644e-06, "loss": 0.0303, "step": 2671 }, { "epoch": 1.8078484438430311, "grad_norm": 0.35906172546652476, "learning_rate": 4.087854662630937e-06, "loss": 0.0318, "step": 2672 }, { "epoch": 1.8085250338294994, "grad_norm": 0.3569372561861631, "learning_rate": 4.083984176798175e-06, "loss": 0.0356, "step": 2673 }, { "epoch": 1.8092016238159676, "grad_norm": 0.33727519351352203, "learning_rate": 4.080114258845846e-06, "loss": 0.0349, "step": 2674 }, { "epoch": 1.8098782138024356, "grad_norm": 0.3902204599412901, "learning_rate": 4.076244911173097e-06, "loss": 0.0229, "step": 2675 }, { "epoch": 1.8105548037889039, "grad_norm": 0.34510078431419167, "learning_rate": 4.072376136178712e-06, "loss": 0.0307, "step": 2676 }, { "epoch": 1.8112313937753721, "grad_norm": 0.5011029538347361, "learning_rate": 4.06850793626112e-06, "loss": 0.0421, "step": 2677 }, { "epoch": 1.8119079837618404, "grad_norm": 0.28226630423543714, "learning_rate": 4.064640313818401e-06, "loss": 0.028, "step": 2678 }, { "epoch": 1.8125845737483086, "grad_norm": 0.345911021189506, "learning_rate": 4.06077327124827e-06, "loss": 0.0375, "step": 2679 }, { "epoch": 1.8132611637347766, "grad_norm": 0.24967775445369783, "learning_rate": 4.056906810948086e-06, "loss": 0.022, "step": 2680 }, { "epoch": 1.8139377537212449, "grad_norm": 0.36561200139382116, "learning_rate": 4.053040935314845e-06, "loss": 0.0326, "step": 2681 }, { "epoch": 1.8146143437077131, "grad_norm": 0.44180192512844846, "learning_rate": 4.049175646745182e-06, "loss": 0.0397, "step": 2682 }, { "epoch": 1.8152909336941814, "grad_norm": 0.45165061132232887, "learning_rate": 4.045310947635369e-06, "loss": 0.0323, "step": 2683 }, { "epoch": 1.8159675236806496, "grad_norm": 0.27801313243001463, "learning_rate": 4.041446840381309e-06, "loss": 0.031, "step": 2684 }, { "epoch": 1.8166441136671176, "grad_norm": 0.3419275007101939, "learning_rate": 4.03758332737854e-06, "loss": 0.026, "step": 2685 }, { "epoch": 1.817320703653586, "grad_norm": 0.3191810698329368, "learning_rate": 4.033720411022235e-06, "loss": 0.0302, "step": 2686 }, { "epoch": 1.8179972936400541, "grad_norm": 0.37863581732633167, "learning_rate": 4.02985809370719e-06, "loss": 0.0316, "step": 2687 }, { "epoch": 1.8186738836265224, "grad_norm": 0.4903037856694656, "learning_rate": 4.025996377827836e-06, "loss": 0.0261, "step": 2688 }, { "epoch": 1.8193504736129906, "grad_norm": 0.33236139853862945, "learning_rate": 4.022135265778226e-06, "loss": 0.025, "step": 2689 }, { "epoch": 1.8200270635994586, "grad_norm": 0.36896052672555035, "learning_rate": 4.018274759952047e-06, "loss": 0.0312, "step": 2690 }, { "epoch": 1.820703653585927, "grad_norm": 0.38812391088835496, "learning_rate": 4.0144148627426e-06, "loss": 0.036, "step": 2691 }, { "epoch": 1.8213802435723951, "grad_norm": 0.415530489904815, "learning_rate": 4.010555576542812e-06, "loss": 0.03, "step": 2692 }, { "epoch": 1.8220568335588632, "grad_norm": 0.29735609643700006, "learning_rate": 4.006696903745236e-06, "loss": 0.0307, "step": 2693 }, { "epoch": 1.8227334235453316, "grad_norm": 0.3116141150028271, "learning_rate": 4.002838846742039e-06, "loss": 0.0264, "step": 2694 }, { "epoch": 1.8234100135317997, "grad_norm": 0.3288701080055487, "learning_rate": 3.998981407925009e-06, "loss": 0.0287, "step": 2695 }, { "epoch": 1.824086603518268, "grad_norm": 0.30263041578993677, "learning_rate": 3.995124589685552e-06, "loss": 0.0308, "step": 2696 }, { "epoch": 1.8247631935047361, "grad_norm": 0.286767356415873, "learning_rate": 3.991268394414685e-06, "loss": 0.0294, "step": 2697 }, { "epoch": 1.8254397834912042, "grad_norm": 0.39894473368693867, "learning_rate": 3.987412824503041e-06, "loss": 0.037, "step": 2698 }, { "epoch": 1.8261163734776726, "grad_norm": 0.3609337918371107, "learning_rate": 3.983557882340866e-06, "loss": 0.0274, "step": 2699 }, { "epoch": 1.8267929634641407, "grad_norm": 0.2664186103399177, "learning_rate": 3.979703570318017e-06, "loss": 0.0217, "step": 2700 }, { "epoch": 1.827469553450609, "grad_norm": 0.29757603731982274, "learning_rate": 3.97584989082396e-06, "loss": 0.0251, "step": 2701 }, { "epoch": 1.8281461434370772, "grad_norm": 0.26038998187107976, "learning_rate": 3.971996846247767e-06, "loss": 0.0247, "step": 2702 }, { "epoch": 1.8288227334235452, "grad_norm": 0.28279378785952597, "learning_rate": 3.968144438978121e-06, "loss": 0.0324, "step": 2703 }, { "epoch": 1.8294993234100136, "grad_norm": 0.24411594025384967, "learning_rate": 3.964292671403303e-06, "loss": 0.0197, "step": 2704 }, { "epoch": 1.8301759133964817, "grad_norm": 0.2704535430184035, "learning_rate": 3.960441545911205e-06, "loss": 0.0287, "step": 2705 }, { "epoch": 1.83085250338295, "grad_norm": 0.39949011410609037, "learning_rate": 3.956591064889313e-06, "loss": 0.035, "step": 2706 }, { "epoch": 1.8315290933694182, "grad_norm": 0.2856086640643192, "learning_rate": 3.952741230724721e-06, "loss": 0.0268, "step": 2707 }, { "epoch": 1.8322056833558862, "grad_norm": 0.5079320070658421, "learning_rate": 3.948892045804117e-06, "loss": 0.036, "step": 2708 }, { "epoch": 1.8328822733423547, "grad_norm": 0.4322514043741924, "learning_rate": 3.94504351251379e-06, "loss": 0.0265, "step": 2709 }, { "epoch": 1.8335588633288227, "grad_norm": 0.35313776553967785, "learning_rate": 3.9411956332396224e-06, "loss": 0.0194, "step": 2710 }, { "epoch": 1.834235453315291, "grad_norm": 0.8542069999554189, "learning_rate": 3.937348410367091e-06, "loss": 0.05, "step": 2711 }, { "epoch": 1.8349120433017592, "grad_norm": 0.5028097463837327, "learning_rate": 3.9335018462812664e-06, "loss": 0.0432, "step": 2712 }, { "epoch": 1.8355886332882272, "grad_norm": 0.5854402146375531, "learning_rate": 3.929655943366812e-06, "loss": 0.0277, "step": 2713 }, { "epoch": 1.8362652232746957, "grad_norm": 0.2546232565257687, "learning_rate": 3.92581070400798e-06, "loss": 0.0238, "step": 2714 }, { "epoch": 1.8369418132611637, "grad_norm": 0.35642062171429384, "learning_rate": 3.921966130588612e-06, "loss": 0.0276, "step": 2715 }, { "epoch": 1.837618403247632, "grad_norm": 0.29038713260225246, "learning_rate": 3.918122225492139e-06, "loss": 0.0255, "step": 2716 }, { "epoch": 1.8382949932341002, "grad_norm": 0.37323932909437657, "learning_rate": 3.914278991101568e-06, "loss": 0.0422, "step": 2717 }, { "epoch": 1.8389715832205682, "grad_norm": 0.31474906581008993, "learning_rate": 3.910436429799503e-06, "loss": 0.0232, "step": 2718 }, { "epoch": 1.8396481732070367, "grad_norm": 0.2768259932365074, "learning_rate": 3.906594543968122e-06, "loss": 0.0231, "step": 2719 }, { "epoch": 1.8403247631935047, "grad_norm": 0.3194073338438866, "learning_rate": 3.902753335989188e-06, "loss": 0.026, "step": 2720 }, { "epoch": 1.841001353179973, "grad_norm": 0.34944793044078193, "learning_rate": 3.898912808244043e-06, "loss": 0.0255, "step": 2721 }, { "epoch": 1.8416779431664412, "grad_norm": 0.39429096759598886, "learning_rate": 3.895072963113607e-06, "loss": 0.0378, "step": 2722 }, { "epoch": 1.8423545331529092, "grad_norm": 0.27068499815451585, "learning_rate": 3.89123380297838e-06, "loss": 0.0221, "step": 2723 }, { "epoch": 1.8430311231393777, "grad_norm": 0.2915361354998442, "learning_rate": 3.887395330218429e-06, "loss": 0.032, "step": 2724 }, { "epoch": 1.8437077131258457, "grad_norm": 0.2229913450537913, "learning_rate": 3.883557547213404e-06, "loss": 0.0243, "step": 2725 }, { "epoch": 1.844384303112314, "grad_norm": 0.3006017740086933, "learning_rate": 3.8797204563425215e-06, "loss": 0.026, "step": 2726 }, { "epoch": 1.8450608930987822, "grad_norm": 0.40672761974402877, "learning_rate": 3.875884059984571e-06, "loss": 0.0395, "step": 2727 }, { "epoch": 1.8457374830852502, "grad_norm": 0.5611133117859646, "learning_rate": 3.872048360517914e-06, "loss": 0.032, "step": 2728 }, { "epoch": 1.8464140730717187, "grad_norm": 0.30084265326422077, "learning_rate": 3.868213360320474e-06, "loss": 0.029, "step": 2729 }, { "epoch": 1.8470906630581867, "grad_norm": 0.29063238857425405, "learning_rate": 3.864379061769749e-06, "loss": 0.0272, "step": 2730 }, { "epoch": 1.847767253044655, "grad_norm": 0.3234239508261535, "learning_rate": 3.860545467242793e-06, "loss": 0.0295, "step": 2731 }, { "epoch": 1.8484438430311232, "grad_norm": 0.2660123733234617, "learning_rate": 3.856712579116229e-06, "loss": 0.0278, "step": 2732 }, { "epoch": 1.8491204330175912, "grad_norm": 0.2748480970316418, "learning_rate": 3.852880399766243e-06, "loss": 0.026, "step": 2733 }, { "epoch": 1.8497970230040597, "grad_norm": 0.4317617381556935, "learning_rate": 3.8490489315685764e-06, "loss": 0.0266, "step": 2734 }, { "epoch": 1.8504736129905277, "grad_norm": 0.256923637429559, "learning_rate": 3.845218176898537e-06, "loss": 0.0221, "step": 2735 }, { "epoch": 1.851150202976996, "grad_norm": 0.312933981334049, "learning_rate": 3.8413881381309845e-06, "loss": 0.0334, "step": 2736 }, { "epoch": 1.8518267929634642, "grad_norm": 0.33124520546911174, "learning_rate": 3.837558817640334e-06, "loss": 0.0262, "step": 2737 }, { "epoch": 1.8525033829499322, "grad_norm": 0.3169719756976648, "learning_rate": 3.8337302178005605e-06, "loss": 0.0314, "step": 2738 }, { "epoch": 1.8531799729364007, "grad_norm": 0.316587761312504, "learning_rate": 3.829902340985189e-06, "loss": 0.0308, "step": 2739 }, { "epoch": 1.8538565629228687, "grad_norm": 0.42645154956655024, "learning_rate": 3.826075189567296e-06, "loss": 0.0412, "step": 2740 }, { "epoch": 1.854533152909337, "grad_norm": 0.37969987986458503, "learning_rate": 3.82224876591951e-06, "loss": 0.0426, "step": 2741 }, { "epoch": 1.8552097428958052, "grad_norm": 0.36015108092892856, "learning_rate": 3.818423072414007e-06, "loss": 0.0285, "step": 2742 }, { "epoch": 1.8558863328822732, "grad_norm": 0.4391809591592842, "learning_rate": 3.8145981114225135e-06, "loss": 0.0357, "step": 2743 }, { "epoch": 1.8565629228687417, "grad_norm": 0.4010653251507355, "learning_rate": 3.8107738853162953e-06, "loss": 0.0308, "step": 2744 }, { "epoch": 1.8572395128552097, "grad_norm": 0.3286902490342025, "learning_rate": 3.8069503964661656e-06, "loss": 0.0282, "step": 2745 }, { "epoch": 1.857916102841678, "grad_norm": 0.26128621160738486, "learning_rate": 3.803127647242486e-06, "loss": 0.025, "step": 2746 }, { "epoch": 1.8585926928281462, "grad_norm": 0.339093432402349, "learning_rate": 3.7993056400151516e-06, "loss": 0.0307, "step": 2747 }, { "epoch": 1.8592692828146142, "grad_norm": 0.38196393377361754, "learning_rate": 3.795484377153601e-06, "loss": 0.0323, "step": 2748 }, { "epoch": 1.8599458728010827, "grad_norm": 0.35884647469612424, "learning_rate": 3.791663861026814e-06, "loss": 0.0308, "step": 2749 }, { "epoch": 1.8606224627875507, "grad_norm": 0.305251079044217, "learning_rate": 3.787844094003302e-06, "loss": 0.0261, "step": 2750 }, { "epoch": 1.861299052774019, "grad_norm": 0.41917420647087306, "learning_rate": 3.7840250784511147e-06, "loss": 0.0346, "step": 2751 }, { "epoch": 1.8619756427604872, "grad_norm": 0.3114860675137204, "learning_rate": 3.780206816737837e-06, "loss": 0.0293, "step": 2752 }, { "epoch": 1.8626522327469552, "grad_norm": 0.3543280815249511, "learning_rate": 3.776389311230584e-06, "loss": 0.0384, "step": 2753 }, { "epoch": 1.8633288227334237, "grad_norm": 0.37800835051382997, "learning_rate": 3.7725725642960047e-06, "loss": 0.0351, "step": 2754 }, { "epoch": 1.8640054127198917, "grad_norm": 0.3372101682646651, "learning_rate": 3.7687565783002754e-06, "loss": 0.0251, "step": 2755 }, { "epoch": 1.86468200270636, "grad_norm": 0.2936300964126187, "learning_rate": 3.7649413556091047e-06, "loss": 0.0255, "step": 2756 }, { "epoch": 1.8653585926928282, "grad_norm": 0.35035127791984333, "learning_rate": 3.7611268985877213e-06, "loss": 0.0296, "step": 2757 }, { "epoch": 1.8660351826792962, "grad_norm": 0.28936593925166243, "learning_rate": 3.7573132096008843e-06, "loss": 0.0287, "step": 2758 }, { "epoch": 1.8667117726657647, "grad_norm": 0.23329604171618087, "learning_rate": 3.753500291012874e-06, "loss": 0.029, "step": 2759 }, { "epoch": 1.8673883626522327, "grad_norm": 0.8916747746121758, "learning_rate": 3.749688145187497e-06, "loss": 0.0318, "step": 2760 }, { "epoch": 1.868064952638701, "grad_norm": 0.5408879210119105, "learning_rate": 3.7458767744880763e-06, "loss": 0.0365, "step": 2761 }, { "epoch": 1.8687415426251692, "grad_norm": 0.27138970699536663, "learning_rate": 3.7420661812774577e-06, "loss": 0.0247, "step": 2762 }, { "epoch": 1.8694181326116373, "grad_norm": 0.2976193614642239, "learning_rate": 3.738256367918004e-06, "loss": 0.0287, "step": 2763 }, { "epoch": 1.8700947225981055, "grad_norm": 0.373410928693832, "learning_rate": 3.734447336771591e-06, "loss": 0.0387, "step": 2764 }, { "epoch": 1.8707713125845737, "grad_norm": 0.31411327834361735, "learning_rate": 3.730639090199616e-06, "loss": 0.0335, "step": 2765 }, { "epoch": 1.871447902571042, "grad_norm": 0.33230149170239837, "learning_rate": 3.7268316305629836e-06, "loss": 0.0302, "step": 2766 }, { "epoch": 1.8721244925575102, "grad_norm": 0.33934420407494387, "learning_rate": 3.7230249602221163e-06, "loss": 0.0309, "step": 2767 }, { "epoch": 1.8728010825439783, "grad_norm": 0.29769907147717395, "learning_rate": 3.719219081536942e-06, "loss": 0.0259, "step": 2768 }, { "epoch": 1.8734776725304465, "grad_norm": 0.37091430711250334, "learning_rate": 3.7154139968669043e-06, "loss": 0.0359, "step": 2769 }, { "epoch": 1.8741542625169147, "grad_norm": 0.35193881203090227, "learning_rate": 3.711609708570948e-06, "loss": 0.0302, "step": 2770 }, { "epoch": 1.874830852503383, "grad_norm": 0.3674154203085046, "learning_rate": 3.7078062190075264e-06, "loss": 0.0277, "step": 2771 }, { "epoch": 1.8755074424898512, "grad_norm": 0.3715442409426697, "learning_rate": 3.704003530534597e-06, "loss": 0.0263, "step": 2772 }, { "epoch": 1.8761840324763193, "grad_norm": 0.33092732279836434, "learning_rate": 3.7002016455096247e-06, "loss": 0.0361, "step": 2773 }, { "epoch": 1.8768606224627875, "grad_norm": 0.23898592217454684, "learning_rate": 3.696400566289571e-06, "loss": 0.0223, "step": 2774 }, { "epoch": 1.8775372124492558, "grad_norm": 0.22496138181328645, "learning_rate": 3.6926002952309015e-06, "loss": 0.0219, "step": 2775 }, { "epoch": 1.878213802435724, "grad_norm": 0.27929651919297416, "learning_rate": 3.6888008346895797e-06, "loss": 0.0251, "step": 2776 }, { "epoch": 1.8788903924221922, "grad_norm": 0.33097525483542106, "learning_rate": 3.685002187021064e-06, "loss": 0.0232, "step": 2777 }, { "epoch": 1.8795669824086603, "grad_norm": 0.45521315284837427, "learning_rate": 3.681204354580313e-06, "loss": 0.031, "step": 2778 }, { "epoch": 1.8802435723951285, "grad_norm": 0.3521683025846162, "learning_rate": 3.6774073397217786e-06, "loss": 0.034, "step": 2779 }, { "epoch": 1.8809201623815968, "grad_norm": 0.29050614400709096, "learning_rate": 3.6736111447994026e-06, "loss": 0.0195, "step": 2780 }, { "epoch": 1.881596752368065, "grad_norm": 0.2268436847521333, "learning_rate": 3.669815772166625e-06, "loss": 0.0196, "step": 2781 }, { "epoch": 1.8822733423545333, "grad_norm": 0.32915281484908304, "learning_rate": 3.6660212241763692e-06, "loss": 0.0279, "step": 2782 }, { "epoch": 1.8829499323410013, "grad_norm": 0.34819830031525706, "learning_rate": 3.662227503181054e-06, "loss": 0.0357, "step": 2783 }, { "epoch": 1.8836265223274695, "grad_norm": 0.5626748289559241, "learning_rate": 3.658434611532578e-06, "loss": 0.0392, "step": 2784 }, { "epoch": 1.8843031123139378, "grad_norm": 0.2884633592876496, "learning_rate": 3.65464255158233e-06, "loss": 0.0256, "step": 2785 }, { "epoch": 1.8849797023004058, "grad_norm": 0.20175821768168112, "learning_rate": 3.6508513256811856e-06, "loss": 0.0187, "step": 2786 }, { "epoch": 1.8856562922868743, "grad_norm": 0.31282576530748646, "learning_rate": 3.6470609361794972e-06, "loss": 0.0255, "step": 2787 }, { "epoch": 1.8863328822733423, "grad_norm": 0.29362490198931823, "learning_rate": 3.643271385427105e-06, "loss": 0.0234, "step": 2788 }, { "epoch": 1.8870094722598105, "grad_norm": 0.37555876152309803, "learning_rate": 3.639482675773324e-06, "loss": 0.0352, "step": 2789 }, { "epoch": 1.8876860622462788, "grad_norm": 0.27726655452882504, "learning_rate": 3.635694809566954e-06, "loss": 0.0273, "step": 2790 }, { "epoch": 1.8883626522327468, "grad_norm": 0.335796705296422, "learning_rate": 3.6319077891562616e-06, "loss": 0.0294, "step": 2791 }, { "epoch": 1.8890392422192153, "grad_norm": 0.4186741035926321, "learning_rate": 3.6281216168889993e-06, "loss": 0.0417, "step": 2792 }, { "epoch": 1.8897158322056833, "grad_norm": 0.2734011889483481, "learning_rate": 3.624336295112388e-06, "loss": 0.0249, "step": 2793 }, { "epoch": 1.8903924221921515, "grad_norm": 0.2893345296623684, "learning_rate": 3.6205518261731247e-06, "loss": 0.0318, "step": 2794 }, { "epoch": 1.8910690121786198, "grad_norm": 0.2862134610686118, "learning_rate": 3.616768212417375e-06, "loss": 0.0244, "step": 2795 }, { "epoch": 1.8917456021650878, "grad_norm": 0.3173863870232804, "learning_rate": 3.6129854561907786e-06, "loss": 0.025, "step": 2796 }, { "epoch": 1.8924221921515563, "grad_norm": 0.43928982539968053, "learning_rate": 3.6092035598384356e-06, "loss": 0.0317, "step": 2797 }, { "epoch": 1.8930987821380243, "grad_norm": 0.43430236350593565, "learning_rate": 3.6054225257049204e-06, "loss": 0.0355, "step": 2798 }, { "epoch": 1.8937753721244925, "grad_norm": 0.3224927637106134, "learning_rate": 3.6016423561342707e-06, "loss": 0.0339, "step": 2799 }, { "epoch": 1.8944519621109608, "grad_norm": 0.38319398271183497, "learning_rate": 3.5978630534699873e-06, "loss": 0.0316, "step": 2800 }, { "epoch": 1.8951285520974288, "grad_norm": 0.2640394799838839, "learning_rate": 3.5940846200550327e-06, "loss": 0.024, "step": 2801 }, { "epoch": 1.8958051420838973, "grad_norm": 0.547423435207649, "learning_rate": 3.5903070582318356e-06, "loss": 0.0406, "step": 2802 }, { "epoch": 1.8964817320703653, "grad_norm": 0.2993570275064508, "learning_rate": 3.5865303703422794e-06, "loss": 0.0255, "step": 2803 }, { "epoch": 1.8971583220568335, "grad_norm": 0.6498621705762274, "learning_rate": 3.5827545587277033e-06, "loss": 0.0463, "step": 2804 }, { "epoch": 1.8978349120433018, "grad_norm": 0.28677708356869375, "learning_rate": 3.5789796257289117e-06, "loss": 0.0268, "step": 2805 }, { "epoch": 1.8985115020297698, "grad_norm": 0.3740110206219344, "learning_rate": 3.5752055736861567e-06, "loss": 0.0399, "step": 2806 }, { "epoch": 1.8991880920162383, "grad_norm": 0.36112072535398315, "learning_rate": 3.571432404939149e-06, "loss": 0.0253, "step": 2807 }, { "epoch": 1.8998646820027063, "grad_norm": 0.33223983356346587, "learning_rate": 3.567660121827048e-06, "loss": 0.0308, "step": 2808 }, { "epoch": 1.9005412719891746, "grad_norm": 0.33198553490346355, "learning_rate": 3.5638887266884682e-06, "loss": 0.0261, "step": 2809 }, { "epoch": 1.9012178619756428, "grad_norm": 0.3097804507949422, "learning_rate": 3.5601182218614706e-06, "loss": 0.0275, "step": 2810 }, { "epoch": 1.9018944519621108, "grad_norm": 0.24213424466380748, "learning_rate": 3.5563486096835643e-06, "loss": 0.0221, "step": 2811 }, { "epoch": 1.9025710419485793, "grad_norm": 0.4353504298756395, "learning_rate": 3.552579892491704e-06, "loss": 0.0289, "step": 2812 }, { "epoch": 1.9032476319350473, "grad_norm": 0.5192967445604084, "learning_rate": 3.548812072622294e-06, "loss": 0.0276, "step": 2813 }, { "epoch": 1.9039242219215156, "grad_norm": 0.4643938534061608, "learning_rate": 3.545045152411178e-06, "loss": 0.0385, "step": 2814 }, { "epoch": 1.9046008119079838, "grad_norm": 0.2651575826675033, "learning_rate": 3.5412791341936446e-06, "loss": 0.0208, "step": 2815 }, { "epoch": 1.9052774018944518, "grad_norm": 0.35986295997323603, "learning_rate": 3.5375140203044233e-06, "loss": 0.0269, "step": 2816 }, { "epoch": 1.9059539918809203, "grad_norm": 0.25351053252441075, "learning_rate": 3.533749813077677e-06, "loss": 0.0236, "step": 2817 }, { "epoch": 1.9066305818673883, "grad_norm": 0.255312732213536, "learning_rate": 3.5299865148470157e-06, "loss": 0.0263, "step": 2818 }, { "epoch": 1.9073071718538566, "grad_norm": 0.31432772507999474, "learning_rate": 3.526224127945479e-06, "loss": 0.0349, "step": 2819 }, { "epoch": 1.9079837618403248, "grad_norm": 0.3648883357223518, "learning_rate": 3.5224626547055463e-06, "loss": 0.031, "step": 2820 }, { "epoch": 1.9086603518267928, "grad_norm": 0.2595013606780327, "learning_rate": 3.518702097459126e-06, "loss": 0.0321, "step": 2821 }, { "epoch": 1.9093369418132613, "grad_norm": 0.33185174904071235, "learning_rate": 3.5149424585375623e-06, "loss": 0.0346, "step": 2822 }, { "epoch": 1.9100135317997293, "grad_norm": 0.28682765527430787, "learning_rate": 3.5111837402716297e-06, "loss": 0.0303, "step": 2823 }, { "epoch": 1.9106901217861976, "grad_norm": 0.3847310611875437, "learning_rate": 3.507425944991529e-06, "loss": 0.0251, "step": 2824 }, { "epoch": 1.9113667117726658, "grad_norm": 0.3072857451242148, "learning_rate": 3.5036690750268897e-06, "loss": 0.0279, "step": 2825 }, { "epoch": 1.9120433017591338, "grad_norm": 0.23586619817907892, "learning_rate": 3.499913132706771e-06, "loss": 0.0209, "step": 2826 }, { "epoch": 1.9127198917456023, "grad_norm": 0.32330068896389064, "learning_rate": 3.496158120359653e-06, "loss": 0.0332, "step": 2827 }, { "epoch": 1.9133964817320703, "grad_norm": 0.2751689962619378, "learning_rate": 3.492404040313443e-06, "loss": 0.0323, "step": 2828 }, { "epoch": 1.9140730717185386, "grad_norm": 0.38804992328882687, "learning_rate": 3.4886508948954656e-06, "loss": 0.0273, "step": 2829 }, { "epoch": 1.9147496617050068, "grad_norm": 0.24126696684207583, "learning_rate": 3.484898686432473e-06, "loss": 0.0221, "step": 2830 }, { "epoch": 1.9154262516914748, "grad_norm": 0.4072994040840315, "learning_rate": 3.4811474172506277e-06, "loss": 0.0323, "step": 2831 }, { "epoch": 1.9161028416779433, "grad_norm": 0.30978967865564594, "learning_rate": 3.4773970896755167e-06, "loss": 0.03, "step": 2832 }, { "epoch": 1.9167794316644113, "grad_norm": 0.3871956545025896, "learning_rate": 3.4736477060321387e-06, "loss": 0.0334, "step": 2833 }, { "epoch": 1.9174560216508796, "grad_norm": 0.32370998680942925, "learning_rate": 3.469899268644913e-06, "loss": 0.0262, "step": 2834 }, { "epoch": 1.9181326116373478, "grad_norm": 0.28469184318348056, "learning_rate": 3.466151779837665e-06, "loss": 0.0309, "step": 2835 }, { "epoch": 1.9188092016238159, "grad_norm": 0.2795111004603757, "learning_rate": 3.4624052419336395e-06, "loss": 0.0249, "step": 2836 }, { "epoch": 1.9194857916102843, "grad_norm": 0.2683034552931114, "learning_rate": 3.458659657255486e-06, "loss": 0.0219, "step": 2837 }, { "epoch": 1.9201623815967523, "grad_norm": 0.256257738319039, "learning_rate": 3.4549150281252635e-06, "loss": 0.0277, "step": 2838 }, { "epoch": 1.9208389715832206, "grad_norm": 0.33001994167723386, "learning_rate": 3.4511713568644432e-06, "loss": 0.0242, "step": 2839 }, { "epoch": 1.9215155615696888, "grad_norm": 0.3366863569402713, "learning_rate": 3.4474286457938976e-06, "loss": 0.0329, "step": 2840 }, { "epoch": 1.9221921515561569, "grad_norm": 0.3280153443728017, "learning_rate": 3.4436868972339073e-06, "loss": 0.0281, "step": 2841 }, { "epoch": 1.9228687415426253, "grad_norm": 0.3350255705132479, "learning_rate": 3.4399461135041525e-06, "loss": 0.0202, "step": 2842 }, { "epoch": 1.9235453315290933, "grad_norm": 0.29081080464471826, "learning_rate": 3.4362062969237227e-06, "loss": 0.0235, "step": 2843 }, { "epoch": 1.9242219215155616, "grad_norm": 0.2621212734470656, "learning_rate": 3.4324674498110956e-06, "loss": 0.0235, "step": 2844 }, { "epoch": 1.9248985115020298, "grad_norm": 0.3380242772080527, "learning_rate": 3.4287295744841588e-06, "loss": 0.0262, "step": 2845 }, { "epoch": 1.9255751014884979, "grad_norm": 0.43964945789010224, "learning_rate": 3.4249926732601914e-06, "loss": 0.0238, "step": 2846 }, { "epoch": 1.9262516914749663, "grad_norm": 0.2710127173580096, "learning_rate": 3.4212567484558735e-06, "loss": 0.0263, "step": 2847 }, { "epoch": 1.9269282814614344, "grad_norm": 0.26758662188394866, "learning_rate": 3.4175218023872753e-06, "loss": 0.0287, "step": 2848 }, { "epoch": 1.9276048714479026, "grad_norm": 0.3432446968148877, "learning_rate": 3.413787837369863e-06, "loss": 0.0278, "step": 2849 }, { "epoch": 1.9282814614343708, "grad_norm": 0.2539660225248182, "learning_rate": 3.4100548557184944e-06, "loss": 0.0216, "step": 2850 }, { "epoch": 1.9289580514208389, "grad_norm": 0.4090591753487882, "learning_rate": 3.4063228597474133e-06, "loss": 0.0286, "step": 2851 }, { "epoch": 1.9296346414073073, "grad_norm": 0.2759084303466922, "learning_rate": 3.40259185177026e-06, "loss": 0.0275, "step": 2852 }, { "epoch": 1.9303112313937754, "grad_norm": 0.2875694538711192, "learning_rate": 3.3988618341000566e-06, "loss": 0.0255, "step": 2853 }, { "epoch": 1.9309878213802436, "grad_norm": 0.23398353942622174, "learning_rate": 3.395132809049212e-06, "loss": 0.0188, "step": 2854 }, { "epoch": 1.9316644113667119, "grad_norm": 0.3082346577153797, "learning_rate": 3.391404778929523e-06, "loss": 0.0274, "step": 2855 }, { "epoch": 1.9323410013531799, "grad_norm": 0.36266806858388334, "learning_rate": 3.3876777460521647e-06, "loss": 0.0279, "step": 2856 }, { "epoch": 1.9330175913396481, "grad_norm": 0.32350208627996513, "learning_rate": 3.383951712727701e-06, "loss": 0.0338, "step": 2857 }, { "epoch": 1.9336941813261164, "grad_norm": 0.2780696818957692, "learning_rate": 3.3802266812660674e-06, "loss": 0.0262, "step": 2858 }, { "epoch": 1.9343707713125846, "grad_norm": 0.24737711764911177, "learning_rate": 3.3765026539765832e-06, "loss": 0.0233, "step": 2859 }, { "epoch": 1.9350473612990529, "grad_norm": 0.29716412290164596, "learning_rate": 3.372779633167946e-06, "loss": 0.0263, "step": 2860 }, { "epoch": 1.9357239512855209, "grad_norm": 0.3110587006998367, "learning_rate": 3.369057621148227e-06, "loss": 0.0202, "step": 2861 }, { "epoch": 1.9364005412719891, "grad_norm": 0.30215191701365823, "learning_rate": 3.3653366202248738e-06, "loss": 0.039, "step": 2862 }, { "epoch": 1.9370771312584574, "grad_norm": 0.29661925810829787, "learning_rate": 3.3616166327047084e-06, "loss": 0.0262, "step": 2863 }, { "epoch": 1.9377537212449256, "grad_norm": 0.3110558330578675, "learning_rate": 3.3578976608939184e-06, "loss": 0.0325, "step": 2864 }, { "epoch": 1.9384303112313939, "grad_norm": 0.49275140021791575, "learning_rate": 3.3541797070980663e-06, "loss": 0.044, "step": 2865 }, { "epoch": 1.939106901217862, "grad_norm": 0.23297219224990126, "learning_rate": 3.3504627736220863e-06, "loss": 0.0215, "step": 2866 }, { "epoch": 1.9397834912043301, "grad_norm": 0.3354038043048891, "learning_rate": 3.3467468627702736e-06, "loss": 0.0285, "step": 2867 }, { "epoch": 1.9404600811907984, "grad_norm": 0.3270926549379668, "learning_rate": 3.3430319768462956e-06, "loss": 0.0383, "step": 2868 }, { "epoch": 1.9411366711772666, "grad_norm": 0.3071772486584624, "learning_rate": 3.3393181181531785e-06, "loss": 0.034, "step": 2869 }, { "epoch": 1.9418132611637349, "grad_norm": 0.29458655876061124, "learning_rate": 3.3356052889933177e-06, "loss": 0.0254, "step": 2870 }, { "epoch": 1.942489851150203, "grad_norm": 0.4383006372306806, "learning_rate": 3.331893491668464e-06, "loss": 0.0392, "step": 2871 }, { "epoch": 1.9431664411366711, "grad_norm": 0.21682665553500347, "learning_rate": 3.3281827284797317e-06, "loss": 0.0137, "step": 2872 }, { "epoch": 1.9438430311231394, "grad_norm": 0.4158094120952702, "learning_rate": 3.3244730017275974e-06, "loss": 0.0338, "step": 2873 }, { "epoch": 1.9445196211096076, "grad_norm": 0.2673568657738758, "learning_rate": 3.3207643137118872e-06, "loss": 0.0227, "step": 2874 }, { "epoch": 1.9451962110960759, "grad_norm": 0.334600763745565, "learning_rate": 3.3170566667317917e-06, "loss": 0.0269, "step": 2875 }, { "epoch": 1.945872801082544, "grad_norm": 0.34411316409786435, "learning_rate": 3.3133500630858507e-06, "loss": 0.0298, "step": 2876 }, { "epoch": 1.9465493910690121, "grad_norm": 0.24721970581413663, "learning_rate": 3.309644505071959e-06, "loss": 0.026, "step": 2877 }, { "epoch": 1.9472259810554804, "grad_norm": 0.3337237396309158, "learning_rate": 3.3059399949873605e-06, "loss": 0.0441, "step": 2878 }, { "epoch": 1.9479025710419484, "grad_norm": 0.46310487928044103, "learning_rate": 3.3022365351286545e-06, "loss": 0.0216, "step": 2879 }, { "epoch": 1.9485791610284169, "grad_norm": 0.27732618646127893, "learning_rate": 3.298534127791785e-06, "loss": 0.0295, "step": 2880 }, { "epoch": 1.949255751014885, "grad_norm": 0.4301121159037394, "learning_rate": 3.2948327752720464e-06, "loss": 0.0357, "step": 2881 }, { "epoch": 1.9499323410013532, "grad_norm": 0.2607109714269544, "learning_rate": 3.2911324798640764e-06, "loss": 0.0311, "step": 2882 }, { "epoch": 1.9506089309878214, "grad_norm": 0.3760848305883796, "learning_rate": 3.2874332438618607e-06, "loss": 0.0364, "step": 2883 }, { "epoch": 1.9512855209742894, "grad_norm": 0.5506064272337056, "learning_rate": 3.2837350695587237e-06, "loss": 0.0199, "step": 2884 }, { "epoch": 1.951962110960758, "grad_norm": 0.341082574308908, "learning_rate": 3.280037959247336e-06, "loss": 0.0252, "step": 2885 }, { "epoch": 1.952638700947226, "grad_norm": 0.3719396780015257, "learning_rate": 3.276341915219704e-06, "loss": 0.035, "step": 2886 }, { "epoch": 1.9533152909336942, "grad_norm": 0.3037992621206681, "learning_rate": 3.2726469397671797e-06, "loss": 0.026, "step": 2887 }, { "epoch": 1.9539918809201624, "grad_norm": 0.44635049308627206, "learning_rate": 3.268953035180445e-06, "loss": 0.0486, "step": 2888 }, { "epoch": 1.9546684709066304, "grad_norm": 0.282533040163484, "learning_rate": 3.2652602037495247e-06, "loss": 0.0274, "step": 2889 }, { "epoch": 1.955345060893099, "grad_norm": 0.3544760189574648, "learning_rate": 3.261568447763775e-06, "loss": 0.0349, "step": 2890 }, { "epoch": 1.956021650879567, "grad_norm": 0.3731113037171858, "learning_rate": 3.2578777695118822e-06, "loss": 0.0425, "step": 2891 }, { "epoch": 1.9566982408660352, "grad_norm": 0.29749706884463234, "learning_rate": 3.254188171281871e-06, "loss": 0.0156, "step": 2892 }, { "epoch": 1.9573748308525034, "grad_norm": 0.3173142864726359, "learning_rate": 3.2504996553610924e-06, "loss": 0.0307, "step": 2893 }, { "epoch": 1.9580514208389714, "grad_norm": 0.35033591623661653, "learning_rate": 3.2468122240362287e-06, "loss": 0.0344, "step": 2894 }, { "epoch": 1.95872801082544, "grad_norm": 0.3322956626141054, "learning_rate": 3.2431258795932863e-06, "loss": 0.0314, "step": 2895 }, { "epoch": 1.959404600811908, "grad_norm": 0.3749911133312623, "learning_rate": 3.2394406243176025e-06, "loss": 0.032, "step": 2896 }, { "epoch": 1.9600811907983762, "grad_norm": 0.2466760191045161, "learning_rate": 3.2357564604938363e-06, "loss": 0.026, "step": 2897 }, { "epoch": 1.9607577807848444, "grad_norm": 0.31893554908880567, "learning_rate": 3.232073390405969e-06, "loss": 0.0239, "step": 2898 }, { "epoch": 1.9614343707713124, "grad_norm": 0.33148467587806907, "learning_rate": 3.2283914163373064e-06, "loss": 0.0338, "step": 2899 }, { "epoch": 1.962110960757781, "grad_norm": 0.4047833615238191, "learning_rate": 3.224710540570475e-06, "loss": 0.0395, "step": 2900 }, { "epoch": 1.962787550744249, "grad_norm": 0.381552121459052, "learning_rate": 3.2210307653874175e-06, "loss": 0.0232, "step": 2901 }, { "epoch": 1.9634641407307172, "grad_norm": 0.3282149561077376, "learning_rate": 3.2173520930693987e-06, "loss": 0.0301, "step": 2902 }, { "epoch": 1.9641407307171854, "grad_norm": 0.2952940794290675, "learning_rate": 3.2136745258969965e-06, "loss": 0.0308, "step": 2903 }, { "epoch": 1.9648173207036534, "grad_norm": 0.6005180166323237, "learning_rate": 3.2099980661501016e-06, "loss": 0.0263, "step": 2904 }, { "epoch": 1.965493910690122, "grad_norm": 0.34264443079170587, "learning_rate": 3.2063227161079234e-06, "loss": 0.0282, "step": 2905 }, { "epoch": 1.96617050067659, "grad_norm": 0.3382328779946316, "learning_rate": 3.202648478048981e-06, "loss": 0.0365, "step": 2906 }, { "epoch": 1.9668470906630582, "grad_norm": 0.2934545968682589, "learning_rate": 3.1989753542511016e-06, "loss": 0.0282, "step": 2907 }, { "epoch": 1.9675236806495264, "grad_norm": 0.39804762599738625, "learning_rate": 3.1953033469914273e-06, "loss": 0.0296, "step": 2908 }, { "epoch": 1.9682002706359945, "grad_norm": 0.23323274395145344, "learning_rate": 3.191632458546401e-06, "loss": 0.0244, "step": 2909 }, { "epoch": 1.968876860622463, "grad_norm": 0.2560444335549138, "learning_rate": 3.1879626911917806e-06, "loss": 0.0226, "step": 2910 }, { "epoch": 1.969553450608931, "grad_norm": 0.472277980688637, "learning_rate": 3.1842940472026194e-06, "loss": 0.0295, "step": 2911 }, { "epoch": 1.9702300405953992, "grad_norm": 0.28628681285349195, "learning_rate": 3.18062652885328e-06, "loss": 0.0278, "step": 2912 }, { "epoch": 1.9709066305818674, "grad_norm": 0.3220856641535677, "learning_rate": 3.1769601384174274e-06, "loss": 0.0318, "step": 2913 }, { "epoch": 1.9715832205683355, "grad_norm": 0.17608460434720677, "learning_rate": 3.173294878168025e-06, "loss": 0.015, "step": 2914 }, { "epoch": 1.972259810554804, "grad_norm": 0.348395752132745, "learning_rate": 3.169630750377337e-06, "loss": 0.0246, "step": 2915 }, { "epoch": 1.972936400541272, "grad_norm": 1.1154799791550307, "learning_rate": 3.165967757316925e-06, "loss": 0.0393, "step": 2916 }, { "epoch": 1.9736129905277402, "grad_norm": 0.4427655176412636, "learning_rate": 3.16230590125765e-06, "loss": 0.0457, "step": 2917 }, { "epoch": 1.9742895805142084, "grad_norm": 0.279935305564462, "learning_rate": 3.1586451844696596e-06, "loss": 0.0263, "step": 2918 }, { "epoch": 1.9749661705006765, "grad_norm": 0.38887833912929604, "learning_rate": 3.154985609222405e-06, "loss": 0.0591, "step": 2919 }, { "epoch": 1.975642760487145, "grad_norm": 0.40384933616899205, "learning_rate": 3.1513271777846244e-06, "loss": 0.033, "step": 2920 }, { "epoch": 1.976319350473613, "grad_norm": 0.7291599951587879, "learning_rate": 3.1476698924243487e-06, "loss": 0.0249, "step": 2921 }, { "epoch": 1.9769959404600812, "grad_norm": 0.28898467345484863, "learning_rate": 3.1440137554088957e-06, "loss": 0.0312, "step": 2922 }, { "epoch": 1.9776725304465494, "grad_norm": 0.36265665243672346, "learning_rate": 3.1403587690048775e-06, "loss": 0.0266, "step": 2923 }, { "epoch": 1.9783491204330175, "grad_norm": 0.3259886252116135, "learning_rate": 3.1367049354781854e-06, "loss": 0.0314, "step": 2924 }, { "epoch": 1.979025710419486, "grad_norm": 0.3582763584476965, "learning_rate": 3.1330522570939987e-06, "loss": 0.0232, "step": 2925 }, { "epoch": 1.979702300405954, "grad_norm": 0.2935990823159862, "learning_rate": 3.129400736116783e-06, "loss": 0.0269, "step": 2926 }, { "epoch": 1.9803788903924222, "grad_norm": 0.3274565880793247, "learning_rate": 3.125750374810283e-06, "loss": 0.0259, "step": 2927 }, { "epoch": 1.9810554803788905, "grad_norm": 0.2658822310608906, "learning_rate": 3.1221011754375275e-06, "loss": 0.0283, "step": 2928 }, { "epoch": 1.9817320703653585, "grad_norm": 0.3847013296697889, "learning_rate": 3.118453140260823e-06, "loss": 0.0298, "step": 2929 }, { "epoch": 1.982408660351827, "grad_norm": 0.7130181415800749, "learning_rate": 3.1148062715417553e-06, "loss": 0.0294, "step": 2930 }, { "epoch": 1.983085250338295, "grad_norm": 0.4373450083423354, "learning_rate": 3.111160571541183e-06, "loss": 0.0429, "step": 2931 }, { "epoch": 1.9837618403247632, "grad_norm": 0.3110882682758484, "learning_rate": 3.107516042519248e-06, "loss": 0.0248, "step": 2932 }, { "epoch": 1.9844384303112315, "grad_norm": 0.36071841806590565, "learning_rate": 3.1038726867353587e-06, "loss": 0.0384, "step": 2933 }, { "epoch": 1.9851150202976995, "grad_norm": 0.28256102100177616, "learning_rate": 3.1002305064482006e-06, "loss": 0.0232, "step": 2934 }, { "epoch": 1.985791610284168, "grad_norm": 0.2988723458507057, "learning_rate": 3.096589503915729e-06, "loss": 0.0323, "step": 2935 }, { "epoch": 1.986468200270636, "grad_norm": 0.2731757881803187, "learning_rate": 3.09294968139517e-06, "loss": 0.0308, "step": 2936 }, { "epoch": 1.9871447902571042, "grad_norm": 0.39227075933969013, "learning_rate": 3.089311041143017e-06, "loss": 0.0353, "step": 2937 }, { "epoch": 1.9878213802435725, "grad_norm": 0.2858002786161629, "learning_rate": 3.085673585415031e-06, "loss": 0.0235, "step": 2938 }, { "epoch": 1.9884979702300405, "grad_norm": 0.2443499835741995, "learning_rate": 3.082037316466236e-06, "loss": 0.0231, "step": 2939 }, { "epoch": 1.989174560216509, "grad_norm": 0.320468895425486, "learning_rate": 3.078402236550926e-06, "loss": 0.0297, "step": 2940 }, { "epoch": 1.989851150202977, "grad_norm": 0.3353473200075891, "learning_rate": 3.074768347922652e-06, "loss": 0.0346, "step": 2941 }, { "epoch": 1.9905277401894452, "grad_norm": 0.9569463985916461, "learning_rate": 3.0711356528342316e-06, "loss": 0.0252, "step": 2942 }, { "epoch": 1.9912043301759135, "grad_norm": 0.25127605328321234, "learning_rate": 3.06750415353774e-06, "loss": 0.0246, "step": 2943 }, { "epoch": 1.9918809201623815, "grad_norm": 0.37107075055984906, "learning_rate": 3.063873852284508e-06, "loss": 0.0337, "step": 2944 }, { "epoch": 1.9925575101488497, "grad_norm": 0.2612803381010322, "learning_rate": 3.0602447513251287e-06, "loss": 0.0233, "step": 2945 }, { "epoch": 1.993234100135318, "grad_norm": 0.6852125222550688, "learning_rate": 3.0566168529094485e-06, "loss": 0.0322, "step": 2946 }, { "epoch": 1.9939106901217862, "grad_norm": 0.4500098880831753, "learning_rate": 3.0529901592865705e-06, "loss": 0.0288, "step": 2947 }, { "epoch": 1.9945872801082545, "grad_norm": 0.3593941089907377, "learning_rate": 3.0493646727048463e-06, "loss": 0.0306, "step": 2948 }, { "epoch": 1.9952638700947225, "grad_norm": 0.4390521539617445, "learning_rate": 3.045740395411886e-06, "loss": 0.0417, "step": 2949 }, { "epoch": 1.9959404600811907, "grad_norm": 0.3389952295118972, "learning_rate": 3.042117329654544e-06, "loss": 0.0286, "step": 2950 }, { "epoch": 1.996617050067659, "grad_norm": 0.266164907803794, "learning_rate": 3.0384954776789255e-06, "loss": 0.0176, "step": 2951 }, { "epoch": 1.9972936400541272, "grad_norm": 0.3084683429644584, "learning_rate": 3.0348748417303826e-06, "loss": 0.0262, "step": 2952 }, { "epoch": 1.9979702300405955, "grad_norm": 0.4066343568805604, "learning_rate": 3.0312554240535166e-06, "loss": 0.0308, "step": 2953 }, { "epoch": 1.9986468200270635, "grad_norm": 0.2967193506919042, "learning_rate": 3.0276372268921694e-06, "loss": 0.0266, "step": 2954 }, { "epoch": 1.9993234100135318, "grad_norm": 0.4838567587953983, "learning_rate": 3.0240202524894304e-06, "loss": 0.0378, "step": 2955 }, { "epoch": 2.0, "grad_norm": 0.24361262369124725, "learning_rate": 3.0204045030876267e-06, "loss": 0.0229, "step": 2956 }, { "epoch": 2.0, "eval_loss": 0.03232486918568611, "eval_runtime": 234.0891, "eval_samples_per_second": 42.527, "eval_steps_per_second": 1.333, "step": 2956 }, { "epoch": 2.000676589986468, "grad_norm": 0.2863479162677051, "learning_rate": 3.016789980928331e-06, "loss": 0.0242, "step": 2957 }, { "epoch": 2.0013531799729365, "grad_norm": 0.2446970841308206, "learning_rate": 3.013176688252349e-06, "loss": 0.0267, "step": 2958 }, { "epoch": 2.0020297699594045, "grad_norm": 0.27546559067848164, "learning_rate": 3.009564627299728e-06, "loss": 0.0277, "step": 2959 }, { "epoch": 2.002706359945873, "grad_norm": 0.25041569161875626, "learning_rate": 3.005953800309752e-06, "loss": 0.0178, "step": 2960 }, { "epoch": 2.003382949932341, "grad_norm": 0.2936556310346428, "learning_rate": 3.0023442095209386e-06, "loss": 0.0208, "step": 2961 }, { "epoch": 2.004059539918809, "grad_norm": 0.2900960504196555, "learning_rate": 2.9987358571710394e-06, "loss": 0.0203, "step": 2962 }, { "epoch": 2.0047361299052775, "grad_norm": 0.23453405387146745, "learning_rate": 2.9951287454970405e-06, "loss": 0.0166, "step": 2963 }, { "epoch": 2.0054127198917455, "grad_norm": 0.24436914995189996, "learning_rate": 2.991522876735154e-06, "loss": 0.0166, "step": 2964 }, { "epoch": 2.006089309878214, "grad_norm": 0.30259742460555106, "learning_rate": 2.987918253120824e-06, "loss": 0.0199, "step": 2965 }, { "epoch": 2.006765899864682, "grad_norm": 0.3949464234058585, "learning_rate": 2.984314876888725e-06, "loss": 0.0201, "step": 2966 }, { "epoch": 2.00744248985115, "grad_norm": 0.3185317265892623, "learning_rate": 2.980712750272754e-06, "loss": 0.0173, "step": 2967 }, { "epoch": 2.0081190798376185, "grad_norm": 0.28231839208490533, "learning_rate": 2.9771118755060368e-06, "loss": 0.0168, "step": 2968 }, { "epoch": 2.0087956698240865, "grad_norm": 0.3113576343695042, "learning_rate": 2.9735122548209204e-06, "loss": 0.0259, "step": 2969 }, { "epoch": 2.009472259810555, "grad_norm": 0.2661010558709563, "learning_rate": 2.96991389044898e-06, "loss": 0.0205, "step": 2970 }, { "epoch": 2.010148849797023, "grad_norm": 0.360734775155085, "learning_rate": 2.966316784621e-06, "loss": 0.0285, "step": 2971 }, { "epoch": 2.010825439783491, "grad_norm": 0.3333442128135005, "learning_rate": 2.9627209395669978e-06, "loss": 0.0215, "step": 2972 }, { "epoch": 2.0115020297699595, "grad_norm": 0.3111768204221705, "learning_rate": 2.9591263575162e-06, "loss": 0.0191, "step": 2973 }, { "epoch": 2.0121786197564275, "grad_norm": 0.3302427223756702, "learning_rate": 2.9555330406970568e-06, "loss": 0.0199, "step": 2974 }, { "epoch": 2.012855209742896, "grad_norm": 0.29770343162182134, "learning_rate": 2.9519409913372286e-06, "loss": 0.0181, "step": 2975 }, { "epoch": 2.013531799729364, "grad_norm": 0.3314595406050617, "learning_rate": 2.9483502116635943e-06, "loss": 0.0253, "step": 2976 }, { "epoch": 2.014208389715832, "grad_norm": 0.2927950786090539, "learning_rate": 2.9447607039022443e-06, "loss": 0.0165, "step": 2977 }, { "epoch": 2.0148849797023005, "grad_norm": 0.36899655053084923, "learning_rate": 2.9411724702784762e-06, "loss": 0.031, "step": 2978 }, { "epoch": 2.0155615696887685, "grad_norm": 0.3289829198876795, "learning_rate": 2.9375855130168046e-06, "loss": 0.0197, "step": 2979 }, { "epoch": 2.016238159675237, "grad_norm": 0.440796677024592, "learning_rate": 2.9339998343409484e-06, "loss": 0.0317, "step": 2980 }, { "epoch": 2.016914749661705, "grad_norm": 0.29986528125333717, "learning_rate": 2.9304154364738358e-06, "loss": 0.0167, "step": 2981 }, { "epoch": 2.017591339648173, "grad_norm": 0.5231906812809761, "learning_rate": 2.9268323216375997e-06, "loss": 0.0209, "step": 2982 }, { "epoch": 2.0182679296346415, "grad_norm": 0.49567420473066737, "learning_rate": 2.92325049205358e-06, "loss": 0.0232, "step": 2983 }, { "epoch": 2.0189445196211095, "grad_norm": 0.3374261257695268, "learning_rate": 2.9196699499423143e-06, "loss": 0.0183, "step": 2984 }, { "epoch": 2.019621109607578, "grad_norm": 0.3234196639187083, "learning_rate": 2.9160906975235493e-06, "loss": 0.0186, "step": 2985 }, { "epoch": 2.020297699594046, "grad_norm": 0.3633058512930501, "learning_rate": 2.9125127370162253e-06, "loss": 0.0266, "step": 2986 }, { "epoch": 2.020974289580514, "grad_norm": 0.3182453062676391, "learning_rate": 2.908936070638487e-06, "loss": 0.0214, "step": 2987 }, { "epoch": 2.0216508795669825, "grad_norm": 0.5490919136205034, "learning_rate": 2.9053607006076766e-06, "loss": 0.0194, "step": 2988 }, { "epoch": 2.0223274695534506, "grad_norm": 0.28036413832885343, "learning_rate": 2.9017866291403275e-06, "loss": 0.0169, "step": 2989 }, { "epoch": 2.023004059539919, "grad_norm": 0.3326998051076093, "learning_rate": 2.8982138584521734e-06, "loss": 0.0208, "step": 2990 }, { "epoch": 2.023680649526387, "grad_norm": 0.24252640345277615, "learning_rate": 2.8946423907581377e-06, "loss": 0.0168, "step": 2991 }, { "epoch": 2.024357239512855, "grad_norm": 0.26774010808471116, "learning_rate": 2.8910722282723404e-06, "loss": 0.0136, "step": 2992 }, { "epoch": 2.0250338294993235, "grad_norm": 0.3893872262951025, "learning_rate": 2.8875033732080865e-06, "loss": 0.0254, "step": 2993 }, { "epoch": 2.0257104194857916, "grad_norm": 0.36350093252970944, "learning_rate": 2.8839358277778758e-06, "loss": 0.0232, "step": 2994 }, { "epoch": 2.02638700947226, "grad_norm": 0.4113375425303043, "learning_rate": 2.8803695941933933e-06, "loss": 0.021, "step": 2995 }, { "epoch": 2.027063599458728, "grad_norm": 0.3424460948782749, "learning_rate": 2.876804674665515e-06, "loss": 0.0166, "step": 2996 }, { "epoch": 2.027740189445196, "grad_norm": 0.34633947375349966, "learning_rate": 2.873241071404296e-06, "loss": 0.0261, "step": 2997 }, { "epoch": 2.0284167794316645, "grad_norm": 0.31695482248283474, "learning_rate": 2.869678786618976e-06, "loss": 0.0258, "step": 2998 }, { "epoch": 2.0290933694181326, "grad_norm": 0.4051790685017009, "learning_rate": 2.866117822517982e-06, "loss": 0.0288, "step": 2999 }, { "epoch": 2.029769959404601, "grad_norm": 0.297721583264592, "learning_rate": 2.86255818130892e-06, "loss": 0.0151, "step": 3000 }, { "epoch": 2.030446549391069, "grad_norm": 0.3263110566690717, "learning_rate": 2.8589998651985775e-06, "loss": 0.0264, "step": 3001 }, { "epoch": 2.031123139377537, "grad_norm": 0.36359138307229244, "learning_rate": 2.855442876392914e-06, "loss": 0.0225, "step": 3002 }, { "epoch": 2.0317997293640055, "grad_norm": 0.38974921943148677, "learning_rate": 2.8518872170970758e-06, "loss": 0.0204, "step": 3003 }, { "epoch": 2.0324763193504736, "grad_norm": 0.3246598767415876, "learning_rate": 2.848332889515375e-06, "loss": 0.0159, "step": 3004 }, { "epoch": 2.033152909336942, "grad_norm": 0.2715035243485544, "learning_rate": 2.8447798958513082e-06, "loss": 0.0199, "step": 3005 }, { "epoch": 2.03382949932341, "grad_norm": 0.30416151930942176, "learning_rate": 2.8412282383075362e-06, "loss": 0.0195, "step": 3006 }, { "epoch": 2.034506089309878, "grad_norm": 0.3174930394310743, "learning_rate": 2.837677919085896e-06, "loss": 0.0239, "step": 3007 }, { "epoch": 2.0351826792963466, "grad_norm": 0.2872144907614338, "learning_rate": 2.8341289403873952e-06, "loss": 0.0171, "step": 3008 }, { "epoch": 2.0358592692828146, "grad_norm": 0.3420128841586019, "learning_rate": 2.83058130441221e-06, "loss": 0.017, "step": 3009 }, { "epoch": 2.0365358592692826, "grad_norm": 0.3087533550670056, "learning_rate": 2.8270350133596824e-06, "loss": 0.0164, "step": 3010 }, { "epoch": 2.037212449255751, "grad_norm": 0.36783863784836157, "learning_rate": 2.82349006942832e-06, "loss": 0.0246, "step": 3011 }, { "epoch": 2.037889039242219, "grad_norm": 0.3192419287887565, "learning_rate": 2.8199464748157983e-06, "loss": 0.0188, "step": 3012 }, { "epoch": 2.0385656292286876, "grad_norm": 0.3207256988137708, "learning_rate": 2.816404231718958e-06, "loss": 0.0225, "step": 3013 }, { "epoch": 2.0392422192151556, "grad_norm": 0.2848406159498065, "learning_rate": 2.8128633423337932e-06, "loss": 0.0209, "step": 3014 }, { "epoch": 2.0399188092016236, "grad_norm": 0.3398180925374401, "learning_rate": 2.8093238088554676e-06, "loss": 0.0273, "step": 3015 }, { "epoch": 2.040595399188092, "grad_norm": 0.2995605132938713, "learning_rate": 2.8057856334783006e-06, "loss": 0.0168, "step": 3016 }, { "epoch": 2.04127198917456, "grad_norm": 0.44030263845454437, "learning_rate": 2.802248818395773e-06, "loss": 0.0195, "step": 3017 }, { "epoch": 2.0419485791610286, "grad_norm": 0.2993737804960732, "learning_rate": 2.7987133658005174e-06, "loss": 0.0191, "step": 3018 }, { "epoch": 2.0426251691474966, "grad_norm": 0.2954601996897239, "learning_rate": 2.795179277884321e-06, "loss": 0.0203, "step": 3019 }, { "epoch": 2.0433017591339646, "grad_norm": 0.2733113064635633, "learning_rate": 2.79164655683813e-06, "loss": 0.0165, "step": 3020 }, { "epoch": 2.043978349120433, "grad_norm": 0.25405037608310793, "learning_rate": 2.788115204852042e-06, "loss": 0.0145, "step": 3021 }, { "epoch": 2.044654939106901, "grad_norm": 0.3284269913514382, "learning_rate": 2.7845852241153063e-06, "loss": 0.0222, "step": 3022 }, { "epoch": 2.0453315290933696, "grad_norm": 0.3107430259541557, "learning_rate": 2.781056616816319e-06, "loss": 0.0219, "step": 3023 }, { "epoch": 2.0460081190798376, "grad_norm": 0.2697386714777377, "learning_rate": 2.7775293851426233e-06, "loss": 0.0151, "step": 3024 }, { "epoch": 2.0466847090663056, "grad_norm": 0.3617580629923284, "learning_rate": 2.7740035312809153e-06, "loss": 0.0179, "step": 3025 }, { "epoch": 2.047361299052774, "grad_norm": 0.33728690392370864, "learning_rate": 2.7704790574170372e-06, "loss": 0.0181, "step": 3026 }, { "epoch": 2.048037889039242, "grad_norm": 0.35894504724803605, "learning_rate": 2.766955965735968e-06, "loss": 0.0177, "step": 3027 }, { "epoch": 2.0487144790257106, "grad_norm": 0.3998741942334366, "learning_rate": 2.7634342584218364e-06, "loss": 0.0228, "step": 3028 }, { "epoch": 2.0493910690121786, "grad_norm": 0.3556964497640884, "learning_rate": 2.759913937657912e-06, "loss": 0.023, "step": 3029 }, { "epoch": 2.0500676589986466, "grad_norm": 0.24363937450340487, "learning_rate": 2.7563950056266053e-06, "loss": 0.0117, "step": 3030 }, { "epoch": 2.050744248985115, "grad_norm": 0.4135397355331111, "learning_rate": 2.752877464509463e-06, "loss": 0.0261, "step": 3031 }, { "epoch": 2.051420838971583, "grad_norm": 0.29385288459782477, "learning_rate": 2.7493613164871678e-06, "loss": 0.0145, "step": 3032 }, { "epoch": 2.0520974289580516, "grad_norm": 0.8182621401967292, "learning_rate": 2.745846563739546e-06, "loss": 0.0411, "step": 3033 }, { "epoch": 2.0527740189445196, "grad_norm": 0.3541771665927555, "learning_rate": 2.7423332084455543e-06, "loss": 0.0184, "step": 3034 }, { "epoch": 2.0534506089309876, "grad_norm": 0.30746052699659804, "learning_rate": 2.7388212527832814e-06, "loss": 0.0162, "step": 3035 }, { "epoch": 2.054127198917456, "grad_norm": 0.3681810925856484, "learning_rate": 2.7353106989299528e-06, "loss": 0.0233, "step": 3036 }, { "epoch": 2.054803788903924, "grad_norm": 0.3064550317174914, "learning_rate": 2.731801549061923e-06, "loss": 0.0162, "step": 3037 }, { "epoch": 2.0554803788903926, "grad_norm": 0.2513444814612412, "learning_rate": 2.7282938053546727e-06, "loss": 0.0154, "step": 3038 }, { "epoch": 2.0561569688768606, "grad_norm": 0.397932289746068, "learning_rate": 2.7247874699828186e-06, "loss": 0.0204, "step": 3039 }, { "epoch": 2.0568335588633286, "grad_norm": 0.2790640626372702, "learning_rate": 2.7212825451200942e-06, "loss": 0.0208, "step": 3040 }, { "epoch": 2.057510148849797, "grad_norm": 0.32071415577478357, "learning_rate": 2.7177790329393674e-06, "loss": 0.0204, "step": 3041 }, { "epoch": 2.058186738836265, "grad_norm": 0.31070051610266236, "learning_rate": 2.7142769356126258e-06, "loss": 0.0186, "step": 3042 }, { "epoch": 2.0588633288227336, "grad_norm": 0.3340397252026217, "learning_rate": 2.710776255310984e-06, "loss": 0.0204, "step": 3043 }, { "epoch": 2.0595399188092016, "grad_norm": 0.4273394375208807, "learning_rate": 2.7072769942046716e-06, "loss": 0.0282, "step": 3044 }, { "epoch": 2.0602165087956696, "grad_norm": 0.2965104082661981, "learning_rate": 2.7037791544630414e-06, "loss": 0.0188, "step": 3045 }, { "epoch": 2.060893098782138, "grad_norm": 0.35599459837318065, "learning_rate": 2.700282738254567e-06, "loss": 0.0268, "step": 3046 }, { "epoch": 2.061569688768606, "grad_norm": 0.27228564751494283, "learning_rate": 2.6967877477468394e-06, "loss": 0.0167, "step": 3047 }, { "epoch": 2.0622462787550746, "grad_norm": 0.2715035449675256, "learning_rate": 2.693294185106562e-06, "loss": 0.017, "step": 3048 }, { "epoch": 2.0629228687415426, "grad_norm": 0.2897796173533462, "learning_rate": 2.689802052499555e-06, "loss": 0.0167, "step": 3049 }, { "epoch": 2.0635994587280106, "grad_norm": 0.21111631517365298, "learning_rate": 2.686311352090756e-06, "loss": 0.0109, "step": 3050 }, { "epoch": 2.064276048714479, "grad_norm": 0.3062700790169697, "learning_rate": 2.682822086044206e-06, "loss": 0.0174, "step": 3051 }, { "epoch": 2.064952638700947, "grad_norm": 0.28150179689863614, "learning_rate": 2.6793342565230675e-06, "loss": 0.0156, "step": 3052 }, { "epoch": 2.0656292286874156, "grad_norm": 0.2448549983091798, "learning_rate": 2.6758478656896015e-06, "loss": 0.0136, "step": 3053 }, { "epoch": 2.0663058186738836, "grad_norm": 0.26608039737433065, "learning_rate": 2.6723629157051844e-06, "loss": 0.0164, "step": 3054 }, { "epoch": 2.0669824086603517, "grad_norm": 0.2764342270796369, "learning_rate": 2.6688794087302993e-06, "loss": 0.02, "step": 3055 }, { "epoch": 2.06765899864682, "grad_norm": 0.25363760014574555, "learning_rate": 2.66539734692453e-06, "loss": 0.0177, "step": 3056 }, { "epoch": 2.068335588633288, "grad_norm": 0.4076233031632493, "learning_rate": 2.66191673244657e-06, "loss": 0.0225, "step": 3057 }, { "epoch": 2.0690121786197566, "grad_norm": 0.3626422278457513, "learning_rate": 2.658437567454209e-06, "loss": 0.0245, "step": 3058 }, { "epoch": 2.0696887686062246, "grad_norm": 0.32647866554382204, "learning_rate": 2.6549598541043433e-06, "loss": 0.024, "step": 3059 }, { "epoch": 2.0703653585926927, "grad_norm": 0.44525429835360186, "learning_rate": 2.6514835945529706e-06, "loss": 0.025, "step": 3060 }, { "epoch": 2.071041948579161, "grad_norm": 0.4043225721957029, "learning_rate": 2.64800879095518e-06, "loss": 0.022, "step": 3061 }, { "epoch": 2.071718538565629, "grad_norm": 0.28299749687391407, "learning_rate": 2.644535445465164e-06, "loss": 0.0166, "step": 3062 }, { "epoch": 2.0723951285520976, "grad_norm": 0.5222510352469268, "learning_rate": 2.641063560236212e-06, "loss": 0.0325, "step": 3063 }, { "epoch": 2.0730717185385656, "grad_norm": 0.2704778046181561, "learning_rate": 2.637593137420702e-06, "loss": 0.0129, "step": 3064 }, { "epoch": 2.0737483085250337, "grad_norm": 0.3198939038082152, "learning_rate": 2.6341241791701126e-06, "loss": 0.0184, "step": 3065 }, { "epoch": 2.074424898511502, "grad_norm": 0.29084713587711386, "learning_rate": 2.6306566876350072e-06, "loss": 0.0164, "step": 3066 }, { "epoch": 2.07510148849797, "grad_norm": 0.360979655960413, "learning_rate": 2.627190664965046e-06, "loss": 0.0236, "step": 3067 }, { "epoch": 2.0757780784844386, "grad_norm": 0.36181318513778404, "learning_rate": 2.623726113308977e-06, "loss": 0.0206, "step": 3068 }, { "epoch": 2.0764546684709067, "grad_norm": 0.322372342952916, "learning_rate": 2.6202630348146323e-06, "loss": 0.0218, "step": 3069 }, { "epoch": 2.0771312584573747, "grad_norm": 0.3414598551719795, "learning_rate": 2.616801431628938e-06, "loss": 0.017, "step": 3070 }, { "epoch": 2.077807848443843, "grad_norm": 0.4051066445113148, "learning_rate": 2.613341305897898e-06, "loss": 0.0177, "step": 3071 }, { "epoch": 2.078484438430311, "grad_norm": 0.3209181294474512, "learning_rate": 2.609882659766605e-06, "loss": 0.0224, "step": 3072 }, { "epoch": 2.0791610284167796, "grad_norm": 0.28578895357623013, "learning_rate": 2.6064254953792344e-06, "loss": 0.0165, "step": 3073 }, { "epoch": 2.0798376184032477, "grad_norm": 0.6075318475491459, "learning_rate": 2.6029698148790392e-06, "loss": 0.0381, "step": 3074 }, { "epoch": 2.0805142083897157, "grad_norm": 0.30737186625660634, "learning_rate": 2.5995156204083573e-06, "loss": 0.0289, "step": 3075 }, { "epoch": 2.081190798376184, "grad_norm": 0.31925899766813914, "learning_rate": 2.5960629141086014e-06, "loss": 0.0273, "step": 3076 }, { "epoch": 2.081867388362652, "grad_norm": 0.3806833163468387, "learning_rate": 2.5926116981202688e-06, "loss": 0.0211, "step": 3077 }, { "epoch": 2.0825439783491206, "grad_norm": 0.43544524307895166, "learning_rate": 2.5891619745829184e-06, "loss": 0.0184, "step": 3078 }, { "epoch": 2.0832205683355887, "grad_norm": 0.3352435432551951, "learning_rate": 2.585713745635197e-06, "loss": 0.0248, "step": 3079 }, { "epoch": 2.0838971583220567, "grad_norm": 0.2718929984125413, "learning_rate": 2.5822670134148216e-06, "loss": 0.0178, "step": 3080 }, { "epoch": 2.084573748308525, "grad_norm": 0.3430436391679866, "learning_rate": 2.5788217800585812e-06, "loss": 0.0217, "step": 3081 }, { "epoch": 2.085250338294993, "grad_norm": 0.395822758178957, "learning_rate": 2.5753780477023314e-06, "loss": 0.0258, "step": 3082 }, { "epoch": 2.0859269282814616, "grad_norm": 0.2922371050137746, "learning_rate": 2.571935818481005e-06, "loss": 0.0197, "step": 3083 }, { "epoch": 2.0866035182679297, "grad_norm": 0.34230089275034653, "learning_rate": 2.5684950945285937e-06, "loss": 0.0228, "step": 3084 }, { "epoch": 2.0872801082543977, "grad_norm": 0.307148243647793, "learning_rate": 2.5650558779781635e-06, "loss": 0.0208, "step": 3085 }, { "epoch": 2.087956698240866, "grad_norm": 0.3059716364571522, "learning_rate": 2.5616181709618447e-06, "loss": 0.0216, "step": 3086 }, { "epoch": 2.088633288227334, "grad_norm": 0.32743146079087154, "learning_rate": 2.558181975610827e-06, "loss": 0.017, "step": 3087 }, { "epoch": 2.089309878213802, "grad_norm": 0.3360812753346708, "learning_rate": 2.5547472940553685e-06, "loss": 0.0192, "step": 3088 }, { "epoch": 2.0899864682002707, "grad_norm": 0.28990393329614506, "learning_rate": 2.551314128424788e-06, "loss": 0.0165, "step": 3089 }, { "epoch": 2.0906630581867387, "grad_norm": 0.33940305663292186, "learning_rate": 2.5478824808474613e-06, "loss": 0.0241, "step": 3090 }, { "epoch": 2.091339648173207, "grad_norm": 0.3011730856791906, "learning_rate": 2.5444523534508225e-06, "loss": 0.0188, "step": 3091 }, { "epoch": 2.092016238159675, "grad_norm": 0.43797419013313976, "learning_rate": 2.5410237483613685e-06, "loss": 0.0456, "step": 3092 }, { "epoch": 2.092692828146143, "grad_norm": 0.36926319415432013, "learning_rate": 2.53759666770465e-06, "loss": 0.0201, "step": 3093 }, { "epoch": 2.0933694181326117, "grad_norm": 0.3753754161314381, "learning_rate": 2.5341711136052728e-06, "loss": 0.0241, "step": 3094 }, { "epoch": 2.0940460081190797, "grad_norm": 0.36140481303337674, "learning_rate": 2.530747088186893e-06, "loss": 0.0202, "step": 3095 }, { "epoch": 2.094722598105548, "grad_norm": 0.2812130611026342, "learning_rate": 2.527324593572223e-06, "loss": 0.0182, "step": 3096 }, { "epoch": 2.095399188092016, "grad_norm": 0.32819748804745974, "learning_rate": 2.523903631883028e-06, "loss": 0.0141, "step": 3097 }, { "epoch": 2.096075778078484, "grad_norm": 0.23769268205062138, "learning_rate": 2.520484205240116e-06, "loss": 0.0145, "step": 3098 }, { "epoch": 2.0967523680649527, "grad_norm": 0.25897294118421665, "learning_rate": 2.517066315763348e-06, "loss": 0.0129, "step": 3099 }, { "epoch": 2.0974289580514207, "grad_norm": 0.2811692980432601, "learning_rate": 2.5136499655716306e-06, "loss": 0.0241, "step": 3100 }, { "epoch": 2.098105548037889, "grad_norm": 0.30983498713273716, "learning_rate": 2.5102351567829187e-06, "loss": 0.018, "step": 3101 }, { "epoch": 2.098782138024357, "grad_norm": 0.3454829575212401, "learning_rate": 2.5068218915142093e-06, "loss": 0.0295, "step": 3102 }, { "epoch": 2.0994587280108252, "grad_norm": 0.3549707454350051, "learning_rate": 2.503410171881544e-06, "loss": 0.0242, "step": 3103 }, { "epoch": 2.1001353179972937, "grad_norm": 0.29752709096247937, "learning_rate": 2.5000000000000015e-06, "loss": 0.0206, "step": 3104 }, { "epoch": 2.1008119079837617, "grad_norm": 0.29067978319376647, "learning_rate": 2.496591377983706e-06, "loss": 0.0146, "step": 3105 }, { "epoch": 2.10148849797023, "grad_norm": 0.31437900460949353, "learning_rate": 2.49318430794582e-06, "loss": 0.0174, "step": 3106 }, { "epoch": 2.102165087956698, "grad_norm": 0.4734757269035232, "learning_rate": 2.4897787919985457e-06, "loss": 0.0315, "step": 3107 }, { "epoch": 2.1028416779431662, "grad_norm": 0.9767500849091907, "learning_rate": 2.4863748322531144e-06, "loss": 0.0856, "step": 3108 }, { "epoch": 2.1035182679296347, "grad_norm": 0.2395614894247743, "learning_rate": 2.4829724308198003e-06, "loss": 0.012, "step": 3109 }, { "epoch": 2.1041948579161027, "grad_norm": 0.2639550064345917, "learning_rate": 2.4795715898079116e-06, "loss": 0.0144, "step": 3110 }, { "epoch": 2.104871447902571, "grad_norm": 0.33505158691687015, "learning_rate": 2.476172311325783e-06, "loss": 0.0258, "step": 3111 }, { "epoch": 2.105548037889039, "grad_norm": 0.4326205521958161, "learning_rate": 2.472774597480783e-06, "loss": 0.0199, "step": 3112 }, { "epoch": 2.1062246278755072, "grad_norm": 0.33457247400374934, "learning_rate": 2.4693784503793128e-06, "loss": 0.0162, "step": 3113 }, { "epoch": 2.1069012178619757, "grad_norm": 0.26248819364761294, "learning_rate": 2.4659838721268005e-06, "loss": 0.0159, "step": 3114 }, { "epoch": 2.1075778078484437, "grad_norm": 0.29913466215394774, "learning_rate": 2.462590864827703e-06, "loss": 0.0175, "step": 3115 }, { "epoch": 2.108254397834912, "grad_norm": 0.32815787935026647, "learning_rate": 2.4591994305854988e-06, "loss": 0.0206, "step": 3116 }, { "epoch": 2.10893098782138, "grad_norm": 0.34816461794815484, "learning_rate": 2.4558095715026975e-06, "loss": 0.0254, "step": 3117 }, { "epoch": 2.1096075778078482, "grad_norm": 0.3432385006117984, "learning_rate": 2.4524212896808265e-06, "loss": 0.0138, "step": 3118 }, { "epoch": 2.1102841677943167, "grad_norm": 0.42969136301244204, "learning_rate": 2.4490345872204403e-06, "loss": 0.0182, "step": 3119 }, { "epoch": 2.1109607577807847, "grad_norm": 0.3903531142555779, "learning_rate": 2.4456494662211082e-06, "loss": 0.0189, "step": 3120 }, { "epoch": 2.111637347767253, "grad_norm": 0.38684894804978254, "learning_rate": 2.442265928781426e-06, "loss": 0.0193, "step": 3121 }, { "epoch": 2.1123139377537212, "grad_norm": 0.31597492188614235, "learning_rate": 2.438883976999003e-06, "loss": 0.0177, "step": 3122 }, { "epoch": 2.1129905277401893, "grad_norm": 0.2959327742325448, "learning_rate": 2.43550361297047e-06, "loss": 0.0192, "step": 3123 }, { "epoch": 2.1136671177266577, "grad_norm": 0.7283818726336723, "learning_rate": 2.4321248387914677e-06, "loss": 0.0219, "step": 3124 }, { "epoch": 2.1143437077131257, "grad_norm": 0.2040244513760422, "learning_rate": 2.4287476565566525e-06, "loss": 0.0117, "step": 3125 }, { "epoch": 2.115020297699594, "grad_norm": 0.274361357398989, "learning_rate": 2.4253720683596976e-06, "loss": 0.0173, "step": 3126 }, { "epoch": 2.1156968876860622, "grad_norm": 0.2942109825610358, "learning_rate": 2.421998076293285e-06, "loss": 0.0213, "step": 3127 }, { "epoch": 2.1163734776725303, "grad_norm": 0.34853093582183314, "learning_rate": 2.4186256824491106e-06, "loss": 0.0178, "step": 3128 }, { "epoch": 2.1170500676589987, "grad_norm": 0.4101569612373172, "learning_rate": 2.4152548889178722e-06, "loss": 0.0293, "step": 3129 }, { "epoch": 2.1177266576454667, "grad_norm": 0.43937037555293795, "learning_rate": 2.4118856977892846e-06, "loss": 0.0193, "step": 3130 }, { "epoch": 2.118403247631935, "grad_norm": 0.3322120577445713, "learning_rate": 2.4085181111520607e-06, "loss": 0.0216, "step": 3131 }, { "epoch": 2.1190798376184032, "grad_norm": 0.29239964634064164, "learning_rate": 2.4051521310939258e-06, "loss": 0.0141, "step": 3132 }, { "epoch": 2.1197564276048713, "grad_norm": 0.37022345024811043, "learning_rate": 2.401787759701603e-06, "loss": 0.0271, "step": 3133 }, { "epoch": 2.1204330175913397, "grad_norm": 0.24897850304443564, "learning_rate": 2.3984249990608237e-06, "loss": 0.0129, "step": 3134 }, { "epoch": 2.1211096075778078, "grad_norm": 0.4290963757508825, "learning_rate": 2.3950638512563173e-06, "loss": 0.0144, "step": 3135 }, { "epoch": 2.121786197564276, "grad_norm": 0.36143080362684216, "learning_rate": 2.3917043183718162e-06, "loss": 0.0151, "step": 3136 }, { "epoch": 2.1224627875507442, "grad_norm": 0.4450079992651522, "learning_rate": 2.3883464024900484e-06, "loss": 0.0217, "step": 3137 }, { "epoch": 2.1231393775372123, "grad_norm": 0.3034654070729038, "learning_rate": 2.3849901056927383e-06, "loss": 0.02, "step": 3138 }, { "epoch": 2.1238159675236807, "grad_norm": 0.3380960064178022, "learning_rate": 2.381635430060611e-06, "loss": 0.0218, "step": 3139 }, { "epoch": 2.1244925575101488, "grad_norm": 0.3219803482217954, "learning_rate": 2.3782823776733866e-06, "loss": 0.0167, "step": 3140 }, { "epoch": 2.1251691474966172, "grad_norm": 0.3350566290144601, "learning_rate": 2.374930950609773e-06, "loss": 0.0244, "step": 3141 }, { "epoch": 2.1258457374830853, "grad_norm": 0.3168758393416301, "learning_rate": 2.371581150947476e-06, "loss": 0.0191, "step": 3142 }, { "epoch": 2.1265223274695533, "grad_norm": 0.25228403321014414, "learning_rate": 2.368232980763194e-06, "loss": 0.0123, "step": 3143 }, { "epoch": 2.1271989174560217, "grad_norm": 0.3507719167433762, "learning_rate": 2.364886442132606e-06, "loss": 0.0181, "step": 3144 }, { "epoch": 2.1278755074424898, "grad_norm": 0.4039840478946242, "learning_rate": 2.361541537130392e-06, "loss": 0.0255, "step": 3145 }, { "epoch": 2.1285520974289582, "grad_norm": 0.337061361222386, "learning_rate": 2.358198267830206e-06, "loss": 0.0203, "step": 3146 }, { "epoch": 2.1292286874154263, "grad_norm": 0.3906657092238292, "learning_rate": 2.3548566363046993e-06, "loss": 0.0223, "step": 3147 }, { "epoch": 2.1299052774018943, "grad_norm": 0.35249780243469786, "learning_rate": 2.351516644625502e-06, "loss": 0.018, "step": 3148 }, { "epoch": 2.1305818673883627, "grad_norm": 0.46141388142612605, "learning_rate": 2.3481782948632317e-06, "loss": 0.0265, "step": 3149 }, { "epoch": 2.1312584573748308, "grad_norm": 0.2826702497054701, "learning_rate": 2.344841589087482e-06, "loss": 0.018, "step": 3150 }, { "epoch": 2.1319350473612992, "grad_norm": 0.3455073675603878, "learning_rate": 2.34150652936683e-06, "loss": 0.0185, "step": 3151 }, { "epoch": 2.1326116373477673, "grad_norm": 0.38621961715633646, "learning_rate": 2.3381731177688346e-06, "loss": 0.0265, "step": 3152 }, { "epoch": 2.1332882273342353, "grad_norm": 0.323081885914747, "learning_rate": 2.3348413563600324e-06, "loss": 0.0181, "step": 3153 }, { "epoch": 2.1339648173207038, "grad_norm": 0.3681896960518674, "learning_rate": 2.331511247205933e-06, "loss": 0.0286, "step": 3154 }, { "epoch": 2.134641407307172, "grad_norm": 0.3662293584547147, "learning_rate": 2.3281827923710265e-06, "loss": 0.0248, "step": 3155 }, { "epoch": 2.1353179972936402, "grad_norm": 0.31740753045573344, "learning_rate": 2.324855993918775e-06, "loss": 0.0206, "step": 3156 }, { "epoch": 2.1359945872801083, "grad_norm": 0.31070838197533174, "learning_rate": 2.321530853911616e-06, "loss": 0.0222, "step": 3157 }, { "epoch": 2.1366711772665763, "grad_norm": 0.22895039731281813, "learning_rate": 2.318207374410956e-06, "loss": 0.0141, "step": 3158 }, { "epoch": 2.1373477672530448, "grad_norm": 0.31200385704446354, "learning_rate": 2.3148855574771706e-06, "loss": 0.0163, "step": 3159 }, { "epoch": 2.138024357239513, "grad_norm": 0.3070878337888588, "learning_rate": 2.3115654051696097e-06, "loss": 0.0194, "step": 3160 }, { "epoch": 2.1387009472259813, "grad_norm": 0.3616286300226413, "learning_rate": 2.3082469195465893e-06, "loss": 0.0206, "step": 3161 }, { "epoch": 2.1393775372124493, "grad_norm": 0.24900051851747546, "learning_rate": 2.304930102665389e-06, "loss": 0.0131, "step": 3162 }, { "epoch": 2.1400541271989173, "grad_norm": 0.2791204943878726, "learning_rate": 2.3016149565822608e-06, "loss": 0.0188, "step": 3163 }, { "epoch": 2.1407307171853858, "grad_norm": 0.34760553752948525, "learning_rate": 2.2983014833524115e-06, "loss": 0.0195, "step": 3164 }, { "epoch": 2.141407307171854, "grad_norm": 0.24185930008667683, "learning_rate": 2.2949896850300186e-06, "loss": 0.0145, "step": 3165 }, { "epoch": 2.1420838971583223, "grad_norm": 0.2506993012488286, "learning_rate": 2.2916795636682197e-06, "loss": 0.0126, "step": 3166 }, { "epoch": 2.1427604871447903, "grad_norm": 0.32172613565753005, "learning_rate": 2.288371121319109e-06, "loss": 0.0205, "step": 3167 }, { "epoch": 2.1434370771312583, "grad_norm": 0.4611798141323994, "learning_rate": 2.2850643600337435e-06, "loss": 0.0351, "step": 3168 }, { "epoch": 2.1441136671177268, "grad_norm": 0.3419789356555703, "learning_rate": 2.281759281862137e-06, "loss": 0.0216, "step": 3169 }, { "epoch": 2.144790257104195, "grad_norm": 0.24546682359372127, "learning_rate": 2.278455888853262e-06, "loss": 0.0116, "step": 3170 }, { "epoch": 2.1454668470906633, "grad_norm": 0.35225760769126374, "learning_rate": 2.2751541830550417e-06, "loss": 0.0253, "step": 3171 }, { "epoch": 2.1461434370771313, "grad_norm": 0.3203742931779411, "learning_rate": 2.2718541665143546e-06, "loss": 0.0211, "step": 3172 }, { "epoch": 2.1468200270635993, "grad_norm": 0.2889708131145228, "learning_rate": 2.2685558412770344e-06, "loss": 0.0179, "step": 3173 }, { "epoch": 2.147496617050068, "grad_norm": 0.3505469783998427, "learning_rate": 2.265259209387867e-06, "loss": 0.0233, "step": 3174 }, { "epoch": 2.148173207036536, "grad_norm": 0.32734335525877833, "learning_rate": 2.261964272890582e-06, "loss": 0.0213, "step": 3175 }, { "epoch": 2.1488497970230043, "grad_norm": 0.3017664556990426, "learning_rate": 2.258671033827866e-06, "loss": 0.0169, "step": 3176 }, { "epoch": 2.1495263870094723, "grad_norm": 0.3256373908456404, "learning_rate": 2.2553794942413506e-06, "loss": 0.016, "step": 3177 }, { "epoch": 2.1502029769959403, "grad_norm": 0.31744304201815216, "learning_rate": 2.2520896561716086e-06, "loss": 0.0212, "step": 3178 }, { "epoch": 2.150879566982409, "grad_norm": 0.38536951168032857, "learning_rate": 2.248801521658167e-06, "loss": 0.0188, "step": 3179 }, { "epoch": 2.151556156968877, "grad_norm": 0.28270267223725937, "learning_rate": 2.245515092739488e-06, "loss": 0.0163, "step": 3180 }, { "epoch": 2.1522327469553453, "grad_norm": 0.224035320442952, "learning_rate": 2.242230371452982e-06, "loss": 0.0138, "step": 3181 }, { "epoch": 2.1529093369418133, "grad_norm": 0.3133768757718769, "learning_rate": 2.2389473598349994e-06, "loss": 0.0175, "step": 3182 }, { "epoch": 2.1535859269282813, "grad_norm": 0.30665383612985975, "learning_rate": 2.2356660599208335e-06, "loss": 0.0227, "step": 3183 }, { "epoch": 2.15426251691475, "grad_norm": 0.3078336811363443, "learning_rate": 2.2323864737447067e-06, "loss": 0.0215, "step": 3184 }, { "epoch": 2.154939106901218, "grad_norm": 0.33968603058332386, "learning_rate": 2.229108603339789e-06, "loss": 0.0223, "step": 3185 }, { "epoch": 2.1556156968876863, "grad_norm": 0.3595968077075736, "learning_rate": 2.2258324507381834e-06, "loss": 0.0188, "step": 3186 }, { "epoch": 2.1562922868741543, "grad_norm": 0.2904430152947272, "learning_rate": 2.2225580179709303e-06, "loss": 0.0153, "step": 3187 }, { "epoch": 2.1569688768606223, "grad_norm": 0.3394738888015632, "learning_rate": 2.219285307067997e-06, "loss": 0.0188, "step": 3188 }, { "epoch": 2.157645466847091, "grad_norm": 0.3501521027907099, "learning_rate": 2.2160143200582906e-06, "loss": 0.0248, "step": 3189 }, { "epoch": 2.158322056833559, "grad_norm": 0.3040340349490807, "learning_rate": 2.2127450589696475e-06, "loss": 0.0182, "step": 3190 }, { "epoch": 2.1589986468200273, "grad_norm": 0.31513527812824993, "learning_rate": 2.209477525828831e-06, "loss": 0.019, "step": 3191 }, { "epoch": 2.1596752368064953, "grad_norm": 0.64752223556514, "learning_rate": 2.2062117226615375e-06, "loss": 0.0232, "step": 3192 }, { "epoch": 2.1603518267929633, "grad_norm": 0.3599564774739713, "learning_rate": 2.202947651492387e-06, "loss": 0.0207, "step": 3193 }, { "epoch": 2.161028416779432, "grad_norm": 0.29667305394672316, "learning_rate": 2.1996853143449285e-06, "loss": 0.0146, "step": 3194 }, { "epoch": 2.1617050067659, "grad_norm": 0.2850668544793242, "learning_rate": 2.1964247132416373e-06, "loss": 0.0158, "step": 3195 }, { "epoch": 2.1623815967523683, "grad_norm": 0.2922406175180509, "learning_rate": 2.1931658502039067e-06, "loss": 0.017, "step": 3196 }, { "epoch": 2.1630581867388363, "grad_norm": 0.2814097867526757, "learning_rate": 2.1899087272520596e-06, "loss": 0.0179, "step": 3197 }, { "epoch": 2.1637347767253043, "grad_norm": 0.3060937668295674, "learning_rate": 2.186653346405333e-06, "loss": 0.0204, "step": 3198 }, { "epoch": 2.164411366711773, "grad_norm": 0.36995219678170427, "learning_rate": 2.1833997096818897e-06, "loss": 0.0206, "step": 3199 }, { "epoch": 2.165087956698241, "grad_norm": 0.32436797225167857, "learning_rate": 2.1801478190988107e-06, "loss": 0.0194, "step": 3200 }, { "epoch": 2.1657645466847093, "grad_norm": 0.3338826377427646, "learning_rate": 2.1768976766720896e-06, "loss": 0.0206, "step": 3201 }, { "epoch": 2.1664411366711773, "grad_norm": 0.32769878792295004, "learning_rate": 2.1736492844166406e-06, "loss": 0.0213, "step": 3202 }, { "epoch": 2.1671177266576453, "grad_norm": 0.32818583449722943, "learning_rate": 2.170402644346294e-06, "loss": 0.0208, "step": 3203 }, { "epoch": 2.167794316644114, "grad_norm": 0.2628270966287541, "learning_rate": 2.16715775847379e-06, "loss": 0.0099, "step": 3204 }, { "epoch": 2.168470906630582, "grad_norm": 0.3491051999150974, "learning_rate": 2.163914628810781e-06, "loss": 0.0208, "step": 3205 }, { "epoch": 2.16914749661705, "grad_norm": 0.27665429608391007, "learning_rate": 2.1606732573678344e-06, "loss": 0.0159, "step": 3206 }, { "epoch": 2.1698240866035183, "grad_norm": 0.3409165219663345, "learning_rate": 2.157433646154426e-06, "loss": 0.021, "step": 3207 }, { "epoch": 2.1705006765899864, "grad_norm": 0.48716337299657286, "learning_rate": 2.154195797178941e-06, "loss": 0.0173, "step": 3208 }, { "epoch": 2.171177266576455, "grad_norm": 0.36184449937916396, "learning_rate": 2.1509597124486693e-06, "loss": 0.0244, "step": 3209 }, { "epoch": 2.171853856562923, "grad_norm": 0.38203593083620024, "learning_rate": 2.147725393969811e-06, "loss": 0.0315, "step": 3210 }, { "epoch": 2.172530446549391, "grad_norm": 0.29355225838059074, "learning_rate": 2.1444928437474667e-06, "loss": 0.02, "step": 3211 }, { "epoch": 2.1732070365358593, "grad_norm": 0.320653509069443, "learning_rate": 2.1412620637856445e-06, "loss": 0.0147, "step": 3212 }, { "epoch": 2.1738836265223274, "grad_norm": 0.2894634971415182, "learning_rate": 2.138033056087256e-06, "loss": 0.0157, "step": 3213 }, { "epoch": 2.174560216508796, "grad_norm": 0.3086742816786467, "learning_rate": 2.1348058226541072e-06, "loss": 0.0173, "step": 3214 }, { "epoch": 2.175236806495264, "grad_norm": 0.38367991285926245, "learning_rate": 2.1315803654869125e-06, "loss": 0.0251, "step": 3215 }, { "epoch": 2.175913396481732, "grad_norm": 0.33939296708239935, "learning_rate": 2.1283566865852824e-06, "loss": 0.0189, "step": 3216 }, { "epoch": 2.1765899864682003, "grad_norm": 0.3613853776427313, "learning_rate": 2.1251347879477217e-06, "loss": 0.0213, "step": 3217 }, { "epoch": 2.1772665764546684, "grad_norm": 0.388599815378119, "learning_rate": 2.1219146715716332e-06, "loss": 0.03, "step": 3218 }, { "epoch": 2.177943166441137, "grad_norm": 0.3012896606065024, "learning_rate": 2.1186963394533165e-06, "loss": 0.0162, "step": 3219 }, { "epoch": 2.178619756427605, "grad_norm": 0.32609586136770297, "learning_rate": 2.1154797935879647e-06, "loss": 0.0175, "step": 3220 }, { "epoch": 2.179296346414073, "grad_norm": 0.32685332068870293, "learning_rate": 2.112265035969664e-06, "loss": 0.0158, "step": 3221 }, { "epoch": 2.1799729364005414, "grad_norm": 0.3908665491980813, "learning_rate": 2.1090520685913874e-06, "loss": 0.0214, "step": 3222 }, { "epoch": 2.1806495263870094, "grad_norm": 0.30518133848043216, "learning_rate": 2.1058408934450055e-06, "loss": 0.0131, "step": 3223 }, { "epoch": 2.181326116373478, "grad_norm": 0.2627082568803207, "learning_rate": 2.102631512521269e-06, "loss": 0.0163, "step": 3224 }, { "epoch": 2.182002706359946, "grad_norm": 0.29533056788281364, "learning_rate": 2.099423927809826e-06, "loss": 0.0198, "step": 3225 }, { "epoch": 2.182679296346414, "grad_norm": 0.30782129434951133, "learning_rate": 2.096218141299203e-06, "loss": 0.0182, "step": 3226 }, { "epoch": 2.1833558863328824, "grad_norm": 0.3685495402218638, "learning_rate": 2.0930141549768145e-06, "loss": 0.0246, "step": 3227 }, { "epoch": 2.1840324763193504, "grad_norm": 0.3730644700321736, "learning_rate": 2.089811970828961e-06, "loss": 0.0232, "step": 3228 }, { "epoch": 2.184709066305819, "grad_norm": 0.26012433544422686, "learning_rate": 2.086611590840826e-06, "loss": 0.0129, "step": 3229 }, { "epoch": 2.185385656292287, "grad_norm": 0.3506394183014197, "learning_rate": 2.0834130169964695e-06, "loss": 0.0206, "step": 3230 }, { "epoch": 2.186062246278755, "grad_norm": 0.3047006489407835, "learning_rate": 2.0802162512788337e-06, "loss": 0.0155, "step": 3231 }, { "epoch": 2.1867388362652234, "grad_norm": 0.2975742048756299, "learning_rate": 2.0770212956697435e-06, "loss": 0.019, "step": 3232 }, { "epoch": 2.1874154262516914, "grad_norm": 0.42409559410941233, "learning_rate": 2.073828152149898e-06, "loss": 0.0404, "step": 3233 }, { "epoch": 2.18809201623816, "grad_norm": 0.29268843453271054, "learning_rate": 2.0706368226988772e-06, "loss": 0.0167, "step": 3234 }, { "epoch": 2.188768606224628, "grad_norm": 0.30444783746080334, "learning_rate": 2.0674473092951286e-06, "loss": 0.0162, "step": 3235 }, { "epoch": 2.189445196211096, "grad_norm": 0.41625888111181386, "learning_rate": 2.064259613915981e-06, "loss": 0.0267, "step": 3236 }, { "epoch": 2.1901217861975644, "grad_norm": 0.3221104298750319, "learning_rate": 2.061073738537635e-06, "loss": 0.0166, "step": 3237 }, { "epoch": 2.1907983761840324, "grad_norm": 0.23311987996484307, "learning_rate": 2.0578896851351606e-06, "loss": 0.0101, "step": 3238 }, { "epoch": 2.191474966170501, "grad_norm": 0.2947568327994009, "learning_rate": 2.0547074556824964e-06, "loss": 0.0189, "step": 3239 }, { "epoch": 2.192151556156969, "grad_norm": 0.2907318768196047, "learning_rate": 2.0515270521524562e-06, "loss": 0.0195, "step": 3240 }, { "epoch": 2.192828146143437, "grad_norm": 0.295783654449266, "learning_rate": 2.0483484765167172e-06, "loss": 0.021, "step": 3241 }, { "epoch": 2.1935047361299054, "grad_norm": 0.27507509133129837, "learning_rate": 2.0451717307458287e-06, "loss": 0.0155, "step": 3242 }, { "epoch": 2.1941813261163734, "grad_norm": 0.34791367861363576, "learning_rate": 2.041996816809197e-06, "loss": 0.0178, "step": 3243 }, { "epoch": 2.194857916102842, "grad_norm": 0.33254911691864697, "learning_rate": 2.0388237366751005e-06, "loss": 0.02, "step": 3244 }, { "epoch": 2.19553450608931, "grad_norm": 0.31488586876258784, "learning_rate": 2.0356524923106763e-06, "loss": 0.0198, "step": 3245 }, { "epoch": 2.196211096075778, "grad_norm": 0.39976937951504854, "learning_rate": 2.032483085681927e-06, "loss": 0.0225, "step": 3246 }, { "epoch": 2.1968876860622464, "grad_norm": 0.31041717262696583, "learning_rate": 2.029315518753711e-06, "loss": 0.0291, "step": 3247 }, { "epoch": 2.1975642760487144, "grad_norm": 0.2528109532127451, "learning_rate": 2.0261497934897507e-06, "loss": 0.0191, "step": 3248 }, { "epoch": 2.198240866035183, "grad_norm": 0.30236700353663026, "learning_rate": 2.0229859118526244e-06, "loss": 0.0255, "step": 3249 }, { "epoch": 2.198917456021651, "grad_norm": 0.35257299480078474, "learning_rate": 2.019823875803771e-06, "loss": 0.0167, "step": 3250 }, { "epoch": 2.199594046008119, "grad_norm": 0.31437589864559173, "learning_rate": 2.0166636873034807e-06, "loss": 0.0183, "step": 3251 }, { "epoch": 2.2002706359945874, "grad_norm": 0.349225535822057, "learning_rate": 2.0135053483108973e-06, "loss": 0.0186, "step": 3252 }, { "epoch": 2.2009472259810554, "grad_norm": 0.46514667041805957, "learning_rate": 2.0103488607840233e-06, "loss": 0.018, "step": 3253 }, { "epoch": 2.201623815967524, "grad_norm": 0.342102432161444, "learning_rate": 2.00719422667971e-06, "loss": 0.0192, "step": 3254 }, { "epoch": 2.202300405953992, "grad_norm": 0.3930160959618195, "learning_rate": 2.004041447953663e-06, "loss": 0.0206, "step": 3255 }, { "epoch": 2.20297699594046, "grad_norm": 0.3251662407712104, "learning_rate": 2.0008905265604316e-06, "loss": 0.0155, "step": 3256 }, { "epoch": 2.2036535859269284, "grad_norm": 0.3090610137339622, "learning_rate": 1.9977414644534206e-06, "loss": 0.0145, "step": 3257 }, { "epoch": 2.2043301759133964, "grad_norm": 0.26289042865730033, "learning_rate": 1.9945942635848745e-06, "loss": 0.0152, "step": 3258 }, { "epoch": 2.205006765899865, "grad_norm": 0.5642809810025953, "learning_rate": 1.9914489259058933e-06, "loss": 0.0259, "step": 3259 }, { "epoch": 2.205683355886333, "grad_norm": 0.32713953201831053, "learning_rate": 1.9883054533664128e-06, "loss": 0.0259, "step": 3260 }, { "epoch": 2.206359945872801, "grad_norm": 0.3162273027872668, "learning_rate": 1.985163847915217e-06, "loss": 0.0147, "step": 3261 }, { "epoch": 2.2070365358592694, "grad_norm": 0.37714029508563507, "learning_rate": 1.9820241114999334e-06, "loss": 0.0174, "step": 3262 }, { "epoch": 2.2077131258457374, "grad_norm": 0.46112262827519634, "learning_rate": 1.9788862460670305e-06, "loss": 0.0162, "step": 3263 }, { "epoch": 2.208389715832206, "grad_norm": 0.3185607785282328, "learning_rate": 1.9757502535618137e-06, "loss": 0.0199, "step": 3264 }, { "epoch": 2.209066305818674, "grad_norm": 0.2971899984451772, "learning_rate": 1.9726161359284283e-06, "loss": 0.0159, "step": 3265 }, { "epoch": 2.209742895805142, "grad_norm": 0.362931081049006, "learning_rate": 1.96948389510986e-06, "loss": 0.0202, "step": 3266 }, { "epoch": 2.2104194857916104, "grad_norm": 0.2878952502965713, "learning_rate": 1.9663535330479305e-06, "loss": 0.015, "step": 3267 }, { "epoch": 2.2110960757780784, "grad_norm": 0.40934712757172603, "learning_rate": 1.963225051683292e-06, "loss": 0.0175, "step": 3268 }, { "epoch": 2.2117726657645465, "grad_norm": 0.32443670045538464, "learning_rate": 1.9600984529554366e-06, "loss": 0.0143, "step": 3269 }, { "epoch": 2.212449255751015, "grad_norm": 0.3538126115344034, "learning_rate": 1.956973738802689e-06, "loss": 0.0147, "step": 3270 }, { "epoch": 2.213125845737483, "grad_norm": 0.29986686834136383, "learning_rate": 1.953850911162199e-06, "loss": 0.0164, "step": 3271 }, { "epoch": 2.2138024357239514, "grad_norm": 0.6028385981170276, "learning_rate": 1.950729971969955e-06, "loss": 0.0236, "step": 3272 }, { "epoch": 2.2144790257104194, "grad_norm": 0.36524417102212586, "learning_rate": 1.9476109231607687e-06, "loss": 0.0214, "step": 3273 }, { "epoch": 2.2151556156968875, "grad_norm": 0.2795522132726754, "learning_rate": 1.9444937666682834e-06, "loss": 0.0199, "step": 3274 }, { "epoch": 2.215832205683356, "grad_norm": 0.2694536250827638, "learning_rate": 1.941378504424968e-06, "loss": 0.0164, "step": 3275 }, { "epoch": 2.216508795669824, "grad_norm": 0.3112459841930046, "learning_rate": 1.938265138362118e-06, "loss": 0.0189, "step": 3276 }, { "epoch": 2.2171853856562924, "grad_norm": 0.3207034243512326, "learning_rate": 1.935153670409853e-06, "loss": 0.018, "step": 3277 }, { "epoch": 2.2178619756427604, "grad_norm": 0.31740722486130024, "learning_rate": 1.9320441024971113e-06, "loss": 0.0138, "step": 3278 }, { "epoch": 2.2185385656292285, "grad_norm": 0.3036380412383107, "learning_rate": 1.928936436551661e-06, "loss": 0.0137, "step": 3279 }, { "epoch": 2.219215155615697, "grad_norm": 0.39343985298433704, "learning_rate": 1.925830674500088e-06, "loss": 0.0289, "step": 3280 }, { "epoch": 2.219891745602165, "grad_norm": 0.3290770852233371, "learning_rate": 1.922726818267795e-06, "loss": 0.019, "step": 3281 }, { "epoch": 2.2205683355886334, "grad_norm": 0.3092438350443896, "learning_rate": 1.9196248697790066e-06, "loss": 0.0164, "step": 3282 }, { "epoch": 2.2212449255751014, "grad_norm": 0.4131225849760436, "learning_rate": 1.916524830956763e-06, "loss": 0.0231, "step": 3283 }, { "epoch": 2.2219215155615695, "grad_norm": 0.3571328444470763, "learning_rate": 1.913426703722924e-06, "loss": 0.0188, "step": 3284 }, { "epoch": 2.222598105548038, "grad_norm": 0.3010393276105099, "learning_rate": 1.9103304899981603e-06, "loss": 0.019, "step": 3285 }, { "epoch": 2.223274695534506, "grad_norm": 0.39026975137224207, "learning_rate": 1.9072361917019538e-06, "loss": 0.0219, "step": 3286 }, { "epoch": 2.2239512855209744, "grad_norm": 0.36305577116596943, "learning_rate": 1.9041438107526055e-06, "loss": 0.0197, "step": 3287 }, { "epoch": 2.2246278755074425, "grad_norm": 0.3174543111115988, "learning_rate": 1.901053349067225e-06, "loss": 0.0169, "step": 3288 }, { "epoch": 2.2253044654939105, "grad_norm": 0.36631910025895825, "learning_rate": 1.8979648085617342e-06, "loss": 0.02, "step": 3289 }, { "epoch": 2.225981055480379, "grad_norm": 0.45650184645697806, "learning_rate": 1.894878191150859e-06, "loss": 0.027, "step": 3290 }, { "epoch": 2.226657645466847, "grad_norm": 0.36677429686660684, "learning_rate": 1.891793498748134e-06, "loss": 0.021, "step": 3291 }, { "epoch": 2.2273342354533154, "grad_norm": 0.25755487214056577, "learning_rate": 1.888710733265905e-06, "loss": 0.0158, "step": 3292 }, { "epoch": 2.2280108254397835, "grad_norm": 0.3702697622738798, "learning_rate": 1.8856298966153214e-06, "loss": 0.0205, "step": 3293 }, { "epoch": 2.2286874154262515, "grad_norm": 0.3684115091572002, "learning_rate": 1.8825509907063328e-06, "loss": 0.0251, "step": 3294 }, { "epoch": 2.22936400541272, "grad_norm": 0.33524908870469344, "learning_rate": 1.8794740174476966e-06, "loss": 0.0172, "step": 3295 }, { "epoch": 2.230040595399188, "grad_norm": 0.27152983687193905, "learning_rate": 1.87639897874697e-06, "loss": 0.0172, "step": 3296 }, { "epoch": 2.2307171853856564, "grad_norm": 0.3108717034521195, "learning_rate": 1.8733258765105129e-06, "loss": 0.0178, "step": 3297 }, { "epoch": 2.2313937753721245, "grad_norm": 0.3713682052869584, "learning_rate": 1.8702547126434818e-06, "loss": 0.0279, "step": 3298 }, { "epoch": 2.2320703653585925, "grad_norm": 0.3713718158434182, "learning_rate": 1.8671854890498308e-06, "loss": 0.0209, "step": 3299 }, { "epoch": 2.232746955345061, "grad_norm": 0.3262102498384552, "learning_rate": 1.864118207632315e-06, "loss": 0.0227, "step": 3300 }, { "epoch": 2.233423545331529, "grad_norm": 0.3359589824418625, "learning_rate": 1.8610528702924851e-06, "loss": 0.0212, "step": 3301 }, { "epoch": 2.2341001353179974, "grad_norm": 0.300309902695362, "learning_rate": 1.8579894789306813e-06, "loss": 0.0161, "step": 3302 }, { "epoch": 2.2347767253044655, "grad_norm": 0.2912282101498163, "learning_rate": 1.8549280354460437e-06, "loss": 0.0178, "step": 3303 }, { "epoch": 2.2354533152909335, "grad_norm": 0.3718179520180523, "learning_rate": 1.851868541736503e-06, "loss": 0.0179, "step": 3304 }, { "epoch": 2.236129905277402, "grad_norm": 0.3315459690239705, "learning_rate": 1.8488109996987774e-06, "loss": 0.0193, "step": 3305 }, { "epoch": 2.23680649526387, "grad_norm": 0.3404284163203787, "learning_rate": 1.845755411228382e-06, "loss": 0.0252, "step": 3306 }, { "epoch": 2.2374830852503385, "grad_norm": 0.3214098997727977, "learning_rate": 1.8427017782196126e-06, "loss": 0.016, "step": 3307 }, { "epoch": 2.2381596752368065, "grad_norm": 0.28455585200848127, "learning_rate": 1.8396501025655594e-06, "loss": 0.0182, "step": 3308 }, { "epoch": 2.2388362652232745, "grad_norm": 0.2405288163158476, "learning_rate": 1.8366003861580966e-06, "loss": 0.0137, "step": 3309 }, { "epoch": 2.239512855209743, "grad_norm": 0.32872578102522326, "learning_rate": 1.8335526308878877e-06, "loss": 0.0166, "step": 3310 }, { "epoch": 2.240189445196211, "grad_norm": 0.29057845428120543, "learning_rate": 1.8305068386443696e-06, "loss": 0.0204, "step": 3311 }, { "epoch": 2.2408660351826795, "grad_norm": 0.34077445036319737, "learning_rate": 1.8274630113157727e-06, "loss": 0.0242, "step": 3312 }, { "epoch": 2.2415426251691475, "grad_norm": 0.3095720384911149, "learning_rate": 1.8244211507891064e-06, "loss": 0.0177, "step": 3313 }, { "epoch": 2.2422192151556155, "grad_norm": 0.364990408897162, "learning_rate": 1.8213812589501611e-06, "loss": 0.0227, "step": 3314 }, { "epoch": 2.242895805142084, "grad_norm": 0.32634833013060577, "learning_rate": 1.818343337683503e-06, "loss": 0.0231, "step": 3315 }, { "epoch": 2.243572395128552, "grad_norm": 0.27229027873812145, "learning_rate": 1.815307388872481e-06, "loss": 0.0175, "step": 3316 }, { "epoch": 2.2442489851150205, "grad_norm": 0.2409021564066747, "learning_rate": 1.8122734143992216e-06, "loss": 0.0138, "step": 3317 }, { "epoch": 2.2449255751014885, "grad_norm": 0.28672227986611876, "learning_rate": 1.8092414161446225e-06, "loss": 0.0149, "step": 3318 }, { "epoch": 2.2456021650879565, "grad_norm": 0.32337583923203034, "learning_rate": 1.8062113959883616e-06, "loss": 0.0264, "step": 3319 }, { "epoch": 2.246278755074425, "grad_norm": 0.3304537785642103, "learning_rate": 1.8031833558088858e-06, "loss": 0.0278, "step": 3320 }, { "epoch": 2.246955345060893, "grad_norm": 0.3545009666826769, "learning_rate": 1.8001572974834169e-06, "loss": 0.0206, "step": 3321 }, { "epoch": 2.2476319350473615, "grad_norm": 0.32211374285095357, "learning_rate": 1.7971332228879518e-06, "loss": 0.0194, "step": 3322 }, { "epoch": 2.2483085250338295, "grad_norm": 0.3635517758902661, "learning_rate": 1.7941111338972484e-06, "loss": 0.017, "step": 3323 }, { "epoch": 2.2489851150202975, "grad_norm": 0.2504823537719474, "learning_rate": 1.7910910323848435e-06, "loss": 0.0147, "step": 3324 }, { "epoch": 2.249661705006766, "grad_norm": 0.33661846951747487, "learning_rate": 1.7880729202230334e-06, "loss": 0.0257, "step": 3325 }, { "epoch": 2.250338294993234, "grad_norm": 0.3957781612156996, "learning_rate": 1.7850567992828865e-06, "loss": 0.02, "step": 3326 }, { "epoch": 2.2510148849797025, "grad_norm": 0.3159214464212701, "learning_rate": 1.7820426714342375e-06, "loss": 0.0174, "step": 3327 }, { "epoch": 2.2516914749661705, "grad_norm": 0.33062886479922, "learning_rate": 1.7790305385456797e-06, "loss": 0.0194, "step": 3328 }, { "epoch": 2.2523680649526385, "grad_norm": 0.4406893847042817, "learning_rate": 1.7760204024845745e-06, "loss": 0.0232, "step": 3329 }, { "epoch": 2.253044654939107, "grad_norm": 0.3160202959529048, "learning_rate": 1.7730122651170457e-06, "loss": 0.0224, "step": 3330 }, { "epoch": 2.253721244925575, "grad_norm": 0.37480327409660913, "learning_rate": 1.7700061283079744e-06, "loss": 0.0174, "step": 3331 }, { "epoch": 2.2543978349120435, "grad_norm": 0.4228752435563574, "learning_rate": 1.7670019939210025e-06, "loss": 0.0315, "step": 3332 }, { "epoch": 2.2550744248985115, "grad_norm": 0.2693928943056749, "learning_rate": 1.763999863818533e-06, "loss": 0.0196, "step": 3333 }, { "epoch": 2.2557510148849795, "grad_norm": 0.3366616168555836, "learning_rate": 1.760999739861724e-06, "loss": 0.023, "step": 3334 }, { "epoch": 2.256427604871448, "grad_norm": 0.4049835032094548, "learning_rate": 1.7580016239104924e-06, "loss": 0.0178, "step": 3335 }, { "epoch": 2.257104194857916, "grad_norm": 0.28705145507234686, "learning_rate": 1.755005517823506e-06, "loss": 0.0193, "step": 3336 }, { "epoch": 2.2577807848443845, "grad_norm": 0.3626690760858157, "learning_rate": 1.7520114234581914e-06, "loss": 0.027, "step": 3337 }, { "epoch": 2.2584573748308525, "grad_norm": 0.2852905675647844, "learning_rate": 1.7490193426707236e-06, "loss": 0.016, "step": 3338 }, { "epoch": 2.2591339648173205, "grad_norm": 0.36799778032248737, "learning_rate": 1.7460292773160315e-06, "loss": 0.0176, "step": 3339 }, { "epoch": 2.259810554803789, "grad_norm": 0.34353736716847627, "learning_rate": 1.7430412292477978e-06, "loss": 0.0161, "step": 3340 }, { "epoch": 2.260487144790257, "grad_norm": 0.4684909137468032, "learning_rate": 1.7400552003184463e-06, "loss": 0.0164, "step": 3341 }, { "epoch": 2.2611637347767255, "grad_norm": 0.3139210344528339, "learning_rate": 1.7370711923791567e-06, "loss": 0.0181, "step": 3342 }, { "epoch": 2.2618403247631935, "grad_norm": 0.39734702734172855, "learning_rate": 1.7340892072798544e-06, "loss": 0.0164, "step": 3343 }, { "epoch": 2.2625169147496615, "grad_norm": 0.331187064414431, "learning_rate": 1.7311092468692082e-06, "loss": 0.0258, "step": 3344 }, { "epoch": 2.26319350473613, "grad_norm": 0.3470376153450806, "learning_rate": 1.7281313129946302e-06, "loss": 0.0182, "step": 3345 }, { "epoch": 2.263870094722598, "grad_norm": 0.2907836320723048, "learning_rate": 1.725155407502282e-06, "loss": 0.0123, "step": 3346 }, { "epoch": 2.2645466847090665, "grad_norm": 0.38607837864990185, "learning_rate": 1.7221815322370633e-06, "loss": 0.0196, "step": 3347 }, { "epoch": 2.2652232746955345, "grad_norm": 0.28422494135273507, "learning_rate": 1.7192096890426192e-06, "loss": 0.0166, "step": 3348 }, { "epoch": 2.2658998646820026, "grad_norm": 0.2641311829510012, "learning_rate": 1.7162398797613284e-06, "loss": 0.0135, "step": 3349 }, { "epoch": 2.266576454668471, "grad_norm": 0.3203543867591377, "learning_rate": 1.7132721062343156e-06, "loss": 0.016, "step": 3350 }, { "epoch": 2.267253044654939, "grad_norm": 0.457206063763029, "learning_rate": 1.7103063703014372e-06, "loss": 0.0171, "step": 3351 }, { "epoch": 2.2679296346414075, "grad_norm": 0.29120129204884165, "learning_rate": 1.7073426738012939e-06, "loss": 0.0138, "step": 3352 }, { "epoch": 2.2686062246278755, "grad_norm": 0.32769236145685815, "learning_rate": 1.7043810185712135e-06, "loss": 0.0207, "step": 3353 }, { "epoch": 2.2692828146143436, "grad_norm": 0.3115450139158369, "learning_rate": 1.7014214064472646e-06, "loss": 0.0164, "step": 3354 }, { "epoch": 2.269959404600812, "grad_norm": 0.31194935580325733, "learning_rate": 1.6984638392642467e-06, "loss": 0.0211, "step": 3355 }, { "epoch": 2.27063599458728, "grad_norm": 0.36413969011600267, "learning_rate": 1.6955083188556947e-06, "loss": 0.0188, "step": 3356 }, { "epoch": 2.2713125845737485, "grad_norm": 0.36765580778539764, "learning_rate": 1.6925548470538695e-06, "loss": 0.0202, "step": 3357 }, { "epoch": 2.2719891745602165, "grad_norm": 0.41977518152765175, "learning_rate": 1.6896034256897626e-06, "loss": 0.0205, "step": 3358 }, { "epoch": 2.2726657645466846, "grad_norm": 0.30153968052530306, "learning_rate": 1.686654056593099e-06, "loss": 0.0167, "step": 3359 }, { "epoch": 2.273342354533153, "grad_norm": 0.2897073649002921, "learning_rate": 1.683706741592327e-06, "loss": 0.0152, "step": 3360 }, { "epoch": 2.274018944519621, "grad_norm": 0.3040463950675685, "learning_rate": 1.6807614825146258e-06, "loss": 0.0189, "step": 3361 }, { "epoch": 2.2746955345060895, "grad_norm": 0.29797419596106484, "learning_rate": 1.6778182811858934e-06, "loss": 0.0174, "step": 3362 }, { "epoch": 2.2753721244925575, "grad_norm": 0.2961945532484389, "learning_rate": 1.6748771394307584e-06, "loss": 0.0165, "step": 3363 }, { "epoch": 2.2760487144790256, "grad_norm": 0.3990668565554466, "learning_rate": 1.671938059072571e-06, "loss": 0.02, "step": 3364 }, { "epoch": 2.276725304465494, "grad_norm": 0.4480022991768558, "learning_rate": 1.6690010419334008e-06, "loss": 0.0438, "step": 3365 }, { "epoch": 2.277401894451962, "grad_norm": 0.337434658168982, "learning_rate": 1.6660660898340392e-06, "loss": 0.0206, "step": 3366 }, { "epoch": 2.2780784844384305, "grad_norm": 0.35270920413483253, "learning_rate": 1.6631332045939996e-06, "loss": 0.0209, "step": 3367 }, { "epoch": 2.2787550744248986, "grad_norm": 0.31489615453735553, "learning_rate": 1.6602023880315126e-06, "loss": 0.0192, "step": 3368 }, { "epoch": 2.2794316644113666, "grad_norm": 0.3605404755993179, "learning_rate": 1.6572736419635288e-06, "loss": 0.0177, "step": 3369 }, { "epoch": 2.280108254397835, "grad_norm": 0.30101785860707747, "learning_rate": 1.6543469682057105e-06, "loss": 0.0142, "step": 3370 }, { "epoch": 2.280784844384303, "grad_norm": 0.2753648508252713, "learning_rate": 1.651422368572436e-06, "loss": 0.018, "step": 3371 }, { "epoch": 2.2814614343707715, "grad_norm": 0.3273227895585344, "learning_rate": 1.648499844876802e-06, "loss": 0.0182, "step": 3372 }, { "epoch": 2.2821380243572396, "grad_norm": 0.29784328791567416, "learning_rate": 1.6455793989306169e-06, "loss": 0.0136, "step": 3373 }, { "epoch": 2.2828146143437076, "grad_norm": 0.33450496584038497, "learning_rate": 1.642661032544396e-06, "loss": 0.0172, "step": 3374 }, { "epoch": 2.283491204330176, "grad_norm": 0.28921021704318484, "learning_rate": 1.639744747527371e-06, "loss": 0.0141, "step": 3375 }, { "epoch": 2.284167794316644, "grad_norm": 0.29539931758206955, "learning_rate": 1.636830545687481e-06, "loss": 0.0144, "step": 3376 }, { "epoch": 2.2848443843031125, "grad_norm": 0.3591264355407285, "learning_rate": 1.6339184288313769e-06, "loss": 0.028, "step": 3377 }, { "epoch": 2.2855209742895806, "grad_norm": 0.44566592922721643, "learning_rate": 1.631008398764412e-06, "loss": 0.0288, "step": 3378 }, { "epoch": 2.2861975642760486, "grad_norm": 0.5247644817268662, "learning_rate": 1.6281004572906462e-06, "loss": 0.0337, "step": 3379 }, { "epoch": 2.286874154262517, "grad_norm": 0.3956964017910272, "learning_rate": 1.6251946062128482e-06, "loss": 0.0168, "step": 3380 }, { "epoch": 2.287550744248985, "grad_norm": 0.3002357813842024, "learning_rate": 1.6222908473324889e-06, "loss": 0.0177, "step": 3381 }, { "epoch": 2.2882273342354535, "grad_norm": 0.27279860216353286, "learning_rate": 1.6193891824497438e-06, "loss": 0.0188, "step": 3382 }, { "epoch": 2.2889039242219216, "grad_norm": 0.4217966814469358, "learning_rate": 1.616489613363486e-06, "loss": 0.0271, "step": 3383 }, { "epoch": 2.2895805142083896, "grad_norm": 0.28064319219119543, "learning_rate": 1.6135921418712959e-06, "loss": 0.0157, "step": 3384 }, { "epoch": 2.290257104194858, "grad_norm": 0.27570323148005854, "learning_rate": 1.6106967697694442e-06, "loss": 0.0171, "step": 3385 }, { "epoch": 2.290933694181326, "grad_norm": 0.35091574079144383, "learning_rate": 1.6078034988529112e-06, "loss": 0.0188, "step": 3386 }, { "epoch": 2.2916102841677946, "grad_norm": 0.3721782944501329, "learning_rate": 1.604912330915364e-06, "loss": 0.0253, "step": 3387 }, { "epoch": 2.2922868741542626, "grad_norm": 0.29111923911747417, "learning_rate": 1.6020232677491732e-06, "loss": 0.0166, "step": 3388 }, { "epoch": 2.2929634641407306, "grad_norm": 0.29045324372586967, "learning_rate": 1.5991363111454023e-06, "loss": 0.0164, "step": 3389 }, { "epoch": 2.293640054127199, "grad_norm": 0.3534138112428688, "learning_rate": 1.5962514628938103e-06, "loss": 0.0221, "step": 3390 }, { "epoch": 2.294316644113667, "grad_norm": 0.32072838866259135, "learning_rate": 1.5933687247828462e-06, "loss": 0.0195, "step": 3391 }, { "epoch": 2.2949932341001356, "grad_norm": 0.3098815018278398, "learning_rate": 1.59048809859965e-06, "loss": 0.0176, "step": 3392 }, { "epoch": 2.2956698240866036, "grad_norm": 0.3926479778143456, "learning_rate": 1.5876095861300567e-06, "loss": 0.0215, "step": 3393 }, { "epoch": 2.2963464140730716, "grad_norm": 0.38504918431993057, "learning_rate": 1.5847331891585888e-06, "loss": 0.0336, "step": 3394 }, { "epoch": 2.29702300405954, "grad_norm": 0.35860940232195915, "learning_rate": 1.5818589094684594e-06, "loss": 0.0187, "step": 3395 }, { "epoch": 2.297699594046008, "grad_norm": 0.36040946508834704, "learning_rate": 1.5789867488415633e-06, "loss": 0.0238, "step": 3396 }, { "epoch": 2.2983761840324766, "grad_norm": 0.5018342132225554, "learning_rate": 1.5761167090584885e-06, "loss": 0.018, "step": 3397 }, { "epoch": 2.2990527740189446, "grad_norm": 0.2834935241222944, "learning_rate": 1.5732487918985017e-06, "loss": 0.0146, "step": 3398 }, { "epoch": 2.2997293640054126, "grad_norm": 0.28017211065018244, "learning_rate": 1.5703829991395602e-06, "loss": 0.0126, "step": 3399 }, { "epoch": 2.300405953991881, "grad_norm": 0.3812149184324965, "learning_rate": 1.5675193325582983e-06, "loss": 0.0145, "step": 3400 }, { "epoch": 2.301082543978349, "grad_norm": 0.33818481581544535, "learning_rate": 1.5646577939300362e-06, "loss": 0.0166, "step": 3401 }, { "epoch": 2.301759133964817, "grad_norm": 0.3891537744780298, "learning_rate": 1.5617983850287737e-06, "loss": 0.0264, "step": 3402 }, { "epoch": 2.3024357239512856, "grad_norm": 0.3547798750552972, "learning_rate": 1.5589411076271916e-06, "loss": 0.0293, "step": 3403 }, { "epoch": 2.3031123139377536, "grad_norm": 0.344589393233328, "learning_rate": 1.5560859634966457e-06, "loss": 0.0206, "step": 3404 }, { "epoch": 2.303788903924222, "grad_norm": 0.31333696680940976, "learning_rate": 1.5532329544071712e-06, "loss": 0.0195, "step": 3405 }, { "epoch": 2.30446549391069, "grad_norm": 0.27836311682266196, "learning_rate": 1.5503820821274812e-06, "loss": 0.016, "step": 3406 }, { "epoch": 2.305142083897158, "grad_norm": 0.3281966681069725, "learning_rate": 1.5475333484249633e-06, "loss": 0.0199, "step": 3407 }, { "epoch": 2.3058186738836266, "grad_norm": 0.31803083744014177, "learning_rate": 1.544686755065677e-06, "loss": 0.019, "step": 3408 }, { "epoch": 2.3064952638700946, "grad_norm": 0.2976133648194919, "learning_rate": 1.5418423038143576e-06, "loss": 0.0103, "step": 3409 }, { "epoch": 2.307171853856563, "grad_norm": 0.5054667403002192, "learning_rate": 1.5389999964344138e-06, "loss": 0.0289, "step": 3410 }, { "epoch": 2.307848443843031, "grad_norm": 0.27112112428549096, "learning_rate": 1.5361598346879193e-06, "loss": 0.0122, "step": 3411 }, { "epoch": 2.308525033829499, "grad_norm": 0.3464400771438308, "learning_rate": 1.5333218203356243e-06, "loss": 0.0204, "step": 3412 }, { "epoch": 2.3092016238159676, "grad_norm": 0.3975492838739599, "learning_rate": 1.5304859551369417e-06, "loss": 0.0249, "step": 3413 }, { "epoch": 2.3098782138024356, "grad_norm": 0.2738991600856642, "learning_rate": 1.5276522408499567e-06, "loss": 0.0144, "step": 3414 }, { "epoch": 2.310554803788904, "grad_norm": 0.32361113515357953, "learning_rate": 1.5248206792314197e-06, "loss": 0.0244, "step": 3415 }, { "epoch": 2.311231393775372, "grad_norm": 0.3037149226711114, "learning_rate": 1.5219912720367474e-06, "loss": 0.0194, "step": 3416 }, { "epoch": 2.31190798376184, "grad_norm": 0.2594839056926496, "learning_rate": 1.5191640210200186e-06, "loss": 0.0151, "step": 3417 }, { "epoch": 2.3125845737483086, "grad_norm": 0.3193148236143407, "learning_rate": 1.5163389279339746e-06, "loss": 0.0161, "step": 3418 }, { "epoch": 2.3132611637347766, "grad_norm": 0.3299916254958434, "learning_rate": 1.5135159945300232e-06, "loss": 0.0232, "step": 3419 }, { "epoch": 2.313937753721245, "grad_norm": 0.356079585830143, "learning_rate": 1.5106952225582312e-06, "loss": 0.0228, "step": 3420 }, { "epoch": 2.314614343707713, "grad_norm": 0.346548967899213, "learning_rate": 1.5078766137673229e-06, "loss": 0.0286, "step": 3421 }, { "epoch": 2.315290933694181, "grad_norm": 0.31734678333429894, "learning_rate": 1.5050601699046852e-06, "loss": 0.0176, "step": 3422 }, { "epoch": 2.3159675236806496, "grad_norm": 0.3138444770545622, "learning_rate": 1.5022458927163618e-06, "loss": 0.0185, "step": 3423 }, { "epoch": 2.3166441136671176, "grad_norm": 0.3872281981192431, "learning_rate": 1.499433783947054e-06, "loss": 0.04, "step": 3424 }, { "epoch": 2.317320703653586, "grad_norm": 0.3047666004806965, "learning_rate": 1.4966238453401161e-06, "loss": 0.0159, "step": 3425 }, { "epoch": 2.317997293640054, "grad_norm": 0.28401856592096936, "learning_rate": 1.4938160786375571e-06, "loss": 0.012, "step": 3426 }, { "epoch": 2.318673883626522, "grad_norm": 0.37776503020377644, "learning_rate": 1.4910104855800429e-06, "loss": 0.0177, "step": 3427 }, { "epoch": 2.3193504736129906, "grad_norm": 0.31668460822759414, "learning_rate": 1.488207067906891e-06, "loss": 0.0142, "step": 3428 }, { "epoch": 2.3200270635994586, "grad_norm": 0.2910053986557296, "learning_rate": 1.4854058273560667e-06, "loss": 0.0178, "step": 3429 }, { "epoch": 2.320703653585927, "grad_norm": 0.31869749440623857, "learning_rate": 1.4826067656641912e-06, "loss": 0.0165, "step": 3430 }, { "epoch": 2.321380243572395, "grad_norm": 0.38881098890086335, "learning_rate": 1.479809884566528e-06, "loss": 0.0207, "step": 3431 }, { "epoch": 2.322056833558863, "grad_norm": 0.36727104723757226, "learning_rate": 1.477015185796995e-06, "loss": 0.0238, "step": 3432 }, { "epoch": 2.3227334235453316, "grad_norm": 0.29700678786934914, "learning_rate": 1.4742226710881558e-06, "loss": 0.0156, "step": 3433 }, { "epoch": 2.3234100135317997, "grad_norm": 0.34989861500261144, "learning_rate": 1.4714323421712163e-06, "loss": 0.0194, "step": 3434 }, { "epoch": 2.324086603518268, "grad_norm": 0.3807261903374077, "learning_rate": 1.4686442007760315e-06, "loss": 0.0238, "step": 3435 }, { "epoch": 2.324763193504736, "grad_norm": 0.32828486518153954, "learning_rate": 1.465858248631099e-06, "loss": 0.0205, "step": 3436 }, { "epoch": 2.325439783491204, "grad_norm": 0.3854799403276748, "learning_rate": 1.4630744874635611e-06, "loss": 0.0241, "step": 3437 }, { "epoch": 2.3261163734776726, "grad_norm": 0.3381987589387271, "learning_rate": 1.460292918999195e-06, "loss": 0.0232, "step": 3438 }, { "epoch": 2.3267929634641407, "grad_norm": 0.28866416084621654, "learning_rate": 1.4575135449624251e-06, "loss": 0.0141, "step": 3439 }, { "epoch": 2.3274695534506087, "grad_norm": 0.3468131314919482, "learning_rate": 1.4547363670763138e-06, "loss": 0.0179, "step": 3440 }, { "epoch": 2.328146143437077, "grad_norm": 0.27370334148235237, "learning_rate": 1.4519613870625632e-06, "loss": 0.0159, "step": 3441 }, { "epoch": 2.328822733423545, "grad_norm": 0.3461188549124858, "learning_rate": 1.4491886066415084e-06, "loss": 0.0127, "step": 3442 }, { "epoch": 2.3294993234100136, "grad_norm": 0.2953882693563509, "learning_rate": 1.4464180275321255e-06, "loss": 0.0186, "step": 3443 }, { "epoch": 2.3301759133964817, "grad_norm": 0.3380921062470368, "learning_rate": 1.4436496514520253e-06, "loss": 0.0141, "step": 3444 }, { "epoch": 2.3308525033829497, "grad_norm": 0.3157462679666608, "learning_rate": 1.4408834801174492e-06, "loss": 0.0215, "step": 3445 }, { "epoch": 2.331529093369418, "grad_norm": 0.2838315171078676, "learning_rate": 1.438119515243277e-06, "loss": 0.019, "step": 3446 }, { "epoch": 2.332205683355886, "grad_norm": 0.29102971186807003, "learning_rate": 1.4353577585430152e-06, "loss": 0.0185, "step": 3447 }, { "epoch": 2.3328822733423547, "grad_norm": 0.36108657223021884, "learning_rate": 1.4325982117288052e-06, "loss": 0.0232, "step": 3448 }, { "epoch": 2.3335588633288227, "grad_norm": 0.34128339886805575, "learning_rate": 1.4298408765114191e-06, "loss": 0.0179, "step": 3449 }, { "epoch": 2.3342354533152907, "grad_norm": 0.29909188182042723, "learning_rate": 1.4270857546002548e-06, "loss": 0.0168, "step": 3450 }, { "epoch": 2.334912043301759, "grad_norm": 0.3854950782607295, "learning_rate": 1.4243328477033369e-06, "loss": 0.0262, "step": 3451 }, { "epoch": 2.335588633288227, "grad_norm": 0.35774118889171214, "learning_rate": 1.4215821575273219e-06, "loss": 0.0216, "step": 3452 }, { "epoch": 2.3362652232746957, "grad_norm": 0.406041101629836, "learning_rate": 1.4188336857774892e-06, "loss": 0.0293, "step": 3453 }, { "epoch": 2.3369418132611637, "grad_norm": 0.36314789122100627, "learning_rate": 1.4160874341577447e-06, "loss": 0.02, "step": 3454 }, { "epoch": 2.3376184032476317, "grad_norm": 0.3202102887868807, "learning_rate": 1.413343404370613e-06, "loss": 0.02, "step": 3455 }, { "epoch": 2.3382949932341, "grad_norm": 0.39385977245190484, "learning_rate": 1.410601598117246e-06, "loss": 0.0222, "step": 3456 }, { "epoch": 2.338971583220568, "grad_norm": 0.33812379984256224, "learning_rate": 1.4078620170974178e-06, "loss": 0.0127, "step": 3457 }, { "epoch": 2.3396481732070367, "grad_norm": 0.3835284152767019, "learning_rate": 1.4051246630095195e-06, "loss": 0.0219, "step": 3458 }, { "epoch": 2.3403247631935047, "grad_norm": 0.45913432753312217, "learning_rate": 1.4023895375505608e-06, "loss": 0.0225, "step": 3459 }, { "epoch": 2.3410013531799727, "grad_norm": 0.2930001985549042, "learning_rate": 1.3996566424161746e-06, "loss": 0.0145, "step": 3460 }, { "epoch": 2.341677943166441, "grad_norm": 0.29618365578774825, "learning_rate": 1.396925979300608e-06, "loss": 0.0189, "step": 3461 }, { "epoch": 2.342354533152909, "grad_norm": 0.3156416431072414, "learning_rate": 1.3941975498967265e-06, "loss": 0.0146, "step": 3462 }, { "epoch": 2.3430311231393777, "grad_norm": 0.34450896180533125, "learning_rate": 1.3914713558960064e-06, "loss": 0.0222, "step": 3463 }, { "epoch": 2.3437077131258457, "grad_norm": 0.37433044639671526, "learning_rate": 1.3887473989885441e-06, "loss": 0.0227, "step": 3464 }, { "epoch": 2.3443843031123137, "grad_norm": 0.2967831896285163, "learning_rate": 1.3860256808630429e-06, "loss": 0.0198, "step": 3465 }, { "epoch": 2.345060893098782, "grad_norm": 0.35596070996685975, "learning_rate": 1.383306203206823e-06, "loss": 0.0192, "step": 3466 }, { "epoch": 2.34573748308525, "grad_norm": 0.27285421060885284, "learning_rate": 1.3805889677058148e-06, "loss": 0.015, "step": 3467 }, { "epoch": 2.3464140730717187, "grad_norm": 0.3379030178564435, "learning_rate": 1.3778739760445552e-06, "loss": 0.0182, "step": 3468 }, { "epoch": 2.3470906630581867, "grad_norm": 0.37442555210690887, "learning_rate": 1.375161229906195e-06, "loss": 0.0176, "step": 3469 }, { "epoch": 2.3477672530446547, "grad_norm": 0.3659478507841638, "learning_rate": 1.372450730972491e-06, "loss": 0.0216, "step": 3470 }, { "epoch": 2.348443843031123, "grad_norm": 0.6884523305018001, "learning_rate": 1.3697424809238058e-06, "loss": 0.0239, "step": 3471 }, { "epoch": 2.349120433017591, "grad_norm": 0.2773584734882262, "learning_rate": 1.3670364814391062e-06, "loss": 0.0199, "step": 3472 }, { "epoch": 2.3497970230040597, "grad_norm": 0.3683873083754821, "learning_rate": 1.3643327341959684e-06, "loss": 0.0236, "step": 3473 }, { "epoch": 2.3504736129905277, "grad_norm": 0.3465351617126101, "learning_rate": 1.361631240870569e-06, "loss": 0.0248, "step": 3474 }, { "epoch": 2.3511502029769957, "grad_norm": 0.673446290767098, "learning_rate": 1.35893200313769e-06, "loss": 0.0232, "step": 3475 }, { "epoch": 2.351826792963464, "grad_norm": 0.453653251164762, "learning_rate": 1.3562350226707106e-06, "loss": 0.0345, "step": 3476 }, { "epoch": 2.352503382949932, "grad_norm": 0.3427863808370553, "learning_rate": 1.3535403011416159e-06, "loss": 0.0267, "step": 3477 }, { "epoch": 2.3531799729364007, "grad_norm": 0.2961253295322023, "learning_rate": 1.3508478402209858e-06, "loss": 0.02, "step": 3478 }, { "epoch": 2.3538565629228687, "grad_norm": 0.33042714299973824, "learning_rate": 1.3481576415780035e-06, "loss": 0.02, "step": 3479 }, { "epoch": 2.3545331529093367, "grad_norm": 0.2886625596054307, "learning_rate": 1.3454697068804434e-06, "loss": 0.0145, "step": 3480 }, { "epoch": 2.355209742895805, "grad_norm": 0.5073975258555212, "learning_rate": 1.3427840377946826e-06, "loss": 0.0278, "step": 3481 }, { "epoch": 2.3558863328822732, "grad_norm": 0.3175605735846637, "learning_rate": 1.3401006359856916e-06, "loss": 0.0157, "step": 3482 }, { "epoch": 2.3565629228687417, "grad_norm": 0.4040167872806406, "learning_rate": 1.337419503117035e-06, "loss": 0.0426, "step": 3483 }, { "epoch": 2.3572395128552097, "grad_norm": 0.22903622237412308, "learning_rate": 1.3347406408508695e-06, "loss": 0.0121, "step": 3484 }, { "epoch": 2.3579161028416777, "grad_norm": 0.4586465906503626, "learning_rate": 1.332064050847945e-06, "loss": 0.0163, "step": 3485 }, { "epoch": 2.358592692828146, "grad_norm": 0.37767507286954305, "learning_rate": 1.3293897347676032e-06, "loss": 0.0218, "step": 3486 }, { "epoch": 2.3592692828146142, "grad_norm": 0.5212575672701087, "learning_rate": 1.3267176942677763e-06, "loss": 0.0185, "step": 3487 }, { "epoch": 2.3599458728010827, "grad_norm": 0.3895583593845578, "learning_rate": 1.324047931004987e-06, "loss": 0.0242, "step": 3488 }, { "epoch": 2.3606224627875507, "grad_norm": 0.30814132888671697, "learning_rate": 1.321380446634342e-06, "loss": 0.0144, "step": 3489 }, { "epoch": 2.3612990527740187, "grad_norm": 0.29548396843448144, "learning_rate": 1.31871524280954e-06, "loss": 0.0159, "step": 3490 }, { "epoch": 2.361975642760487, "grad_norm": 0.29620029474076864, "learning_rate": 1.3160523211828612e-06, "loss": 0.0154, "step": 3491 }, { "epoch": 2.3626522327469552, "grad_norm": 0.3089293933425394, "learning_rate": 1.313391683405177e-06, "loss": 0.0183, "step": 3492 }, { "epoch": 2.3633288227334237, "grad_norm": 0.29458168000264845, "learning_rate": 1.310733331125935e-06, "loss": 0.0135, "step": 3493 }, { "epoch": 2.3640054127198917, "grad_norm": 0.5333665226371411, "learning_rate": 1.3080772659931728e-06, "loss": 0.0216, "step": 3494 }, { "epoch": 2.3646820027063598, "grad_norm": 0.4694395097376205, "learning_rate": 1.305423489653508e-06, "loss": 0.0268, "step": 3495 }, { "epoch": 2.365358592692828, "grad_norm": 0.26610297140918115, "learning_rate": 1.3027720037521397e-06, "loss": 0.0129, "step": 3496 }, { "epoch": 2.3660351826792962, "grad_norm": 0.3216074320309018, "learning_rate": 1.3001228099328445e-06, "loss": 0.0158, "step": 3497 }, { "epoch": 2.3667117726657647, "grad_norm": 0.3675644364816066, "learning_rate": 1.297475909837979e-06, "loss": 0.0207, "step": 3498 }, { "epoch": 2.3673883626522327, "grad_norm": 0.4496383494220818, "learning_rate": 1.29483130510848e-06, "loss": 0.0239, "step": 3499 }, { "epoch": 2.3680649526387008, "grad_norm": 0.38480410347240224, "learning_rate": 1.2921889973838591e-06, "loss": 0.0254, "step": 3500 }, { "epoch": 2.3687415426251692, "grad_norm": 0.3474437991994648, "learning_rate": 1.289548988302207e-06, "loss": 0.016, "step": 3501 }, { "epoch": 2.3694181326116373, "grad_norm": 0.3643345445652335, "learning_rate": 1.2869112795001836e-06, "loss": 0.0168, "step": 3502 }, { "epoch": 2.3700947225981057, "grad_norm": 0.3653840433304237, "learning_rate": 1.2842758726130283e-06, "loss": 0.0201, "step": 3503 }, { "epoch": 2.3707713125845737, "grad_norm": 0.4239909885433043, "learning_rate": 1.281642769274552e-06, "loss": 0.0176, "step": 3504 }, { "epoch": 2.3714479025710418, "grad_norm": 0.372786400830861, "learning_rate": 1.2790119711171356e-06, "loss": 0.0255, "step": 3505 }, { "epoch": 2.3721244925575102, "grad_norm": 0.5484580889192269, "learning_rate": 1.2763834797717312e-06, "loss": 0.0256, "step": 3506 }, { "epoch": 2.3728010825439783, "grad_norm": 0.327930154927057, "learning_rate": 1.2737572968678624e-06, "loss": 0.0196, "step": 3507 }, { "epoch": 2.3734776725304467, "grad_norm": 0.45904627217132027, "learning_rate": 1.2711334240336216e-06, "loss": 0.0225, "step": 3508 }, { "epoch": 2.3741542625169147, "grad_norm": 0.35917177934795713, "learning_rate": 1.26851186289567e-06, "loss": 0.019, "step": 3509 }, { "epoch": 2.3748308525033828, "grad_norm": 0.847389695305474, "learning_rate": 1.2658926150792321e-06, "loss": 0.0275, "step": 3510 }, { "epoch": 2.3755074424898512, "grad_norm": 0.43979032251488054, "learning_rate": 1.2632756822081e-06, "loss": 0.0242, "step": 3511 }, { "epoch": 2.3761840324763193, "grad_norm": 0.4016908873869214, "learning_rate": 1.2606610659046314e-06, "loss": 0.0223, "step": 3512 }, { "epoch": 2.3768606224627877, "grad_norm": 0.3964610840879998, "learning_rate": 1.2580487677897496e-06, "loss": 0.0294, "step": 3513 }, { "epoch": 2.3775372124492558, "grad_norm": 0.32040431574055594, "learning_rate": 1.255438789482935e-06, "loss": 0.0209, "step": 3514 }, { "epoch": 2.378213802435724, "grad_norm": 0.2733499958799494, "learning_rate": 1.2528311326022364e-06, "loss": 0.0175, "step": 3515 }, { "epoch": 2.3788903924221922, "grad_norm": 0.41348718080267, "learning_rate": 1.250225798764259e-06, "loss": 0.0193, "step": 3516 }, { "epoch": 2.3795669824086603, "grad_norm": 0.3065936874861683, "learning_rate": 1.2476227895841714e-06, "loss": 0.0196, "step": 3517 }, { "epoch": 2.3802435723951287, "grad_norm": 0.26321713581050965, "learning_rate": 1.2450221066756973e-06, "loss": 0.0164, "step": 3518 }, { "epoch": 2.3809201623815968, "grad_norm": 0.24304563264346346, "learning_rate": 1.242423751651119e-06, "loss": 0.0161, "step": 3519 }, { "epoch": 2.381596752368065, "grad_norm": 0.2887832946381938, "learning_rate": 1.2398277261212777e-06, "loss": 0.0141, "step": 3520 }, { "epoch": 2.3822733423545333, "grad_norm": 0.3201531185077651, "learning_rate": 1.2372340316955694e-06, "loss": 0.0224, "step": 3521 }, { "epoch": 2.3829499323410013, "grad_norm": 0.2673111245201406, "learning_rate": 1.234642669981946e-06, "loss": 0.0151, "step": 3522 }, { "epoch": 2.3836265223274697, "grad_norm": 0.361959365189735, "learning_rate": 1.232053642586909e-06, "loss": 0.0255, "step": 3523 }, { "epoch": 2.3843031123139378, "grad_norm": 0.23907565095316533, "learning_rate": 1.2294669511155193e-06, "loss": 0.0149, "step": 3524 }, { "epoch": 2.384979702300406, "grad_norm": 0.2761112499672755, "learning_rate": 1.2268825971713833e-06, "loss": 0.017, "step": 3525 }, { "epoch": 2.3856562922868743, "grad_norm": 0.32362861719233066, "learning_rate": 1.2243005823566638e-06, "loss": 0.0188, "step": 3526 }, { "epoch": 2.3863328822733423, "grad_norm": 0.2613132688502163, "learning_rate": 1.2217209082720677e-06, "loss": 0.0147, "step": 3527 }, { "epoch": 2.3870094722598107, "grad_norm": 0.3284213473585238, "learning_rate": 1.2191435765168557e-06, "loss": 0.0188, "step": 3528 }, { "epoch": 2.3876860622462788, "grad_norm": 0.3256730328988581, "learning_rate": 1.2165685886888346e-06, "loss": 0.0333, "step": 3529 }, { "epoch": 2.388362652232747, "grad_norm": 0.3677415875121975, "learning_rate": 1.2139959463843593e-06, "loss": 0.0198, "step": 3530 }, { "epoch": 2.3890392422192153, "grad_norm": 0.4418753233954852, "learning_rate": 1.2114256511983274e-06, "loss": 0.025, "step": 3531 }, { "epoch": 2.3897158322056833, "grad_norm": 0.34847431989760974, "learning_rate": 1.2088577047241834e-06, "loss": 0.0185, "step": 3532 }, { "epoch": 2.3903924221921518, "grad_norm": 0.37733463812285334, "learning_rate": 1.2062921085539152e-06, "loss": 0.0214, "step": 3533 }, { "epoch": 2.39106901217862, "grad_norm": 0.45738770036944465, "learning_rate": 1.2037288642780575e-06, "loss": 0.0152, "step": 3534 }, { "epoch": 2.391745602165088, "grad_norm": 0.3755377152715656, "learning_rate": 1.2011679734856796e-06, "loss": 0.0214, "step": 3535 }, { "epoch": 2.3924221921515563, "grad_norm": 0.40400668220835156, "learning_rate": 1.1986094377643976e-06, "loss": 0.0183, "step": 3536 }, { "epoch": 2.3930987821380243, "grad_norm": 0.31864548279603405, "learning_rate": 1.1960532587003666e-06, "loss": 0.019, "step": 3537 }, { "epoch": 2.3937753721244928, "grad_norm": 0.3939370885379173, "learning_rate": 1.193499437878277e-06, "loss": 0.0196, "step": 3538 }, { "epoch": 2.394451962110961, "grad_norm": 0.23943780206307763, "learning_rate": 1.1909479768813641e-06, "loss": 0.0107, "step": 3539 }, { "epoch": 2.395128552097429, "grad_norm": 0.28387392170040904, "learning_rate": 1.1883988772913924e-06, "loss": 0.0203, "step": 3540 }, { "epoch": 2.3958051420838973, "grad_norm": 0.3548361942597856, "learning_rate": 1.1858521406886674e-06, "loss": 0.0177, "step": 3541 }, { "epoch": 2.3964817320703653, "grad_norm": 0.2809746690444741, "learning_rate": 1.183307768652029e-06, "loss": 0.0133, "step": 3542 }, { "epoch": 2.3971583220568338, "grad_norm": 0.24331556416682687, "learning_rate": 1.180765762758852e-06, "loss": 0.0125, "step": 3543 }, { "epoch": 2.397834912043302, "grad_norm": 0.36627104128099164, "learning_rate": 1.1782261245850417e-06, "loss": 0.0166, "step": 3544 }, { "epoch": 2.39851150202977, "grad_norm": 0.3347947577313801, "learning_rate": 1.1756888557050356e-06, "loss": 0.0242, "step": 3545 }, { "epoch": 2.3991880920162383, "grad_norm": 0.4153508901482643, "learning_rate": 1.173153957691805e-06, "loss": 0.0384, "step": 3546 }, { "epoch": 2.3998646820027063, "grad_norm": 0.3022178539899148, "learning_rate": 1.1706214321168513e-06, "loss": 0.0203, "step": 3547 }, { "epoch": 2.4005412719891748, "grad_norm": 0.38753508758662086, "learning_rate": 1.1680912805502008e-06, "loss": 0.0238, "step": 3548 }, { "epoch": 2.401217861975643, "grad_norm": 0.3144206326333019, "learning_rate": 1.165563504560413e-06, "loss": 0.0158, "step": 3549 }, { "epoch": 2.401894451962111, "grad_norm": 0.3268120250622495, "learning_rate": 1.1630381057145735e-06, "loss": 0.0108, "step": 3550 }, { "epoch": 2.4025710419485793, "grad_norm": 0.30599363281197867, "learning_rate": 1.1605150855782916e-06, "loss": 0.021, "step": 3551 }, { "epoch": 2.4032476319350473, "grad_norm": 0.3216079548933299, "learning_rate": 1.157994445715706e-06, "loss": 0.0166, "step": 3552 }, { "epoch": 2.403924221921516, "grad_norm": 0.32487055741332094, "learning_rate": 1.155476187689475e-06, "loss": 0.0227, "step": 3553 }, { "epoch": 2.404600811907984, "grad_norm": 0.2592134713163622, "learning_rate": 1.1529603130607837e-06, "loss": 0.0151, "step": 3554 }, { "epoch": 2.405277401894452, "grad_norm": 0.5691157660691436, "learning_rate": 1.1504468233893408e-06, "loss": 0.0286, "step": 3555 }, { "epoch": 2.4059539918809203, "grad_norm": 0.35597353966299355, "learning_rate": 1.1479357202333707e-06, "loss": 0.0188, "step": 3556 }, { "epoch": 2.4066305818673883, "grad_norm": 0.24157505857099631, "learning_rate": 1.1454270051496264e-06, "loss": 0.012, "step": 3557 }, { "epoch": 2.407307171853857, "grad_norm": 0.4523490673598951, "learning_rate": 1.1429206796933717e-06, "loss": 0.0225, "step": 3558 }, { "epoch": 2.407983761840325, "grad_norm": 0.27794856013467706, "learning_rate": 1.1404167454183957e-06, "loss": 0.0155, "step": 3559 }, { "epoch": 2.408660351826793, "grad_norm": 0.3850251532065914, "learning_rate": 1.137915203877003e-06, "loss": 0.0253, "step": 3560 }, { "epoch": 2.4093369418132613, "grad_norm": 0.32746507418285364, "learning_rate": 1.1354160566200128e-06, "loss": 0.0183, "step": 3561 }, { "epoch": 2.4100135317997293, "grad_norm": 0.3644506235132886, "learning_rate": 1.132919305196763e-06, "loss": 0.0208, "step": 3562 }, { "epoch": 2.410690121786198, "grad_norm": 0.3174355500605888, "learning_rate": 1.130424951155104e-06, "loss": 0.0215, "step": 3563 }, { "epoch": 2.411366711772666, "grad_norm": 0.31997648625186803, "learning_rate": 1.1279329960414047e-06, "loss": 0.0176, "step": 3564 }, { "epoch": 2.412043301759134, "grad_norm": 0.3186085921629601, "learning_rate": 1.1254434414005367e-06, "loss": 0.0164, "step": 3565 }, { "epoch": 2.4127198917456023, "grad_norm": 0.32475947283866646, "learning_rate": 1.1229562887758927e-06, "loss": 0.0188, "step": 3566 }, { "epoch": 2.4133964817320703, "grad_norm": 0.31312475654368, "learning_rate": 1.1204715397093735e-06, "loss": 0.0173, "step": 3567 }, { "epoch": 2.414073071718539, "grad_norm": 0.33532270460929536, "learning_rate": 1.1179891957413908e-06, "loss": 0.0238, "step": 3568 }, { "epoch": 2.414749661705007, "grad_norm": 0.6103818765626172, "learning_rate": 1.1155092584108606e-06, "loss": 0.0301, "step": 3569 }, { "epoch": 2.415426251691475, "grad_norm": 0.28215322588991454, "learning_rate": 1.113031729255214e-06, "loss": 0.015, "step": 3570 }, { "epoch": 2.4161028416779433, "grad_norm": 0.3777151223630397, "learning_rate": 1.1105566098103825e-06, "loss": 0.0227, "step": 3571 }, { "epoch": 2.4167794316644113, "grad_norm": 0.29649804984965666, "learning_rate": 1.1080839016108086e-06, "loss": 0.0194, "step": 3572 }, { "epoch": 2.41745602165088, "grad_norm": 0.33306378870190073, "learning_rate": 1.1056136061894386e-06, "loss": 0.0182, "step": 3573 }, { "epoch": 2.418132611637348, "grad_norm": 0.38621214047041735, "learning_rate": 1.1031457250777206e-06, "loss": 0.0258, "step": 3574 }, { "epoch": 2.418809201623816, "grad_norm": 0.32207222992722523, "learning_rate": 1.1006802598056081e-06, "loss": 0.0152, "step": 3575 }, { "epoch": 2.4194857916102843, "grad_norm": 0.3316487167521873, "learning_rate": 1.0982172119015594e-06, "loss": 0.0222, "step": 3576 }, { "epoch": 2.4201623815967523, "grad_norm": 0.4281193426466968, "learning_rate": 1.0957565828925292e-06, "loss": 0.026, "step": 3577 }, { "epoch": 2.420838971583221, "grad_norm": 0.3457078644941504, "learning_rate": 1.0932983743039739e-06, "loss": 0.0193, "step": 3578 }, { "epoch": 2.421515561569689, "grad_norm": 0.35555742541029767, "learning_rate": 1.0908425876598512e-06, "loss": 0.0203, "step": 3579 }, { "epoch": 2.422192151556157, "grad_norm": 0.26932840661341484, "learning_rate": 1.0883892244826173e-06, "loss": 0.0151, "step": 3580 }, { "epoch": 2.4228687415426253, "grad_norm": 0.3705681130606943, "learning_rate": 1.0859382862932255e-06, "loss": 0.027, "step": 3581 }, { "epoch": 2.4235453315290933, "grad_norm": 0.38953695957431494, "learning_rate": 1.0834897746111233e-06, "loss": 0.0198, "step": 3582 }, { "epoch": 2.424221921515562, "grad_norm": 0.3317674407950328, "learning_rate": 1.0810436909542571e-06, "loss": 0.0187, "step": 3583 }, { "epoch": 2.42489851150203, "grad_norm": 0.3219899912954504, "learning_rate": 1.0786000368390686e-06, "loss": 0.0189, "step": 3584 }, { "epoch": 2.425575101488498, "grad_norm": 0.3255756865159592, "learning_rate": 1.0761588137804896e-06, "loss": 0.0207, "step": 3585 }, { "epoch": 2.4262516914749663, "grad_norm": 0.3572568166615696, "learning_rate": 1.0737200232919465e-06, "loss": 0.0179, "step": 3586 }, { "epoch": 2.4269282814614344, "grad_norm": 0.4304802398588593, "learning_rate": 1.0712836668853583e-06, "loss": 0.0197, "step": 3587 }, { "epoch": 2.4276048714479024, "grad_norm": 0.2816174167407849, "learning_rate": 1.0688497460711345e-06, "loss": 0.0146, "step": 3588 }, { "epoch": 2.428281461434371, "grad_norm": 0.3331761050155744, "learning_rate": 1.0664182623581777e-06, "loss": 0.0215, "step": 3589 }, { "epoch": 2.428958051420839, "grad_norm": 0.3461907404987719, "learning_rate": 1.0639892172538734e-06, "loss": 0.02, "step": 3590 }, { "epoch": 2.4296346414073073, "grad_norm": 0.3791631041733398, "learning_rate": 1.0615626122640988e-06, "loss": 0.0186, "step": 3591 }, { "epoch": 2.4303112313937754, "grad_norm": 0.3245515999284233, "learning_rate": 1.0591384488932188e-06, "loss": 0.0251, "step": 3592 }, { "epoch": 2.4309878213802434, "grad_norm": 0.2593365509195044, "learning_rate": 1.0567167286440844e-06, "loss": 0.0158, "step": 3593 }, { "epoch": 2.431664411366712, "grad_norm": 0.4416984677568257, "learning_rate": 1.0542974530180327e-06, "loss": 0.0231, "step": 3594 }, { "epoch": 2.43234100135318, "grad_norm": 0.2752010486638857, "learning_rate": 1.0518806235148814e-06, "loss": 0.0161, "step": 3595 }, { "epoch": 2.4330175913396483, "grad_norm": 0.2913899193060892, "learning_rate": 1.0494662416329366e-06, "loss": 0.0157, "step": 3596 }, { "epoch": 2.4336941813261164, "grad_norm": 0.3917553111418661, "learning_rate": 1.0470543088689855e-06, "loss": 0.022, "step": 3597 }, { "epoch": 2.4343707713125844, "grad_norm": 0.3034249763115742, "learning_rate": 1.044644826718295e-06, "loss": 0.0154, "step": 3598 }, { "epoch": 2.435047361299053, "grad_norm": 0.3145332035277517, "learning_rate": 1.0422377966746133e-06, "loss": 0.0182, "step": 3599 }, { "epoch": 2.435723951285521, "grad_norm": 0.2589253114587557, "learning_rate": 1.0398332202301708e-06, "loss": 0.0123, "step": 3600 }, { "epoch": 2.4364005412719894, "grad_norm": 0.24622442813531342, "learning_rate": 1.0374310988756747e-06, "loss": 0.0118, "step": 3601 }, { "epoch": 2.4370771312584574, "grad_norm": 0.2982017403070333, "learning_rate": 1.0350314341003121e-06, "loss": 0.0161, "step": 3602 }, { "epoch": 2.4377537212449254, "grad_norm": 0.2828273007905014, "learning_rate": 1.0326342273917432e-06, "loss": 0.0192, "step": 3603 }, { "epoch": 2.438430311231394, "grad_norm": 0.2905337340739197, "learning_rate": 1.0302394802361104e-06, "loss": 0.0115, "step": 3604 }, { "epoch": 2.439106901217862, "grad_norm": 0.3221955757275716, "learning_rate": 1.0278471941180245e-06, "loss": 0.0189, "step": 3605 }, { "epoch": 2.4397834912043304, "grad_norm": 0.39995015244508103, "learning_rate": 1.0254573705205751e-06, "loss": 0.0201, "step": 3606 }, { "epoch": 2.4404600811907984, "grad_norm": 0.27357873960222684, "learning_rate": 1.0230700109253255e-06, "loss": 0.0167, "step": 3607 }, { "epoch": 2.4411366711772664, "grad_norm": 0.2724211804132972, "learning_rate": 1.0206851168123078e-06, "loss": 0.0137, "step": 3608 }, { "epoch": 2.441813261163735, "grad_norm": 0.3582863886440724, "learning_rate": 1.0183026896600284e-06, "loss": 0.0184, "step": 3609 }, { "epoch": 2.442489851150203, "grad_norm": 0.3258502584036862, "learning_rate": 1.0159227309454662e-06, "loss": 0.016, "step": 3610 }, { "epoch": 2.4431664411366714, "grad_norm": 0.42611940152469097, "learning_rate": 1.0135452421440645e-06, "loss": 0.0225, "step": 3611 }, { "epoch": 2.4438430311231394, "grad_norm": 0.27375167470137807, "learning_rate": 1.0111702247297372e-06, "loss": 0.0149, "step": 3612 }, { "epoch": 2.4445196211096074, "grad_norm": 0.3884996950545533, "learning_rate": 1.0087976801748694e-06, "loss": 0.0224, "step": 3613 }, { "epoch": 2.445196211096076, "grad_norm": 0.3679433733704196, "learning_rate": 1.00642760995031e-06, "loss": 0.0166, "step": 3614 }, { "epoch": 2.445872801082544, "grad_norm": 0.3342586133317336, "learning_rate": 1.0040600155253766e-06, "loss": 0.0194, "step": 3615 }, { "epoch": 2.4465493910690124, "grad_norm": 0.3673869880514802, "learning_rate": 1.0016948983678471e-06, "loss": 0.0236, "step": 3616 }, { "epoch": 2.4472259810554804, "grad_norm": 0.35815533304459596, "learning_rate": 9.993322599439692e-07, "loss": 0.0176, "step": 3617 }, { "epoch": 2.4479025710419484, "grad_norm": 0.38776650522245165, "learning_rate": 9.969721017184492e-07, "loss": 0.0218, "step": 3618 }, { "epoch": 2.448579161028417, "grad_norm": 0.33733432858135887, "learning_rate": 9.946144251544604e-07, "loss": 0.018, "step": 3619 }, { "epoch": 2.449255751014885, "grad_norm": 0.42845636603426923, "learning_rate": 9.92259231713632e-07, "loss": 0.0224, "step": 3620 }, { "epoch": 2.449932341001353, "grad_norm": 0.31609882237347636, "learning_rate": 9.899065228560596e-07, "loss": 0.0191, "step": 3621 }, { "epoch": 2.4506089309878214, "grad_norm": 0.32348897993986947, "learning_rate": 9.87556300040295e-07, "loss": 0.0167, "step": 3622 }, { "epoch": 2.4512855209742894, "grad_norm": 0.36727952481851095, "learning_rate": 9.852085647233505e-07, "loss": 0.0218, "step": 3623 }, { "epoch": 2.451962110960758, "grad_norm": 0.3426880024563584, "learning_rate": 9.82863318360695e-07, "loss": 0.0209, "step": 3624 }, { "epoch": 2.452638700947226, "grad_norm": 0.29728264085109457, "learning_rate": 9.805205624062535e-07, "loss": 0.0175, "step": 3625 }, { "epoch": 2.453315290933694, "grad_norm": 0.31534817697654494, "learning_rate": 9.781802983124094e-07, "loss": 0.0167, "step": 3626 }, { "epoch": 2.4539918809201624, "grad_norm": 0.3601449965736396, "learning_rate": 9.758425275299998e-07, "loss": 0.0242, "step": 3627 }, { "epoch": 2.4546684709066304, "grad_norm": 0.2857567366562343, "learning_rate": 9.735072515083193e-07, "loss": 0.0176, "step": 3628 }, { "epoch": 2.455345060893099, "grad_norm": 0.25617022661644473, "learning_rate": 9.711744716951093e-07, "loss": 0.0135, "step": 3629 }, { "epoch": 2.456021650879567, "grad_norm": 0.3529573135201078, "learning_rate": 9.688441895365708e-07, "loss": 0.0207, "step": 3630 }, { "epoch": 2.456698240866035, "grad_norm": 0.33674943059847223, "learning_rate": 9.665164064773496e-07, "loss": 0.0175, "step": 3631 }, { "epoch": 2.4573748308525034, "grad_norm": 0.37782206613329106, "learning_rate": 9.641911239605494e-07, "loss": 0.0196, "step": 3632 }, { "epoch": 2.4580514208389714, "grad_norm": 0.3862462874137182, "learning_rate": 9.618683434277176e-07, "loss": 0.0228, "step": 3633 }, { "epoch": 2.45872801082544, "grad_norm": 0.34999219455681013, "learning_rate": 9.595480663188528e-07, "loss": 0.0195, "step": 3634 }, { "epoch": 2.459404600811908, "grad_norm": 0.4626169830326315, "learning_rate": 9.572302940724032e-07, "loss": 0.0393, "step": 3635 }, { "epoch": 2.460081190798376, "grad_norm": 0.2913626246404951, "learning_rate": 9.549150281252633e-07, "loss": 0.0158, "step": 3636 }, { "epoch": 2.4607577807848444, "grad_norm": 0.3279146939460105, "learning_rate": 9.526022699127718e-07, "loss": 0.0169, "step": 3637 }, { "epoch": 2.4614343707713124, "grad_norm": 0.2571865806121024, "learning_rate": 9.502920208687133e-07, "loss": 0.0166, "step": 3638 }, { "epoch": 2.462110960757781, "grad_norm": 0.30294011483908945, "learning_rate": 9.479842824253182e-07, "loss": 0.0167, "step": 3639 }, { "epoch": 2.462787550744249, "grad_norm": 0.39661258407298433, "learning_rate": 9.456790560132617e-07, "loss": 0.0195, "step": 3640 }, { "epoch": 2.463464140730717, "grad_norm": 0.3158447434300703, "learning_rate": 9.433763430616577e-07, "loss": 0.0186, "step": 3641 }, { "epoch": 2.4641407307171854, "grad_norm": 0.38141902736677236, "learning_rate": 9.410761449980654e-07, "loss": 0.0188, "step": 3642 }, { "epoch": 2.4648173207036534, "grad_norm": 0.27162725093169054, "learning_rate": 9.387784632484825e-07, "loss": 0.016, "step": 3643 }, { "epoch": 2.465493910690122, "grad_norm": 0.2828561253646193, "learning_rate": 9.364832992373501e-07, "loss": 0.0153, "step": 3644 }, { "epoch": 2.46617050067659, "grad_norm": 0.3750875612411601, "learning_rate": 9.341906543875451e-07, "loss": 0.0243, "step": 3645 }, { "epoch": 2.466847090663058, "grad_norm": 0.30320451689941996, "learning_rate": 9.319005301203821e-07, "loss": 0.0116, "step": 3646 }, { "epoch": 2.4675236806495264, "grad_norm": 0.30771201182797003, "learning_rate": 9.296129278556155e-07, "loss": 0.0144, "step": 3647 }, { "epoch": 2.4682002706359945, "grad_norm": 0.34404982430849945, "learning_rate": 9.273278490114357e-07, "loss": 0.0199, "step": 3648 }, { "epoch": 2.468876860622463, "grad_norm": 0.27712231545755306, "learning_rate": 9.250452950044702e-07, "loss": 0.0151, "step": 3649 }, { "epoch": 2.469553450608931, "grad_norm": 0.3061379277484997, "learning_rate": 9.227652672497761e-07, "loss": 0.0161, "step": 3650 }, { "epoch": 2.470230040595399, "grad_norm": 0.2580160324430578, "learning_rate": 9.204877671608515e-07, "loss": 0.012, "step": 3651 }, { "epoch": 2.4709066305818674, "grad_norm": 0.2458200032209575, "learning_rate": 9.182127961496196e-07, "loss": 0.0168, "step": 3652 }, { "epoch": 2.4715832205683355, "grad_norm": 0.30992285070756337, "learning_rate": 9.159403556264435e-07, "loss": 0.0171, "step": 3653 }, { "epoch": 2.472259810554804, "grad_norm": 0.27087369368753017, "learning_rate": 9.136704470001101e-07, "loss": 0.0144, "step": 3654 }, { "epoch": 2.472936400541272, "grad_norm": 0.3955016414382731, "learning_rate": 9.114030716778433e-07, "loss": 0.0183, "step": 3655 }, { "epoch": 2.47361299052774, "grad_norm": 0.2533859532558727, "learning_rate": 9.091382310652925e-07, "loss": 0.0115, "step": 3656 }, { "epoch": 2.4742895805142084, "grad_norm": 0.3260767643275918, "learning_rate": 9.068759265665384e-07, "loss": 0.0223, "step": 3657 }, { "epoch": 2.4749661705006765, "grad_norm": 0.288620760090034, "learning_rate": 9.046161595840858e-07, "loss": 0.0117, "step": 3658 }, { "epoch": 2.475642760487145, "grad_norm": 0.3286027645711313, "learning_rate": 9.023589315188686e-07, "loss": 0.019, "step": 3659 }, { "epoch": 2.476319350473613, "grad_norm": 0.32142937478643163, "learning_rate": 9.001042437702468e-07, "loss": 0.0172, "step": 3660 }, { "epoch": 2.476995940460081, "grad_norm": 0.2941548716427655, "learning_rate": 8.978520977360067e-07, "loss": 0.0146, "step": 3661 }, { "epoch": 2.4776725304465494, "grad_norm": 0.26511273894977394, "learning_rate": 8.956024948123549e-07, "loss": 0.0159, "step": 3662 }, { "epoch": 2.4783491204330175, "grad_norm": 0.31953367946362177, "learning_rate": 8.933554363939256e-07, "loss": 0.0148, "step": 3663 }, { "epoch": 2.479025710419486, "grad_norm": 0.31577564757791765, "learning_rate": 8.911109238737748e-07, "loss": 0.0213, "step": 3664 }, { "epoch": 2.479702300405954, "grad_norm": 0.2829178437388904, "learning_rate": 8.888689586433768e-07, "loss": 0.0144, "step": 3665 }, { "epoch": 2.480378890392422, "grad_norm": 0.2735767790253131, "learning_rate": 8.866295420926319e-07, "loss": 0.0179, "step": 3666 }, { "epoch": 2.4810554803788905, "grad_norm": 0.31001672218423, "learning_rate": 8.843926756098548e-07, "loss": 0.0215, "step": 3667 }, { "epoch": 2.4817320703653585, "grad_norm": 0.33414976016780795, "learning_rate": 8.821583605817835e-07, "loss": 0.02, "step": 3668 }, { "epoch": 2.482408660351827, "grad_norm": 0.284854847309466, "learning_rate": 8.799265983935734e-07, "loss": 0.017, "step": 3669 }, { "epoch": 2.483085250338295, "grad_norm": 0.26849249382240087, "learning_rate": 8.776973904287972e-07, "loss": 0.0144, "step": 3670 }, { "epoch": 2.483761840324763, "grad_norm": 0.3337874131085429, "learning_rate": 8.754707380694427e-07, "loss": 0.0184, "step": 3671 }, { "epoch": 2.4844384303112315, "grad_norm": 0.2735184717788136, "learning_rate": 8.732466426959135e-07, "loss": 0.016, "step": 3672 }, { "epoch": 2.4851150202976995, "grad_norm": 0.3753345660246591, "learning_rate": 8.7102510568703e-07, "loss": 0.0264, "step": 3673 }, { "epoch": 2.485791610284168, "grad_norm": 0.35112101487173825, "learning_rate": 8.688061284200266e-07, "loss": 0.0185, "step": 3674 }, { "epoch": 2.486468200270636, "grad_norm": 0.48090583087705707, "learning_rate": 8.665897122705463e-07, "loss": 0.0211, "step": 3675 }, { "epoch": 2.487144790257104, "grad_norm": 0.34084347866296383, "learning_rate": 8.6437585861265e-07, "loss": 0.0219, "step": 3676 }, { "epoch": 2.4878213802435725, "grad_norm": 0.3247018053195929, "learning_rate": 8.621645688188085e-07, "loss": 0.0154, "step": 3677 }, { "epoch": 2.4884979702300405, "grad_norm": 0.3268995129040612, "learning_rate": 8.599558442598998e-07, "loss": 0.014, "step": 3678 }, { "epoch": 2.489174560216509, "grad_norm": 0.3043518421228281, "learning_rate": 8.577496863052165e-07, "loss": 0.0147, "step": 3679 }, { "epoch": 2.489851150202977, "grad_norm": 0.34875701962468597, "learning_rate": 8.555460963224549e-07, "loss": 0.0194, "step": 3680 }, { "epoch": 2.490527740189445, "grad_norm": 0.2949701090986178, "learning_rate": 8.53345075677724e-07, "loss": 0.0167, "step": 3681 }, { "epoch": 2.4912043301759135, "grad_norm": 0.31695728671430934, "learning_rate": 8.511466257355384e-07, "loss": 0.0242, "step": 3682 }, { "epoch": 2.4918809201623815, "grad_norm": 0.29890344422040294, "learning_rate": 8.48950747858816e-07, "loss": 0.017, "step": 3683 }, { "epoch": 2.49255751014885, "grad_norm": 0.3273187322016261, "learning_rate": 8.46757443408886e-07, "loss": 0.0144, "step": 3684 }, { "epoch": 2.493234100135318, "grad_norm": 0.3509080983687528, "learning_rate": 8.44566713745476e-07, "loss": 0.0186, "step": 3685 }, { "epoch": 2.493910690121786, "grad_norm": 0.36309102824738165, "learning_rate": 8.42378560226722e-07, "loss": 0.0262, "step": 3686 }, { "epoch": 2.4945872801082545, "grad_norm": 0.3209157502033997, "learning_rate": 8.401929842091616e-07, "loss": 0.0203, "step": 3687 }, { "epoch": 2.4952638700947225, "grad_norm": 0.33224393104659095, "learning_rate": 8.380099870477321e-07, "loss": 0.0241, "step": 3688 }, { "epoch": 2.495940460081191, "grad_norm": 0.2850066777570733, "learning_rate": 8.358295700957753e-07, "loss": 0.0188, "step": 3689 }, { "epoch": 2.496617050067659, "grad_norm": 0.45238747277186764, "learning_rate": 8.336517347050327e-07, "loss": 0.036, "step": 3690 }, { "epoch": 2.497293640054127, "grad_norm": 0.31987116564388407, "learning_rate": 8.314764822256465e-07, "loss": 0.0178, "step": 3691 }, { "epoch": 2.4979702300405955, "grad_norm": 0.29664999198584385, "learning_rate": 8.293038140061516e-07, "loss": 0.0207, "step": 3692 }, { "epoch": 2.4986468200270635, "grad_norm": 0.29991950939417333, "learning_rate": 8.271337313934869e-07, "loss": 0.023, "step": 3693 }, { "epoch": 2.499323410013532, "grad_norm": 0.3175573012633243, "learning_rate": 8.24966235732988e-07, "loss": 0.0228, "step": 3694 }, { "epoch": 2.5, "grad_norm": 0.2579720120488598, "learning_rate": 8.22801328368385e-07, "loss": 0.0133, "step": 3695 }, { "epoch": 2.500676589986468, "grad_norm": 0.41447195630641864, "learning_rate": 8.206390106418028e-07, "loss": 0.0287, "step": 3696 }, { "epoch": 2.5013531799729365, "grad_norm": 0.26294665783938886, "learning_rate": 8.184792838937633e-07, "loss": 0.013, "step": 3697 }, { "epoch": 2.5020297699594045, "grad_norm": 0.3487304987438679, "learning_rate": 8.163221494631785e-07, "loss": 0.0169, "step": 3698 }, { "epoch": 2.502706359945873, "grad_norm": 0.3363029598186698, "learning_rate": 8.141676086873574e-07, "loss": 0.0138, "step": 3699 }, { "epoch": 2.503382949932341, "grad_norm": 0.33537386953847986, "learning_rate": 8.120156629019987e-07, "loss": 0.0162, "step": 3700 }, { "epoch": 2.504059539918809, "grad_norm": 0.30018568109620863, "learning_rate": 8.098663134411922e-07, "loss": 0.0219, "step": 3701 }, { "epoch": 2.5047361299052775, "grad_norm": 0.2736818891809169, "learning_rate": 8.077195616374184e-07, "loss": 0.014, "step": 3702 }, { "epoch": 2.5054127198917455, "grad_norm": 0.2916938971910028, "learning_rate": 8.055754088215501e-07, "loss": 0.0167, "step": 3703 }, { "epoch": 2.506089309878214, "grad_norm": 0.3742103765871252, "learning_rate": 8.03433856322845e-07, "loss": 0.0221, "step": 3704 }, { "epoch": 2.506765899864682, "grad_norm": 0.2853029902243529, "learning_rate": 8.012949054689484e-07, "loss": 0.017, "step": 3705 }, { "epoch": 2.50744248985115, "grad_norm": 0.2780046433962602, "learning_rate": 7.991585575858962e-07, "loss": 0.0129, "step": 3706 }, { "epoch": 2.5081190798376185, "grad_norm": 0.34726562537663525, "learning_rate": 7.970248139981091e-07, "loss": 0.0229, "step": 3707 }, { "epoch": 2.5087956698240865, "grad_norm": 0.3124849687652771, "learning_rate": 7.948936760283937e-07, "loss": 0.0166, "step": 3708 }, { "epoch": 2.509472259810555, "grad_norm": 0.37500088438852763, "learning_rate": 7.92765144997939e-07, "loss": 0.0177, "step": 3709 }, { "epoch": 2.510148849797023, "grad_norm": 0.3044971725677902, "learning_rate": 7.906392222263199e-07, "loss": 0.0166, "step": 3710 }, { "epoch": 2.510825439783491, "grad_norm": 0.2617148716258878, "learning_rate": 7.885159090314959e-07, "loss": 0.0117, "step": 3711 }, { "epoch": 2.5115020297699595, "grad_norm": 0.34720228744816334, "learning_rate": 7.863952067298042e-07, "loss": 0.0178, "step": 3712 }, { "epoch": 2.5121786197564275, "grad_norm": 0.31964201336305254, "learning_rate": 7.842771166359681e-07, "loss": 0.0148, "step": 3713 }, { "epoch": 2.512855209742896, "grad_norm": 0.30802601534510443, "learning_rate": 7.821616400630866e-07, "loss": 0.0169, "step": 3714 }, { "epoch": 2.513531799729364, "grad_norm": 0.2899769362573107, "learning_rate": 7.80048778322643e-07, "loss": 0.0147, "step": 3715 }, { "epoch": 2.514208389715832, "grad_norm": 0.3276382558417513, "learning_rate": 7.779385327244987e-07, "loss": 0.0155, "step": 3716 }, { "epoch": 2.5148849797023005, "grad_norm": 0.41759341270109446, "learning_rate": 7.758309045768908e-07, "loss": 0.0216, "step": 3717 }, { "epoch": 2.5155615696887685, "grad_norm": 0.38468041223551724, "learning_rate": 7.737258951864341e-07, "loss": 0.0257, "step": 3718 }, { "epoch": 2.516238159675237, "grad_norm": 0.3883977011172923, "learning_rate": 7.716235058581218e-07, "loss": 0.0194, "step": 3719 }, { "epoch": 2.516914749661705, "grad_norm": 0.35220631420106524, "learning_rate": 7.695237378953224e-07, "loss": 0.0149, "step": 3720 }, { "epoch": 2.517591339648173, "grad_norm": 0.3119448090893277, "learning_rate": 7.674265925997804e-07, "loss": 0.0256, "step": 3721 }, { "epoch": 2.5182679296346415, "grad_norm": 0.31443390805581484, "learning_rate": 7.653320712716095e-07, "loss": 0.0208, "step": 3722 }, { "epoch": 2.5189445196211095, "grad_norm": 0.41807724934513535, "learning_rate": 7.632401752093016e-07, "loss": 0.0214, "step": 3723 }, { "epoch": 2.519621109607578, "grad_norm": 0.28393848318928605, "learning_rate": 7.611509057097211e-07, "loss": 0.0161, "step": 3724 }, { "epoch": 2.520297699594046, "grad_norm": 0.3403491041134632, "learning_rate": 7.590642640681012e-07, "loss": 0.0177, "step": 3725 }, { "epoch": 2.520974289580514, "grad_norm": 0.3331428333939334, "learning_rate": 7.569802515780455e-07, "loss": 0.0146, "step": 3726 }, { "epoch": 2.5216508795669825, "grad_norm": 0.3760862041467032, "learning_rate": 7.548988695315313e-07, "loss": 0.0306, "step": 3727 }, { "epoch": 2.5223274695534506, "grad_norm": 0.3027211125719161, "learning_rate": 7.528201192189028e-07, "loss": 0.0175, "step": 3728 }, { "epoch": 2.523004059539919, "grad_norm": 0.3191198087320616, "learning_rate": 7.507440019288742e-07, "loss": 0.0195, "step": 3729 }, { "epoch": 2.523680649526387, "grad_norm": 0.41889847322413687, "learning_rate": 7.486705189485243e-07, "loss": 0.021, "step": 3730 }, { "epoch": 2.524357239512855, "grad_norm": 0.4182492998268176, "learning_rate": 7.465996715633028e-07, "loss": 0.0211, "step": 3731 }, { "epoch": 2.5250338294993235, "grad_norm": 0.347004273713614, "learning_rate": 7.44531461057022e-07, "loss": 0.0205, "step": 3732 }, { "epoch": 2.5257104194857916, "grad_norm": 0.34847214537558846, "learning_rate": 7.424658887118613e-07, "loss": 0.0182, "step": 3733 }, { "epoch": 2.52638700947226, "grad_norm": 0.34440728730388054, "learning_rate": 7.404029558083653e-07, "loss": 0.0172, "step": 3734 }, { "epoch": 2.527063599458728, "grad_norm": 0.2578896878374381, "learning_rate": 7.383426636254392e-07, "loss": 0.015, "step": 3735 }, { "epoch": 2.527740189445196, "grad_norm": 0.33215378511702953, "learning_rate": 7.362850134403543e-07, "loss": 0.0127, "step": 3736 }, { "epoch": 2.5284167794316645, "grad_norm": 0.3640017273800162, "learning_rate": 7.342300065287439e-07, "loss": 0.0205, "step": 3737 }, { "epoch": 2.5290933694181326, "grad_norm": 0.4096276659255588, "learning_rate": 7.321776441646001e-07, "loss": 0.0352, "step": 3738 }, { "epoch": 2.529769959404601, "grad_norm": 0.2871912767415467, "learning_rate": 7.301279276202761e-07, "loss": 0.018, "step": 3739 }, { "epoch": 2.530446549391069, "grad_norm": 0.3251713139436743, "learning_rate": 7.280808581664866e-07, "loss": 0.0243, "step": 3740 }, { "epoch": 2.531123139377537, "grad_norm": 0.3780638753045574, "learning_rate": 7.260364370723044e-07, "loss": 0.0198, "step": 3741 }, { "epoch": 2.5317997293640055, "grad_norm": 0.3513956004069086, "learning_rate": 7.239946656051622e-07, "loss": 0.0224, "step": 3742 }, { "epoch": 2.5324763193504736, "grad_norm": 0.34038329609714074, "learning_rate": 7.219555450308446e-07, "loss": 0.0158, "step": 3743 }, { "epoch": 2.533152909336942, "grad_norm": 0.34755681876113453, "learning_rate": 7.199190766135001e-07, "loss": 0.0169, "step": 3744 }, { "epoch": 2.53382949932341, "grad_norm": 0.29591709662706916, "learning_rate": 7.178852616156262e-07, "loss": 0.0223, "step": 3745 }, { "epoch": 2.534506089309878, "grad_norm": 0.38133263415756624, "learning_rate": 7.158541012980813e-07, "loss": 0.0189, "step": 3746 }, { "epoch": 2.5351826792963466, "grad_norm": 0.4194238768991091, "learning_rate": 7.138255969200724e-07, "loss": 0.0273, "step": 3747 }, { "epoch": 2.5358592692828146, "grad_norm": 0.27822039690481304, "learning_rate": 7.117997497391648e-07, "loss": 0.0168, "step": 3748 }, { "epoch": 2.536535859269283, "grad_norm": 0.39759140555950817, "learning_rate": 7.097765610112745e-07, "loss": 0.023, "step": 3749 }, { "epoch": 2.537212449255751, "grad_norm": 0.4016877400509333, "learning_rate": 7.077560319906696e-07, "loss": 0.0283, "step": 3750 }, { "epoch": 2.537889039242219, "grad_norm": 0.3758635653520881, "learning_rate": 7.057381639299693e-07, "loss": 0.0157, "step": 3751 }, { "epoch": 2.5385656292286876, "grad_norm": 0.2847335457232802, "learning_rate": 7.037229580801414e-07, "loss": 0.0133, "step": 3752 }, { "epoch": 2.5392422192151556, "grad_norm": 0.38036657443586286, "learning_rate": 7.017104156905058e-07, "loss": 0.0249, "step": 3753 }, { "epoch": 2.539918809201624, "grad_norm": 0.3057178712858743, "learning_rate": 6.997005380087301e-07, "loss": 0.0214, "step": 3754 }, { "epoch": 2.540595399188092, "grad_norm": 0.35050630154278756, "learning_rate": 6.976933262808322e-07, "loss": 0.0186, "step": 3755 }, { "epoch": 2.54127198917456, "grad_norm": 0.24373961201498015, "learning_rate": 6.95688781751172e-07, "loss": 0.0117, "step": 3756 }, { "epoch": 2.5419485791610286, "grad_norm": 0.3235327242981872, "learning_rate": 6.936869056624623e-07, "loss": 0.0157, "step": 3757 }, { "epoch": 2.5426251691474966, "grad_norm": 0.30609144696792684, "learning_rate": 6.916876992557553e-07, "loss": 0.017, "step": 3758 }, { "epoch": 2.543301759133965, "grad_norm": 0.22594102548472556, "learning_rate": 6.896911637704534e-07, "loss": 0.0132, "step": 3759 }, { "epoch": 2.543978349120433, "grad_norm": 0.5191390673623143, "learning_rate": 6.876973004442988e-07, "loss": 0.0216, "step": 3760 }, { "epoch": 2.544654939106901, "grad_norm": 0.2796887288244487, "learning_rate": 6.85706110513381e-07, "loss": 0.0206, "step": 3761 }, { "epoch": 2.5453315290933696, "grad_norm": 0.3677716090530408, "learning_rate": 6.837175952121305e-07, "loss": 0.0214, "step": 3762 }, { "epoch": 2.5460081190798376, "grad_norm": 0.32068769101337385, "learning_rate": 6.8173175577332e-07, "loss": 0.0186, "step": 3763 }, { "epoch": 2.546684709066306, "grad_norm": 0.41819358878408824, "learning_rate": 6.797485934280618e-07, "loss": 0.0238, "step": 3764 }, { "epoch": 2.547361299052774, "grad_norm": 0.3896390142288087, "learning_rate": 6.777681094058087e-07, "loss": 0.0214, "step": 3765 }, { "epoch": 2.548037889039242, "grad_norm": 0.30481597134225336, "learning_rate": 6.757903049343556e-07, "loss": 0.0136, "step": 3766 }, { "epoch": 2.5487144790257106, "grad_norm": 0.30709409095919515, "learning_rate": 6.738151812398353e-07, "loss": 0.0186, "step": 3767 }, { "epoch": 2.5493910690121786, "grad_norm": 0.2751161121270378, "learning_rate": 6.718427395467165e-07, "loss": 0.0136, "step": 3768 }, { "epoch": 2.550067658998647, "grad_norm": 0.36730811831170196, "learning_rate": 6.698729810778065e-07, "loss": 0.0229, "step": 3769 }, { "epoch": 2.550744248985115, "grad_norm": 0.3901367916758139, "learning_rate": 6.67905907054251e-07, "loss": 0.0196, "step": 3770 }, { "epoch": 2.551420838971583, "grad_norm": 0.3218881770971386, "learning_rate": 6.659415186955298e-07, "loss": 0.0166, "step": 3771 }, { "epoch": 2.5520974289580516, "grad_norm": 0.2959047106763243, "learning_rate": 6.639798172194567e-07, "loss": 0.0152, "step": 3772 }, { "epoch": 2.5527740189445196, "grad_norm": 0.41578956702986747, "learning_rate": 6.620208038421805e-07, "loss": 0.0217, "step": 3773 }, { "epoch": 2.553450608930988, "grad_norm": 0.29811809542579076, "learning_rate": 6.600644797781847e-07, "loss": 0.0214, "step": 3774 }, { "epoch": 2.554127198917456, "grad_norm": 0.364721312291806, "learning_rate": 6.581108462402847e-07, "loss": 0.0303, "step": 3775 }, { "epoch": 2.554803788903924, "grad_norm": 0.3529172118128871, "learning_rate": 6.561599044396288e-07, "loss": 0.0184, "step": 3776 }, { "epoch": 2.555480378890392, "grad_norm": 0.3152404408119672, "learning_rate": 6.542116555856953e-07, "loss": 0.0189, "step": 3777 }, { "epoch": 2.5561569688768606, "grad_norm": 0.37995639828497735, "learning_rate": 6.522661008862918e-07, "loss": 0.0258, "step": 3778 }, { "epoch": 2.556833558863329, "grad_norm": 0.34963033118812814, "learning_rate": 6.503232415475591e-07, "loss": 0.0241, "step": 3779 }, { "epoch": 2.557510148849797, "grad_norm": 0.26705100509969, "learning_rate": 6.483830787739659e-07, "loss": 0.0154, "step": 3780 }, { "epoch": 2.558186738836265, "grad_norm": 0.3153615336651123, "learning_rate": 6.464456137683061e-07, "loss": 0.019, "step": 3781 }, { "epoch": 2.558863328822733, "grad_norm": 0.32949469173911516, "learning_rate": 6.445108477317046e-07, "loss": 0.0228, "step": 3782 }, { "epoch": 2.5595399188092016, "grad_norm": 0.299929681374004, "learning_rate": 6.425787818636131e-07, "loss": 0.0133, "step": 3783 }, { "epoch": 2.56021650879567, "grad_norm": 0.34799625123696276, "learning_rate": 6.406494173618083e-07, "loss": 0.0209, "step": 3784 }, { "epoch": 2.560893098782138, "grad_norm": 0.33894449678548877, "learning_rate": 6.387227554223918e-07, "loss": 0.0203, "step": 3785 }, { "epoch": 2.561569688768606, "grad_norm": 0.3136589437938693, "learning_rate": 6.367987972397887e-07, "loss": 0.0157, "step": 3786 }, { "epoch": 2.562246278755074, "grad_norm": 0.3658814669623945, "learning_rate": 6.348775440067507e-07, "loss": 0.0233, "step": 3787 }, { "epoch": 2.5629228687415426, "grad_norm": 0.302550822795212, "learning_rate": 6.329589969143518e-07, "loss": 0.0167, "step": 3788 }, { "epoch": 2.563599458728011, "grad_norm": 0.44812161415334467, "learning_rate": 6.310431571519865e-07, "loss": 0.0174, "step": 3789 }, { "epoch": 2.564276048714479, "grad_norm": 0.33489522778398995, "learning_rate": 6.291300259073724e-07, "loss": 0.0198, "step": 3790 }, { "epoch": 2.564952638700947, "grad_norm": 0.39341720529773727, "learning_rate": 6.27219604366549e-07, "loss": 0.026, "step": 3791 }, { "epoch": 2.565629228687415, "grad_norm": 0.30574329808242695, "learning_rate": 6.25311893713873e-07, "loss": 0.016, "step": 3792 }, { "epoch": 2.5663058186738836, "grad_norm": 0.23251544683637546, "learning_rate": 6.234068951320243e-07, "loss": 0.0107, "step": 3793 }, { "epoch": 2.566982408660352, "grad_norm": 0.3135740981894931, "learning_rate": 6.215046098019967e-07, "loss": 0.016, "step": 3794 }, { "epoch": 2.56765899864682, "grad_norm": 0.24331517761875804, "learning_rate": 6.196050389031061e-07, "loss": 0.0119, "step": 3795 }, { "epoch": 2.568335588633288, "grad_norm": 0.25651413300338116, "learning_rate": 6.177081836129833e-07, "loss": 0.0171, "step": 3796 }, { "epoch": 2.569012178619756, "grad_norm": 0.30903109530849543, "learning_rate": 6.158140451075794e-07, "loss": 0.0196, "step": 3797 }, { "epoch": 2.5696887686062246, "grad_norm": 0.28212594949742376, "learning_rate": 6.139226245611535e-07, "loss": 0.0177, "step": 3798 }, { "epoch": 2.5703653585926927, "grad_norm": 0.3398177304051992, "learning_rate": 6.120339231462862e-07, "loss": 0.0329, "step": 3799 }, { "epoch": 2.571041948579161, "grad_norm": 0.35122524012297607, "learning_rate": 6.101479420338713e-07, "loss": 0.019, "step": 3800 }, { "epoch": 2.571718538565629, "grad_norm": 0.30860813441030255, "learning_rate": 6.082646823931165e-07, "loss": 0.0191, "step": 3801 }, { "epoch": 2.572395128552097, "grad_norm": 0.31340422768397574, "learning_rate": 6.063841453915381e-07, "loss": 0.0222, "step": 3802 }, { "epoch": 2.5730717185385656, "grad_norm": 0.28261952362490333, "learning_rate": 6.045063321949696e-07, "loss": 0.018, "step": 3803 }, { "epoch": 2.5737483085250337, "grad_norm": 0.3929177836615564, "learning_rate": 6.026312439675553e-07, "loss": 0.0268, "step": 3804 }, { "epoch": 2.574424898511502, "grad_norm": 0.3149803959614211, "learning_rate": 6.007588818717458e-07, "loss": 0.016, "step": 3805 }, { "epoch": 2.57510148849797, "grad_norm": 0.2584046503682228, "learning_rate": 5.988892470683072e-07, "loss": 0.0136, "step": 3806 }, { "epoch": 2.575778078484438, "grad_norm": 0.24415284472503188, "learning_rate": 5.9702234071631e-07, "loss": 0.0111, "step": 3807 }, { "epoch": 2.5764546684709067, "grad_norm": 0.3517556331948916, "learning_rate": 5.951581639731374e-07, "loss": 0.0215, "step": 3808 }, { "epoch": 2.5771312584573747, "grad_norm": 0.37205934687573333, "learning_rate": 5.932967179944788e-07, "loss": 0.0147, "step": 3809 }, { "epoch": 2.577807848443843, "grad_norm": 0.36745162922669494, "learning_rate": 5.914380039343281e-07, "loss": 0.0244, "step": 3810 }, { "epoch": 2.578484438430311, "grad_norm": 0.3069121104010219, "learning_rate": 5.895820229449906e-07, "loss": 0.0162, "step": 3811 }, { "epoch": 2.579161028416779, "grad_norm": 0.39617489864023986, "learning_rate": 5.877287761770717e-07, "loss": 0.0178, "step": 3812 }, { "epoch": 2.5798376184032477, "grad_norm": 0.23492464813043765, "learning_rate": 5.858782647794864e-07, "loss": 0.0132, "step": 3813 }, { "epoch": 2.5805142083897157, "grad_norm": 0.3715920389620766, "learning_rate": 5.84030489899452e-07, "loss": 0.021, "step": 3814 }, { "epoch": 2.581190798376184, "grad_norm": 0.3444343101060036, "learning_rate": 5.821854526824883e-07, "loss": 0.0136, "step": 3815 }, { "epoch": 2.581867388362652, "grad_norm": 0.32409626370097167, "learning_rate": 5.803431542724192e-07, "loss": 0.0211, "step": 3816 }, { "epoch": 2.58254397834912, "grad_norm": 0.2916593852312832, "learning_rate": 5.785035958113717e-07, "loss": 0.019, "step": 3817 }, { "epoch": 2.5832205683355887, "grad_norm": 0.3729373264065649, "learning_rate": 5.766667784397706e-07, "loss": 0.0155, "step": 3818 }, { "epoch": 2.5838971583220567, "grad_norm": 0.30098753602743283, "learning_rate": 5.748327032963464e-07, "loss": 0.0118, "step": 3819 }, { "epoch": 2.584573748308525, "grad_norm": 0.3989803538788793, "learning_rate": 5.730013715181238e-07, "loss": 0.0268, "step": 3820 }, { "epoch": 2.585250338294993, "grad_norm": 0.40759638105888774, "learning_rate": 5.711727842404319e-07, "loss": 0.0216, "step": 3821 }, { "epoch": 2.585926928281461, "grad_norm": 0.4623888636214892, "learning_rate": 5.693469425968962e-07, "loss": 0.0176, "step": 3822 }, { "epoch": 2.5866035182679297, "grad_norm": 0.2533420398951006, "learning_rate": 5.675238477194389e-07, "loss": 0.0128, "step": 3823 }, { "epoch": 2.5872801082543977, "grad_norm": 0.2778696966059566, "learning_rate": 5.657035007382822e-07, "loss": 0.0183, "step": 3824 }, { "epoch": 2.587956698240866, "grad_norm": 0.2707644682041514, "learning_rate": 5.63885902781941e-07, "loss": 0.0145, "step": 3825 }, { "epoch": 2.588633288227334, "grad_norm": 0.319753787554978, "learning_rate": 5.620710549772295e-07, "loss": 0.0153, "step": 3826 }, { "epoch": 2.589309878213802, "grad_norm": 0.2499952247532181, "learning_rate": 5.602589584492563e-07, "loss": 0.0133, "step": 3827 }, { "epoch": 2.5899864682002707, "grad_norm": 0.32337333650405664, "learning_rate": 5.584496143214213e-07, "loss": 0.016, "step": 3828 }, { "epoch": 2.5906630581867387, "grad_norm": 0.3326756489389962, "learning_rate": 5.566430237154219e-07, "loss": 0.017, "step": 3829 }, { "epoch": 2.591339648173207, "grad_norm": 0.30600092409383467, "learning_rate": 5.548391877512471e-07, "loss": 0.0146, "step": 3830 }, { "epoch": 2.592016238159675, "grad_norm": 0.2859905917314146, "learning_rate": 5.530381075471775e-07, "loss": 0.0138, "step": 3831 }, { "epoch": 2.592692828146143, "grad_norm": 0.43210430557751284, "learning_rate": 5.512397842197847e-07, "loss": 0.0199, "step": 3832 }, { "epoch": 2.5933694181326117, "grad_norm": 0.2575695695965481, "learning_rate": 5.494442188839333e-07, "loss": 0.014, "step": 3833 }, { "epoch": 2.5940460081190797, "grad_norm": 0.4279612494651425, "learning_rate": 5.476514126527771e-07, "loss": 0.0194, "step": 3834 }, { "epoch": 2.594722598105548, "grad_norm": 0.30195321132602454, "learning_rate": 5.458613666377599e-07, "loss": 0.0209, "step": 3835 }, { "epoch": 2.595399188092016, "grad_norm": 0.6498733864084568, "learning_rate": 5.440740819486123e-07, "loss": 0.0294, "step": 3836 }, { "epoch": 2.596075778078484, "grad_norm": 0.2808968561263005, "learning_rate": 5.422895596933559e-07, "loss": 0.0137, "step": 3837 }, { "epoch": 2.5967523680649527, "grad_norm": 0.3737479669148715, "learning_rate": 5.405078009782966e-07, "loss": 0.0169, "step": 3838 }, { "epoch": 2.5974289580514207, "grad_norm": 0.4120158920436347, "learning_rate": 5.387288069080298e-07, "loss": 0.0254, "step": 3839 }, { "epoch": 2.598105548037889, "grad_norm": 0.2961257258795956, "learning_rate": 5.369525785854368e-07, "loss": 0.0166, "step": 3840 }, { "epoch": 2.598782138024357, "grad_norm": 0.32195093334073316, "learning_rate": 5.351791171116815e-07, "loss": 0.0178, "step": 3841 }, { "epoch": 2.5994587280108252, "grad_norm": 0.36623583464451537, "learning_rate": 5.334084235862158e-07, "loss": 0.0213, "step": 3842 }, { "epoch": 2.6001353179972937, "grad_norm": 0.30841867725387645, "learning_rate": 5.316404991067747e-07, "loss": 0.0191, "step": 3843 }, { "epoch": 2.6008119079837617, "grad_norm": 0.4489018552982301, "learning_rate": 5.29875344769375e-07, "loss": 0.0266, "step": 3844 }, { "epoch": 2.60148849797023, "grad_norm": 0.3270851138595341, "learning_rate": 5.281129616683167e-07, "loss": 0.0178, "step": 3845 }, { "epoch": 2.602165087956698, "grad_norm": 0.26661414593490385, "learning_rate": 5.263533508961827e-07, "loss": 0.0123, "step": 3846 }, { "epoch": 2.6028416779431662, "grad_norm": 0.4057171635880116, "learning_rate": 5.24596513543838e-07, "loss": 0.0269, "step": 3847 }, { "epoch": 2.6035182679296347, "grad_norm": 0.24243674863329226, "learning_rate": 5.228424507004265e-07, "loss": 0.0125, "step": 3848 }, { "epoch": 2.6041948579161027, "grad_norm": 0.2859036518229947, "learning_rate": 5.210911634533722e-07, "loss": 0.0176, "step": 3849 }, { "epoch": 2.604871447902571, "grad_norm": 0.30940377964776217, "learning_rate": 5.193426528883788e-07, "loss": 0.0184, "step": 3850 }, { "epoch": 2.605548037889039, "grad_norm": 0.41125207699366584, "learning_rate": 5.175969200894293e-07, "loss": 0.0283, "step": 3851 }, { "epoch": 2.6062246278755072, "grad_norm": 0.24518511219965616, "learning_rate": 5.15853966138784e-07, "loss": 0.0111, "step": 3852 }, { "epoch": 2.6069012178619757, "grad_norm": 0.28153827896785844, "learning_rate": 5.141137921169792e-07, "loss": 0.0165, "step": 3853 }, { "epoch": 2.6075778078484437, "grad_norm": 0.5510704796068311, "learning_rate": 5.123763991028291e-07, "loss": 0.0177, "step": 3854 }, { "epoch": 2.608254397834912, "grad_norm": 0.33342472568412335, "learning_rate": 5.106417881734244e-07, "loss": 0.0212, "step": 3855 }, { "epoch": 2.60893098782138, "grad_norm": 0.3150637932340437, "learning_rate": 5.089099604041314e-07, "loss": 0.0154, "step": 3856 }, { "epoch": 2.6096075778078482, "grad_norm": 0.30085535857761864, "learning_rate": 5.071809168685887e-07, "loss": 0.0185, "step": 3857 }, { "epoch": 2.6102841677943167, "grad_norm": 0.28791112855982204, "learning_rate": 5.054546586387093e-07, "loss": 0.0176, "step": 3858 }, { "epoch": 2.6109607577807847, "grad_norm": 0.34472367636173984, "learning_rate": 5.037311867846817e-07, "loss": 0.019, "step": 3859 }, { "epoch": 2.611637347767253, "grad_norm": 0.33585988543813833, "learning_rate": 5.020105023749644e-07, "loss": 0.0203, "step": 3860 }, { "epoch": 2.6123139377537212, "grad_norm": 0.34803229829542065, "learning_rate": 5.002926064762908e-07, "loss": 0.0181, "step": 3861 }, { "epoch": 2.6129905277401893, "grad_norm": 0.3991988805851519, "learning_rate": 4.985775001536619e-07, "loss": 0.0209, "step": 3862 }, { "epoch": 2.6136671177266577, "grad_norm": 0.40529106287499, "learning_rate": 4.968651844703514e-07, "loss": 0.0218, "step": 3863 }, { "epoch": 2.6143437077131257, "grad_norm": 0.32193538848448844, "learning_rate": 4.951556604879049e-07, "loss": 0.0202, "step": 3864 }, { "epoch": 2.615020297699594, "grad_norm": 0.30364844266341456, "learning_rate": 4.934489292661326e-07, "loss": 0.0178, "step": 3865 }, { "epoch": 2.6156968876860622, "grad_norm": 0.29246687146878775, "learning_rate": 4.917449918631162e-07, "loss": 0.016, "step": 3866 }, { "epoch": 2.6163734776725303, "grad_norm": 0.34378482284101136, "learning_rate": 4.900438493352056e-07, "loss": 0.0212, "step": 3867 }, { "epoch": 2.6170500676589987, "grad_norm": 0.3659490933303389, "learning_rate": 4.883455027370171e-07, "loss": 0.0207, "step": 3868 }, { "epoch": 2.6177266576454667, "grad_norm": 0.23131728722454328, "learning_rate": 4.866499531214353e-07, "loss": 0.01, "step": 3869 }, { "epoch": 2.618403247631935, "grad_norm": 0.2980442039402375, "learning_rate": 4.849572015396081e-07, "loss": 0.0172, "step": 3870 }, { "epoch": 2.6190798376184032, "grad_norm": 0.39373854818566745, "learning_rate": 4.832672490409513e-07, "loss": 0.0227, "step": 3871 }, { "epoch": 2.6197564276048713, "grad_norm": 0.29211987408626305, "learning_rate": 4.815800966731432e-07, "loss": 0.0125, "step": 3872 }, { "epoch": 2.6204330175913397, "grad_norm": 0.37663135841547213, "learning_rate": 4.798957454821285e-07, "loss": 0.018, "step": 3873 }, { "epoch": 2.6211096075778078, "grad_norm": 0.3709047479395079, "learning_rate": 4.782141965121129e-07, "loss": 0.0223, "step": 3874 }, { "epoch": 2.621786197564276, "grad_norm": 0.3376669800648237, "learning_rate": 4.7653545080556694e-07, "loss": 0.0216, "step": 3875 }, { "epoch": 2.6224627875507442, "grad_norm": 0.40515608442775186, "learning_rate": 4.748595094032221e-07, "loss": 0.0141, "step": 3876 }, { "epoch": 2.6231393775372123, "grad_norm": 0.3988322929438632, "learning_rate": 4.7318637334407335e-07, "loss": 0.018, "step": 3877 }, { "epoch": 2.6238159675236807, "grad_norm": 0.3295221068990801, "learning_rate": 4.715160436653732e-07, "loss": 0.0205, "step": 3878 }, { "epoch": 2.6244925575101488, "grad_norm": 0.3369310092609611, "learning_rate": 4.698485214026349e-07, "loss": 0.0233, "step": 3879 }, { "epoch": 2.6251691474966172, "grad_norm": 0.4172184697329247, "learning_rate": 4.6818380758963445e-07, "loss": 0.0147, "step": 3880 }, { "epoch": 2.6258457374830853, "grad_norm": 0.2769336173766099, "learning_rate": 4.6652190325840396e-07, "loss": 0.0141, "step": 3881 }, { "epoch": 2.6265223274695533, "grad_norm": 0.34232015925707115, "learning_rate": 4.6486280943923547e-07, "loss": 0.0176, "step": 3882 }, { "epoch": 2.6271989174560217, "grad_norm": 0.3316339782534317, "learning_rate": 4.632065271606756e-07, "loss": 0.0182, "step": 3883 }, { "epoch": 2.6278755074424898, "grad_norm": 0.2813458272506469, "learning_rate": 4.615530574495325e-07, "loss": 0.0155, "step": 3884 }, { "epoch": 2.6285520974289582, "grad_norm": 0.28417227930603656, "learning_rate": 4.5990240133086617e-07, "loss": 0.0151, "step": 3885 }, { "epoch": 2.6292286874154263, "grad_norm": 0.2888775215342934, "learning_rate": 4.582545598279964e-07, "loss": 0.0132, "step": 3886 }, { "epoch": 2.6299052774018943, "grad_norm": 0.33495339114295725, "learning_rate": 4.566095339624943e-07, "loss": 0.0135, "step": 3887 }, { "epoch": 2.6305818673883627, "grad_norm": 0.3677816456633802, "learning_rate": 4.549673247541875e-07, "loss": 0.0213, "step": 3888 }, { "epoch": 2.6312584573748308, "grad_norm": 0.2877152983626595, "learning_rate": 4.533279332211582e-07, "loss": 0.0147, "step": 3889 }, { "epoch": 2.6319350473612992, "grad_norm": 0.28540870328855866, "learning_rate": 4.516913603797407e-07, "loss": 0.0197, "step": 3890 }, { "epoch": 2.6326116373477673, "grad_norm": 0.5090477039481519, "learning_rate": 4.5005760724452173e-07, "loss": 0.0226, "step": 3891 }, { "epoch": 2.6332882273342353, "grad_norm": 0.3120565437034527, "learning_rate": 4.484266748283389e-07, "loss": 0.0211, "step": 3892 }, { "epoch": 2.6339648173207038, "grad_norm": 0.2577547985077275, "learning_rate": 4.4679856414228394e-07, "loss": 0.0113, "step": 3893 }, { "epoch": 2.634641407307172, "grad_norm": 0.3646085325861183, "learning_rate": 4.4517327619569784e-07, "loss": 0.0221, "step": 3894 }, { "epoch": 2.6353179972936402, "grad_norm": 0.3798157236233289, "learning_rate": 4.435508119961701e-07, "loss": 0.0196, "step": 3895 }, { "epoch": 2.6359945872801083, "grad_norm": 0.44490105371403355, "learning_rate": 4.4193117254954174e-07, "loss": 0.0181, "step": 3896 }, { "epoch": 2.6366711772665763, "grad_norm": 0.2954257625147546, "learning_rate": 4.403143588599029e-07, "loss": 0.0118, "step": 3897 }, { "epoch": 2.6373477672530448, "grad_norm": 0.28004517966788645, "learning_rate": 4.387003719295896e-07, "loss": 0.0149, "step": 3898 }, { "epoch": 2.638024357239513, "grad_norm": 0.33120734676245644, "learning_rate": 4.37089212759188e-07, "loss": 0.0235, "step": 3899 }, { "epoch": 2.6387009472259813, "grad_norm": 0.3608183612599865, "learning_rate": 4.3548088234752814e-07, "loss": 0.02, "step": 3900 }, { "epoch": 2.6393775372124493, "grad_norm": 0.32828570395822737, "learning_rate": 4.3387538169168905e-07, "loss": 0.0151, "step": 3901 }, { "epoch": 2.6400541271989173, "grad_norm": 0.3096144338983367, "learning_rate": 4.322727117869951e-07, "loss": 0.0247, "step": 3902 }, { "epoch": 2.6407307171853858, "grad_norm": 0.36095470417773273, "learning_rate": 4.3067287362701606e-07, "loss": 0.0225, "step": 3903 }, { "epoch": 2.641407307171854, "grad_norm": 0.3319403506732052, "learning_rate": 4.2907586820356337e-07, "loss": 0.0246, "step": 3904 }, { "epoch": 2.6420838971583223, "grad_norm": 0.32482972799388027, "learning_rate": 4.2748169650669524e-07, "loss": 0.0174, "step": 3905 }, { "epoch": 2.6427604871447903, "grad_norm": 0.32913880251857697, "learning_rate": 4.258903595247116e-07, "loss": 0.0176, "step": 3906 }, { "epoch": 2.6434370771312583, "grad_norm": 0.2908676502386541, "learning_rate": 4.2430185824415717e-07, "loss": 0.0155, "step": 3907 }, { "epoch": 2.6441136671177268, "grad_norm": 0.3098953513324127, "learning_rate": 4.2271619364981474e-07, "loss": 0.0181, "step": 3908 }, { "epoch": 2.644790257104195, "grad_norm": 0.27277513724138214, "learning_rate": 4.211333667247125e-07, "loss": 0.0159, "step": 3909 }, { "epoch": 2.6454668470906633, "grad_norm": 0.33789192453975786, "learning_rate": 4.195533784501177e-07, "loss": 0.0203, "step": 3910 }, { "epoch": 2.6461434370771313, "grad_norm": 0.35939308429289896, "learning_rate": 4.179762298055384e-07, "loss": 0.0211, "step": 3911 }, { "epoch": 2.6468200270635993, "grad_norm": 0.320411301639234, "learning_rate": 4.164019217687215e-07, "loss": 0.0146, "step": 3912 }, { "epoch": 2.647496617050068, "grad_norm": 0.2362825884117557, "learning_rate": 4.1483045531565183e-07, "loss": 0.0091, "step": 3913 }, { "epoch": 2.648173207036536, "grad_norm": 0.28708167376221655, "learning_rate": 4.132618314205544e-07, "loss": 0.0143, "step": 3914 }, { "epoch": 2.6488497970230043, "grad_norm": 0.38186143626136787, "learning_rate": 4.1169605105589315e-07, "loss": 0.0321, "step": 3915 }, { "epoch": 2.6495263870094723, "grad_norm": 0.3484998324938573, "learning_rate": 4.101331151923649e-07, "loss": 0.019, "step": 3916 }, { "epoch": 2.6502029769959403, "grad_norm": 0.3209783144879589, "learning_rate": 4.085730247989078e-07, "loss": 0.0159, "step": 3917 }, { "epoch": 2.650879566982409, "grad_norm": 0.27431039425284287, "learning_rate": 4.070157808426928e-07, "loss": 0.0142, "step": 3918 }, { "epoch": 2.651556156968877, "grad_norm": 0.3111461728880971, "learning_rate": 4.0546138428912694e-07, "loss": 0.0211, "step": 3919 }, { "epoch": 2.6522327469553453, "grad_norm": 0.337765663300153, "learning_rate": 4.039098361018534e-07, "loss": 0.018, "step": 3920 }, { "epoch": 2.6529093369418133, "grad_norm": 0.2841292113821926, "learning_rate": 4.0236113724274716e-07, "loss": 0.0143, "step": 3921 }, { "epoch": 2.6535859269282813, "grad_norm": 0.2911658918869312, "learning_rate": 4.0081528867191854e-07, "loss": 0.0165, "step": 3922 }, { "epoch": 2.65426251691475, "grad_norm": 0.27203809441422655, "learning_rate": 3.992722913477104e-07, "loss": 0.0136, "step": 3923 }, { "epoch": 2.654939106901218, "grad_norm": 0.5587079487321519, "learning_rate": 3.9773214622669974e-07, "loss": 0.0282, "step": 3924 }, { "epoch": 2.6556156968876863, "grad_norm": 0.322198990516875, "learning_rate": 3.9619485426369007e-07, "loss": 0.0163, "step": 3925 }, { "epoch": 2.6562922868741543, "grad_norm": 0.31230145293933687, "learning_rate": 3.9466041641172126e-07, "loss": 0.0214, "step": 3926 }, { "epoch": 2.6569688768606223, "grad_norm": 0.31695084783290833, "learning_rate": 3.9312883362206177e-07, "loss": 0.0203, "step": 3927 }, { "epoch": 2.657645466847091, "grad_norm": 0.42114465466333423, "learning_rate": 3.916001068442116e-07, "loss": 0.0176, "step": 3928 }, { "epoch": 2.658322056833559, "grad_norm": 0.3724269800045378, "learning_rate": 3.90074237025897e-07, "loss": 0.0244, "step": 3929 }, { "epoch": 2.6589986468200273, "grad_norm": 0.34608745347403674, "learning_rate": 3.885512251130763e-07, "loss": 0.0187, "step": 3930 }, { "epoch": 2.6596752368064953, "grad_norm": 0.28587571147229535, "learning_rate": 3.870310720499354e-07, "loss": 0.0137, "step": 3931 }, { "epoch": 2.6603518267929633, "grad_norm": 0.33921847814139083, "learning_rate": 3.8551377877888487e-07, "loss": 0.0127, "step": 3932 }, { "epoch": 2.661028416779432, "grad_norm": 0.267265988150043, "learning_rate": 3.839993462405678e-07, "loss": 0.0176, "step": 3933 }, { "epoch": 2.6617050067659, "grad_norm": 0.3617449862708202, "learning_rate": 3.8248777537384763e-07, "loss": 0.0223, "step": 3934 }, { "epoch": 2.6623815967523683, "grad_norm": 0.8014209907018301, "learning_rate": 3.8097906711581864e-07, "loss": 0.025, "step": 3935 }, { "epoch": 2.6630581867388363, "grad_norm": 0.3736165538060828, "learning_rate": 3.794732224017994e-07, "loss": 0.0193, "step": 3936 }, { "epoch": 2.6637347767253043, "grad_norm": 0.4260184161980021, "learning_rate": 3.7797024216533143e-07, "loss": 0.0242, "step": 3937 }, { "epoch": 2.664411366711773, "grad_norm": 0.3472416873892331, "learning_rate": 3.764701273381799e-07, "loss": 0.0188, "step": 3938 }, { "epoch": 2.665087956698241, "grad_norm": 0.35323929395202686, "learning_rate": 3.7497287885033763e-07, "loss": 0.0217, "step": 3939 }, { "epoch": 2.6657645466847093, "grad_norm": 0.40594597062584553, "learning_rate": 3.734784976300165e-07, "loss": 0.0269, "step": 3940 }, { "epoch": 2.6664411366711773, "grad_norm": 0.44665833491750784, "learning_rate": 3.719869846036539e-07, "loss": 0.0305, "step": 3941 }, { "epoch": 2.6671177266576453, "grad_norm": 0.30024012099265956, "learning_rate": 3.7049834069590507e-07, "loss": 0.0182, "step": 3942 }, { "epoch": 2.667794316644114, "grad_norm": 0.3269498330187841, "learning_rate": 3.6901256682965123e-07, "loss": 0.0171, "step": 3943 }, { "epoch": 2.668470906630582, "grad_norm": 0.36432034507712474, "learning_rate": 3.675296639259912e-07, "loss": 0.0178, "step": 3944 }, { "epoch": 2.6691474966170503, "grad_norm": 0.40697105264204897, "learning_rate": 3.6604963290424453e-07, "loss": 0.028, "step": 3945 }, { "epoch": 2.6698240866035183, "grad_norm": 0.2819036122469473, "learning_rate": 3.6457247468195233e-07, "loss": 0.0144, "step": 3946 }, { "epoch": 2.6705006765899864, "grad_norm": 0.3322224073131182, "learning_rate": 3.6309819017487034e-07, "loss": 0.0139, "step": 3947 }, { "epoch": 2.671177266576455, "grad_norm": 0.2542359632629604, "learning_rate": 3.6162678029697696e-07, "loss": 0.0129, "step": 3948 }, { "epoch": 2.671853856562923, "grad_norm": 0.3018835528992928, "learning_rate": 3.60158245960468e-07, "loss": 0.0153, "step": 3949 }, { "epoch": 2.6725304465493913, "grad_norm": 0.31663925709287366, "learning_rate": 3.5869258807575414e-07, "loss": 0.0184, "step": 3950 }, { "epoch": 2.6732070365358593, "grad_norm": 0.4249759802609871, "learning_rate": 3.572298075514652e-07, "loss": 0.0277, "step": 3951 }, { "epoch": 2.6738836265223274, "grad_norm": 0.2824209969784279, "learning_rate": 3.557699052944447e-07, "loss": 0.0145, "step": 3952 }, { "epoch": 2.674560216508796, "grad_norm": 0.32105438490120597, "learning_rate": 3.5431288220975466e-07, "loss": 0.0248, "step": 3953 }, { "epoch": 2.675236806495264, "grad_norm": 0.3317638633088956, "learning_rate": 3.528587392006716e-07, "loss": 0.0146, "step": 3954 }, { "epoch": 2.6759133964817323, "grad_norm": 0.3247865928920339, "learning_rate": 3.5140747716868375e-07, "loss": 0.0161, "step": 3955 }, { "epoch": 2.6765899864682003, "grad_norm": 0.34873860460927325, "learning_rate": 3.499590970134964e-07, "loss": 0.0192, "step": 3956 }, { "epoch": 2.6772665764546684, "grad_norm": 0.3032937689202816, "learning_rate": 3.48513599633028e-07, "loss": 0.0164, "step": 3957 }, { "epoch": 2.677943166441137, "grad_norm": 0.3220608890504641, "learning_rate": 3.470709859234084e-07, "loss": 0.0145, "step": 3958 }, { "epoch": 2.678619756427605, "grad_norm": 0.2921901563624883, "learning_rate": 3.4563125677897936e-07, "loss": 0.0131, "step": 3959 }, { "epoch": 2.6792963464140733, "grad_norm": 0.35235928759794943, "learning_rate": 3.4419441309229587e-07, "loss": 0.021, "step": 3960 }, { "epoch": 2.6799729364005414, "grad_norm": 0.3240001104687845, "learning_rate": 3.427604557541242e-07, "loss": 0.0215, "step": 3961 }, { "epoch": 2.6806495263870094, "grad_norm": 0.36378012234467383, "learning_rate": 3.4132938565344054e-07, "loss": 0.0197, "step": 3962 }, { "epoch": 2.6813261163734774, "grad_norm": 0.42436223450235305, "learning_rate": 3.3990120367743074e-07, "loss": 0.0289, "step": 3963 }, { "epoch": 2.682002706359946, "grad_norm": 0.4797539008409651, "learning_rate": 3.38475910711491e-07, "loss": 0.0279, "step": 3964 }, { "epoch": 2.6826792963464143, "grad_norm": 0.5036292250663403, "learning_rate": 3.370535076392256e-07, "loss": 0.028, "step": 3965 }, { "epoch": 2.6833558863328824, "grad_norm": 0.3521912885566871, "learning_rate": 3.356339953424481e-07, "loss": 0.0175, "step": 3966 }, { "epoch": 2.6840324763193504, "grad_norm": 0.34324717810799954, "learning_rate": 3.342173747011801e-07, "loss": 0.0148, "step": 3967 }, { "epoch": 2.6847090663058184, "grad_norm": 0.2884494151212183, "learning_rate": 3.3280364659364903e-07, "loss": 0.0155, "step": 3968 }, { "epoch": 2.685385656292287, "grad_norm": 0.27871209698815347, "learning_rate": 3.313928118962906e-07, "loss": 0.0185, "step": 3969 }, { "epoch": 2.6860622462787553, "grad_norm": 0.31202318809896584, "learning_rate": 3.299848714837473e-07, "loss": 0.015, "step": 3970 }, { "epoch": 2.6867388362652234, "grad_norm": 0.30238832288143436, "learning_rate": 3.285798262288653e-07, "loss": 0.019, "step": 3971 }, { "epoch": 2.6874154262516914, "grad_norm": 0.31633936011316777, "learning_rate": 3.271776770026963e-07, "loss": 0.0196, "step": 3972 }, { "epoch": 2.6880920162381594, "grad_norm": 0.2756780328798173, "learning_rate": 3.2577842467449773e-07, "loss": 0.0188, "step": 3973 }, { "epoch": 2.688768606224628, "grad_norm": 0.27233306192939727, "learning_rate": 3.243820701117306e-07, "loss": 0.0146, "step": 3974 }, { "epoch": 2.6894451962110963, "grad_norm": 0.2768605775656786, "learning_rate": 3.229886141800609e-07, "loss": 0.0196, "step": 3975 }, { "epoch": 2.6901217861975644, "grad_norm": 0.3232662211606284, "learning_rate": 3.2159805774335364e-07, "loss": 0.0203, "step": 3976 }, { "epoch": 2.6907983761840324, "grad_norm": 0.37823678373998143, "learning_rate": 3.2021040166368145e-07, "loss": 0.0366, "step": 3977 }, { "epoch": 2.6914749661705004, "grad_norm": 0.40305361291705155, "learning_rate": 3.18825646801314e-07, "loss": 0.022, "step": 3978 }, { "epoch": 2.692151556156969, "grad_norm": 0.4273099718053193, "learning_rate": 3.174437940147268e-07, "loss": 0.0329, "step": 3979 }, { "epoch": 2.6928281461434374, "grad_norm": 0.29755498790040424, "learning_rate": 3.160648441605918e-07, "loss": 0.0162, "step": 3980 }, { "epoch": 2.6935047361299054, "grad_norm": 0.2667826288793149, "learning_rate": 3.146887980937852e-07, "loss": 0.0191, "step": 3981 }, { "epoch": 2.6941813261163734, "grad_norm": 0.2869237590396578, "learning_rate": 3.133156566673806e-07, "loss": 0.0138, "step": 3982 }, { "epoch": 2.6948579161028414, "grad_norm": 0.31340279503148366, "learning_rate": 3.119454207326533e-07, "loss": 0.0187, "step": 3983 }, { "epoch": 2.69553450608931, "grad_norm": 0.3644251122156195, "learning_rate": 3.105780911390738e-07, "loss": 0.022, "step": 3984 }, { "epoch": 2.696211096075778, "grad_norm": 0.38640727503191297, "learning_rate": 3.0921366873431337e-07, "loss": 0.0213, "step": 3985 }, { "epoch": 2.6968876860622464, "grad_norm": 0.4404023970252406, "learning_rate": 3.0785215436423986e-07, "loss": 0.0189, "step": 3986 }, { "epoch": 2.6975642760487144, "grad_norm": 0.3390027575595125, "learning_rate": 3.0649354887291927e-07, "loss": 0.0184, "step": 3987 }, { "epoch": 2.6982408660351824, "grad_norm": 0.34572300340973866, "learning_rate": 3.05137853102615e-07, "loss": 0.021, "step": 3988 }, { "epoch": 2.698917456021651, "grad_norm": 0.33657469038263754, "learning_rate": 3.037850678937831e-07, "loss": 0.018, "step": 3989 }, { "epoch": 2.699594046008119, "grad_norm": 0.3761462390392621, "learning_rate": 3.0243519408507894e-07, "loss": 0.0227, "step": 3990 }, { "epoch": 2.7002706359945874, "grad_norm": 0.2847388699465433, "learning_rate": 3.0108823251335183e-07, "loss": 0.0178, "step": 3991 }, { "epoch": 2.7009472259810554, "grad_norm": 0.32007823943767616, "learning_rate": 2.997441840136445e-07, "loss": 0.0142, "step": 3992 }, { "epoch": 2.7016238159675234, "grad_norm": 0.4164781570430055, "learning_rate": 2.984030494191942e-07, "loss": 0.0179, "step": 3993 }, { "epoch": 2.702300405953992, "grad_norm": 0.30710764810054547, "learning_rate": 2.97064829561432e-07, "loss": 0.0146, "step": 3994 }, { "epoch": 2.70297699594046, "grad_norm": 0.3725245796358553, "learning_rate": 2.957295252699832e-07, "loss": 0.0255, "step": 3995 }, { "epoch": 2.7036535859269284, "grad_norm": 0.31916801311574255, "learning_rate": 2.9439713737266504e-07, "loss": 0.021, "step": 3996 }, { "epoch": 2.7043301759133964, "grad_norm": 0.34789644252839536, "learning_rate": 2.930676666954846e-07, "loss": 0.0186, "step": 3997 }, { "epoch": 2.7050067658998644, "grad_norm": 0.3185141638540178, "learning_rate": 2.917411140626425e-07, "loss": 0.0202, "step": 3998 }, { "epoch": 2.705683355886333, "grad_norm": 0.35599833079731513, "learning_rate": 2.904174802965293e-07, "loss": 0.0184, "step": 3999 }, { "epoch": 2.706359945872801, "grad_norm": 0.3320771075498398, "learning_rate": 2.8909676621772853e-07, "loss": 0.0221, "step": 4000 }, { "epoch": 2.7070365358592694, "grad_norm": 0.26379576526571435, "learning_rate": 2.877789726450092e-07, "loss": 0.0117, "step": 4001 }, { "epoch": 2.7077131258457374, "grad_norm": 0.33358097165521583, "learning_rate": 2.864641003953339e-07, "loss": 0.0167, "step": 4002 }, { "epoch": 2.7083897158322054, "grad_norm": 0.2729659485043568, "learning_rate": 2.8515215028385223e-07, "loss": 0.0145, "step": 4003 }, { "epoch": 2.709066305818674, "grad_norm": 0.30138506793419817, "learning_rate": 2.8384312312390306e-07, "loss": 0.0179, "step": 4004 }, { "epoch": 2.709742895805142, "grad_norm": 0.350962633376903, "learning_rate": 2.8253701972701275e-07, "loss": 0.0193, "step": 4005 }, { "epoch": 2.7104194857916104, "grad_norm": 0.25997132874121065, "learning_rate": 2.8123384090289307e-07, "loss": 0.0164, "step": 4006 }, { "epoch": 2.7110960757780784, "grad_norm": 0.3182245648724455, "learning_rate": 2.799335874594461e-07, "loss": 0.015, "step": 4007 }, { "epoch": 2.7117726657645465, "grad_norm": 0.28796902178207934, "learning_rate": 2.7863626020275867e-07, "loss": 0.0169, "step": 4008 }, { "epoch": 2.712449255751015, "grad_norm": 0.25975610658489845, "learning_rate": 2.773418599371047e-07, "loss": 0.0122, "step": 4009 }, { "epoch": 2.713125845737483, "grad_norm": 0.47643332218771917, "learning_rate": 2.7605038746494063e-07, "loss": 0.0246, "step": 4010 }, { "epoch": 2.7138024357239514, "grad_norm": 0.29446278438100165, "learning_rate": 2.7476184358691206e-07, "loss": 0.0175, "step": 4011 }, { "epoch": 2.7144790257104194, "grad_norm": 0.27594452911980805, "learning_rate": 2.7347622910184445e-07, "loss": 0.0148, "step": 4012 }, { "epoch": 2.7151556156968875, "grad_norm": 0.4369159147523556, "learning_rate": 2.7219354480675144e-07, "loss": 0.0315, "step": 4013 }, { "epoch": 2.715832205683356, "grad_norm": 0.3572726531465084, "learning_rate": 2.7091379149682683e-07, "loss": 0.0263, "step": 4014 }, { "epoch": 2.716508795669824, "grad_norm": 0.3064147132424032, "learning_rate": 2.696369699654489e-07, "loss": 0.014, "step": 4015 }, { "epoch": 2.7171853856562924, "grad_norm": 0.3252035525213185, "learning_rate": 2.6836308100417874e-07, "loss": 0.0201, "step": 4016 }, { "epoch": 2.7178619756427604, "grad_norm": 0.36956768488070535, "learning_rate": 2.670921254027592e-07, "loss": 0.0278, "step": 4017 }, { "epoch": 2.7185385656292285, "grad_norm": 0.33678750438956134, "learning_rate": 2.6582410394911327e-07, "loss": 0.0177, "step": 4018 }, { "epoch": 2.719215155615697, "grad_norm": 0.3031224517435561, "learning_rate": 2.6455901742934556e-07, "loss": 0.0217, "step": 4019 }, { "epoch": 2.719891745602165, "grad_norm": 0.3040656428382453, "learning_rate": 2.6329686662774247e-07, "loss": 0.016, "step": 4020 }, { "epoch": 2.7205683355886334, "grad_norm": 0.3473243847101063, "learning_rate": 2.620376523267698e-07, "loss": 0.0206, "step": 4021 }, { "epoch": 2.7212449255751014, "grad_norm": 0.39693332106447327, "learning_rate": 2.6078137530707146e-07, "loss": 0.0172, "step": 4022 }, { "epoch": 2.7219215155615695, "grad_norm": 0.36349345616889783, "learning_rate": 2.595280363474717e-07, "loss": 0.0218, "step": 4023 }, { "epoch": 2.722598105548038, "grad_norm": 0.3510368180734231, "learning_rate": 2.582776362249739e-07, "loss": 0.0173, "step": 4024 }, { "epoch": 2.723274695534506, "grad_norm": 0.3009889770250133, "learning_rate": 2.5703017571475755e-07, "loss": 0.017, "step": 4025 }, { "epoch": 2.7239512855209744, "grad_norm": 0.3231731108995585, "learning_rate": 2.5578565559018276e-07, "loss": 0.0182, "step": 4026 }, { "epoch": 2.7246278755074425, "grad_norm": 0.425246576115556, "learning_rate": 2.545440766227825e-07, "loss": 0.0345, "step": 4027 }, { "epoch": 2.7253044654939105, "grad_norm": 0.3699676156137622, "learning_rate": 2.5330543958227036e-07, "loss": 0.0195, "step": 4028 }, { "epoch": 2.725981055480379, "grad_norm": 0.3979910221226708, "learning_rate": 2.520697452365345e-07, "loss": 0.0262, "step": 4029 }, { "epoch": 2.726657645466847, "grad_norm": 0.3519156429332919, "learning_rate": 2.508369943516387e-07, "loss": 0.026, "step": 4030 }, { "epoch": 2.7273342354533154, "grad_norm": 0.29848732385448223, "learning_rate": 2.4960718769182214e-07, "loss": 0.0186, "step": 4031 }, { "epoch": 2.7280108254397835, "grad_norm": 0.47792407734513015, "learning_rate": 2.483803260194978e-07, "loss": 0.0191, "step": 4032 }, { "epoch": 2.7286874154262515, "grad_norm": 0.4101167635055601, "learning_rate": 2.4715641009525446e-07, "loss": 0.0272, "step": 4033 }, { "epoch": 2.72936400541272, "grad_norm": 0.28252063753987144, "learning_rate": 2.459354406778547e-07, "loss": 0.0147, "step": 4034 }, { "epoch": 2.730040595399188, "grad_norm": 0.3063204855995186, "learning_rate": 2.447174185242324e-07, "loss": 0.0166, "step": 4035 }, { "epoch": 2.7307171853856564, "grad_norm": 0.2621946755270409, "learning_rate": 2.4350234438949625e-07, "loss": 0.0139, "step": 4036 }, { "epoch": 2.7313937753721245, "grad_norm": 0.6575602680490917, "learning_rate": 2.4229021902692663e-07, "loss": 0.0282, "step": 4037 }, { "epoch": 2.7320703653585925, "grad_norm": 0.2779304144145073, "learning_rate": 2.4108104318797674e-07, "loss": 0.0153, "step": 4038 }, { "epoch": 2.732746955345061, "grad_norm": 0.3329234905094126, "learning_rate": 2.3987481762226984e-07, "loss": 0.0161, "step": 4039 }, { "epoch": 2.733423545331529, "grad_norm": 0.3040250237676759, "learning_rate": 2.3867154307759986e-07, "loss": 0.014, "step": 4040 }, { "epoch": 2.7341001353179974, "grad_norm": 0.30396870746681326, "learning_rate": 2.3747122029993296e-07, "loss": 0.0195, "step": 4041 }, { "epoch": 2.7347767253044655, "grad_norm": 0.30697060065287424, "learning_rate": 2.3627385003340552e-07, "loss": 0.0164, "step": 4042 }, { "epoch": 2.7354533152909335, "grad_norm": 0.32791553905501825, "learning_rate": 2.3507943302032045e-07, "loss": 0.0163, "step": 4043 }, { "epoch": 2.736129905277402, "grad_norm": 0.38213003884367064, "learning_rate": 2.3388797000115427e-07, "loss": 0.0187, "step": 4044 }, { "epoch": 2.73680649526387, "grad_norm": 0.22295909082208143, "learning_rate": 2.3269946171454727e-07, "loss": 0.0104, "step": 4045 }, { "epoch": 2.7374830852503385, "grad_norm": 0.34977044871127216, "learning_rate": 2.3151390889731285e-07, "loss": 0.02, "step": 4046 }, { "epoch": 2.7381596752368065, "grad_norm": 0.3241616477307261, "learning_rate": 2.3033131228442863e-07, "loss": 0.0232, "step": 4047 }, { "epoch": 2.7388362652232745, "grad_norm": 0.3648919069938753, "learning_rate": 2.2915167260904092e-07, "loss": 0.0242, "step": 4048 }, { "epoch": 2.739512855209743, "grad_norm": 0.3734339835125443, "learning_rate": 2.2797499060246253e-07, "loss": 0.0214, "step": 4049 }, { "epoch": 2.740189445196211, "grad_norm": 0.3434765124359413, "learning_rate": 2.2680126699417383e-07, "loss": 0.0235, "step": 4050 }, { "epoch": 2.7408660351826795, "grad_norm": 0.32959240028144116, "learning_rate": 2.256305025118194e-07, "loss": 0.0213, "step": 4051 }, { "epoch": 2.7415426251691475, "grad_norm": 0.3160533452250457, "learning_rate": 2.244626978812109e-07, "loss": 0.0169, "step": 4052 }, { "epoch": 2.7422192151556155, "grad_norm": 0.8114519106917566, "learning_rate": 2.2329785382632253e-07, "loss": 0.0345, "step": 4053 }, { "epoch": 2.742895805142084, "grad_norm": 0.3152431991332842, "learning_rate": 2.2213597106929608e-07, "loss": 0.0161, "step": 4054 }, { "epoch": 2.743572395128552, "grad_norm": 0.3065261676246775, "learning_rate": 2.2097705033043703e-07, "loss": 0.0147, "step": 4055 }, { "epoch": 2.7442489851150205, "grad_norm": 0.3535460570128464, "learning_rate": 2.198210923282118e-07, "loss": 0.0174, "step": 4056 }, { "epoch": 2.7449255751014885, "grad_norm": 0.3520352465737316, "learning_rate": 2.1866809777925323e-07, "loss": 0.0177, "step": 4057 }, { "epoch": 2.7456021650879565, "grad_norm": 0.33604005886941474, "learning_rate": 2.1751806739835624e-07, "loss": 0.0183, "step": 4058 }, { "epoch": 2.746278755074425, "grad_norm": 0.26371486220756835, "learning_rate": 2.163710018984766e-07, "loss": 0.0132, "step": 4059 }, { "epoch": 2.746955345060893, "grad_norm": 0.3041056847418416, "learning_rate": 2.1522690199073382e-07, "loss": 0.014, "step": 4060 }, { "epoch": 2.7476319350473615, "grad_norm": 0.35208989016331793, "learning_rate": 2.140857683844072e-07, "loss": 0.0186, "step": 4061 }, { "epoch": 2.7483085250338295, "grad_norm": 0.33364142321908025, "learning_rate": 2.1294760178693918e-07, "loss": 0.0176, "step": 4062 }, { "epoch": 2.7489851150202975, "grad_norm": 0.2958162686569277, "learning_rate": 2.118124029039309e-07, "loss": 0.017, "step": 4063 }, { "epoch": 2.749661705006766, "grad_norm": 0.2805006933993456, "learning_rate": 2.1068017243914663e-07, "loss": 0.0167, "step": 4064 }, { "epoch": 2.750338294993234, "grad_norm": 0.30392576671577776, "learning_rate": 2.0955091109450488e-07, "loss": 0.0152, "step": 4065 }, { "epoch": 2.7510148849797025, "grad_norm": 0.3334875369482922, "learning_rate": 2.0842461957008841e-07, "loss": 0.0226, "step": 4066 }, { "epoch": 2.7516914749661705, "grad_norm": 0.262672594488508, "learning_rate": 2.0730129856413705e-07, "loss": 0.0178, "step": 4067 }, { "epoch": 2.7523680649526385, "grad_norm": 0.3303580177380507, "learning_rate": 2.061809487730504e-07, "loss": 0.0224, "step": 4068 }, { "epoch": 2.753044654939107, "grad_norm": 0.3283166808566351, "learning_rate": 2.050635708913834e-07, "loss": 0.0188, "step": 4069 }, { "epoch": 2.753721244925575, "grad_norm": 0.31562572929572663, "learning_rate": 2.0394916561185085e-07, "loss": 0.0176, "step": 4070 }, { "epoch": 2.7543978349120435, "grad_norm": 0.388767555169842, "learning_rate": 2.0283773362532455e-07, "loss": 0.0169, "step": 4071 }, { "epoch": 2.7550744248985115, "grad_norm": 0.3048219874973848, "learning_rate": 2.0172927562083056e-07, "loss": 0.02, "step": 4072 }, { "epoch": 2.7557510148849795, "grad_norm": 0.274060625739595, "learning_rate": 2.006237922855553e-07, "loss": 0.0133, "step": 4073 }, { "epoch": 2.756427604871448, "grad_norm": 0.3099779178997045, "learning_rate": 1.9952128430483718e-07, "loss": 0.0218, "step": 4074 }, { "epoch": 2.757104194857916, "grad_norm": 0.2725512711461539, "learning_rate": 1.9842175236217176e-07, "loss": 0.0143, "step": 4075 }, { "epoch": 2.7577807848443845, "grad_norm": 0.39468192117410833, "learning_rate": 1.973251971392115e-07, "loss": 0.0181, "step": 4076 }, { "epoch": 2.7584573748308525, "grad_norm": 0.39116902292312916, "learning_rate": 1.962316193157593e-07, "loss": 0.0205, "step": 4077 }, { "epoch": 2.7591339648173205, "grad_norm": 0.33392762399766013, "learning_rate": 1.9514101956977617e-07, "loss": 0.0212, "step": 4078 }, { "epoch": 2.759810554803789, "grad_norm": 0.3625373916301655, "learning_rate": 1.9405339857737348e-07, "loss": 0.021, "step": 4079 }, { "epoch": 2.760487144790257, "grad_norm": 0.29297155837592603, "learning_rate": 1.9296875701281858e-07, "loss": 0.0142, "step": 4080 }, { "epoch": 2.7611637347767255, "grad_norm": 0.37434360044098514, "learning_rate": 1.9188709554853137e-07, "loss": 0.0131, "step": 4081 }, { "epoch": 2.7618403247631935, "grad_norm": 0.32840018770567087, "learning_rate": 1.9080841485508205e-07, "loss": 0.0178, "step": 4082 }, { "epoch": 2.7625169147496615, "grad_norm": 0.2732008471951929, "learning_rate": 1.8973271560119576e-07, "loss": 0.0166, "step": 4083 }, { "epoch": 2.76319350473613, "grad_norm": 0.304425593920917, "learning_rate": 1.8865999845374794e-07, "loss": 0.0173, "step": 4084 }, { "epoch": 2.763870094722598, "grad_norm": 0.3357367881595279, "learning_rate": 1.8759026407776605e-07, "loss": 0.0189, "step": 4085 }, { "epoch": 2.7645466847090665, "grad_norm": 0.37373448162813294, "learning_rate": 1.8652351313642568e-07, "loss": 0.0187, "step": 4086 }, { "epoch": 2.7652232746955345, "grad_norm": 0.29380493140749553, "learning_rate": 1.8545974629105624e-07, "loss": 0.0184, "step": 4087 }, { "epoch": 2.7658998646820026, "grad_norm": 0.36995227087745874, "learning_rate": 1.8439896420113569e-07, "loss": 0.0224, "step": 4088 }, { "epoch": 2.766576454668471, "grad_norm": 0.3713994600818487, "learning_rate": 1.8334116752429243e-07, "loss": 0.0192, "step": 4089 }, { "epoch": 2.767253044654939, "grad_norm": 0.29526245991345584, "learning_rate": 1.8228635691630191e-07, "loss": 0.0149, "step": 4090 }, { "epoch": 2.7679296346414075, "grad_norm": 0.28427182057613726, "learning_rate": 1.812345330310916e-07, "loss": 0.0122, "step": 4091 }, { "epoch": 2.7686062246278755, "grad_norm": 0.28069488963185774, "learning_rate": 1.801856965207338e-07, "loss": 0.0125, "step": 4092 }, { "epoch": 2.7692828146143436, "grad_norm": 0.35868479336121434, "learning_rate": 1.791398480354517e-07, "loss": 0.0232, "step": 4093 }, { "epoch": 2.769959404600812, "grad_norm": 0.3313312477344652, "learning_rate": 1.78096988223615e-07, "loss": 0.0186, "step": 4094 }, { "epoch": 2.77063599458728, "grad_norm": 0.326954198352059, "learning_rate": 1.770571177317404e-07, "loss": 0.0166, "step": 4095 }, { "epoch": 2.7713125845737485, "grad_norm": 0.38009409106856973, "learning_rate": 1.7602023720449114e-07, "loss": 0.0212, "step": 4096 }, { "epoch": 2.7719891745602165, "grad_norm": 0.2556002151284434, "learning_rate": 1.74986347284678e-07, "loss": 0.0114, "step": 4097 }, { "epoch": 2.7726657645466846, "grad_norm": 0.2925961625184005, "learning_rate": 1.7395544861325718e-07, "loss": 0.0212, "step": 4098 }, { "epoch": 2.773342354533153, "grad_norm": 0.2905471849559168, "learning_rate": 1.7292754182932914e-07, "loss": 0.0136, "step": 4099 }, { "epoch": 2.774018944519621, "grad_norm": 0.3106757077951478, "learning_rate": 1.7190262757014076e-07, "loss": 0.0144, "step": 4100 }, { "epoch": 2.7746955345060895, "grad_norm": 0.3014654925650417, "learning_rate": 1.7088070647108433e-07, "loss": 0.0142, "step": 4101 }, { "epoch": 2.7753721244925575, "grad_norm": 0.37528508507028097, "learning_rate": 1.6986177916569646e-07, "loss": 0.018, "step": 4102 }, { "epoch": 2.7760487144790256, "grad_norm": 0.3515124685348501, "learning_rate": 1.688458462856557e-07, "loss": 0.0213, "step": 4103 }, { "epoch": 2.776725304465494, "grad_norm": 0.32662458304207914, "learning_rate": 1.6783290846078714e-07, "loss": 0.0205, "step": 4104 }, { "epoch": 2.777401894451962, "grad_norm": 0.23417014264730251, "learning_rate": 1.6682296631905626e-07, "loss": 0.0107, "step": 4105 }, { "epoch": 2.7780784844384305, "grad_norm": 0.30065525384623853, "learning_rate": 1.6581602048657387e-07, "loss": 0.0172, "step": 4106 }, { "epoch": 2.7787550744248986, "grad_norm": 0.3665657592493083, "learning_rate": 1.648120715875906e-07, "loss": 0.0203, "step": 4107 }, { "epoch": 2.7794316644113666, "grad_norm": 0.37673042416747315, "learning_rate": 1.6381112024450196e-07, "loss": 0.0178, "step": 4108 }, { "epoch": 2.780108254397835, "grad_norm": 0.34082547617953735, "learning_rate": 1.6281316707784377e-07, "loss": 0.0172, "step": 4109 }, { "epoch": 2.780784844384303, "grad_norm": 0.3240317627049622, "learning_rate": 1.618182127062934e-07, "loss": 0.0225, "step": 4110 }, { "epoch": 2.7814614343707715, "grad_norm": 0.3259841086009218, "learning_rate": 1.6082625774666793e-07, "loss": 0.0171, "step": 4111 }, { "epoch": 2.7821380243572396, "grad_norm": 0.32598710617131105, "learning_rate": 1.5983730281392663e-07, "loss": 0.0198, "step": 4112 }, { "epoch": 2.7828146143437076, "grad_norm": 0.2709223470740636, "learning_rate": 1.588513485211679e-07, "loss": 0.0164, "step": 4113 }, { "epoch": 2.783491204330176, "grad_norm": 0.32928346289954025, "learning_rate": 1.5786839547963008e-07, "loss": 0.0189, "step": 4114 }, { "epoch": 2.784167794316644, "grad_norm": 0.30672617000876923, "learning_rate": 1.5688844429869232e-07, "loss": 0.0132, "step": 4115 }, { "epoch": 2.7848443843031125, "grad_norm": 0.25503069071354534, "learning_rate": 1.5591149558587037e-07, "loss": 0.0154, "step": 4116 }, { "epoch": 2.7855209742895806, "grad_norm": 0.31228690440274604, "learning_rate": 1.5493754994681977e-07, "loss": 0.0154, "step": 4117 }, { "epoch": 2.7861975642760486, "grad_norm": 0.35886890995941473, "learning_rate": 1.539666079853358e-07, "loss": 0.0189, "step": 4118 }, { "epoch": 2.786874154262517, "grad_norm": 0.29012322934131934, "learning_rate": 1.5299867030334815e-07, "loss": 0.0138, "step": 4119 }, { "epoch": 2.787550744248985, "grad_norm": 0.29350351400257524, "learning_rate": 1.5203373750092676e-07, "loss": 0.018, "step": 4120 }, { "epoch": 2.7882273342354535, "grad_norm": 0.3287251860029488, "learning_rate": 1.5107181017627813e-07, "loss": 0.0217, "step": 4121 }, { "epoch": 2.7889039242219216, "grad_norm": 0.3509188190630282, "learning_rate": 1.5011288892574526e-07, "loss": 0.017, "step": 4122 }, { "epoch": 2.7895805142083896, "grad_norm": 0.3343502518240095, "learning_rate": 1.4915697434380816e-07, "loss": 0.0125, "step": 4123 }, { "epoch": 2.790257104194858, "grad_norm": 0.25046203213359464, "learning_rate": 1.4820406702308165e-07, "loss": 0.0138, "step": 4124 }, { "epoch": 2.790933694181326, "grad_norm": 0.33263767132269406, "learning_rate": 1.4725416755431655e-07, "loss": 0.0247, "step": 4125 }, { "epoch": 2.7916102841677946, "grad_norm": 0.3691855820101089, "learning_rate": 1.463072765264001e-07, "loss": 0.0264, "step": 4126 }, { "epoch": 2.7922868741542626, "grad_norm": 0.2736780190443737, "learning_rate": 1.4536339452635385e-07, "loss": 0.0154, "step": 4127 }, { "epoch": 2.7929634641407306, "grad_norm": 0.3061179819121896, "learning_rate": 1.444225221393325e-07, "loss": 0.0196, "step": 4128 }, { "epoch": 2.793640054127199, "grad_norm": 0.36039938771301444, "learning_rate": 1.4348465994862782e-07, "loss": 0.0207, "step": 4129 }, { "epoch": 2.794316644113667, "grad_norm": 0.4566606225699603, "learning_rate": 1.4254980853566248e-07, "loss": 0.0261, "step": 4130 }, { "epoch": 2.7949932341001356, "grad_norm": 0.34136881554932963, "learning_rate": 1.4161796847999566e-07, "loss": 0.0238, "step": 4131 }, { "epoch": 2.7956698240866036, "grad_norm": 0.3282136685011032, "learning_rate": 1.4068914035931635e-07, "loss": 0.0219, "step": 4132 }, { "epoch": 2.7963464140730716, "grad_norm": 0.296902671355774, "learning_rate": 1.3976332474944842e-07, "loss": 0.0169, "step": 4133 }, { "epoch": 2.79702300405954, "grad_norm": 0.3690960315227845, "learning_rate": 1.388405222243472e-07, "loss": 0.0298, "step": 4134 }, { "epoch": 2.797699594046008, "grad_norm": 0.3073177677940477, "learning_rate": 1.3792073335610111e-07, "loss": 0.0179, "step": 4135 }, { "epoch": 2.7983761840324766, "grad_norm": 0.3274661142774267, "learning_rate": 1.3700395871493023e-07, "loss": 0.0211, "step": 4136 }, { "epoch": 2.7990527740189446, "grad_norm": 0.33622335014898314, "learning_rate": 1.360901988691843e-07, "loss": 0.021, "step": 4137 }, { "epoch": 2.7997293640054126, "grad_norm": 0.33537010188096567, "learning_rate": 1.3517945438534629e-07, "loss": 0.0173, "step": 4138 }, { "epoch": 2.800405953991881, "grad_norm": 0.32687748121447535, "learning_rate": 1.342717258280274e-07, "loss": 0.0186, "step": 4139 }, { "epoch": 2.801082543978349, "grad_norm": 0.3690251150778055, "learning_rate": 1.333670137599713e-07, "loss": 0.0176, "step": 4140 }, { "epoch": 2.8017591339648176, "grad_norm": 0.34519815737695225, "learning_rate": 1.3246531874204994e-07, "loss": 0.0257, "step": 4141 }, { "epoch": 2.8024357239512856, "grad_norm": 0.24481152014945498, "learning_rate": 1.3156664133326614e-07, "loss": 0.0126, "step": 4142 }, { "epoch": 2.8031123139377536, "grad_norm": 0.3739915472938706, "learning_rate": 1.3067098209075202e-07, "loss": 0.0175, "step": 4143 }, { "epoch": 2.803788903924222, "grad_norm": 0.347442694922643, "learning_rate": 1.2977834156976733e-07, "loss": 0.0212, "step": 4144 }, { "epoch": 2.80446549391069, "grad_norm": 0.2839483018207203, "learning_rate": 1.2888872032370103e-07, "loss": 0.0114, "step": 4145 }, { "epoch": 2.8051420838971586, "grad_norm": 0.32521226637344836, "learning_rate": 1.280021189040709e-07, "loss": 0.0181, "step": 4146 }, { "epoch": 2.8058186738836266, "grad_norm": 0.2842628100028989, "learning_rate": 1.2711853786052108e-07, "loss": 0.0131, "step": 4147 }, { "epoch": 2.8064952638700946, "grad_norm": 0.5208399257736981, "learning_rate": 1.2623797774082514e-07, "loss": 0.019, "step": 4148 }, { "epoch": 2.8071718538565626, "grad_norm": 0.3550062429672941, "learning_rate": 1.253604390908819e-07, "loss": 0.0207, "step": 4149 }, { "epoch": 2.807848443843031, "grad_norm": 0.3540559434553578, "learning_rate": 1.2448592245471903e-07, "loss": 0.0212, "step": 4150 }, { "epoch": 2.8085250338294996, "grad_norm": 0.3446182176537769, "learning_rate": 1.2361442837449e-07, "loss": 0.0199, "step": 4151 }, { "epoch": 2.8092016238159676, "grad_norm": 0.3930861168869538, "learning_rate": 1.2274595739047267e-07, "loss": 0.02, "step": 4152 }, { "epoch": 2.8098782138024356, "grad_norm": 0.331933688100313, "learning_rate": 1.2188051004107305e-07, "loss": 0.0169, "step": 4153 }, { "epoch": 2.8105548037889037, "grad_norm": 0.2710769268242476, "learning_rate": 1.210180868628219e-07, "loss": 0.0146, "step": 4154 }, { "epoch": 2.811231393775372, "grad_norm": 0.291195714945549, "learning_rate": 1.2015868839037492e-07, "loss": 0.0162, "step": 4155 }, { "epoch": 2.8119079837618406, "grad_norm": 0.362413962982955, "learning_rate": 1.1930231515651313e-07, "loss": 0.0288, "step": 4156 }, { "epoch": 2.8125845737483086, "grad_norm": 0.2807957681518928, "learning_rate": 1.1844896769214187e-07, "loss": 0.0128, "step": 4157 }, { "epoch": 2.8132611637347766, "grad_norm": 0.3585190271177819, "learning_rate": 1.1759864652629072e-07, "loss": 0.0175, "step": 4158 }, { "epoch": 2.8139377537212447, "grad_norm": 0.23158674845334967, "learning_rate": 1.1675135218611188e-07, "loss": 0.0126, "step": 4159 }, { "epoch": 2.814614343707713, "grad_norm": 0.3380125058541794, "learning_rate": 1.1590708519688243e-07, "loss": 0.0185, "step": 4160 }, { "epoch": 2.8152909336941816, "grad_norm": 0.35484980017854956, "learning_rate": 1.1506584608200366e-07, "loss": 0.0171, "step": 4161 }, { "epoch": 2.8159675236806496, "grad_norm": 0.3041795775691457, "learning_rate": 1.142276353629973e-07, "loss": 0.0167, "step": 4162 }, { "epoch": 2.8166441136671176, "grad_norm": 0.2883595137984694, "learning_rate": 1.1339245355950934e-07, "loss": 0.0152, "step": 4163 }, { "epoch": 2.8173207036535857, "grad_norm": 0.3345296906089819, "learning_rate": 1.1256030118930727e-07, "loss": 0.0194, "step": 4164 }, { "epoch": 2.817997293640054, "grad_norm": 0.35993936957584743, "learning_rate": 1.1173117876828066e-07, "loss": 0.0187, "step": 4165 }, { "epoch": 2.8186738836265226, "grad_norm": 0.3673734361345342, "learning_rate": 1.1090508681044055e-07, "loss": 0.0238, "step": 4166 }, { "epoch": 2.8193504736129906, "grad_norm": 0.40161920530572387, "learning_rate": 1.1008202582792005e-07, "loss": 0.0311, "step": 4167 }, { "epoch": 2.8200270635994586, "grad_norm": 0.3802887760171123, "learning_rate": 1.0926199633097156e-07, "loss": 0.0201, "step": 4168 }, { "epoch": 2.8207036535859267, "grad_norm": 0.4511150041871571, "learning_rate": 1.0844499882797011e-07, "loss": 0.0235, "step": 4169 }, { "epoch": 2.821380243572395, "grad_norm": 0.3747750908844225, "learning_rate": 1.0763103382541052e-07, "loss": 0.0179, "step": 4170 }, { "epoch": 2.822056833558863, "grad_norm": 0.35174672555112185, "learning_rate": 1.0682010182790637e-07, "loss": 0.0196, "step": 4171 }, { "epoch": 2.8227334235453316, "grad_norm": 0.27613717626728473, "learning_rate": 1.0601220333819162e-07, "loss": 0.0153, "step": 4172 }, { "epoch": 2.8234100135317997, "grad_norm": 0.36129890907296824, "learning_rate": 1.0520733885712008e-07, "loss": 0.0148, "step": 4173 }, { "epoch": 2.8240866035182677, "grad_norm": 0.3063260499050128, "learning_rate": 1.0440550888366485e-07, "loss": 0.0202, "step": 4174 }, { "epoch": 2.824763193504736, "grad_norm": 0.26862879142559287, "learning_rate": 1.0360671391491606e-07, "loss": 0.0155, "step": 4175 }, { "epoch": 2.825439783491204, "grad_norm": 0.3108379329412415, "learning_rate": 1.0281095444608425e-07, "loss": 0.0141, "step": 4176 }, { "epoch": 2.8261163734776726, "grad_norm": 0.31685624074629615, "learning_rate": 1.0201823097049812e-07, "loss": 0.0209, "step": 4177 }, { "epoch": 2.8267929634641407, "grad_norm": 0.2674887247006238, "learning_rate": 1.0122854397960292e-07, "loss": 0.0153, "step": 4178 }, { "epoch": 2.8274695534506087, "grad_norm": 0.36815715043709984, "learning_rate": 1.0044189396296144e-07, "loss": 0.0195, "step": 4179 }, { "epoch": 2.828146143437077, "grad_norm": 0.45243847055235403, "learning_rate": 9.965828140825529e-08, "loss": 0.0322, "step": 4180 }, { "epoch": 2.828822733423545, "grad_norm": 0.32561370294205155, "learning_rate": 9.887770680128083e-08, "loss": 0.0201, "step": 4181 }, { "epoch": 2.8294993234100136, "grad_norm": 0.378921279145927, "learning_rate": 9.810017062595322e-08, "loss": 0.0185, "step": 4182 }, { "epoch": 2.8301759133964817, "grad_norm": 0.38997410309334873, "learning_rate": 9.732567336430298e-08, "loss": 0.0264, "step": 4183 }, { "epoch": 2.8308525033829497, "grad_norm": 0.33019969569208485, "learning_rate": 9.655421549647603e-08, "loss": 0.0171, "step": 4184 }, { "epoch": 2.831529093369418, "grad_norm": 0.32296767331333515, "learning_rate": 9.57857975007348e-08, "loss": 0.018, "step": 4185 }, { "epoch": 2.832205683355886, "grad_norm": 0.24066758014289052, "learning_rate": 9.502041985345766e-08, "loss": 0.0141, "step": 4186 }, { "epoch": 2.8328822733423547, "grad_norm": 0.4270692892252477, "learning_rate": 9.42580830291373e-08, "loss": 0.0303, "step": 4187 }, { "epoch": 2.8335588633288227, "grad_norm": 0.46981352624557154, "learning_rate": 9.349878750038067e-08, "loss": 0.0166, "step": 4188 }, { "epoch": 2.8342354533152907, "grad_norm": 0.32867190206042646, "learning_rate": 9.274253373791064e-08, "loss": 0.0172, "step": 4189 }, { "epoch": 2.834912043301759, "grad_norm": 0.26081705893631607, "learning_rate": 9.198932221056333e-08, "loss": 0.0183, "step": 4190 }, { "epoch": 2.835588633288227, "grad_norm": 0.3046085644654613, "learning_rate": 9.123915338529132e-08, "loss": 0.0188, "step": 4191 }, { "epoch": 2.8362652232746957, "grad_norm": 0.45431772838139406, "learning_rate": 9.049202772715593e-08, "loss": 0.0327, "step": 4192 }, { "epoch": 2.8369418132611637, "grad_norm": 0.2873985175221937, "learning_rate": 8.974794569933609e-08, "loss": 0.0159, "step": 4193 }, { "epoch": 2.8376184032476317, "grad_norm": 0.31784620721086676, "learning_rate": 8.900690776312282e-08, "loss": 0.0197, "step": 4194 }, { "epoch": 2.8382949932341, "grad_norm": 0.30041381746593987, "learning_rate": 8.826891437791974e-08, "loss": 0.0169, "step": 4195 }, { "epoch": 2.838971583220568, "grad_norm": 0.39558605785565415, "learning_rate": 8.753396600124254e-08, "loss": 0.0289, "step": 4196 }, { "epoch": 2.8396481732070367, "grad_norm": 0.37784200747162283, "learning_rate": 8.680206308871953e-08, "loss": 0.0231, "step": 4197 }, { "epoch": 2.8403247631935047, "grad_norm": 0.3790069222464381, "learning_rate": 8.607320609409165e-08, "loss": 0.0219, "step": 4198 }, { "epoch": 2.8410013531799727, "grad_norm": 0.3142251266132667, "learning_rate": 8.534739546921023e-08, "loss": 0.0165, "step": 4199 }, { "epoch": 2.841677943166441, "grad_norm": 0.35237740062774736, "learning_rate": 8.462463166403978e-08, "loss": 0.0241, "step": 4200 }, { "epoch": 2.842354533152909, "grad_norm": 0.2665480806466482, "learning_rate": 8.390491512665355e-08, "loss": 0.0169, "step": 4201 }, { "epoch": 2.8430311231393777, "grad_norm": 0.3091381157886086, "learning_rate": 8.318824630323741e-08, "loss": 0.0182, "step": 4202 }, { "epoch": 2.8437077131258457, "grad_norm": 0.3159771346997211, "learning_rate": 8.247462563808816e-08, "loss": 0.0147, "step": 4203 }, { "epoch": 2.8443843031123137, "grad_norm": 0.25437533995979045, "learning_rate": 8.176405357361194e-08, "loss": 0.0145, "step": 4204 }, { "epoch": 2.845060893098782, "grad_norm": 0.3153744422734466, "learning_rate": 8.105653055032415e-08, "loss": 0.0158, "step": 4205 }, { "epoch": 2.84573748308525, "grad_norm": 0.4315609203813294, "learning_rate": 8.035205700685167e-08, "loss": 0.0321, "step": 4206 }, { "epoch": 2.8464140730717187, "grad_norm": 0.29239149692314864, "learning_rate": 7.965063337993018e-08, "loss": 0.0147, "step": 4207 }, { "epoch": 2.8470906630581867, "grad_norm": 0.32624058008096235, "learning_rate": 7.89522601044046e-08, "loss": 0.0229, "step": 4208 }, { "epoch": 2.8477672530446547, "grad_norm": 0.30141679336917077, "learning_rate": 7.825693761322861e-08, "loss": 0.0154, "step": 4209 }, { "epoch": 2.848443843031123, "grad_norm": 0.33653125023671926, "learning_rate": 7.756466633746407e-08, "loss": 0.0184, "step": 4210 }, { "epoch": 2.849120433017591, "grad_norm": 0.3675320849851784, "learning_rate": 7.687544670628267e-08, "loss": 0.0181, "step": 4211 }, { "epoch": 2.8497970230040597, "grad_norm": 0.30627731914656653, "learning_rate": 7.618927914696372e-08, "loss": 0.0151, "step": 4212 }, { "epoch": 2.8504736129905277, "grad_norm": 0.5556851057427106, "learning_rate": 7.550616408489253e-08, "loss": 0.0264, "step": 4213 }, { "epoch": 2.8511502029769957, "grad_norm": 0.25391006207421374, "learning_rate": 7.482610194356477e-08, "loss": 0.0126, "step": 4214 }, { "epoch": 2.851826792963464, "grad_norm": 0.3823095849765143, "learning_rate": 7.414909314458263e-08, "loss": 0.0241, "step": 4215 }, { "epoch": 2.852503382949932, "grad_norm": 0.3234817129922742, "learning_rate": 7.347513810765427e-08, "loss": 0.0222, "step": 4216 }, { "epoch": 2.8531799729364007, "grad_norm": 0.3765176485465511, "learning_rate": 7.280423725059604e-08, "loss": 0.0166, "step": 4217 }, { "epoch": 2.8538565629228687, "grad_norm": 0.30554992343241966, "learning_rate": 7.213639098933022e-08, "loss": 0.0158, "step": 4218 }, { "epoch": 2.8545331529093367, "grad_norm": 0.46525063222991864, "learning_rate": 7.147159973788508e-08, "loss": 0.0394, "step": 4219 }, { "epoch": 2.855209742895805, "grad_norm": 0.30322183391296076, "learning_rate": 7.080986390839539e-08, "loss": 0.0164, "step": 4220 }, { "epoch": 2.8558863328822732, "grad_norm": 0.37710971981176267, "learning_rate": 7.015118391110299e-08, "loss": 0.0235, "step": 4221 }, { "epoch": 2.8565629228687417, "grad_norm": 0.2656839806725427, "learning_rate": 6.949556015435178e-08, "loss": 0.0162, "step": 4222 }, { "epoch": 2.8572395128552097, "grad_norm": 0.36277813108526075, "learning_rate": 6.884299304459497e-08, "loss": 0.017, "step": 4223 }, { "epoch": 2.8579161028416777, "grad_norm": 0.39673098082738284, "learning_rate": 6.819348298638839e-08, "loss": 0.0302, "step": 4224 }, { "epoch": 2.858592692828146, "grad_norm": 0.31371828741325075, "learning_rate": 6.75470303823933e-08, "loss": 0.0188, "step": 4225 }, { "epoch": 2.8592692828146142, "grad_norm": 0.26473994958768376, "learning_rate": 6.690363563337466e-08, "loss": 0.018, "step": 4226 }, { "epoch": 2.8599458728010827, "grad_norm": 0.2767737819584196, "learning_rate": 6.626329913820339e-08, "loss": 0.0152, "step": 4227 }, { "epoch": 2.8606224627875507, "grad_norm": 0.273429670336525, "learning_rate": 6.562602129385365e-08, "loss": 0.0171, "step": 4228 }, { "epoch": 2.8612990527740187, "grad_norm": 0.27272266057380484, "learning_rate": 6.499180249540382e-08, "loss": 0.0142, "step": 4229 }, { "epoch": 2.861975642760487, "grad_norm": 0.32846334676597794, "learning_rate": 6.436064313603385e-08, "loss": 0.0159, "step": 4230 }, { "epoch": 2.8626522327469552, "grad_norm": 0.3317592781453487, "learning_rate": 6.373254360703019e-08, "loss": 0.0175, "step": 4231 }, { "epoch": 2.8633288227334237, "grad_norm": 0.3523062154902482, "learning_rate": 6.310750429777912e-08, "loss": 0.0279, "step": 4232 }, { "epoch": 2.8640054127198917, "grad_norm": 0.33862087809046665, "learning_rate": 6.248552559577292e-08, "loss": 0.0155, "step": 4233 }, { "epoch": 2.8646820027063598, "grad_norm": 0.30863870798026977, "learning_rate": 6.186660788660315e-08, "loss": 0.024, "step": 4234 }, { "epoch": 2.865358592692828, "grad_norm": 0.43530468617613666, "learning_rate": 6.125075155396675e-08, "loss": 0.0249, "step": 4235 }, { "epoch": 2.8660351826792962, "grad_norm": 0.33410857146880635, "learning_rate": 6.063795697966057e-08, "loss": 0.0182, "step": 4236 }, { "epoch": 2.8667117726657647, "grad_norm": 0.2581141263823941, "learning_rate": 6.00282245435857e-08, "loss": 0.013, "step": 4237 }, { "epoch": 2.8673883626522327, "grad_norm": 0.26336019085508233, "learning_rate": 5.9421554623742e-08, "loss": 0.0199, "step": 4238 }, { "epoch": 2.8680649526387008, "grad_norm": 0.3068932902803712, "learning_rate": 5.881794759623194e-08, "loss": 0.019, "step": 4239 }, { "epoch": 2.8687415426251692, "grad_norm": 0.30334602603987404, "learning_rate": 5.8217403835260086e-08, "loss": 0.017, "step": 4240 }, { "epoch": 2.8694181326116373, "grad_norm": 0.36373523194783963, "learning_rate": 5.7619923713130857e-08, "loss": 0.0218, "step": 4241 }, { "epoch": 2.8700947225981057, "grad_norm": 0.28776157462866214, "learning_rate": 5.7025507600250165e-08, "loss": 0.0168, "step": 4242 }, { "epoch": 2.8707713125845737, "grad_norm": 0.3741713230544328, "learning_rate": 5.643415586512324e-08, "loss": 0.0183, "step": 4243 }, { "epoch": 2.8714479025710418, "grad_norm": 0.35753259645318475, "learning_rate": 5.584586887435739e-08, "loss": 0.0165, "step": 4244 }, { "epoch": 2.8721244925575102, "grad_norm": 0.4091639225039315, "learning_rate": 5.526064699265754e-08, "loss": 0.0221, "step": 4245 }, { "epoch": 2.8728010825439783, "grad_norm": 0.4310266598775968, "learning_rate": 5.4678490582830704e-08, "loss": 0.0161, "step": 4246 }, { "epoch": 2.8734776725304467, "grad_norm": 0.4589657330215928, "learning_rate": 5.409940000578207e-08, "loss": 0.025, "step": 4247 }, { "epoch": 2.8741542625169147, "grad_norm": 0.39574053407186405, "learning_rate": 5.352337562051613e-08, "loss": 0.0205, "step": 4248 }, { "epoch": 2.8748308525033828, "grad_norm": 0.31761749193169986, "learning_rate": 5.2950417784137785e-08, "loss": 0.0135, "step": 4249 }, { "epoch": 2.8755074424898512, "grad_norm": 0.3288942339427781, "learning_rate": 5.2380526851850135e-08, "loss": 0.0119, "step": 4250 }, { "epoch": 2.8761840324763193, "grad_norm": 0.2709022082033616, "learning_rate": 5.181370317695389e-08, "loss": 0.0117, "step": 4251 }, { "epoch": 2.8768606224627877, "grad_norm": 0.33017093317214713, "learning_rate": 5.124994711084963e-08, "loss": 0.0164, "step": 4252 }, { "epoch": 2.8775372124492558, "grad_norm": 0.41359223837025066, "learning_rate": 5.0689259003035566e-08, "loss": 0.0244, "step": 4253 }, { "epoch": 2.878213802435724, "grad_norm": 0.33650748628730026, "learning_rate": 5.013163920110864e-08, "loss": 0.0211, "step": 4254 }, { "epoch": 2.8788903924221922, "grad_norm": 0.34398603125699145, "learning_rate": 4.9577088050762337e-08, "loss": 0.0207, "step": 4255 }, { "epoch": 2.8795669824086603, "grad_norm": 0.33198254647461567, "learning_rate": 4.9025605895788867e-08, "loss": 0.0218, "step": 4256 }, { "epoch": 2.8802435723951287, "grad_norm": 0.35877004905417115, "learning_rate": 4.847719307807752e-08, "loss": 0.0189, "step": 4257 }, { "epoch": 2.8809201623815968, "grad_norm": 0.3116332685026539, "learning_rate": 4.793184993761468e-08, "loss": 0.0156, "step": 4258 }, { "epoch": 2.881596752368065, "grad_norm": 0.3082410344380756, "learning_rate": 4.73895768124838e-08, "loss": 0.0148, "step": 4259 }, { "epoch": 2.8822733423545333, "grad_norm": 0.3041206173305893, "learning_rate": 4.685037403886483e-08, "loss": 0.02, "step": 4260 }, { "epoch": 2.8829499323410013, "grad_norm": 0.2693770911865805, "learning_rate": 4.631424195103373e-08, "loss": 0.0117, "step": 4261 }, { "epoch": 2.8836265223274697, "grad_norm": 0.3201209591829607, "learning_rate": 4.578118088136463e-08, "loss": 0.0166, "step": 4262 }, { "epoch": 2.8843031123139378, "grad_norm": 0.36151687197068305, "learning_rate": 4.52511911603265e-08, "loss": 0.0237, "step": 4263 }, { "epoch": 2.884979702300406, "grad_norm": 0.30576819613758666, "learning_rate": 4.4724273116483754e-08, "loss": 0.0175, "step": 4264 }, { "epoch": 2.8856562922868743, "grad_norm": 0.4126017261557132, "learning_rate": 4.42004270764973e-08, "loss": 0.0232, "step": 4265 }, { "epoch": 2.8863328822733423, "grad_norm": 0.28007083278233813, "learning_rate": 4.367965336512403e-08, "loss": 0.0172, "step": 4266 }, { "epoch": 2.8870094722598107, "grad_norm": 0.29123182114067986, "learning_rate": 4.316195230521514e-08, "loss": 0.0172, "step": 4267 }, { "epoch": 2.8876860622462788, "grad_norm": 0.41340741961638583, "learning_rate": 4.264732421771722e-08, "loss": 0.0241, "step": 4268 }, { "epoch": 2.888362652232747, "grad_norm": 0.21205094059444654, "learning_rate": 4.21357694216723e-08, "loss": 0.0104, "step": 4269 }, { "epoch": 2.8890392422192153, "grad_norm": 0.32319959609921306, "learning_rate": 4.162728823421669e-08, "loss": 0.015, "step": 4270 }, { "epoch": 2.8897158322056833, "grad_norm": 0.3290901086365611, "learning_rate": 4.112188097058156e-08, "loss": 0.0235, "step": 4271 }, { "epoch": 2.8903924221921518, "grad_norm": 0.25413269095569097, "learning_rate": 4.061954794409184e-08, "loss": 0.0127, "step": 4272 }, { "epoch": 2.89106901217862, "grad_norm": 0.30256805738488934, "learning_rate": 4.0120289466166754e-08, "loss": 0.0164, "step": 4273 }, { "epoch": 2.891745602165088, "grad_norm": 0.38270235641862727, "learning_rate": 3.9624105846319813e-08, "loss": 0.0171, "step": 4274 }, { "epoch": 2.8924221921515563, "grad_norm": 0.3169592219433915, "learning_rate": 3.9130997392157756e-08, "loss": 0.0193, "step": 4275 }, { "epoch": 2.8930987821380243, "grad_norm": 0.37530967308090235, "learning_rate": 3.86409644093827e-08, "loss": 0.0227, "step": 4276 }, { "epoch": 2.8937753721244928, "grad_norm": 0.3727951648012234, "learning_rate": 3.8154007201787194e-08, "loss": 0.0213, "step": 4277 }, { "epoch": 2.894451962110961, "grad_norm": 0.2400579021593019, "learning_rate": 3.7670126071259194e-08, "loss": 0.0119, "step": 4278 }, { "epoch": 2.895128552097429, "grad_norm": 0.491117410386177, "learning_rate": 3.718932131777819e-08, "loss": 0.0221, "step": 4279 }, { "epoch": 2.8958051420838973, "grad_norm": 0.3236535062948911, "learning_rate": 3.6711593239417976e-08, "loss": 0.0154, "step": 4280 }, { "epoch": 2.8964817320703653, "grad_norm": 0.26659002110625496, "learning_rate": 3.62369421323433e-08, "loss": 0.0184, "step": 4281 }, { "epoch": 2.8971583220568338, "grad_norm": 0.3384157344158936, "learning_rate": 3.576536829081323e-08, "loss": 0.0226, "step": 4282 }, { "epoch": 2.897834912043302, "grad_norm": 0.4754787105293215, "learning_rate": 3.52968720071778e-08, "loss": 0.0222, "step": 4283 }, { "epoch": 2.89851150202977, "grad_norm": 0.34755706930405583, "learning_rate": 3.483145357187967e-08, "loss": 0.0179, "step": 4284 }, { "epoch": 2.8991880920162383, "grad_norm": 0.325935012514578, "learning_rate": 3.436911327345305e-08, "loss": 0.0144, "step": 4285 }, { "epoch": 2.8998646820027063, "grad_norm": 0.3327741788248334, "learning_rate": 3.3909851398523654e-08, "loss": 0.0132, "step": 4286 }, { "epoch": 2.9005412719891748, "grad_norm": 0.2861945720917734, "learning_rate": 3.345366823180929e-08, "loss": 0.0157, "step": 4287 }, { "epoch": 2.901217861975643, "grad_norm": 0.3665978352277033, "learning_rate": 3.300056405611873e-08, "loss": 0.0154, "step": 4288 }, { "epoch": 2.901894451962111, "grad_norm": 0.3339163588510717, "learning_rate": 3.2550539152352845e-08, "loss": 0.0156, "step": 4289 }, { "epoch": 2.9025710419485793, "grad_norm": 0.4395250423334691, "learning_rate": 3.2103593799501786e-08, "loss": 0.0272, "step": 4290 }, { "epoch": 2.9032476319350473, "grad_norm": 0.2962692895914033, "learning_rate": 3.165972827464892e-08, "loss": 0.0121, "step": 4291 }, { "epoch": 2.903924221921516, "grad_norm": 0.30857554868165343, "learning_rate": 3.1218942852965226e-08, "loss": 0.0186, "step": 4292 }, { "epoch": 2.904600811907984, "grad_norm": 0.3379736851327122, "learning_rate": 3.078123780771602e-08, "loss": 0.0178, "step": 4293 }, { "epoch": 2.905277401894452, "grad_norm": 0.3347150338757051, "learning_rate": 3.034661341025258e-08, "loss": 0.0127, "step": 4294 }, { "epoch": 2.9059539918809203, "grad_norm": 0.47547131888115907, "learning_rate": 2.9915069930019914e-08, "loss": 0.0239, "step": 4295 }, { "epoch": 2.9066305818673883, "grad_norm": 0.34846448209053205, "learning_rate": 2.94866076345518e-08, "loss": 0.0203, "step": 4296 }, { "epoch": 2.907307171853857, "grad_norm": 0.3472791435166466, "learning_rate": 2.9061226789471874e-08, "loss": 0.0217, "step": 4297 }, { "epoch": 2.907983761840325, "grad_norm": 0.2908861983310066, "learning_rate": 2.863892765849252e-08, "loss": 0.0167, "step": 4298 }, { "epoch": 2.908660351826793, "grad_norm": 0.27631325099648174, "learning_rate": 2.8219710503416543e-08, "loss": 0.015, "step": 4299 }, { "epoch": 2.9093369418132613, "grad_norm": 0.4381992878386518, "learning_rate": 2.78035755841366e-08, "loss": 0.0182, "step": 4300 }, { "epoch": 2.9100135317997293, "grad_norm": 0.3772522330202533, "learning_rate": 2.7390523158633552e-08, "loss": 0.03, "step": 4301 }, { "epoch": 2.910690121786198, "grad_norm": 0.39301401342529896, "learning_rate": 2.6980553482977566e-08, "loss": 0.0215, "step": 4302 }, { "epoch": 2.911366711772666, "grad_norm": 0.34538212284144815, "learning_rate": 2.657366681132756e-08, "loss": 0.0252, "step": 4303 }, { "epoch": 2.912043301759134, "grad_norm": 0.3033583250639276, "learning_rate": 2.6169863395932304e-08, "loss": 0.0151, "step": 4304 }, { "epoch": 2.9127198917456023, "grad_norm": 0.4348684049174322, "learning_rate": 2.5769143487127113e-08, "loss": 0.0244, "step": 4305 }, { "epoch": 2.9133964817320703, "grad_norm": 0.357439540770848, "learning_rate": 2.5371507333337153e-08, "loss": 0.0199, "step": 4306 }, { "epoch": 2.914073071718539, "grad_norm": 0.2885648400552308, "learning_rate": 2.497695518107579e-08, "loss": 0.0169, "step": 4307 }, { "epoch": 2.914749661705007, "grad_norm": 0.25282593387606855, "learning_rate": 2.4585487274942922e-08, "loss": 0.0128, "step": 4308 }, { "epoch": 2.915426251691475, "grad_norm": 0.2277673314759974, "learning_rate": 2.4197103857628858e-08, "loss": 0.0124, "step": 4309 }, { "epoch": 2.9161028416779433, "grad_norm": 0.30799563707363364, "learning_rate": 2.381180516990933e-08, "loss": 0.0189, "step": 4310 }, { "epoch": 2.9167794316644113, "grad_norm": 0.402521176810984, "learning_rate": 2.3429591450649934e-08, "loss": 0.0165, "step": 4311 }, { "epoch": 2.91745602165088, "grad_norm": 0.30582298260297197, "learning_rate": 2.305046293680113e-08, "loss": 0.0158, "step": 4312 }, { "epoch": 2.918132611637348, "grad_norm": 0.3068926391837632, "learning_rate": 2.267441986340324e-08, "loss": 0.0143, "step": 4313 }, { "epoch": 2.918809201623816, "grad_norm": 0.30115489166458353, "learning_rate": 2.230146246358256e-08, "loss": 0.0157, "step": 4314 }, { "epoch": 2.9194857916102843, "grad_norm": 0.42883998913216026, "learning_rate": 2.193159096855191e-08, "loss": 0.0289, "step": 4315 }, { "epoch": 2.9201623815967523, "grad_norm": 0.5385956885172357, "learning_rate": 2.1564805607612317e-08, "loss": 0.0269, "step": 4316 }, { "epoch": 2.920838971583221, "grad_norm": 0.26155103827421755, "learning_rate": 2.120110660815078e-08, "loss": 0.0151, "step": 4317 }, { "epoch": 2.921515561569689, "grad_norm": 0.39640151390909856, "learning_rate": 2.0840494195641382e-08, "loss": 0.0193, "step": 4318 }, { "epoch": 2.922192151556157, "grad_norm": 0.27912308164286453, "learning_rate": 2.0482968593643625e-08, "loss": 0.013, "step": 4319 }, { "epoch": 2.9228687415426253, "grad_norm": 0.3229440425464994, "learning_rate": 2.012853002380466e-08, "loss": 0.0141, "step": 4320 }, { "epoch": 2.9235453315290933, "grad_norm": 0.28732500317070214, "learning_rate": 1.97771787058576e-08, "loss": 0.02, "step": 4321 }, { "epoch": 2.924221921515562, "grad_norm": 0.3558103778800274, "learning_rate": 1.942891485762044e-08, "loss": 0.0276, "step": 4322 }, { "epoch": 2.92489851150203, "grad_norm": 0.40707033045057484, "learning_rate": 1.9083738694998798e-08, "loss": 0.0212, "step": 4323 }, { "epoch": 2.925575101488498, "grad_norm": 0.2882245457459522, "learning_rate": 1.8741650431982615e-08, "loss": 0.0135, "step": 4324 }, { "epoch": 2.9262516914749663, "grad_norm": 0.3716334418961407, "learning_rate": 1.8402650280648916e-08, "loss": 0.0215, "step": 4325 }, { "epoch": 2.9269282814614344, "grad_norm": 0.3709921645244406, "learning_rate": 1.8066738451159028e-08, "loss": 0.0262, "step": 4326 }, { "epoch": 2.927604871447903, "grad_norm": 0.3314299162218466, "learning_rate": 1.773391515176026e-08, "loss": 0.0166, "step": 4327 }, { "epoch": 2.928281461434371, "grad_norm": 0.3393260454830498, "learning_rate": 1.740418058878479e-08, "loss": 0.0197, "step": 4328 }, { "epoch": 2.928958051420839, "grad_norm": 0.3084975827429294, "learning_rate": 1.7077534966650767e-08, "loss": 0.0202, "step": 4329 }, { "epoch": 2.9296346414073073, "grad_norm": 0.3019682415559219, "learning_rate": 1.6753978487860645e-08, "loss": 0.0183, "step": 4330 }, { "epoch": 2.9303112313937754, "grad_norm": 0.3057852451626844, "learning_rate": 1.6433511353002863e-08, "loss": 0.0149, "step": 4331 }, { "epoch": 2.930987821380244, "grad_norm": 0.35440586101875543, "learning_rate": 1.6116133760747944e-08, "loss": 0.02, "step": 4332 }, { "epoch": 2.931664411366712, "grad_norm": 0.396189232384318, "learning_rate": 1.5801845907854606e-08, "loss": 0.021, "step": 4333 }, { "epoch": 2.93234100135318, "grad_norm": 0.383055252792876, "learning_rate": 1.549064798916311e-08, "loss": 0.0334, "step": 4334 }, { "epoch": 2.933017591339648, "grad_norm": 0.5205310542121773, "learning_rate": 1.5182540197600237e-08, "loss": 0.0205, "step": 4335 }, { "epoch": 2.9336941813261164, "grad_norm": 0.2664001445596687, "learning_rate": 1.4877522724175974e-08, "loss": 0.013, "step": 4336 }, { "epoch": 2.934370771312585, "grad_norm": 0.3788167280191553, "learning_rate": 1.4575595757985172e-08, "loss": 0.0224, "step": 4337 }, { "epoch": 2.935047361299053, "grad_norm": 0.3201633152950062, "learning_rate": 1.4276759486205328e-08, "loss": 0.0199, "step": 4338 }, { "epoch": 2.935723951285521, "grad_norm": 0.5406506993061142, "learning_rate": 1.3981014094099354e-08, "loss": 0.017, "step": 4339 }, { "epoch": 2.936400541271989, "grad_norm": 0.2973005758907856, "learning_rate": 1.368835976501337e-08, "loss": 0.0176, "step": 4340 }, { "epoch": 2.9370771312584574, "grad_norm": 0.27892407613947634, "learning_rate": 1.3398796680377245e-08, "loss": 0.0142, "step": 4341 }, { "epoch": 2.937753721244926, "grad_norm": 0.38503686951243543, "learning_rate": 1.3112325019704608e-08, "loss": 0.021, "step": 4342 }, { "epoch": 2.938430311231394, "grad_norm": 0.3339655848594916, "learning_rate": 1.2828944960592837e-08, "loss": 0.0176, "step": 4343 }, { "epoch": 2.939106901217862, "grad_norm": 0.316123728556445, "learning_rate": 1.2548656678721404e-08, "loss": 0.0164, "step": 4344 }, { "epoch": 2.93978349120433, "grad_norm": 0.36367628136946084, "learning_rate": 1.2271460347854091e-08, "loss": 0.0179, "step": 4345 }, { "epoch": 2.9404600811907984, "grad_norm": 0.3488339117202152, "learning_rate": 1.1997356139838434e-08, "loss": 0.0213, "step": 4346 }, { "epoch": 2.941136671177267, "grad_norm": 0.35022374428722575, "learning_rate": 1.1726344224603504e-08, "loss": 0.02, "step": 4347 }, { "epoch": 2.941813261163735, "grad_norm": 0.39506691018080775, "learning_rate": 1.145842477016268e-08, "loss": 0.0225, "step": 4348 }, { "epoch": 2.942489851150203, "grad_norm": 0.3458942447882672, "learning_rate": 1.119359794261088e-08, "loss": 0.0203, "step": 4349 }, { "epoch": 2.943166441136671, "grad_norm": 0.31112225633978385, "learning_rate": 1.0931863906127327e-08, "loss": 0.0179, "step": 4350 }, { "epoch": 2.9438430311231394, "grad_norm": 0.3632591345807602, "learning_rate": 1.0673222822972229e-08, "loss": 0.021, "step": 4351 }, { "epoch": 2.944519621109608, "grad_norm": 0.26889010243432787, "learning_rate": 1.0417674853489545e-08, "loss": 0.0134, "step": 4352 }, { "epoch": 2.945196211096076, "grad_norm": 0.33497895902046243, "learning_rate": 1.0165220156105326e-08, "loss": 0.0233, "step": 4353 }, { "epoch": 2.945872801082544, "grad_norm": 0.3273435086703643, "learning_rate": 9.915858887327157e-09, "loss": 0.0153, "step": 4354 }, { "epoch": 2.946549391069012, "grad_norm": 0.3195513135185016, "learning_rate": 9.669591201746375e-09, "loss": 0.0209, "step": 4355 }, { "epoch": 2.9472259810554804, "grad_norm": 0.3509855885167185, "learning_rate": 9.426417252035858e-09, "loss": 0.0196, "step": 4356 }, { "epoch": 2.9479025710419484, "grad_norm": 0.3005686980471101, "learning_rate": 9.186337188949456e-09, "loss": 0.0136, "step": 4357 }, { "epoch": 2.948579161028417, "grad_norm": 0.35888684219728445, "learning_rate": 8.949351161324227e-09, "loss": 0.0213, "step": 4358 }, { "epoch": 2.949255751014885, "grad_norm": 0.2688442947169199, "learning_rate": 8.715459316078756e-09, "loss": 0.0163, "step": 4359 }, { "epoch": 2.949932341001353, "grad_norm": 0.24857245333319675, "learning_rate": 8.484661798213723e-09, "loss": 0.013, "step": 4360 }, { "epoch": 2.9506089309878214, "grad_norm": 0.35704831755136823, "learning_rate": 8.256958750810784e-09, "loss": 0.015, "step": 4361 }, { "epoch": 2.9512855209742894, "grad_norm": 0.3010921743036329, "learning_rate": 8.032350315033688e-09, "loss": 0.0146, "step": 4362 }, { "epoch": 2.951962110960758, "grad_norm": 0.3536846063831116, "learning_rate": 7.810836630127717e-09, "loss": 0.0157, "step": 4363 }, { "epoch": 2.952638700947226, "grad_norm": 0.40101883380471337, "learning_rate": 7.59241783341913e-09, "loss": 0.0199, "step": 4364 }, { "epoch": 2.953315290933694, "grad_norm": 0.40459086470998656, "learning_rate": 7.377094060315726e-09, "loss": 0.0202, "step": 4365 }, { "epoch": 2.9539918809201624, "grad_norm": 0.4537988531466028, "learning_rate": 7.164865444306834e-09, "loss": 0.0203, "step": 4366 }, { "epoch": 2.9546684709066304, "grad_norm": 0.32286109225660575, "learning_rate": 6.9557321169622105e-09, "loss": 0.0247, "step": 4367 }, { "epoch": 2.955345060893099, "grad_norm": 1.0824472983870232, "learning_rate": 6.7496942079342546e-09, "loss": 0.0195, "step": 4368 }, { "epoch": 2.956021650879567, "grad_norm": 0.5544590267783411, "learning_rate": 6.546751844955235e-09, "loss": 0.0225, "step": 4369 }, { "epoch": 2.956698240866035, "grad_norm": 0.22849275168826153, "learning_rate": 6.346905153837291e-09, "loss": 0.0102, "step": 4370 }, { "epoch": 2.9573748308525034, "grad_norm": 0.34317849125388905, "learning_rate": 6.150154258476315e-09, "loss": 0.017, "step": 4371 }, { "epoch": 2.9580514208389714, "grad_norm": 0.29587388142693405, "learning_rate": 5.956499280845851e-09, "loss": 0.0148, "step": 4372 }, { "epoch": 2.95872801082544, "grad_norm": 0.3199452788834128, "learning_rate": 5.765940341002085e-09, "loss": 0.0208, "step": 4373 }, { "epoch": 2.959404600811908, "grad_norm": 0.28730714447598094, "learning_rate": 5.578477557081074e-09, "loss": 0.0106, "step": 4374 }, { "epoch": 2.960081190798376, "grad_norm": 0.31094536156736957, "learning_rate": 5.394111045299855e-09, "loss": 0.0204, "step": 4375 }, { "epoch": 2.9607577807848444, "grad_norm": 0.26448000549333334, "learning_rate": 5.212840919955886e-09, "loss": 0.0156, "step": 4376 }, { "epoch": 2.9614343707713124, "grad_norm": 0.36121195162214426, "learning_rate": 5.034667293427053e-09, "loss": 0.0221, "step": 4377 }, { "epoch": 2.962110960757781, "grad_norm": 0.3919679143121755, "learning_rate": 4.859590276170556e-09, "loss": 0.0236, "step": 4378 }, { "epoch": 2.962787550744249, "grad_norm": 0.31466905088752206, "learning_rate": 4.687609976725127e-09, "loss": 0.0207, "step": 4379 }, { "epoch": 2.963464140730717, "grad_norm": 0.276137358393101, "learning_rate": 4.51872650170937e-09, "loss": 0.0159, "step": 4380 }, { "epoch": 2.9641407307171854, "grad_norm": 0.27536454716675796, "learning_rate": 4.352939955822311e-09, "loss": 0.0136, "step": 4381 }, { "epoch": 2.9648173207036534, "grad_norm": 0.3095208723601742, "learning_rate": 4.190250441841737e-09, "loss": 0.0147, "step": 4382 }, { "epoch": 2.965493910690122, "grad_norm": 0.36776435927075146, "learning_rate": 4.030658060626969e-09, "loss": 0.0191, "step": 4383 }, { "epoch": 2.96617050067659, "grad_norm": 0.2839355716419772, "learning_rate": 3.874162911117196e-09, "loss": 0.013, "step": 4384 }, { "epoch": 2.966847090663058, "grad_norm": 0.32540217483087935, "learning_rate": 3.7207650903298143e-09, "loss": 0.0149, "step": 4385 }, { "epoch": 2.9675236806495264, "grad_norm": 0.3140991526101512, "learning_rate": 3.570464693364306e-09, "loss": 0.0215, "step": 4386 }, { "epoch": 2.9682002706359945, "grad_norm": 0.30812042457101607, "learning_rate": 3.4232618133978044e-09, "loss": 0.0153, "step": 4387 }, { "epoch": 2.968876860622463, "grad_norm": 0.33732578138585134, "learning_rate": 3.279156541688422e-09, "loss": 0.0229, "step": 4388 }, { "epoch": 2.969553450608931, "grad_norm": 0.2377707525395033, "learning_rate": 3.1381489675746946e-09, "loss": 0.0087, "step": 4389 }, { "epoch": 2.970230040595399, "grad_norm": 0.3067346004371049, "learning_rate": 3.000239178472253e-09, "loss": 0.0134, "step": 4390 }, { "epoch": 2.9709066305818674, "grad_norm": 0.4933288385571862, "learning_rate": 2.8654272598788167e-09, "loss": 0.0411, "step": 4391 }, { "epoch": 2.9715832205683355, "grad_norm": 0.37518135491918647, "learning_rate": 2.7337132953697555e-09, "loss": 0.0168, "step": 4392 }, { "epoch": 2.972259810554804, "grad_norm": 0.300150884049802, "learning_rate": 2.605097366601417e-09, "loss": 0.0146, "step": 4393 }, { "epoch": 2.972936400541272, "grad_norm": 0.2936469448510459, "learning_rate": 2.479579553307798e-09, "loss": 0.0167, "step": 4394 }, { "epoch": 2.97361299052774, "grad_norm": 0.297790522107297, "learning_rate": 2.3571599333038765e-09, "loss": 0.0152, "step": 4395 }, { "epoch": 2.9742895805142084, "grad_norm": 0.3821502282272499, "learning_rate": 2.237838582483387e-09, "loss": 0.0183, "step": 4396 }, { "epoch": 2.9749661705006765, "grad_norm": 0.29273770886435896, "learning_rate": 2.12161557481827e-09, "loss": 0.0195, "step": 4397 }, { "epoch": 2.975642760487145, "grad_norm": 0.2801009993689134, "learning_rate": 2.008490982360889e-09, "loss": 0.0114, "step": 4398 }, { "epoch": 2.976319350473613, "grad_norm": 0.38089038697206073, "learning_rate": 1.8984648752429222e-09, "loss": 0.0179, "step": 4399 }, { "epoch": 2.976995940460081, "grad_norm": 0.2877555876007096, "learning_rate": 1.7915373216742527e-09, "loss": 0.015, "step": 4400 }, { "epoch": 2.9776725304465494, "grad_norm": 0.3755693582452172, "learning_rate": 1.687708387944076e-09, "loss": 0.0227, "step": 4401 }, { "epoch": 2.9783491204330175, "grad_norm": 0.2785422941954074, "learning_rate": 1.5869781384203475e-09, "loss": 0.0128, "step": 4402 }, { "epoch": 2.979025710419486, "grad_norm": 0.44204489971832406, "learning_rate": 1.4893466355514474e-09, "loss": 0.0175, "step": 4403 }, { "epoch": 2.979702300405954, "grad_norm": 0.3323320010546501, "learning_rate": 1.3948139398628492e-09, "loss": 0.0181, "step": 4404 }, { "epoch": 2.980378890392422, "grad_norm": 0.27961205538820144, "learning_rate": 1.3033801099598954e-09, "loss": 0.0131, "step": 4405 }, { "epoch": 2.9810554803788905, "grad_norm": 0.33412975393517885, "learning_rate": 1.215045202527243e-09, "loss": 0.0171, "step": 4406 }, { "epoch": 2.9817320703653585, "grad_norm": 0.2890974529256528, "learning_rate": 1.1298092723266429e-09, "loss": 0.0144, "step": 4407 }, { "epoch": 2.982408660351827, "grad_norm": 0.33997638992605866, "learning_rate": 1.0476723722002702e-09, "loss": 0.0296, "step": 4408 }, { "epoch": 2.983085250338295, "grad_norm": 0.281155159120715, "learning_rate": 9.686345530690589e-10, "loss": 0.0129, "step": 4409 }, { "epoch": 2.983761840324763, "grad_norm": 0.433244080656716, "learning_rate": 8.926958639315919e-10, "loss": 0.0216, "step": 4410 }, { "epoch": 2.9844384303112315, "grad_norm": 0.3665508858483157, "learning_rate": 8.198563518657665e-10, "loss": 0.0163, "step": 4411 }, { "epoch": 2.9851150202976995, "grad_norm": 0.31428569622042574, "learning_rate": 7.50116062028794e-10, "loss": 0.0143, "step": 4412 }, { "epoch": 2.985791610284168, "grad_norm": 0.339015937564257, "learning_rate": 6.834750376549793e-10, "loss": 0.0215, "step": 4413 }, { "epoch": 2.986468200270636, "grad_norm": 0.3381678729181359, "learning_rate": 6.199333200590519e-10, "loss": 0.0161, "step": 4414 }, { "epoch": 2.987144790257104, "grad_norm": 0.2771287078950449, "learning_rate": 5.594909486328348e-10, "loss": 0.0153, "step": 4415 }, { "epoch": 2.9878213802435725, "grad_norm": 0.3394705935867139, "learning_rate": 5.021479608474655e-10, "loss": 0.0193, "step": 4416 }, { "epoch": 2.9884979702300405, "grad_norm": 0.3644892330047543, "learning_rate": 4.4790439225284034e-10, "loss": 0.0242, "step": 4417 }, { "epoch": 2.989174560216509, "grad_norm": 0.3136943948894721, "learning_rate": 3.967602764770595e-10, "loss": 0.0184, "step": 4418 }, { "epoch": 2.989851150202977, "grad_norm": 0.387996779344426, "learning_rate": 3.487156452258722e-10, "loss": 0.0198, "step": 4419 }, { "epoch": 2.990527740189445, "grad_norm": 0.29577891501208836, "learning_rate": 3.0377052828489684e-10, "loss": 0.0163, "step": 4420 }, { "epoch": 2.9912043301759135, "grad_norm": 0.3113922493751291, "learning_rate": 2.6192495351795576e-10, "loss": 0.0143, "step": 4421 }, { "epoch": 2.9918809201623815, "grad_norm": 0.329045522456078, "learning_rate": 2.231789468670753e-10, "loss": 0.0212, "step": 4422 }, { "epoch": 2.99255751014885, "grad_norm": 0.3697015023651616, "learning_rate": 1.8753253235248568e-10, "loss": 0.0277, "step": 4423 }, { "epoch": 2.993234100135318, "grad_norm": 0.3461954717802595, "learning_rate": 1.5498573207262112e-10, "loss": 0.0249, "step": 4424 }, { "epoch": 2.993910690121786, "grad_norm": 0.33534009206015153, "learning_rate": 1.2553856620522997e-10, "loss": 0.0212, "step": 4425 }, { "epoch": 2.9945872801082545, "grad_norm": 0.29711609164962155, "learning_rate": 9.919105300570941e-11, "loss": 0.0109, "step": 4426 }, { "epoch": 2.9952638700947225, "grad_norm": 0.2793411841570855, "learning_rate": 7.59432088082157e-11, "loss": 0.0121, "step": 4427 }, { "epoch": 2.995940460081191, "grad_norm": 0.3296633975783328, "learning_rate": 5.579504802566416e-11, "loss": 0.0213, "step": 4428 }, { "epoch": 2.996617050067659, "grad_norm": 0.45326530102336376, "learning_rate": 3.8746583148063786e-11, "loss": 0.0231, "step": 4429 }, { "epoch": 2.997293640054127, "grad_norm": 0.29331055135595124, "learning_rate": 2.4797824744737797e-11, "loss": 0.0153, "step": 4430 }, { "epoch": 2.9979702300405955, "grad_norm": 0.2800545169223398, "learning_rate": 1.3948781463213324e-11, "loss": 0.0119, "step": 4431 }, { "epoch": 2.9986468200270635, "grad_norm": 0.3171020771903546, "learning_rate": 6.199460029221449e-12, "loss": 0.0209, "step": 4432 }, { "epoch": 2.999323410013532, "grad_norm": 0.6336304879901102, "learning_rate": 1.549865247807425e-12, "loss": 0.0254, "step": 4433 }, { "epoch": 3.0, "grad_norm": 0.3215913587648698, "learning_rate": 0.0, "loss": 0.014, "step": 4434 }, { "epoch": 3.0, "eval_loss": 0.03193313255906105, "eval_runtime": 234.4462, "eval_samples_per_second": 42.462, "eval_steps_per_second": 1.331, "step": 4434 }, { "epoch": 3.0, "step": 4434, "total_flos": 1.381053238219899e+18, "train_loss": 0.03611092418570863, "train_runtime": 47401.0192, "train_samples_per_second": 11.969, "train_steps_per_second": 0.094 } ], "logging_steps": 1, "max_steps": 4434, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.381053238219899e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }