diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31104 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 4434, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006765899864682003, + "grad_norm": 12.109418480944685, + "learning_rate": 2.2522522522522524e-08, + "loss": 0.2806, + "step": 1 + }, + { + "epoch": 0.0013531799729364006, + "grad_norm": 11.05123491573913, + "learning_rate": 4.504504504504505e-08, + "loss": 0.2493, + "step": 2 + }, + { + "epoch": 0.0020297699594046007, + "grad_norm": 11.568626476471813, + "learning_rate": 6.756756756756757e-08, + "loss": 0.2796, + "step": 3 + }, + { + "epoch": 0.0027063599458728013, + "grad_norm": 15.330906430777253, + "learning_rate": 9.00900900900901e-08, + "loss": 0.3237, + "step": 4 + }, + { + "epoch": 0.0033829499323410014, + "grad_norm": 11.80893118713706, + "learning_rate": 1.1261261261261262e-07, + "loss": 0.2768, + "step": 5 + }, + { + "epoch": 0.0040595399188092015, + "grad_norm": 12.288565128357796, + "learning_rate": 1.3513513513513515e-07, + "loss": 0.278, + "step": 6 + }, + { + "epoch": 0.004736129905277402, + "grad_norm": 11.056614459254297, + "learning_rate": 1.5765765765765766e-07, + "loss": 0.2614, + "step": 7 + }, + { + "epoch": 0.005412719891745603, + "grad_norm": 11.644727224551765, + "learning_rate": 1.801801801801802e-07, + "loss": 0.2835, + "step": 8 + }, + { + "epoch": 0.006089309878213802, + "grad_norm": 10.897140250186142, + "learning_rate": 2.0270270270270273e-07, + "loss": 0.2622, + "step": 9 + }, + { + "epoch": 0.006765899864682003, + "grad_norm": 12.757274251949218, + "learning_rate": 2.2522522522522524e-07, + "loss": 0.3088, + "step": 10 + }, + { + "epoch": 0.007442489851150203, + "grad_norm": 11.771844271594402, + "learning_rate": 2.477477477477478e-07, + "loss": 0.2771, + "step": 11 + }, + { + "epoch": 0.008119079837618403, + "grad_norm": 10.462073634910523, + "learning_rate": 2.702702702702703e-07, + "loss": 0.2459, + "step": 12 + }, + { + "epoch": 0.008795669824086604, + "grad_norm": 11.330093340150974, + "learning_rate": 2.927927927927928e-07, + "loss": 0.2485, + "step": 13 + }, + { + "epoch": 0.009472259810554804, + "grad_norm": 11.045487141403331, + "learning_rate": 3.153153153153153e-07, + "loss": 0.2496, + "step": 14 + }, + { + "epoch": 0.010148849797023005, + "grad_norm": 9.006747510455144, + "learning_rate": 3.378378378378379e-07, + "loss": 0.2169, + "step": 15 + }, + { + "epoch": 0.010825439783491205, + "grad_norm": 8.513474092378251, + "learning_rate": 3.603603603603604e-07, + "loss": 0.1941, + "step": 16 + }, + { + "epoch": 0.011502029769959404, + "grad_norm": 8.985361626419866, + "learning_rate": 3.828828828828829e-07, + "loss": 0.2209, + "step": 17 + }, + { + "epoch": 0.012178619756427604, + "grad_norm": 7.4644108769734885, + "learning_rate": 4.0540540540540546e-07, + "loss": 0.2173, + "step": 18 + }, + { + "epoch": 0.012855209742895805, + "grad_norm": 8.138660456856027, + "learning_rate": 4.27927927927928e-07, + "loss": 0.1994, + "step": 19 + }, + { + "epoch": 0.013531799729364006, + "grad_norm": 6.463475276121953, + "learning_rate": 4.504504504504505e-07, + "loss": 0.1617, + "step": 20 + }, + { + "epoch": 0.014208389715832206, + "grad_norm": 4.67755763633634, + "learning_rate": 4.7297297297297305e-07, + "loss": 0.1528, + "step": 21 + }, + { + "epoch": 0.014884979702300407, + "grad_norm": 5.102687813043612, + "learning_rate": 4.954954954954956e-07, + "loss": 0.1559, + "step": 22 + }, + { + "epoch": 0.015561569688768605, + "grad_norm": 4.450782219252275, + "learning_rate": 5.180180180180181e-07, + "loss": 0.1432, + "step": 23 + }, + { + "epoch": 0.016238159675236806, + "grad_norm": 3.8628913682582757, + "learning_rate": 5.405405405405406e-07, + "loss": 0.1566, + "step": 24 + }, + { + "epoch": 0.016914749661705007, + "grad_norm": 3.765429189029888, + "learning_rate": 5.630630630630631e-07, + "loss": 0.1539, + "step": 25 + }, + { + "epoch": 0.017591339648173207, + "grad_norm": 3.4747070187389206, + "learning_rate": 5.855855855855856e-07, + "loss": 0.156, + "step": 26 + }, + { + "epoch": 0.018267929634641408, + "grad_norm": 2.7591405300352077, + "learning_rate": 6.081081081081082e-07, + "loss": 0.1504, + "step": 27 + }, + { + "epoch": 0.018944519621109608, + "grad_norm": 2.5110411566324258, + "learning_rate": 6.306306306306306e-07, + "loss": 0.1226, + "step": 28 + }, + { + "epoch": 0.01962110960757781, + "grad_norm": 2.147016347481153, + "learning_rate": 6.531531531531532e-07, + "loss": 0.0942, + "step": 29 + }, + { + "epoch": 0.02029769959404601, + "grad_norm": 1.8191888485653491, + "learning_rate": 6.756756756756758e-07, + "loss": 0.1346, + "step": 30 + }, + { + "epoch": 0.02097428958051421, + "grad_norm": 2.4427111665888233, + "learning_rate": 6.981981981981982e-07, + "loss": 0.1226, + "step": 31 + }, + { + "epoch": 0.02165087956698241, + "grad_norm": 1.515634001454101, + "learning_rate": 7.207207207207208e-07, + "loss": 0.1013, + "step": 32 + }, + { + "epoch": 0.022327469553450607, + "grad_norm": 1.4537092544960724, + "learning_rate": 7.432432432432434e-07, + "loss": 0.11, + "step": 33 + }, + { + "epoch": 0.023004059539918808, + "grad_norm": 2.478946984359515, + "learning_rate": 7.657657657657658e-07, + "loss": 0.1188, + "step": 34 + }, + { + "epoch": 0.02368064952638701, + "grad_norm": 1.6796456542643035, + "learning_rate": 7.882882882882883e-07, + "loss": 0.1119, + "step": 35 + }, + { + "epoch": 0.02435723951285521, + "grad_norm": 1.60436577837702, + "learning_rate": 8.108108108108109e-07, + "loss": 0.1056, + "step": 36 + }, + { + "epoch": 0.02503382949932341, + "grad_norm": 1.5649369548638523, + "learning_rate": 8.333333333333333e-07, + "loss": 0.1106, + "step": 37 + }, + { + "epoch": 0.02571041948579161, + "grad_norm": 2.0561242436687373, + "learning_rate": 8.55855855855856e-07, + "loss": 0.0859, + "step": 38 + }, + { + "epoch": 0.02638700947225981, + "grad_norm": 1.8755061475179144, + "learning_rate": 8.783783783783785e-07, + "loss": 0.1142, + "step": 39 + }, + { + "epoch": 0.02706359945872801, + "grad_norm": 1.846342223086991, + "learning_rate": 9.00900900900901e-07, + "loss": 0.0968, + "step": 40 + }, + { + "epoch": 0.02774018944519621, + "grad_norm": 1.6330756001827755, + "learning_rate": 9.234234234234235e-07, + "loss": 0.1053, + "step": 41 + }, + { + "epoch": 0.028416779431664412, + "grad_norm": 1.566099881914172, + "learning_rate": 9.459459459459461e-07, + "loss": 0.1067, + "step": 42 + }, + { + "epoch": 0.029093369418132613, + "grad_norm": 1.698567379119881, + "learning_rate": 9.684684684684686e-07, + "loss": 0.091, + "step": 43 + }, + { + "epoch": 0.029769959404600813, + "grad_norm": 1.6125714900139525, + "learning_rate": 9.909909909909911e-07, + "loss": 0.076, + "step": 44 + }, + { + "epoch": 0.030446549391069014, + "grad_norm": 2.115521381565484, + "learning_rate": 1.0135135135135136e-06, + "loss": 0.1054, + "step": 45 + }, + { + "epoch": 0.03112313937753721, + "grad_norm": 2.650889695337764, + "learning_rate": 1.0360360360360361e-06, + "loss": 0.1047, + "step": 46 + }, + { + "epoch": 0.031799729364005415, + "grad_norm": 1.389331225829285, + "learning_rate": 1.0585585585585587e-06, + "loss": 0.0897, + "step": 47 + }, + { + "epoch": 0.03247631935047361, + "grad_norm": 1.4231867009481125, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.0995, + "step": 48 + }, + { + "epoch": 0.033152909336941816, + "grad_norm": 1.2285847771352059, + "learning_rate": 1.1036036036036037e-06, + "loss": 0.0717, + "step": 49 + }, + { + "epoch": 0.03382949932341001, + "grad_norm": 1.3024523733871285, + "learning_rate": 1.1261261261261262e-06, + "loss": 0.0935, + "step": 50 + }, + { + "epoch": 0.03450608930987822, + "grad_norm": 1.3670113610830439, + "learning_rate": 1.148648648648649e-06, + "loss": 0.0791, + "step": 51 + }, + { + "epoch": 0.035182679296346414, + "grad_norm": 1.7510664364205482, + "learning_rate": 1.1711711711711712e-06, + "loss": 0.1211, + "step": 52 + }, + { + "epoch": 0.03585926928281461, + "grad_norm": 2.58368446347696, + "learning_rate": 1.1936936936936937e-06, + "loss": 0.1098, + "step": 53 + }, + { + "epoch": 0.036535859269282815, + "grad_norm": 1.1546683198894498, + "learning_rate": 1.2162162162162164e-06, + "loss": 0.0944, + "step": 54 + }, + { + "epoch": 0.03721244925575101, + "grad_norm": 1.0262303684513803, + "learning_rate": 1.2387387387387387e-06, + "loss": 0.0697, + "step": 55 + }, + { + "epoch": 0.037889039242219216, + "grad_norm": 2.0466850231277793, + "learning_rate": 1.2612612612612613e-06, + "loss": 0.0939, + "step": 56 + }, + { + "epoch": 0.03856562922868741, + "grad_norm": 2.0547227253053095, + "learning_rate": 1.2837837837837838e-06, + "loss": 0.0935, + "step": 57 + }, + { + "epoch": 0.03924221921515562, + "grad_norm": 1.2229081913467617, + "learning_rate": 1.3063063063063065e-06, + "loss": 0.0994, + "step": 58 + }, + { + "epoch": 0.039918809201623814, + "grad_norm": 1.3020824069826142, + "learning_rate": 1.328828828828829e-06, + "loss": 0.0712, + "step": 59 + }, + { + "epoch": 0.04059539918809202, + "grad_norm": 1.4084130139999327, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.0828, + "step": 60 + }, + { + "epoch": 0.041271989174560215, + "grad_norm": 1.3605355132053356, + "learning_rate": 1.373873873873874e-06, + "loss": 0.0767, + "step": 61 + }, + { + "epoch": 0.04194857916102842, + "grad_norm": 1.190756668990836, + "learning_rate": 1.3963963963963963e-06, + "loss": 0.0862, + "step": 62 + }, + { + "epoch": 0.04262516914749662, + "grad_norm": 1.609471879933888, + "learning_rate": 1.418918918918919e-06, + "loss": 0.1112, + "step": 63 + }, + { + "epoch": 0.04330175913396482, + "grad_norm": 1.19760034159579, + "learning_rate": 1.4414414414414416e-06, + "loss": 0.0988, + "step": 64 + }, + { + "epoch": 0.04397834912043302, + "grad_norm": 1.2167332141698486, + "learning_rate": 1.463963963963964e-06, + "loss": 0.0616, + "step": 65 + }, + { + "epoch": 0.044654939106901215, + "grad_norm": 1.1643059346788907, + "learning_rate": 1.4864864864864868e-06, + "loss": 0.0749, + "step": 66 + }, + { + "epoch": 0.04533152909336942, + "grad_norm": 1.756367232451425, + "learning_rate": 1.5090090090090093e-06, + "loss": 0.0883, + "step": 67 + }, + { + "epoch": 0.046008119079837616, + "grad_norm": 1.1816077677665917, + "learning_rate": 1.5315315315315316e-06, + "loss": 0.0925, + "step": 68 + }, + { + "epoch": 0.04668470906630582, + "grad_norm": 1.3497747965548728, + "learning_rate": 1.5540540540540541e-06, + "loss": 0.104, + "step": 69 + }, + { + "epoch": 0.04736129905277402, + "grad_norm": 1.3274120607510551, + "learning_rate": 1.5765765765765766e-06, + "loss": 0.0849, + "step": 70 + }, + { + "epoch": 0.04803788903924222, + "grad_norm": 1.549606581695767, + "learning_rate": 1.5990990990990993e-06, + "loss": 0.0889, + "step": 71 + }, + { + "epoch": 0.04871447902571042, + "grad_norm": 1.4493395352159697, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.0944, + "step": 72 + }, + { + "epoch": 0.04939106901217862, + "grad_norm": 1.0994319556111298, + "learning_rate": 1.6441441441441444e-06, + "loss": 0.0759, + "step": 73 + }, + { + "epoch": 0.05006765899864682, + "grad_norm": 1.2837499508543788, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0698, + "step": 74 + }, + { + "epoch": 0.05074424898511502, + "grad_norm": 1.1341275770781738, + "learning_rate": 1.6891891891891894e-06, + "loss": 0.107, + "step": 75 + }, + { + "epoch": 0.05142083897158322, + "grad_norm": 1.0646003246593085, + "learning_rate": 1.711711711711712e-06, + "loss": 0.0877, + "step": 76 + }, + { + "epoch": 0.052097428958051424, + "grad_norm": 1.0705597995328733, + "learning_rate": 1.7342342342342344e-06, + "loss": 0.0938, + "step": 77 + }, + { + "epoch": 0.05277401894451962, + "grad_norm": 1.8324753250262114, + "learning_rate": 1.756756756756757e-06, + "loss": 0.098, + "step": 78 + }, + { + "epoch": 0.05345060893098782, + "grad_norm": 1.6052106663983605, + "learning_rate": 1.7792792792792792e-06, + "loss": 0.0724, + "step": 79 + }, + { + "epoch": 0.05412719891745602, + "grad_norm": 1.0519773994845274, + "learning_rate": 1.801801801801802e-06, + "loss": 0.0643, + "step": 80 + }, + { + "epoch": 0.05480378890392422, + "grad_norm": 1.2333488233321814, + "learning_rate": 1.8243243243243245e-06, + "loss": 0.0835, + "step": 81 + }, + { + "epoch": 0.05548037889039242, + "grad_norm": 1.4998932144723107, + "learning_rate": 1.846846846846847e-06, + "loss": 0.0748, + "step": 82 + }, + { + "epoch": 0.05615696887686062, + "grad_norm": 1.389620534415091, + "learning_rate": 1.8693693693693697e-06, + "loss": 0.0819, + "step": 83 + }, + { + "epoch": 0.056833558863328824, + "grad_norm": 1.2388704868409917, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.0648, + "step": 84 + }, + { + "epoch": 0.05751014884979702, + "grad_norm": 0.8506120195949848, + "learning_rate": 1.9144144144144145e-06, + "loss": 0.0576, + "step": 85 + }, + { + "epoch": 0.058186738836265225, + "grad_norm": 1.2780645850137915, + "learning_rate": 1.9369369369369372e-06, + "loss": 0.0796, + "step": 86 + }, + { + "epoch": 0.05886332882273342, + "grad_norm": 1.132956937818961, + "learning_rate": 1.9594594594594595e-06, + "loss": 0.0786, + "step": 87 + }, + { + "epoch": 0.05953991880920163, + "grad_norm": 1.4862959937967373, + "learning_rate": 1.9819819819819822e-06, + "loss": 0.0655, + "step": 88 + }, + { + "epoch": 0.060216508795669824, + "grad_norm": 1.5069955438493359, + "learning_rate": 2.0045045045045045e-06, + "loss": 0.0789, + "step": 89 + }, + { + "epoch": 0.06089309878213803, + "grad_norm": 1.1028032292446677, + "learning_rate": 2.0270270270270273e-06, + "loss": 0.0663, + "step": 90 + }, + { + "epoch": 0.061569688768606225, + "grad_norm": 1.853101169509201, + "learning_rate": 2.0495495495495496e-06, + "loss": 0.1001, + "step": 91 + }, + { + "epoch": 0.06224627875507442, + "grad_norm": 1.1764131955694261, + "learning_rate": 2.0720720720720723e-06, + "loss": 0.0677, + "step": 92 + }, + { + "epoch": 0.06292286874154263, + "grad_norm": 1.270501762567475, + "learning_rate": 2.0945945945945946e-06, + "loss": 0.0809, + "step": 93 + }, + { + "epoch": 0.06359945872801083, + "grad_norm": 1.08651500253901, + "learning_rate": 2.1171171171171173e-06, + "loss": 0.0701, + "step": 94 + }, + { + "epoch": 0.06427604871447902, + "grad_norm": 1.171577214767396, + "learning_rate": 2.13963963963964e-06, + "loss": 0.0658, + "step": 95 + }, + { + "epoch": 0.06495263870094722, + "grad_norm": 0.9993389865692777, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.0697, + "step": 96 + }, + { + "epoch": 0.06562922868741543, + "grad_norm": 1.1647667270803757, + "learning_rate": 2.1846846846846846e-06, + "loss": 0.068, + "step": 97 + }, + { + "epoch": 0.06630581867388363, + "grad_norm": 1.0815909952207134, + "learning_rate": 2.2072072072072073e-06, + "loss": 0.0586, + "step": 98 + }, + { + "epoch": 0.06698240866035182, + "grad_norm": 1.4432815661843537, + "learning_rate": 2.22972972972973e-06, + "loss": 0.0849, + "step": 99 + }, + { + "epoch": 0.06765899864682003, + "grad_norm": 1.2850237284528643, + "learning_rate": 2.2522522522522524e-06, + "loss": 0.0803, + "step": 100 + }, + { + "epoch": 0.06833558863328823, + "grad_norm": 1.3086998007860184, + "learning_rate": 2.274774774774775e-06, + "loss": 0.0893, + "step": 101 + }, + { + "epoch": 0.06901217861975643, + "grad_norm": 1.3811854632746974, + "learning_rate": 2.297297297297298e-06, + "loss": 0.0787, + "step": 102 + }, + { + "epoch": 0.06968876860622462, + "grad_norm": 0.8496140325256104, + "learning_rate": 2.31981981981982e-06, + "loss": 0.0552, + "step": 103 + }, + { + "epoch": 0.07036535859269283, + "grad_norm": 1.1250516677643785, + "learning_rate": 2.3423423423423424e-06, + "loss": 0.0674, + "step": 104 + }, + { + "epoch": 0.07104194857916103, + "grad_norm": 1.5188591494569172, + "learning_rate": 2.364864864864865e-06, + "loss": 0.0853, + "step": 105 + }, + { + "epoch": 0.07171853856562922, + "grad_norm": 0.9166081492102163, + "learning_rate": 2.3873873873873874e-06, + "loss": 0.0691, + "step": 106 + }, + { + "epoch": 0.07239512855209743, + "grad_norm": 1.4406911354239935, + "learning_rate": 2.40990990990991e-06, + "loss": 0.0764, + "step": 107 + }, + { + "epoch": 0.07307171853856563, + "grad_norm": 1.2837903616374027, + "learning_rate": 2.432432432432433e-06, + "loss": 0.0686, + "step": 108 + }, + { + "epoch": 0.07374830852503383, + "grad_norm": 1.1827381468628524, + "learning_rate": 2.454954954954955e-06, + "loss": 0.0835, + "step": 109 + }, + { + "epoch": 0.07442489851150202, + "grad_norm": 1.0842278836488644, + "learning_rate": 2.4774774774774775e-06, + "loss": 0.0847, + "step": 110 + }, + { + "epoch": 0.07510148849797023, + "grad_norm": 0.7826209232029521, + "learning_rate": 2.5e-06, + "loss": 0.0611, + "step": 111 + }, + { + "epoch": 0.07577807848443843, + "grad_norm": 1.0514720604523498, + "learning_rate": 2.5225225225225225e-06, + "loss": 0.0662, + "step": 112 + }, + { + "epoch": 0.07645466847090664, + "grad_norm": 1.1210491354297492, + "learning_rate": 2.5450450450450452e-06, + "loss": 0.0668, + "step": 113 + }, + { + "epoch": 0.07713125845737483, + "grad_norm": 1.081267730408898, + "learning_rate": 2.5675675675675675e-06, + "loss": 0.0799, + "step": 114 + }, + { + "epoch": 0.07780784844384303, + "grad_norm": 1.404599973743778, + "learning_rate": 2.5900900900900907e-06, + "loss": 0.0726, + "step": 115 + }, + { + "epoch": 0.07848443843031123, + "grad_norm": 0.796717754084093, + "learning_rate": 2.612612612612613e-06, + "loss": 0.0514, + "step": 116 + }, + { + "epoch": 0.07916102841677942, + "grad_norm": 1.2692775386997248, + "learning_rate": 2.6351351351351353e-06, + "loss": 0.0764, + "step": 117 + }, + { + "epoch": 0.07983761840324763, + "grad_norm": 0.8178369672005279, + "learning_rate": 2.657657657657658e-06, + "loss": 0.052, + "step": 118 + }, + { + "epoch": 0.08051420838971583, + "grad_norm": 1.1124558159045197, + "learning_rate": 2.6801801801801803e-06, + "loss": 0.0699, + "step": 119 + }, + { + "epoch": 0.08119079837618404, + "grad_norm": 1.8258037622705916, + "learning_rate": 2.702702702702703e-06, + "loss": 0.086, + "step": 120 + }, + { + "epoch": 0.08186738836265223, + "grad_norm": 0.9911638914541022, + "learning_rate": 2.7252252252252253e-06, + "loss": 0.0716, + "step": 121 + }, + { + "epoch": 0.08254397834912043, + "grad_norm": 0.7981140741449456, + "learning_rate": 2.747747747747748e-06, + "loss": 0.0653, + "step": 122 + }, + { + "epoch": 0.08322056833558863, + "grad_norm": 0.9891583556512792, + "learning_rate": 2.7702702702702703e-06, + "loss": 0.0626, + "step": 123 + }, + { + "epoch": 0.08389715832205684, + "grad_norm": 0.9311177301850705, + "learning_rate": 2.7927927927927926e-06, + "loss": 0.0676, + "step": 124 + }, + { + "epoch": 0.08457374830852503, + "grad_norm": 0.8213289961419631, + "learning_rate": 2.8153153153153158e-06, + "loss": 0.065, + "step": 125 + }, + { + "epoch": 0.08525033829499323, + "grad_norm": 1.4753798779127683, + "learning_rate": 2.837837837837838e-06, + "loss": 0.0847, + "step": 126 + }, + { + "epoch": 0.08592692828146144, + "grad_norm": 1.018664096046297, + "learning_rate": 2.860360360360361e-06, + "loss": 0.0689, + "step": 127 + }, + { + "epoch": 0.08660351826792964, + "grad_norm": 0.885299703826496, + "learning_rate": 2.882882882882883e-06, + "loss": 0.0513, + "step": 128 + }, + { + "epoch": 0.08728010825439783, + "grad_norm": 1.2654835363026304, + "learning_rate": 2.9054054054054054e-06, + "loss": 0.0762, + "step": 129 + }, + { + "epoch": 0.08795669824086604, + "grad_norm": 1.3878112144001065, + "learning_rate": 2.927927927927928e-06, + "loss": 0.118, + "step": 130 + }, + { + "epoch": 0.08863328822733424, + "grad_norm": 1.6327600361262127, + "learning_rate": 2.9504504504504504e-06, + "loss": 0.077, + "step": 131 + }, + { + "epoch": 0.08930987821380243, + "grad_norm": 1.42128938356837, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.0674, + "step": 132 + }, + { + "epoch": 0.08998646820027063, + "grad_norm": 1.2993447107485385, + "learning_rate": 2.995495495495496e-06, + "loss": 0.0678, + "step": 133 + }, + { + "epoch": 0.09066305818673884, + "grad_norm": 0.9176485862076669, + "learning_rate": 3.0180180180180186e-06, + "loss": 0.0528, + "step": 134 + }, + { + "epoch": 0.09133964817320704, + "grad_norm": 1.2537634334042256, + "learning_rate": 3.040540540540541e-06, + "loss": 0.0772, + "step": 135 + }, + { + "epoch": 0.09201623815967523, + "grad_norm": 0.8837089615432548, + "learning_rate": 3.063063063063063e-06, + "loss": 0.0565, + "step": 136 + }, + { + "epoch": 0.09269282814614344, + "grad_norm": 0.949361425538162, + "learning_rate": 3.085585585585586e-06, + "loss": 0.0637, + "step": 137 + }, + { + "epoch": 0.09336941813261164, + "grad_norm": 0.7767836874416418, + "learning_rate": 3.1081081081081082e-06, + "loss": 0.0537, + "step": 138 + }, + { + "epoch": 0.09404600811907984, + "grad_norm": 1.0620072007006445, + "learning_rate": 3.130630630630631e-06, + "loss": 0.0821, + "step": 139 + }, + { + "epoch": 0.09472259810554803, + "grad_norm": 0.8330908041619831, + "learning_rate": 3.1531531531531532e-06, + "loss": 0.0672, + "step": 140 + }, + { + "epoch": 0.09539918809201624, + "grad_norm": 0.8709290571325103, + "learning_rate": 3.1756756756756755e-06, + "loss": 0.0582, + "step": 141 + }, + { + "epoch": 0.09607577807848444, + "grad_norm": 0.9576748527480916, + "learning_rate": 3.1981981981981987e-06, + "loss": 0.0804, + "step": 142 + }, + { + "epoch": 0.09675236806495263, + "grad_norm": 2.139408740627896, + "learning_rate": 3.220720720720721e-06, + "loss": 0.0788, + "step": 143 + }, + { + "epoch": 0.09742895805142084, + "grad_norm": 0.92133608056852, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.0595, + "step": 144 + }, + { + "epoch": 0.09810554803788904, + "grad_norm": 0.9925385605152662, + "learning_rate": 3.265765765765766e-06, + "loss": 0.0603, + "step": 145 + }, + { + "epoch": 0.09878213802435724, + "grad_norm": 1.0763586483912095, + "learning_rate": 3.2882882882882887e-06, + "loss": 0.0804, + "step": 146 + }, + { + "epoch": 0.09945872801082543, + "grad_norm": 1.7871971536480755, + "learning_rate": 3.310810810810811e-06, + "loss": 0.0841, + "step": 147 + }, + { + "epoch": 0.10013531799729364, + "grad_norm": 1.202388521210232, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0912, + "step": 148 + }, + { + "epoch": 0.10081190798376184, + "grad_norm": 0.9232872486608801, + "learning_rate": 3.3558558558558565e-06, + "loss": 0.0874, + "step": 149 + }, + { + "epoch": 0.10148849797023005, + "grad_norm": 1.562181686800407, + "learning_rate": 3.3783783783783788e-06, + "loss": 0.0796, + "step": 150 + }, + { + "epoch": 0.10216508795669824, + "grad_norm": 1.0582165709303402, + "learning_rate": 3.4009009009009015e-06, + "loss": 0.0611, + "step": 151 + }, + { + "epoch": 0.10284167794316644, + "grad_norm": 1.1869252064389257, + "learning_rate": 3.423423423423424e-06, + "loss": 0.0778, + "step": 152 + }, + { + "epoch": 0.10351826792963464, + "grad_norm": 1.0882063117012855, + "learning_rate": 3.445945945945946e-06, + "loss": 0.0704, + "step": 153 + }, + { + "epoch": 0.10419485791610285, + "grad_norm": 1.684218554074011, + "learning_rate": 3.468468468468469e-06, + "loss": 0.0739, + "step": 154 + }, + { + "epoch": 0.10487144790257104, + "grad_norm": 1.2879493411494656, + "learning_rate": 3.490990990990991e-06, + "loss": 0.0706, + "step": 155 + }, + { + "epoch": 0.10554803788903924, + "grad_norm": 1.180667462102325, + "learning_rate": 3.513513513513514e-06, + "loss": 0.0516, + "step": 156 + }, + { + "epoch": 0.10622462787550745, + "grad_norm": 0.9173666672150224, + "learning_rate": 3.536036036036036e-06, + "loss": 0.0597, + "step": 157 + }, + { + "epoch": 0.10690121786197564, + "grad_norm": 0.9824869826881204, + "learning_rate": 3.5585585585585584e-06, + "loss": 0.0618, + "step": 158 + }, + { + "epoch": 0.10757780784844384, + "grad_norm": 0.8372326102844767, + "learning_rate": 3.5810810810810816e-06, + "loss": 0.055, + "step": 159 + }, + { + "epoch": 0.10825439783491204, + "grad_norm": 0.6863999562308667, + "learning_rate": 3.603603603603604e-06, + "loss": 0.0453, + "step": 160 + }, + { + "epoch": 0.10893098782138025, + "grad_norm": 1.7300411271145275, + "learning_rate": 3.6261261261261266e-06, + "loss": 0.078, + "step": 161 + }, + { + "epoch": 0.10960757780784844, + "grad_norm": 0.8323956752494743, + "learning_rate": 3.648648648648649e-06, + "loss": 0.0671, + "step": 162 + }, + { + "epoch": 0.11028416779431664, + "grad_norm": 0.8816106059707829, + "learning_rate": 3.6711711711711716e-06, + "loss": 0.0677, + "step": 163 + }, + { + "epoch": 0.11096075778078485, + "grad_norm": 1.291090966278352, + "learning_rate": 3.693693693693694e-06, + "loss": 0.0703, + "step": 164 + }, + { + "epoch": 0.11163734776725305, + "grad_norm": 1.4380603085453538, + "learning_rate": 3.7162162162162162e-06, + "loss": 0.081, + "step": 165 + }, + { + "epoch": 0.11231393775372124, + "grad_norm": 1.1279173560250253, + "learning_rate": 3.7387387387387394e-06, + "loss": 0.0626, + "step": 166 + }, + { + "epoch": 0.11299052774018944, + "grad_norm": 1.1190351240265553, + "learning_rate": 3.7612612612612612e-06, + "loss": 0.0803, + "step": 167 + }, + { + "epoch": 0.11366711772665765, + "grad_norm": 1.0861817886530531, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.0531, + "step": 168 + }, + { + "epoch": 0.11434370771312584, + "grad_norm": 1.1450975829292371, + "learning_rate": 3.8063063063063067e-06, + "loss": 0.0657, + "step": 169 + }, + { + "epoch": 0.11502029769959404, + "grad_norm": 1.1837341458804667, + "learning_rate": 3.828828828828829e-06, + "loss": 0.0779, + "step": 170 + }, + { + "epoch": 0.11569688768606225, + "grad_norm": 0.9705664284135157, + "learning_rate": 3.851351351351352e-06, + "loss": 0.0717, + "step": 171 + }, + { + "epoch": 0.11637347767253045, + "grad_norm": 0.9067677836299134, + "learning_rate": 3.8738738738738744e-06, + "loss": 0.0641, + "step": 172 + }, + { + "epoch": 0.11705006765899864, + "grad_norm": 0.7795707970035339, + "learning_rate": 3.896396396396397e-06, + "loss": 0.0539, + "step": 173 + }, + { + "epoch": 0.11772665764546685, + "grad_norm": 1.1543725315639926, + "learning_rate": 3.918918918918919e-06, + "loss": 0.07, + "step": 174 + }, + { + "epoch": 0.11840324763193505, + "grad_norm": 0.8574092898862513, + "learning_rate": 3.941441441441442e-06, + "loss": 0.056, + "step": 175 + }, + { + "epoch": 0.11907983761840325, + "grad_norm": 1.1116892070356739, + "learning_rate": 3.9639639639639645e-06, + "loss": 0.0667, + "step": 176 + }, + { + "epoch": 0.11975642760487144, + "grad_norm": 1.0421056323865376, + "learning_rate": 3.986486486486487e-06, + "loss": 0.0699, + "step": 177 + }, + { + "epoch": 0.12043301759133965, + "grad_norm": 1.0065865415244708, + "learning_rate": 4.009009009009009e-06, + "loss": 0.0973, + "step": 178 + }, + { + "epoch": 0.12110960757780785, + "grad_norm": 0.8365657611328879, + "learning_rate": 4.031531531531531e-06, + "loss": 0.0617, + "step": 179 + }, + { + "epoch": 0.12178619756427606, + "grad_norm": 0.9221990808220503, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.0623, + "step": 180 + }, + { + "epoch": 0.12246278755074425, + "grad_norm": 0.7072566756504902, + "learning_rate": 4.076576576576577e-06, + "loss": 0.0507, + "step": 181 + }, + { + "epoch": 0.12313937753721245, + "grad_norm": 1.0923690184288475, + "learning_rate": 4.099099099099099e-06, + "loss": 0.0611, + "step": 182 + }, + { + "epoch": 0.12381596752368065, + "grad_norm": 0.8634520196498353, + "learning_rate": 4.121621621621622e-06, + "loss": 0.0649, + "step": 183 + }, + { + "epoch": 0.12449255751014884, + "grad_norm": 0.7815477364864121, + "learning_rate": 4.1441441441441446e-06, + "loss": 0.0686, + "step": 184 + }, + { + "epoch": 0.12516914749661706, + "grad_norm": 1.034553826316958, + "learning_rate": 4.166666666666667e-06, + "loss": 0.0761, + "step": 185 + }, + { + "epoch": 0.12584573748308525, + "grad_norm": 0.8790600434907246, + "learning_rate": 4.189189189189189e-06, + "loss": 0.0541, + "step": 186 + }, + { + "epoch": 0.12652232746955344, + "grad_norm": 0.6312394412630871, + "learning_rate": 4.2117117117117115e-06, + "loss": 0.048, + "step": 187 + }, + { + "epoch": 0.12719891745602166, + "grad_norm": 1.0236694297248756, + "learning_rate": 4.234234234234235e-06, + "loss": 0.0624, + "step": 188 + }, + { + "epoch": 0.12787550744248985, + "grad_norm": 1.2415095048856977, + "learning_rate": 4.256756756756757e-06, + "loss": 0.076, + "step": 189 + }, + { + "epoch": 0.12855209742895804, + "grad_norm": 1.0440729129155921, + "learning_rate": 4.27927927927928e-06, + "loss": 0.0721, + "step": 190 + }, + { + "epoch": 0.12922868741542626, + "grad_norm": 0.9272072797636888, + "learning_rate": 4.301801801801802e-06, + "loss": 0.0636, + "step": 191 + }, + { + "epoch": 0.12990527740189445, + "grad_norm": 0.8399664093238411, + "learning_rate": 4.324324324324325e-06, + "loss": 0.0605, + "step": 192 + }, + { + "epoch": 0.13058186738836267, + "grad_norm": 0.8169119514173009, + "learning_rate": 4.346846846846847e-06, + "loss": 0.0464, + "step": 193 + }, + { + "epoch": 0.13125845737483086, + "grad_norm": 1.7056399951235797, + "learning_rate": 4.369369369369369e-06, + "loss": 0.0842, + "step": 194 + }, + { + "epoch": 0.13193504736129905, + "grad_norm": 0.8963540535989036, + "learning_rate": 4.391891891891892e-06, + "loss": 0.0631, + "step": 195 + }, + { + "epoch": 0.13261163734776726, + "grad_norm": 1.1297665202656213, + "learning_rate": 4.414414414414415e-06, + "loss": 0.0666, + "step": 196 + }, + { + "epoch": 0.13328822733423545, + "grad_norm": 1.2222656454260337, + "learning_rate": 4.436936936936938e-06, + "loss": 0.0742, + "step": 197 + }, + { + "epoch": 0.13396481732070364, + "grad_norm": 0.7830411618814591, + "learning_rate": 4.45945945945946e-06, + "loss": 0.0643, + "step": 198 + }, + { + "epoch": 0.13464140730717186, + "grad_norm": 0.9248966716591538, + "learning_rate": 4.4819819819819824e-06, + "loss": 0.056, + "step": 199 + }, + { + "epoch": 0.13531799729364005, + "grad_norm": 1.0226482316768566, + "learning_rate": 4.504504504504505e-06, + "loss": 0.0513, + "step": 200 + }, + { + "epoch": 0.13599458728010824, + "grad_norm": 2.796924711877252, + "learning_rate": 4.527027027027027e-06, + "loss": 0.0719, + "step": 201 + }, + { + "epoch": 0.13667117726657646, + "grad_norm": 1.3548336859142396, + "learning_rate": 4.54954954954955e-06, + "loss": 0.0695, + "step": 202 + }, + { + "epoch": 0.13734776725304465, + "grad_norm": 0.8975021051323848, + "learning_rate": 4.5720720720720725e-06, + "loss": 0.0503, + "step": 203 + }, + { + "epoch": 0.13802435723951287, + "grad_norm": 1.1704431625814344, + "learning_rate": 4.594594594594596e-06, + "loss": 0.0693, + "step": 204 + }, + { + "epoch": 0.13870094722598106, + "grad_norm": 1.1095813348157724, + "learning_rate": 4.617117117117118e-06, + "loss": 0.0624, + "step": 205 + }, + { + "epoch": 0.13937753721244925, + "grad_norm": 1.0994100355897811, + "learning_rate": 4.63963963963964e-06, + "loss": 0.0814, + "step": 206 + }, + { + "epoch": 0.14005412719891747, + "grad_norm": 0.9955265603973159, + "learning_rate": 4.6621621621621625e-06, + "loss": 0.0582, + "step": 207 + }, + { + "epoch": 0.14073071718538566, + "grad_norm": 0.7673711112426891, + "learning_rate": 4.684684684684685e-06, + "loss": 0.079, + "step": 208 + }, + { + "epoch": 0.14140730717185385, + "grad_norm": 1.1997536882951019, + "learning_rate": 4.707207207207208e-06, + "loss": 0.0758, + "step": 209 + }, + { + "epoch": 0.14208389715832206, + "grad_norm": 1.066850922079127, + "learning_rate": 4.72972972972973e-06, + "loss": 0.0673, + "step": 210 + }, + { + "epoch": 0.14276048714479025, + "grad_norm": 0.6981303735219595, + "learning_rate": 4.7522522522522526e-06, + "loss": 0.0515, + "step": 211 + }, + { + "epoch": 0.14343707713125844, + "grad_norm": 0.7702057119766513, + "learning_rate": 4.774774774774775e-06, + "loss": 0.0558, + "step": 212 + }, + { + "epoch": 0.14411366711772666, + "grad_norm": 0.8179829158571619, + "learning_rate": 4.797297297297297e-06, + "loss": 0.059, + "step": 213 + }, + { + "epoch": 0.14479025710419485, + "grad_norm": 0.841601120489142, + "learning_rate": 4.81981981981982e-06, + "loss": 0.0667, + "step": 214 + }, + { + "epoch": 0.14546684709066307, + "grad_norm": 0.7577151328092167, + "learning_rate": 4.842342342342343e-06, + "loss": 0.064, + "step": 215 + }, + { + "epoch": 0.14614343707713126, + "grad_norm": 0.7229776371738366, + "learning_rate": 4.864864864864866e-06, + "loss": 0.0551, + "step": 216 + }, + { + "epoch": 0.14682002706359945, + "grad_norm": 0.7548280052358195, + "learning_rate": 4.887387387387388e-06, + "loss": 0.0742, + "step": 217 + }, + { + "epoch": 0.14749661705006767, + "grad_norm": 0.7832374227672186, + "learning_rate": 4.90990990990991e-06, + "loss": 0.0585, + "step": 218 + }, + { + "epoch": 0.14817320703653586, + "grad_norm": 0.7468947916494266, + "learning_rate": 4.932432432432433e-06, + "loss": 0.0597, + "step": 219 + }, + { + "epoch": 0.14884979702300405, + "grad_norm": 0.7515844395435481, + "learning_rate": 4.954954954954955e-06, + "loss": 0.0477, + "step": 220 + }, + { + "epoch": 0.14952638700947227, + "grad_norm": 0.8072211331751639, + "learning_rate": 4.977477477477478e-06, + "loss": 0.0546, + "step": 221 + }, + { + "epoch": 0.15020297699594046, + "grad_norm": 0.8185940219640517, + "learning_rate": 5e-06, + "loss": 0.0576, + "step": 222 + }, + { + "epoch": 0.15087956698240865, + "grad_norm": 0.8997670516926375, + "learning_rate": 5.022522522522523e-06, + "loss": 0.0787, + "step": 223 + }, + { + "epoch": 0.15155615696887687, + "grad_norm": 0.812997747689852, + "learning_rate": 5.045045045045045e-06, + "loss": 0.0771, + "step": 224 + }, + { + "epoch": 0.15223274695534506, + "grad_norm": 1.6748254663318023, + "learning_rate": 5.067567567567568e-06, + "loss": 0.0783, + "step": 225 + }, + { + "epoch": 0.15290933694181327, + "grad_norm": 1.2271152074751945, + "learning_rate": 5.0900900900900905e-06, + "loss": 0.0738, + "step": 226 + }, + { + "epoch": 0.15358592692828146, + "grad_norm": 0.6945605892310552, + "learning_rate": 5.112612612612613e-06, + "loss": 0.0579, + "step": 227 + }, + { + "epoch": 0.15426251691474965, + "grad_norm": 0.8426999080994071, + "learning_rate": 5.135135135135135e-06, + "loss": 0.0557, + "step": 228 + }, + { + "epoch": 0.15493910690121787, + "grad_norm": 0.8652992733951973, + "learning_rate": 5.157657657657657e-06, + "loss": 0.0669, + "step": 229 + }, + { + "epoch": 0.15561569688768606, + "grad_norm": 0.7837956357894128, + "learning_rate": 5.180180180180181e-06, + "loss": 0.0552, + "step": 230 + }, + { + "epoch": 0.15629228687415425, + "grad_norm": 0.8690750422186453, + "learning_rate": 5.202702702702704e-06, + "loss": 0.058, + "step": 231 + }, + { + "epoch": 0.15696887686062247, + "grad_norm": 1.6243365225413129, + "learning_rate": 5.225225225225226e-06, + "loss": 0.0702, + "step": 232 + }, + { + "epoch": 0.15764546684709066, + "grad_norm": 0.9135053302800554, + "learning_rate": 5.247747747747748e-06, + "loss": 0.0557, + "step": 233 + }, + { + "epoch": 0.15832205683355885, + "grad_norm": 1.1007878772849908, + "learning_rate": 5.2702702702702705e-06, + "loss": 0.079, + "step": 234 + }, + { + "epoch": 0.15899864682002707, + "grad_norm": 0.6184680312641445, + "learning_rate": 5.292792792792794e-06, + "loss": 0.0545, + "step": 235 + }, + { + "epoch": 0.15967523680649526, + "grad_norm": 1.0609934975650073, + "learning_rate": 5.315315315315316e-06, + "loss": 0.072, + "step": 236 + }, + { + "epoch": 0.16035182679296348, + "grad_norm": 0.8713490461647541, + "learning_rate": 5.337837837837838e-06, + "loss": 0.067, + "step": 237 + }, + { + "epoch": 0.16102841677943167, + "grad_norm": 0.5902219845508051, + "learning_rate": 5.360360360360361e-06, + "loss": 0.0446, + "step": 238 + }, + { + "epoch": 0.16170500676589986, + "grad_norm": 1.649731021518868, + "learning_rate": 5.382882882882884e-06, + "loss": 0.0663, + "step": 239 + }, + { + "epoch": 0.16238159675236807, + "grad_norm": 1.0906319565432787, + "learning_rate": 5.405405405405406e-06, + "loss": 0.0658, + "step": 240 + }, + { + "epoch": 0.16305818673883626, + "grad_norm": 0.6666684555012528, + "learning_rate": 5.427927927927928e-06, + "loss": 0.0599, + "step": 241 + }, + { + "epoch": 0.16373477672530445, + "grad_norm": 1.0927066523354083, + "learning_rate": 5.450450450450451e-06, + "loss": 0.0865, + "step": 242 + }, + { + "epoch": 0.16441136671177267, + "grad_norm": 0.831763258236507, + "learning_rate": 5.472972972972973e-06, + "loss": 0.0586, + "step": 243 + }, + { + "epoch": 0.16508795669824086, + "grad_norm": 0.8704003641819794, + "learning_rate": 5.495495495495496e-06, + "loss": 0.0531, + "step": 244 + }, + { + "epoch": 0.16576454668470908, + "grad_norm": 1.0765777401013956, + "learning_rate": 5.518018018018018e-06, + "loss": 0.0623, + "step": 245 + }, + { + "epoch": 0.16644113667117727, + "grad_norm": 0.8411725489707547, + "learning_rate": 5.540540540540541e-06, + "loss": 0.0662, + "step": 246 + }, + { + "epoch": 0.16711772665764546, + "grad_norm": 0.8517356933328273, + "learning_rate": 5.563063063063063e-06, + "loss": 0.0545, + "step": 247 + }, + { + "epoch": 0.16779431664411368, + "grad_norm": 0.7124648740025268, + "learning_rate": 5.585585585585585e-06, + "loss": 0.0581, + "step": 248 + }, + { + "epoch": 0.16847090663058187, + "grad_norm": 0.6663886160451271, + "learning_rate": 5.608108108108109e-06, + "loss": 0.0595, + "step": 249 + }, + { + "epoch": 0.16914749661705006, + "grad_norm": 1.0397462129786121, + "learning_rate": 5.6306306306306316e-06, + "loss": 0.0588, + "step": 250 + }, + { + "epoch": 0.16982408660351828, + "grad_norm": 1.5997174616428276, + "learning_rate": 5.653153153153154e-06, + "loss": 0.0829, + "step": 251 + }, + { + "epoch": 0.17050067658998647, + "grad_norm": 0.7506698509283649, + "learning_rate": 5.675675675675676e-06, + "loss": 0.065, + "step": 252 + }, + { + "epoch": 0.17117726657645466, + "grad_norm": 9.512088246340134, + "learning_rate": 5.6981981981981985e-06, + "loss": 0.0999, + "step": 253 + }, + { + "epoch": 0.17185385656292287, + "grad_norm": 0.8094898813339155, + "learning_rate": 5.720720720720722e-06, + "loss": 0.064, + "step": 254 + }, + { + "epoch": 0.17253044654939106, + "grad_norm": 0.9073780252224255, + "learning_rate": 5.743243243243244e-06, + "loss": 0.0547, + "step": 255 + }, + { + "epoch": 0.17320703653585928, + "grad_norm": 0.9363636629622993, + "learning_rate": 5.765765765765766e-06, + "loss": 0.0729, + "step": 256 + }, + { + "epoch": 0.17388362652232747, + "grad_norm": 1.1763667241015792, + "learning_rate": 5.7882882882882885e-06, + "loss": 0.0623, + "step": 257 + }, + { + "epoch": 0.17456021650879566, + "grad_norm": 0.7735422761562635, + "learning_rate": 5.810810810810811e-06, + "loss": 0.0549, + "step": 258 + }, + { + "epoch": 0.17523680649526388, + "grad_norm": 1.1842795603184852, + "learning_rate": 5.833333333333334e-06, + "loss": 0.0678, + "step": 259 + }, + { + "epoch": 0.17591339648173207, + "grad_norm": 0.9880273332974672, + "learning_rate": 5.855855855855856e-06, + "loss": 0.0616, + "step": 260 + }, + { + "epoch": 0.17658998646820026, + "grad_norm": 0.9946326645978719, + "learning_rate": 5.8783783783783786e-06, + "loss": 0.067, + "step": 261 + }, + { + "epoch": 0.17726657645466848, + "grad_norm": 0.8436445730122438, + "learning_rate": 5.900900900900901e-06, + "loss": 0.0507, + "step": 262 + }, + { + "epoch": 0.17794316644113667, + "grad_norm": 1.128724554221957, + "learning_rate": 5.923423423423423e-06, + "loss": 0.0637, + "step": 263 + }, + { + "epoch": 0.17861975642760486, + "grad_norm": 0.8228539499031123, + "learning_rate": 5.945945945945947e-06, + "loss": 0.0505, + "step": 264 + }, + { + "epoch": 0.17929634641407308, + "grad_norm": 1.4221352867966524, + "learning_rate": 5.9684684684684694e-06, + "loss": 0.0669, + "step": 265 + }, + { + "epoch": 0.17997293640054127, + "grad_norm": 0.6266405186580584, + "learning_rate": 5.990990990990992e-06, + "loss": 0.0557, + "step": 266 + }, + { + "epoch": 0.18064952638700948, + "grad_norm": 1.016148575810402, + "learning_rate": 6.013513513513514e-06, + "loss": 0.0575, + "step": 267 + }, + { + "epoch": 0.18132611637347767, + "grad_norm": 1.0689203740691198, + "learning_rate": 6.036036036036037e-06, + "loss": 0.0619, + "step": 268 + }, + { + "epoch": 0.18200270635994586, + "grad_norm": 1.000862602449387, + "learning_rate": 6.0585585585585595e-06, + "loss": 0.067, + "step": 269 + }, + { + "epoch": 0.18267929634641408, + "grad_norm": 1.110717610117604, + "learning_rate": 6.081081081081082e-06, + "loss": 0.0807, + "step": 270 + }, + { + "epoch": 0.18335588633288227, + "grad_norm": 0.8781204597287579, + "learning_rate": 6.103603603603604e-06, + "loss": 0.0603, + "step": 271 + }, + { + "epoch": 0.18403247631935046, + "grad_norm": 0.8582142107483495, + "learning_rate": 6.126126126126126e-06, + "loss": 0.0625, + "step": 272 + }, + { + "epoch": 0.18470906630581868, + "grad_norm": 0.9650867483607015, + "learning_rate": 6.1486486486486495e-06, + "loss": 0.0617, + "step": 273 + }, + { + "epoch": 0.18538565629228687, + "grad_norm": 0.848479702118272, + "learning_rate": 6.171171171171172e-06, + "loss": 0.0755, + "step": 274 + }, + { + "epoch": 0.18606224627875506, + "grad_norm": 1.026330058910402, + "learning_rate": 6.193693693693694e-06, + "loss": 0.0764, + "step": 275 + }, + { + "epoch": 0.18673883626522328, + "grad_norm": 0.7526362586186429, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.0463, + "step": 276 + }, + { + "epoch": 0.18741542625169147, + "grad_norm": 1.0045694906631195, + "learning_rate": 6.238738738738739e-06, + "loss": 0.0812, + "step": 277 + }, + { + "epoch": 0.1880920162381597, + "grad_norm": 0.828489825167721, + "learning_rate": 6.261261261261262e-06, + "loss": 0.0615, + "step": 278 + }, + { + "epoch": 0.18876860622462788, + "grad_norm": 0.8573465514600265, + "learning_rate": 6.283783783783784e-06, + "loss": 0.0721, + "step": 279 + }, + { + "epoch": 0.18944519621109607, + "grad_norm": 0.6617401623803025, + "learning_rate": 6.3063063063063065e-06, + "loss": 0.0645, + "step": 280 + }, + { + "epoch": 0.19012178619756429, + "grad_norm": 0.8634458607540112, + "learning_rate": 6.328828828828829e-06, + "loss": 0.0477, + "step": 281 + }, + { + "epoch": 0.19079837618403248, + "grad_norm": 1.1310375061446956, + "learning_rate": 6.351351351351351e-06, + "loss": 0.0682, + "step": 282 + }, + { + "epoch": 0.19147496617050067, + "grad_norm": 0.8308007684531133, + "learning_rate": 6.373873873873875e-06, + "loss": 0.0743, + "step": 283 + }, + { + "epoch": 0.19215155615696888, + "grad_norm": 0.9582211460617213, + "learning_rate": 6.396396396396397e-06, + "loss": 0.0841, + "step": 284 + }, + { + "epoch": 0.19282814614343707, + "grad_norm": 0.6703558818024842, + "learning_rate": 6.41891891891892e-06, + "loss": 0.0572, + "step": 285 + }, + { + "epoch": 0.19350473612990526, + "grad_norm": 1.1896877966727823, + "learning_rate": 6.441441441441442e-06, + "loss": 0.0676, + "step": 286 + }, + { + "epoch": 0.19418132611637348, + "grad_norm": 0.9324453447728346, + "learning_rate": 6.463963963963964e-06, + "loss": 0.0538, + "step": 287 + }, + { + "epoch": 0.19485791610284167, + "grad_norm": 0.7451682540900739, + "learning_rate": 6.486486486486487e-06, + "loss": 0.061, + "step": 288 + }, + { + "epoch": 0.1955345060893099, + "grad_norm": 1.00741044422082, + "learning_rate": 6.50900900900901e-06, + "loss": 0.0621, + "step": 289 + }, + { + "epoch": 0.19621109607577808, + "grad_norm": 0.7827295409555299, + "learning_rate": 6.531531531531532e-06, + "loss": 0.0607, + "step": 290 + }, + { + "epoch": 0.19688768606224627, + "grad_norm": 1.1092699412455538, + "learning_rate": 6.554054054054054e-06, + "loss": 0.066, + "step": 291 + }, + { + "epoch": 0.1975642760487145, + "grad_norm": 0.7778480185787826, + "learning_rate": 6.5765765765765775e-06, + "loss": 0.0626, + "step": 292 + }, + { + "epoch": 0.19824086603518268, + "grad_norm": 0.9415418209335837, + "learning_rate": 6.5990990990991e-06, + "loss": 0.0702, + "step": 293 + }, + { + "epoch": 0.19891745602165087, + "grad_norm": 0.7712422958541395, + "learning_rate": 6.621621621621622e-06, + "loss": 0.077, + "step": 294 + }, + { + "epoch": 0.19959404600811909, + "grad_norm": 0.7502680597526424, + "learning_rate": 6.644144144144144e-06, + "loss": 0.0464, + "step": 295 + }, + { + "epoch": 0.20027063599458728, + "grad_norm": 1.5825005457319432, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0696, + "step": 296 + }, + { + "epoch": 0.2009472259810555, + "grad_norm": 0.6160769472482266, + "learning_rate": 6.689189189189191e-06, + "loss": 0.0489, + "step": 297 + }, + { + "epoch": 0.20162381596752368, + "grad_norm": 1.0163485751161636, + "learning_rate": 6.711711711711713e-06, + "loss": 0.0503, + "step": 298 + }, + { + "epoch": 0.20230040595399187, + "grad_norm": 0.8415280386144876, + "learning_rate": 6.734234234234235e-06, + "loss": 0.0555, + "step": 299 + }, + { + "epoch": 0.2029769959404601, + "grad_norm": 0.7533756964827386, + "learning_rate": 6.7567567567567575e-06, + "loss": 0.0553, + "step": 300 + }, + { + "epoch": 0.20365358592692828, + "grad_norm": 0.8567571847440155, + "learning_rate": 6.77927927927928e-06, + "loss": 0.0609, + "step": 301 + }, + { + "epoch": 0.20433017591339647, + "grad_norm": 0.6840058445177892, + "learning_rate": 6.801801801801803e-06, + "loss": 0.0484, + "step": 302 + }, + { + "epoch": 0.2050067658998647, + "grad_norm": 1.20849999437776, + "learning_rate": 6.824324324324325e-06, + "loss": 0.0755, + "step": 303 + }, + { + "epoch": 0.20568335588633288, + "grad_norm": 0.7179879977264311, + "learning_rate": 6.846846846846848e-06, + "loss": 0.0645, + "step": 304 + }, + { + "epoch": 0.20635994587280107, + "grad_norm": 0.7978287341375102, + "learning_rate": 6.86936936936937e-06, + "loss": 0.0606, + "step": 305 + }, + { + "epoch": 0.2070365358592693, + "grad_norm": 0.6223542351284302, + "learning_rate": 6.891891891891892e-06, + "loss": 0.0534, + "step": 306 + }, + { + "epoch": 0.20771312584573748, + "grad_norm": 0.708227113767345, + "learning_rate": 6.914414414414415e-06, + "loss": 0.0647, + "step": 307 + }, + { + "epoch": 0.2083897158322057, + "grad_norm": 0.766161531560619, + "learning_rate": 6.936936936936938e-06, + "loss": 0.0536, + "step": 308 + }, + { + "epoch": 0.2090663058186739, + "grad_norm": 0.6538609527519975, + "learning_rate": 6.95945945945946e-06, + "loss": 0.053, + "step": 309 + }, + { + "epoch": 0.20974289580514208, + "grad_norm": 0.5192620602709809, + "learning_rate": 6.981981981981982e-06, + "loss": 0.0521, + "step": 310 + }, + { + "epoch": 0.2104194857916103, + "grad_norm": 0.8855494776836502, + "learning_rate": 7.0045045045045045e-06, + "loss": 0.0803, + "step": 311 + }, + { + "epoch": 0.21109607577807848, + "grad_norm": 0.7105341285767813, + "learning_rate": 7.027027027027028e-06, + "loss": 0.0552, + "step": 312 + }, + { + "epoch": 0.21177266576454667, + "grad_norm": 0.760756554196171, + "learning_rate": 7.04954954954955e-06, + "loss": 0.0523, + "step": 313 + }, + { + "epoch": 0.2124492557510149, + "grad_norm": 0.6833298158589686, + "learning_rate": 7.072072072072072e-06, + "loss": 0.0649, + "step": 314 + }, + { + "epoch": 0.21312584573748308, + "grad_norm": 0.6605226531267566, + "learning_rate": 7.0945945945945946e-06, + "loss": 0.0562, + "step": 315 + }, + { + "epoch": 0.21380243572395127, + "grad_norm": 0.7182781506178565, + "learning_rate": 7.117117117117117e-06, + "loss": 0.0619, + "step": 316 + }, + { + "epoch": 0.2144790257104195, + "grad_norm": 0.9039606730344303, + "learning_rate": 7.139639639639641e-06, + "loss": 0.0678, + "step": 317 + }, + { + "epoch": 0.21515561569688768, + "grad_norm": 1.0672408603590426, + "learning_rate": 7.162162162162163e-06, + "loss": 0.062, + "step": 318 + }, + { + "epoch": 0.2158322056833559, + "grad_norm": 0.7825066978725418, + "learning_rate": 7.1846846846846855e-06, + "loss": 0.0631, + "step": 319 + }, + { + "epoch": 0.2165087956698241, + "grad_norm": 0.6986235049014473, + "learning_rate": 7.207207207207208e-06, + "loss": 0.0522, + "step": 320 + }, + { + "epoch": 0.21718538565629228, + "grad_norm": 0.7970237099290051, + "learning_rate": 7.229729729729731e-06, + "loss": 0.0568, + "step": 321 + }, + { + "epoch": 0.2178619756427605, + "grad_norm": 0.713437897281833, + "learning_rate": 7.252252252252253e-06, + "loss": 0.0606, + "step": 322 + }, + { + "epoch": 0.2185385656292287, + "grad_norm": 0.6538301705889707, + "learning_rate": 7.2747747747747755e-06, + "loss": 0.0592, + "step": 323 + }, + { + "epoch": 0.21921515561569688, + "grad_norm": 0.7333839996698983, + "learning_rate": 7.297297297297298e-06, + "loss": 0.0506, + "step": 324 + }, + { + "epoch": 0.2198917456021651, + "grad_norm": 0.7219854833615129, + "learning_rate": 7.31981981981982e-06, + "loss": 0.0575, + "step": 325 + }, + { + "epoch": 0.22056833558863329, + "grad_norm": 0.5823539428141844, + "learning_rate": 7.342342342342343e-06, + "loss": 0.0452, + "step": 326 + }, + { + "epoch": 0.22124492557510148, + "grad_norm": 0.7073237029424214, + "learning_rate": 7.3648648648648655e-06, + "loss": 0.0692, + "step": 327 + }, + { + "epoch": 0.2219215155615697, + "grad_norm": 0.5163380360026765, + "learning_rate": 7.387387387387388e-06, + "loss": 0.0542, + "step": 328 + }, + { + "epoch": 0.22259810554803788, + "grad_norm": 0.8531706542539876, + "learning_rate": 7.40990990990991e-06, + "loss": 0.0643, + "step": 329 + }, + { + "epoch": 0.2232746955345061, + "grad_norm": 0.5916938857991, + "learning_rate": 7.4324324324324324e-06, + "loss": 0.056, + "step": 330 + }, + { + "epoch": 0.2239512855209743, + "grad_norm": 0.7178043475648989, + "learning_rate": 7.4549549549549564e-06, + "loss": 0.0586, + "step": 331 + }, + { + "epoch": 0.22462787550744248, + "grad_norm": 1.1038858368742637, + "learning_rate": 7.477477477477479e-06, + "loss": 0.087, + "step": 332 + }, + { + "epoch": 0.2253044654939107, + "grad_norm": 1.0433511825553878, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0709, + "step": 333 + }, + { + "epoch": 0.2259810554803789, + "grad_norm": 0.7610852262597415, + "learning_rate": 7.5225225225225225e-06, + "loss": 0.0632, + "step": 334 + }, + { + "epoch": 0.22665764546684708, + "grad_norm": 0.9203107705922047, + "learning_rate": 7.545045045045045e-06, + "loss": 0.0625, + "step": 335 + }, + { + "epoch": 0.2273342354533153, + "grad_norm": 0.7007805814883743, + "learning_rate": 7.567567567567569e-06, + "loss": 0.0609, + "step": 336 + }, + { + "epoch": 0.2280108254397835, + "grad_norm": 0.7470133530462661, + "learning_rate": 7.590090090090091e-06, + "loss": 0.0546, + "step": 337 + }, + { + "epoch": 0.22868741542625168, + "grad_norm": 0.9299383482954492, + "learning_rate": 7.612612612612613e-06, + "loss": 0.0714, + "step": 338 + }, + { + "epoch": 0.2293640054127199, + "grad_norm": 0.7919735328109295, + "learning_rate": 7.635135135135135e-06, + "loss": 0.0674, + "step": 339 + }, + { + "epoch": 0.23004059539918809, + "grad_norm": 1.214035621679479, + "learning_rate": 7.657657657657658e-06, + "loss": 0.0879, + "step": 340 + }, + { + "epoch": 0.2307171853856563, + "grad_norm": 0.7465558095516062, + "learning_rate": 7.680180180180181e-06, + "loss": 0.0394, + "step": 341 + }, + { + "epoch": 0.2313937753721245, + "grad_norm": 0.7374121088630315, + "learning_rate": 7.702702702702704e-06, + "loss": 0.0547, + "step": 342 + }, + { + "epoch": 0.23207036535859268, + "grad_norm": 0.6071010914405474, + "learning_rate": 7.725225225225226e-06, + "loss": 0.0591, + "step": 343 + }, + { + "epoch": 0.2327469553450609, + "grad_norm": 0.9006190654260152, + "learning_rate": 7.747747747747749e-06, + "loss": 0.0835, + "step": 344 + }, + { + "epoch": 0.2334235453315291, + "grad_norm": 0.6486327204147487, + "learning_rate": 7.77027027027027e-06, + "loss": 0.0479, + "step": 345 + }, + { + "epoch": 0.23410013531799728, + "grad_norm": 0.7953218164634385, + "learning_rate": 7.792792792792793e-06, + "loss": 0.0753, + "step": 346 + }, + { + "epoch": 0.2347767253044655, + "grad_norm": 0.7327000299018703, + "learning_rate": 7.815315315315317e-06, + "loss": 0.0589, + "step": 347 + }, + { + "epoch": 0.2354533152909337, + "grad_norm": 0.6166326590182014, + "learning_rate": 7.837837837837838e-06, + "loss": 0.0451, + "step": 348 + }, + { + "epoch": 0.2361299052774019, + "grad_norm": 0.6922868013628761, + "learning_rate": 7.860360360360361e-06, + "loss": 0.0608, + "step": 349 + }, + { + "epoch": 0.2368064952638701, + "grad_norm": 0.7841237536383574, + "learning_rate": 7.882882882882884e-06, + "loss": 0.0659, + "step": 350 + }, + { + "epoch": 0.2374830852503383, + "grad_norm": 0.5213615675912151, + "learning_rate": 7.905405405405406e-06, + "loss": 0.0456, + "step": 351 + }, + { + "epoch": 0.2381596752368065, + "grad_norm": 0.6972976703434131, + "learning_rate": 7.927927927927929e-06, + "loss": 0.0533, + "step": 352 + }, + { + "epoch": 0.2388362652232747, + "grad_norm": 0.6850099785760475, + "learning_rate": 7.95045045045045e-06, + "loss": 0.0487, + "step": 353 + }, + { + "epoch": 0.2395128552097429, + "grad_norm": 0.6034297504099073, + "learning_rate": 7.972972972972974e-06, + "loss": 0.0448, + "step": 354 + }, + { + "epoch": 0.2401894451962111, + "grad_norm": 0.8666472731967332, + "learning_rate": 7.995495495495497e-06, + "loss": 0.0508, + "step": 355 + }, + { + "epoch": 0.2408660351826793, + "grad_norm": 0.5232908189061367, + "learning_rate": 8.018018018018018e-06, + "loss": 0.0433, + "step": 356 + }, + { + "epoch": 0.24154262516914748, + "grad_norm": 0.7162558221507196, + "learning_rate": 8.040540540540541e-06, + "loss": 0.0635, + "step": 357 + }, + { + "epoch": 0.2422192151556157, + "grad_norm": 0.5463869132160125, + "learning_rate": 8.063063063063063e-06, + "loss": 0.0608, + "step": 358 + }, + { + "epoch": 0.2428958051420839, + "grad_norm": 0.6698741157825209, + "learning_rate": 8.085585585585586e-06, + "loss": 0.0478, + "step": 359 + }, + { + "epoch": 0.2435723951285521, + "grad_norm": 1.0667149170417045, + "learning_rate": 8.108108108108109e-06, + "loss": 0.0623, + "step": 360 + }, + { + "epoch": 0.2442489851150203, + "grad_norm": 0.8322199906613101, + "learning_rate": 8.130630630630632e-06, + "loss": 0.0632, + "step": 361 + }, + { + "epoch": 0.2449255751014885, + "grad_norm": 1.1527891020800387, + "learning_rate": 8.153153153153154e-06, + "loss": 0.0918, + "step": 362 + }, + { + "epoch": 0.2456021650879567, + "grad_norm": 1.0015311731112426, + "learning_rate": 8.175675675675677e-06, + "loss": 0.0649, + "step": 363 + }, + { + "epoch": 0.2462787550744249, + "grad_norm": 1.2822270189403457, + "learning_rate": 8.198198198198198e-06, + "loss": 0.0615, + "step": 364 + }, + { + "epoch": 0.2469553450608931, + "grad_norm": 0.7170123128195778, + "learning_rate": 8.220720720720721e-06, + "loss": 0.0637, + "step": 365 + }, + { + "epoch": 0.2476319350473613, + "grad_norm": 0.7113945589505706, + "learning_rate": 8.243243243243245e-06, + "loss": 0.0488, + "step": 366 + }, + { + "epoch": 0.2483085250338295, + "grad_norm": 1.1129075587429147, + "learning_rate": 8.265765765765766e-06, + "loss": 0.0721, + "step": 367 + }, + { + "epoch": 0.2489851150202977, + "grad_norm": 0.7052680620974655, + "learning_rate": 8.288288288288289e-06, + "loss": 0.07, + "step": 368 + }, + { + "epoch": 0.2496617050067659, + "grad_norm": 0.6560055965647756, + "learning_rate": 8.31081081081081e-06, + "loss": 0.0403, + "step": 369 + }, + { + "epoch": 0.2503382949932341, + "grad_norm": 0.5676362590161815, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0603, + "step": 370 + }, + { + "epoch": 0.2510148849797023, + "grad_norm": 0.5097872222057397, + "learning_rate": 8.355855855855857e-06, + "loss": 0.0429, + "step": 371 + }, + { + "epoch": 0.2516914749661705, + "grad_norm": 0.9286563571647155, + "learning_rate": 8.378378378378378e-06, + "loss": 0.0763, + "step": 372 + }, + { + "epoch": 0.2523680649526387, + "grad_norm": 0.6214443636461204, + "learning_rate": 8.400900900900901e-06, + "loss": 0.0628, + "step": 373 + }, + { + "epoch": 0.2530446549391069, + "grad_norm": 0.6171283354529087, + "learning_rate": 8.423423423423423e-06, + "loss": 0.0521, + "step": 374 + }, + { + "epoch": 0.25372124492557513, + "grad_norm": 1.2968833059895006, + "learning_rate": 8.445945945945948e-06, + "loss": 0.0658, + "step": 375 + }, + { + "epoch": 0.2543978349120433, + "grad_norm": 0.6343349368864437, + "learning_rate": 8.46846846846847e-06, + "loss": 0.0563, + "step": 376 + }, + { + "epoch": 0.2550744248985115, + "grad_norm": 0.9583680612830061, + "learning_rate": 8.490990990990992e-06, + "loss": 0.0611, + "step": 377 + }, + { + "epoch": 0.2557510148849797, + "grad_norm": 0.5591574769792025, + "learning_rate": 8.513513513513514e-06, + "loss": 0.0436, + "step": 378 + }, + { + "epoch": 0.2564276048714479, + "grad_norm": 0.8927505996517491, + "learning_rate": 8.536036036036037e-06, + "loss": 0.0595, + "step": 379 + }, + { + "epoch": 0.2571041948579161, + "grad_norm": 0.6036231696789058, + "learning_rate": 8.55855855855856e-06, + "loss": 0.067, + "step": 380 + }, + { + "epoch": 0.2577807848443843, + "grad_norm": 0.674926700874754, + "learning_rate": 8.581081081081082e-06, + "loss": 0.071, + "step": 381 + }, + { + "epoch": 0.2584573748308525, + "grad_norm": 0.630725065216595, + "learning_rate": 8.603603603603605e-06, + "loss": 0.0689, + "step": 382 + }, + { + "epoch": 0.2591339648173207, + "grad_norm": 0.6901017830788927, + "learning_rate": 8.626126126126126e-06, + "loss": 0.0494, + "step": 383 + }, + { + "epoch": 0.2598105548037889, + "grad_norm": 0.5403322347987723, + "learning_rate": 8.64864864864865e-06, + "loss": 0.0654, + "step": 384 + }, + { + "epoch": 0.2604871447902571, + "grad_norm": 0.6416339260999229, + "learning_rate": 8.671171171171172e-06, + "loss": 0.0461, + "step": 385 + }, + { + "epoch": 0.26116373477672533, + "grad_norm": 0.6916472928225982, + "learning_rate": 8.693693693693694e-06, + "loss": 0.051, + "step": 386 + }, + { + "epoch": 0.2618403247631935, + "grad_norm": 0.5240431354656332, + "learning_rate": 8.716216216216217e-06, + "loss": 0.0461, + "step": 387 + }, + { + "epoch": 0.2625169147496617, + "grad_norm": 0.6085975030951926, + "learning_rate": 8.738738738738739e-06, + "loss": 0.0474, + "step": 388 + }, + { + "epoch": 0.2631935047361299, + "grad_norm": 1.000054446566517, + "learning_rate": 8.761261261261262e-06, + "loss": 0.052, + "step": 389 + }, + { + "epoch": 0.2638700947225981, + "grad_norm": 0.9140253728090463, + "learning_rate": 8.783783783783785e-06, + "loss": 0.0491, + "step": 390 + }, + { + "epoch": 0.2645466847090663, + "grad_norm": 1.0777677397593741, + "learning_rate": 8.806306306306306e-06, + "loss": 0.0769, + "step": 391 + }, + { + "epoch": 0.2652232746955345, + "grad_norm": 0.6860601884142381, + "learning_rate": 8.82882882882883e-06, + "loss": 0.0612, + "step": 392 + }, + { + "epoch": 0.2658998646820027, + "grad_norm": 0.7034780660251052, + "learning_rate": 8.851351351351351e-06, + "loss": 0.0533, + "step": 393 + }, + { + "epoch": 0.2665764546684709, + "grad_norm": 0.7354148731129575, + "learning_rate": 8.873873873873876e-06, + "loss": 0.0508, + "step": 394 + }, + { + "epoch": 0.2672530446549391, + "grad_norm": 0.4680850279518108, + "learning_rate": 8.896396396396397e-06, + "loss": 0.0501, + "step": 395 + }, + { + "epoch": 0.2679296346414073, + "grad_norm": 1.188509234752549, + "learning_rate": 8.91891891891892e-06, + "loss": 0.0783, + "step": 396 + }, + { + "epoch": 0.26860622462787553, + "grad_norm": 0.5583428530563981, + "learning_rate": 8.941441441441442e-06, + "loss": 0.0575, + "step": 397 + }, + { + "epoch": 0.2692828146143437, + "grad_norm": 1.0423898739451198, + "learning_rate": 8.963963963963965e-06, + "loss": 0.0549, + "step": 398 + }, + { + "epoch": 0.2699594046008119, + "grad_norm": 0.7311091222443428, + "learning_rate": 8.986486486486488e-06, + "loss": 0.0608, + "step": 399 + }, + { + "epoch": 0.2706359945872801, + "grad_norm": 0.5225547351097861, + "learning_rate": 9.00900900900901e-06, + "loss": 0.0477, + "step": 400 + }, + { + "epoch": 0.2713125845737483, + "grad_norm": 0.8567720081871087, + "learning_rate": 9.031531531531533e-06, + "loss": 0.0521, + "step": 401 + }, + { + "epoch": 0.2719891745602165, + "grad_norm": 0.8179635254015833, + "learning_rate": 9.054054054054054e-06, + "loss": 0.047, + "step": 402 + }, + { + "epoch": 0.27266576454668473, + "grad_norm": 0.6192623432238964, + "learning_rate": 9.076576576576577e-06, + "loss": 0.0526, + "step": 403 + }, + { + "epoch": 0.2733423545331529, + "grad_norm": 0.9529732419140194, + "learning_rate": 9.0990990990991e-06, + "loss": 0.0672, + "step": 404 + }, + { + "epoch": 0.2740189445196211, + "grad_norm": 0.8832817880863134, + "learning_rate": 9.121621621621622e-06, + "loss": 0.0644, + "step": 405 + }, + { + "epoch": 0.2746955345060893, + "grad_norm": 0.9390119367020981, + "learning_rate": 9.144144144144145e-06, + "loss": 0.0716, + "step": 406 + }, + { + "epoch": 0.2753721244925575, + "grad_norm": 0.5580452118490072, + "learning_rate": 9.166666666666666e-06, + "loss": 0.0499, + "step": 407 + }, + { + "epoch": 0.27604871447902574, + "grad_norm": 0.6095275666530106, + "learning_rate": 9.189189189189191e-06, + "loss": 0.054, + "step": 408 + }, + { + "epoch": 0.2767253044654939, + "grad_norm": 0.5924692533944188, + "learning_rate": 9.211711711711713e-06, + "loss": 0.0585, + "step": 409 + }, + { + "epoch": 0.2774018944519621, + "grad_norm": 0.6491461741649488, + "learning_rate": 9.234234234234236e-06, + "loss": 0.0578, + "step": 410 + }, + { + "epoch": 0.2780784844384303, + "grad_norm": 0.6490685083671708, + "learning_rate": 9.256756756756757e-06, + "loss": 0.0811, + "step": 411 + }, + { + "epoch": 0.2787550744248985, + "grad_norm": 0.7765080161117859, + "learning_rate": 9.27927927927928e-06, + "loss": 0.0758, + "step": 412 + }, + { + "epoch": 0.2794316644113667, + "grad_norm": 0.8952768170207692, + "learning_rate": 9.301801801801804e-06, + "loss": 0.1027, + "step": 413 + }, + { + "epoch": 0.28010825439783493, + "grad_norm": 0.5544435611505211, + "learning_rate": 9.324324324324325e-06, + "loss": 0.0663, + "step": 414 + }, + { + "epoch": 0.2807848443843031, + "grad_norm": 0.8014924169525837, + "learning_rate": 9.346846846846848e-06, + "loss": 0.0473, + "step": 415 + }, + { + "epoch": 0.2814614343707713, + "grad_norm": 0.5692425054934582, + "learning_rate": 9.36936936936937e-06, + "loss": 0.0508, + "step": 416 + }, + { + "epoch": 0.2821380243572395, + "grad_norm": 0.59770195978138, + "learning_rate": 9.391891891891893e-06, + "loss": 0.0472, + "step": 417 + }, + { + "epoch": 0.2828146143437077, + "grad_norm": 0.5740539152201671, + "learning_rate": 9.414414414414416e-06, + "loss": 0.0592, + "step": 418 + }, + { + "epoch": 0.28349120433017594, + "grad_norm": 0.9301350592935055, + "learning_rate": 9.436936936936937e-06, + "loss": 0.0547, + "step": 419 + }, + { + "epoch": 0.28416779431664413, + "grad_norm": 0.6512576926687861, + "learning_rate": 9.45945945945946e-06, + "loss": 0.0595, + "step": 420 + }, + { + "epoch": 0.2848443843031123, + "grad_norm": 0.739452317492861, + "learning_rate": 9.481981981981982e-06, + "loss": 0.0633, + "step": 421 + }, + { + "epoch": 0.2855209742895805, + "grad_norm": 0.7547290147847666, + "learning_rate": 9.504504504504505e-06, + "loss": 0.0589, + "step": 422 + }, + { + "epoch": 0.2861975642760487, + "grad_norm": 1.006531849063146, + "learning_rate": 9.527027027027028e-06, + "loss": 0.0705, + "step": 423 + }, + { + "epoch": 0.2868741542625169, + "grad_norm": 1.2347826160840838, + "learning_rate": 9.54954954954955e-06, + "loss": 0.0714, + "step": 424 + }, + { + "epoch": 0.28755074424898514, + "grad_norm": 0.5834190454325607, + "learning_rate": 9.572072072072073e-06, + "loss": 0.0741, + "step": 425 + }, + { + "epoch": 0.2882273342354533, + "grad_norm": 1.0523461913659622, + "learning_rate": 9.594594594594594e-06, + "loss": 0.0713, + "step": 426 + }, + { + "epoch": 0.2889039242219215, + "grad_norm": 0.4741085533713739, + "learning_rate": 9.617117117117117e-06, + "loss": 0.0448, + "step": 427 + }, + { + "epoch": 0.2895805142083897, + "grad_norm": 0.5880007748012538, + "learning_rate": 9.63963963963964e-06, + "loss": 0.0708, + "step": 428 + }, + { + "epoch": 0.2902571041948579, + "grad_norm": 0.6889454460244109, + "learning_rate": 9.662162162162164e-06, + "loss": 0.0518, + "step": 429 + }, + { + "epoch": 0.29093369418132614, + "grad_norm": 0.6959729962192035, + "learning_rate": 9.684684684684685e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.29161028416779433, + "grad_norm": 0.827473475654896, + "learning_rate": 9.707207207207208e-06, + "loss": 0.0663, + "step": 431 + }, + { + "epoch": 0.2922868741542625, + "grad_norm": 0.6763086308781805, + "learning_rate": 9.729729729729732e-06, + "loss": 0.0535, + "step": 432 + }, + { + "epoch": 0.2929634641407307, + "grad_norm": 0.8721703546435019, + "learning_rate": 9.752252252252253e-06, + "loss": 0.0605, + "step": 433 + }, + { + "epoch": 0.2936400541271989, + "grad_norm": 1.0347900345944163, + "learning_rate": 9.774774774774776e-06, + "loss": 0.0435, + "step": 434 + }, + { + "epoch": 0.2943166441136671, + "grad_norm": 0.7533663188355433, + "learning_rate": 9.797297297297298e-06, + "loss": 0.0668, + "step": 435 + }, + { + "epoch": 0.29499323410013534, + "grad_norm": 0.7903165280972534, + "learning_rate": 9.81981981981982e-06, + "loss": 0.0699, + "step": 436 + }, + { + "epoch": 0.2956698240866035, + "grad_norm": 0.6550871582375518, + "learning_rate": 9.842342342342344e-06, + "loss": 0.0638, + "step": 437 + }, + { + "epoch": 0.2963464140730717, + "grad_norm": 0.6040331258438423, + "learning_rate": 9.864864864864865e-06, + "loss": 0.0514, + "step": 438 + }, + { + "epoch": 0.2970230040595399, + "grad_norm": 0.515235499537733, + "learning_rate": 9.887387387387388e-06, + "loss": 0.0517, + "step": 439 + }, + { + "epoch": 0.2976995940460081, + "grad_norm": 0.5780813865787422, + "learning_rate": 9.90990990990991e-06, + "loss": 0.0491, + "step": 440 + }, + { + "epoch": 0.29837618403247634, + "grad_norm": 0.611062653460649, + "learning_rate": 9.932432432432433e-06, + "loss": 0.0514, + "step": 441 + }, + { + "epoch": 0.29905277401894453, + "grad_norm": 0.5573070083642604, + "learning_rate": 9.954954954954956e-06, + "loss": 0.05, + "step": 442 + }, + { + "epoch": 0.2997293640054127, + "grad_norm": 0.7764390254784838, + "learning_rate": 9.97747747747748e-06, + "loss": 0.0573, + "step": 443 + }, + { + "epoch": 0.3004059539918809, + "grad_norm": 0.63941645256914, + "learning_rate": 1e-05, + "loss": 0.0708, + "step": 444 + }, + { + "epoch": 0.3010825439783491, + "grad_norm": 0.8471778599360368, + "learning_rate": 9.999998450134754e-06, + "loss": 0.0528, + "step": 445 + }, + { + "epoch": 0.3017591339648173, + "grad_norm": 0.8450451817270667, + "learning_rate": 9.999993800539971e-06, + "loss": 0.0572, + "step": 446 + }, + { + "epoch": 0.30243572395128554, + "grad_norm": 0.8844146226217167, + "learning_rate": 9.999986051218538e-06, + "loss": 0.0644, + "step": 447 + }, + { + "epoch": 0.30311231393775373, + "grad_norm": 0.5754466363792542, + "learning_rate": 9.999975202175256e-06, + "loss": 0.0527, + "step": 448 + }, + { + "epoch": 0.3037889039242219, + "grad_norm": 0.5959925989880287, + "learning_rate": 9.999961253416853e-06, + "loss": 0.0492, + "step": 449 + }, + { + "epoch": 0.3044654939106901, + "grad_norm": 0.8591321055153385, + "learning_rate": 9.999944204951974e-06, + "loss": 0.075, + "step": 450 + }, + { + "epoch": 0.3051420838971583, + "grad_norm": 0.7071183665853937, + "learning_rate": 9.999924056791192e-06, + "loss": 0.069, + "step": 451 + }, + { + "epoch": 0.30581867388362655, + "grad_norm": 0.7677079769031657, + "learning_rate": 9.999900808946996e-06, + "loss": 0.0768, + "step": 452 + }, + { + "epoch": 0.30649526387009474, + "grad_norm": 0.6931293022348536, + "learning_rate": 9.999874461433796e-06, + "loss": 0.0495, + "step": 453 + }, + { + "epoch": 0.3071718538565629, + "grad_norm": 0.6525562001842464, + "learning_rate": 9.999845014267928e-06, + "loss": 0.0707, + "step": 454 + }, + { + "epoch": 0.3078484438430311, + "grad_norm": 0.6032130470460988, + "learning_rate": 9.99981246746765e-06, + "loss": 0.0512, + "step": 455 + }, + { + "epoch": 0.3085250338294993, + "grad_norm": 0.66181144485413, + "learning_rate": 9.999776821053134e-06, + "loss": 0.0545, + "step": 456 + }, + { + "epoch": 0.3092016238159675, + "grad_norm": 0.6466649751991335, + "learning_rate": 9.999738075046483e-06, + "loss": 0.0628, + "step": 457 + }, + { + "epoch": 0.30987821380243574, + "grad_norm": 0.7485951892575824, + "learning_rate": 9.999696229471716e-06, + "loss": 0.086, + "step": 458 + }, + { + "epoch": 0.31055480378890393, + "grad_norm": 0.856119408797913, + "learning_rate": 9.999651284354774e-06, + "loss": 0.0697, + "step": 459 + }, + { + "epoch": 0.3112313937753721, + "grad_norm": 0.6434548587724919, + "learning_rate": 9.999603239723524e-06, + "loss": 0.0638, + "step": 460 + }, + { + "epoch": 0.3119079837618403, + "grad_norm": 0.5999827338946243, + "learning_rate": 9.999552095607748e-06, + "loss": 0.0447, + "step": 461 + }, + { + "epoch": 0.3125845737483085, + "grad_norm": 1.3006434285410413, + "learning_rate": 9.999497852039152e-06, + "loss": 0.0622, + "step": 462 + }, + { + "epoch": 0.31326116373477675, + "grad_norm": 1.059372351387485, + "learning_rate": 9.999440509051367e-06, + "loss": 0.0779, + "step": 463 + }, + { + "epoch": 0.31393775372124494, + "grad_norm": 0.8847374912809629, + "learning_rate": 9.999380066679943e-06, + "loss": 0.0622, + "step": 464 + }, + { + "epoch": 0.31461434370771313, + "grad_norm": 0.4015954011331638, + "learning_rate": 9.999316524962347e-06, + "loss": 0.0451, + "step": 465 + }, + { + "epoch": 0.3152909336941813, + "grad_norm": 0.5801258642444339, + "learning_rate": 9.999249883937971e-06, + "loss": 0.0407, + "step": 466 + }, + { + "epoch": 0.3159675236806495, + "grad_norm": 0.6643566861194543, + "learning_rate": 9.999180143648136e-06, + "loss": 0.0476, + "step": 467 + }, + { + "epoch": 0.3166441136671177, + "grad_norm": 1.519649764447325, + "learning_rate": 9.999107304136068e-06, + "loss": 0.0967, + "step": 468 + }, + { + "epoch": 0.31732070365358594, + "grad_norm": 0.7379888857729946, + "learning_rate": 9.999031365446932e-06, + "loss": 0.0675, + "step": 469 + }, + { + "epoch": 0.31799729364005414, + "grad_norm": 0.6511620429858112, + "learning_rate": 9.9989523276278e-06, + "loss": 0.0511, + "step": 470 + }, + { + "epoch": 0.3186738836265223, + "grad_norm": 0.7407486316989249, + "learning_rate": 9.998870190727674e-06, + "loss": 0.0468, + "step": 471 + }, + { + "epoch": 0.3193504736129905, + "grad_norm": 0.9120240661496066, + "learning_rate": 9.998784954797474e-06, + "loss": 0.0605, + "step": 472 + }, + { + "epoch": 0.3200270635994587, + "grad_norm": 0.6768945458494859, + "learning_rate": 9.99869661989004e-06, + "loss": 0.0495, + "step": 473 + }, + { + "epoch": 0.32070365358592695, + "grad_norm": 0.7273592444942851, + "learning_rate": 9.998605186060138e-06, + "loss": 0.0523, + "step": 474 + }, + { + "epoch": 0.32138024357239514, + "grad_norm": 0.9893725326416111, + "learning_rate": 9.998510653364449e-06, + "loss": 0.0595, + "step": 475 + }, + { + "epoch": 0.32205683355886333, + "grad_norm": 0.5372243596154718, + "learning_rate": 9.998413021861581e-06, + "loss": 0.0534, + "step": 476 + }, + { + "epoch": 0.3227334235453315, + "grad_norm": 0.6620807382992, + "learning_rate": 9.998312291612056e-06, + "loss": 0.0581, + "step": 477 + }, + { + "epoch": 0.3234100135317997, + "grad_norm": 0.5581779223413607, + "learning_rate": 9.998208462678328e-06, + "loss": 0.0521, + "step": 478 + }, + { + "epoch": 0.32408660351826796, + "grad_norm": 0.6358486018642957, + "learning_rate": 9.998101535124758e-06, + "loss": 0.0568, + "step": 479 + }, + { + "epoch": 0.32476319350473615, + "grad_norm": 0.6098598965509763, + "learning_rate": 9.99799150901764e-06, + "loss": 0.0573, + "step": 480 + }, + { + "epoch": 0.32543978349120434, + "grad_norm": 0.6430211419986778, + "learning_rate": 9.997878384425183e-06, + "loss": 0.0483, + "step": 481 + }, + { + "epoch": 0.3261163734776725, + "grad_norm": 0.9958565461685056, + "learning_rate": 9.997762161417517e-06, + "loss": 0.0561, + "step": 482 + }, + { + "epoch": 0.3267929634641407, + "grad_norm": 0.5285385755194703, + "learning_rate": 9.997642840066696e-06, + "loss": 0.0443, + "step": 483 + }, + { + "epoch": 0.3274695534506089, + "grad_norm": 0.7769394701457077, + "learning_rate": 9.997520420446694e-06, + "loss": 0.0939, + "step": 484 + }, + { + "epoch": 0.32814614343707715, + "grad_norm": 0.6003044728349175, + "learning_rate": 9.9973949026334e-06, + "loss": 0.0491, + "step": 485 + }, + { + "epoch": 0.32882273342354534, + "grad_norm": 0.9504360667818255, + "learning_rate": 9.99726628670463e-06, + "loss": 0.0696, + "step": 486 + }, + { + "epoch": 0.32949932341001353, + "grad_norm": 0.5647015380923237, + "learning_rate": 9.997134572740122e-06, + "loss": 0.0552, + "step": 487 + }, + { + "epoch": 0.3301759133964817, + "grad_norm": 0.6115624905232845, + "learning_rate": 9.996999760821529e-06, + "loss": 0.0472, + "step": 488 + }, + { + "epoch": 0.3308525033829499, + "grad_norm": 0.9453936271559629, + "learning_rate": 9.996861851032426e-06, + "loss": 0.0621, + "step": 489 + }, + { + "epoch": 0.33152909336941816, + "grad_norm": 0.496098418094311, + "learning_rate": 9.996720843458312e-06, + "loss": 0.055, + "step": 490 + }, + { + "epoch": 0.33220568335588635, + "grad_norm": 0.5709611582565106, + "learning_rate": 9.996576738186602e-06, + "loss": 0.0589, + "step": 491 + }, + { + "epoch": 0.33288227334235454, + "grad_norm": 0.6279453336841245, + "learning_rate": 9.996429535306638e-06, + "loss": 0.0506, + "step": 492 + }, + { + "epoch": 0.33355886332882273, + "grad_norm": 0.9607027447963572, + "learning_rate": 9.996279234909672e-06, + "loss": 0.076, + "step": 493 + }, + { + "epoch": 0.3342354533152909, + "grad_norm": 0.7312108895179549, + "learning_rate": 9.996125837088883e-06, + "loss": 0.0565, + "step": 494 + }, + { + "epoch": 0.3349120433017591, + "grad_norm": 0.6090789167815116, + "learning_rate": 9.995969341939373e-06, + "loss": 0.0627, + "step": 495 + }, + { + "epoch": 0.33558863328822736, + "grad_norm": 0.6909338373764786, + "learning_rate": 9.995809749558159e-06, + "loss": 0.0462, + "step": 496 + }, + { + "epoch": 0.33626522327469555, + "grad_norm": 0.7476438356921422, + "learning_rate": 9.995647060044178e-06, + "loss": 0.0566, + "step": 497 + }, + { + "epoch": 0.33694181326116374, + "grad_norm": 0.5753897410171199, + "learning_rate": 9.995481273498291e-06, + "loss": 0.052, + "step": 498 + }, + { + "epoch": 0.3376184032476319, + "grad_norm": 0.9847889262945785, + "learning_rate": 9.995312390023275e-06, + "loss": 0.0742, + "step": 499 + }, + { + "epoch": 0.3382949932341001, + "grad_norm": 0.8251720827079871, + "learning_rate": 9.995140409723831e-06, + "loss": 0.0558, + "step": 500 + }, + { + "epoch": 0.33897158322056836, + "grad_norm": 0.6173311918543511, + "learning_rate": 9.994965332706574e-06, + "loss": 0.0638, + "step": 501 + }, + { + "epoch": 0.33964817320703655, + "grad_norm": 0.5220884176150119, + "learning_rate": 9.994787159080046e-06, + "loss": 0.0815, + "step": 502 + }, + { + "epoch": 0.34032476319350474, + "grad_norm": 0.867115659658843, + "learning_rate": 9.994605888954701e-06, + "loss": 0.0758, + "step": 503 + }, + { + "epoch": 0.34100135317997293, + "grad_norm": 0.6005910959010601, + "learning_rate": 9.99442152244292e-06, + "loss": 0.0688, + "step": 504 + }, + { + "epoch": 0.3416779431664411, + "grad_norm": 0.7257293085903247, + "learning_rate": 9.994234059658998e-06, + "loss": 0.062, + "step": 505 + }, + { + "epoch": 0.3423545331529093, + "grad_norm": 0.42476969791052893, + "learning_rate": 9.994043500719155e-06, + "loss": 0.0559, + "step": 506 + }, + { + "epoch": 0.34303112313937756, + "grad_norm": 0.7131862947849292, + "learning_rate": 9.993849845741525e-06, + "loss": 0.0593, + "step": 507 + }, + { + "epoch": 0.34370771312584575, + "grad_norm": 0.5472938298249831, + "learning_rate": 9.993653094846162e-06, + "loss": 0.0423, + "step": 508 + }, + { + "epoch": 0.34438430311231394, + "grad_norm": 0.5948620000109195, + "learning_rate": 9.993453248155044e-06, + "loss": 0.0497, + "step": 509 + }, + { + "epoch": 0.34506089309878213, + "grad_norm": 1.093733533838249, + "learning_rate": 9.993250305792067e-06, + "loss": 0.0746, + "step": 510 + }, + { + "epoch": 0.3457374830852503, + "grad_norm": 0.5333163745309386, + "learning_rate": 9.993044267883039e-06, + "loss": 0.0423, + "step": 511 + }, + { + "epoch": 0.34641407307171856, + "grad_norm": 0.7020673460133552, + "learning_rate": 9.992835134555694e-06, + "loss": 0.0535, + "step": 512 + }, + { + "epoch": 0.34709066305818675, + "grad_norm": 0.6343550734219309, + "learning_rate": 9.992622905939686e-06, + "loss": 0.0696, + "step": 513 + }, + { + "epoch": 0.34776725304465494, + "grad_norm": 0.6693806970304675, + "learning_rate": 9.992407582166582e-06, + "loss": 0.0642, + "step": 514 + }, + { + "epoch": 0.34844384303112313, + "grad_norm": 0.6158128059524903, + "learning_rate": 9.992189163369873e-06, + "loss": 0.0482, + "step": 515 + }, + { + "epoch": 0.3491204330175913, + "grad_norm": 0.7487588584286908, + "learning_rate": 9.991967649684967e-06, + "loss": 0.0673, + "step": 516 + }, + { + "epoch": 0.3497970230040595, + "grad_norm": 0.6862433264965782, + "learning_rate": 9.99174304124919e-06, + "loss": 0.0567, + "step": 517 + }, + { + "epoch": 0.35047361299052776, + "grad_norm": 0.6580399913848498, + "learning_rate": 9.991515338201787e-06, + "loss": 0.0609, + "step": 518 + }, + { + "epoch": 0.35115020297699595, + "grad_norm": 0.5920159943928762, + "learning_rate": 9.991284540683922e-06, + "loss": 0.0552, + "step": 519 + }, + { + "epoch": 0.35182679296346414, + "grad_norm": 0.7122440429081579, + "learning_rate": 9.991050648838676e-06, + "loss": 0.0858, + "step": 520 + }, + { + "epoch": 0.35250338294993233, + "grad_norm": 0.5295783013499992, + "learning_rate": 9.990813662811052e-06, + "loss": 0.0684, + "step": 521 + }, + { + "epoch": 0.3531799729364005, + "grad_norm": 0.6434007100386325, + "learning_rate": 9.990573582747965e-06, + "loss": 0.0694, + "step": 522 + }, + { + "epoch": 0.35385656292286877, + "grad_norm": 0.5522910666589854, + "learning_rate": 9.990330408798255e-06, + "loss": 0.0563, + "step": 523 + }, + { + "epoch": 0.35453315290933696, + "grad_norm": 0.5355815417338708, + "learning_rate": 9.990084141112674e-06, + "loss": 0.0474, + "step": 524 + }, + { + "epoch": 0.35520974289580515, + "grad_norm": 0.5872240034067444, + "learning_rate": 9.989834779843895e-06, + "loss": 0.067, + "step": 525 + }, + { + "epoch": 0.35588633288227334, + "grad_norm": 0.5044518811534828, + "learning_rate": 9.989582325146511e-06, + "loss": 0.0507, + "step": 526 + }, + { + "epoch": 0.3565629228687415, + "grad_norm": 0.6290030734871784, + "learning_rate": 9.98932677717703e-06, + "loss": 0.0594, + "step": 527 + }, + { + "epoch": 0.3572395128552097, + "grad_norm": 0.6863719680908231, + "learning_rate": 9.989068136093873e-06, + "loss": 0.0493, + "step": 528 + }, + { + "epoch": 0.35791610284167796, + "grad_norm": 0.6009540790788344, + "learning_rate": 9.98880640205739e-06, + "loss": 0.0589, + "step": 529 + }, + { + "epoch": 0.35859269282814615, + "grad_norm": 0.6084327497198343, + "learning_rate": 9.988541575229837e-06, + "loss": 0.0518, + "step": 530 + }, + { + "epoch": 0.35926928281461434, + "grad_norm": 0.5273232471322782, + "learning_rate": 9.988273655775398e-06, + "loss": 0.0462, + "step": 531 + }, + { + "epoch": 0.35994587280108253, + "grad_norm": 0.5680931468656651, + "learning_rate": 9.988002643860162e-06, + "loss": 0.0446, + "step": 532 + }, + { + "epoch": 0.3606224627875507, + "grad_norm": 0.504296532636251, + "learning_rate": 9.987728539652145e-06, + "loss": 0.0492, + "step": 533 + }, + { + "epoch": 0.36129905277401897, + "grad_norm": 0.7850147931450985, + "learning_rate": 9.98745134332128e-06, + "loss": 0.0595, + "step": 534 + }, + { + "epoch": 0.36197564276048716, + "grad_norm": 0.6883218200291387, + "learning_rate": 9.987171055039409e-06, + "loss": 0.0679, + "step": 535 + }, + { + "epoch": 0.36265223274695535, + "grad_norm": 0.6217248635294631, + "learning_rate": 9.986887674980297e-06, + "loss": 0.0744, + "step": 536 + }, + { + "epoch": 0.36332882273342354, + "grad_norm": 0.8984877191976386, + "learning_rate": 9.986601203319623e-06, + "loss": 0.0611, + "step": 537 + }, + { + "epoch": 0.36400541271989173, + "grad_norm": 0.5200286611594177, + "learning_rate": 9.986311640234988e-06, + "loss": 0.047, + "step": 538 + }, + { + "epoch": 0.3646820027063599, + "grad_norm": 0.5393876296161556, + "learning_rate": 9.986018985905901e-06, + "loss": 0.0573, + "step": 539 + }, + { + "epoch": 0.36535859269282817, + "grad_norm": 0.9610999640309457, + "learning_rate": 9.985723240513795e-06, + "loss": 0.0688, + "step": 540 + }, + { + "epoch": 0.36603518267929636, + "grad_norm": 0.6022215074653418, + "learning_rate": 9.985424404242015e-06, + "loss": 0.0706, + "step": 541 + }, + { + "epoch": 0.36671177266576455, + "grad_norm": 0.7613605462105187, + "learning_rate": 9.985122477275824e-06, + "loss": 0.0567, + "step": 542 + }, + { + "epoch": 0.36738836265223274, + "grad_norm": 0.4665944640080223, + "learning_rate": 9.9848174598024e-06, + "loss": 0.0426, + "step": 543 + }, + { + "epoch": 0.3680649526387009, + "grad_norm": 0.43200689111944535, + "learning_rate": 9.984509352010839e-06, + "loss": 0.0427, + "step": 544 + }, + { + "epoch": 0.36874154262516917, + "grad_norm": 0.546995121085685, + "learning_rate": 9.984198154092147e-06, + "loss": 0.0417, + "step": 545 + }, + { + "epoch": 0.36941813261163736, + "grad_norm": 0.5950822901216501, + "learning_rate": 9.983883866239253e-06, + "loss": 0.0454, + "step": 546 + }, + { + "epoch": 0.37009472259810555, + "grad_norm": 0.6285822947918005, + "learning_rate": 9.983566488647e-06, + "loss": 0.0489, + "step": 547 + }, + { + "epoch": 0.37077131258457374, + "grad_norm": 0.7708988381155685, + "learning_rate": 9.98324602151214e-06, + "loss": 0.0546, + "step": 548 + }, + { + "epoch": 0.37144790257104193, + "grad_norm": 0.6385878593932224, + "learning_rate": 9.98292246503335e-06, + "loss": 0.0787, + "step": 549 + }, + { + "epoch": 0.3721244925575101, + "grad_norm": 0.4767906982846207, + "learning_rate": 9.982595819411216e-06, + "loss": 0.0499, + "step": 550 + }, + { + "epoch": 0.37280108254397837, + "grad_norm": 0.5981540167028362, + "learning_rate": 9.98226608484824e-06, + "loss": 0.042, + "step": 551 + }, + { + "epoch": 0.37347767253044656, + "grad_norm": 1.0562171077633178, + "learning_rate": 9.981933261548841e-06, + "loss": 0.0549, + "step": 552 + }, + { + "epoch": 0.37415426251691475, + "grad_norm": 0.5806835877422644, + "learning_rate": 9.981597349719351e-06, + "loss": 0.0574, + "step": 553 + }, + { + "epoch": 0.37483085250338294, + "grad_norm": 0.4004814302679841, + "learning_rate": 9.981258349568018e-06, + "loss": 0.0396, + "step": 554 + }, + { + "epoch": 0.37550744248985113, + "grad_norm": 0.6074327356572229, + "learning_rate": 9.980916261305002e-06, + "loss": 0.0585, + "step": 555 + }, + { + "epoch": 0.3761840324763194, + "grad_norm": 0.6224260203576697, + "learning_rate": 9.980571085142381e-06, + "loss": 0.0612, + "step": 556 + }, + { + "epoch": 0.37686062246278756, + "grad_norm": 0.7966370089812861, + "learning_rate": 9.980222821294143e-06, + "loss": 0.046, + "step": 557 + }, + { + "epoch": 0.37753721244925575, + "grad_norm": 0.4410488171312846, + "learning_rate": 9.979871469976197e-06, + "loss": 0.0481, + "step": 558 + }, + { + "epoch": 0.37821380243572394, + "grad_norm": 0.3848865222745807, + "learning_rate": 9.979517031406357e-06, + "loss": 0.0333, + "step": 559 + }, + { + "epoch": 0.37889039242219213, + "grad_norm": 0.6351468438149818, + "learning_rate": 9.97915950580436e-06, + "loss": 0.0692, + "step": 560 + }, + { + "epoch": 0.3795669824086603, + "grad_norm": 0.4988612014196045, + "learning_rate": 9.97879889339185e-06, + "loss": 0.0564, + "step": 561 + }, + { + "epoch": 0.38024357239512857, + "grad_norm": 0.5769445700376538, + "learning_rate": 9.97843519439239e-06, + "loss": 0.0635, + "step": 562 + }, + { + "epoch": 0.38092016238159676, + "grad_norm": 0.480116832601754, + "learning_rate": 9.978068409031449e-06, + "loss": 0.0478, + "step": 563 + }, + { + "epoch": 0.38159675236806495, + "grad_norm": 0.4701133209134182, + "learning_rate": 9.97769853753642e-06, + "loss": 0.055, + "step": 564 + }, + { + "epoch": 0.38227334235453314, + "grad_norm": 0.5503363763619515, + "learning_rate": 9.977325580136598e-06, + "loss": 0.0467, + "step": 565 + }, + { + "epoch": 0.38294993234100133, + "grad_norm": 0.5989246732859047, + "learning_rate": 9.9769495370632e-06, + "loss": 0.0451, + "step": 566 + }, + { + "epoch": 0.3836265223274696, + "grad_norm": 0.47256811628541845, + "learning_rate": 9.97657040854935e-06, + "loss": 0.0443, + "step": 567 + }, + { + "epoch": 0.38430311231393777, + "grad_norm": 0.8915362833553975, + "learning_rate": 9.976188194830092e-06, + "loss": 0.0586, + "step": 568 + }, + { + "epoch": 0.38497970230040596, + "grad_norm": 0.553155207515097, + "learning_rate": 9.975802896142373e-06, + "loss": 0.0625, + "step": 569 + }, + { + "epoch": 0.38565629228687415, + "grad_norm": 0.5458082769709284, + "learning_rate": 9.975414512725058e-06, + "loss": 0.0499, + "step": 570 + }, + { + "epoch": 0.38633288227334234, + "grad_norm": 0.4556060115243897, + "learning_rate": 9.975023044818925e-06, + "loss": 0.0392, + "step": 571 + }, + { + "epoch": 0.3870094722598105, + "grad_norm": 1.0506396789722947, + "learning_rate": 9.974628492666664e-06, + "loss": 0.0584, + "step": 572 + }, + { + "epoch": 0.3876860622462788, + "grad_norm": 0.5710503312265792, + "learning_rate": 9.974230856512874e-06, + "loss": 0.0527, + "step": 573 + }, + { + "epoch": 0.38836265223274696, + "grad_norm": 0.5477787984347363, + "learning_rate": 9.973830136604068e-06, + "loss": 0.0544, + "step": 574 + }, + { + "epoch": 0.38903924221921515, + "grad_norm": 0.5591499111479803, + "learning_rate": 9.973426333188673e-06, + "loss": 0.0544, + "step": 575 + }, + { + "epoch": 0.38971583220568334, + "grad_norm": 0.6805144445945421, + "learning_rate": 9.973019446517023e-06, + "loss": 0.0743, + "step": 576 + }, + { + "epoch": 0.39039242219215153, + "grad_norm": 0.5357256318419202, + "learning_rate": 9.972609476841368e-06, + "loss": 0.0492, + "step": 577 + }, + { + "epoch": 0.3910690121786198, + "grad_norm": 0.6563328943248335, + "learning_rate": 9.972196424415865e-06, + "loss": 0.0515, + "step": 578 + }, + { + "epoch": 0.39174560216508797, + "grad_norm": 0.6683875299057613, + "learning_rate": 9.971780289496585e-06, + "loss": 0.0596, + "step": 579 + }, + { + "epoch": 0.39242219215155616, + "grad_norm": 0.6203356217668405, + "learning_rate": 9.971361072341509e-06, + "loss": 0.048, + "step": 580 + }, + { + "epoch": 0.39309878213802435, + "grad_norm": 0.7130829657996903, + "learning_rate": 9.97093877321053e-06, + "loss": 0.0625, + "step": 581 + }, + { + "epoch": 0.39377537212449254, + "grad_norm": 0.7118642885536309, + "learning_rate": 9.970513392365449e-06, + "loss": 0.0554, + "step": 582 + }, + { + "epoch": 0.3944519621109608, + "grad_norm": 0.6293851578067391, + "learning_rate": 9.970084930069982e-06, + "loss": 0.0623, + "step": 583 + }, + { + "epoch": 0.395128552097429, + "grad_norm": 0.4917487400725403, + "learning_rate": 9.969653386589749e-06, + "loss": 0.0343, + "step": 584 + }, + { + "epoch": 0.39580514208389717, + "grad_norm": 1.223119525422176, + "learning_rate": 9.969218762192286e-06, + "loss": 0.0651, + "step": 585 + }, + { + "epoch": 0.39648173207036536, + "grad_norm": 0.4287599437591492, + "learning_rate": 9.968781057147036e-06, + "loss": 0.0451, + "step": 586 + }, + { + "epoch": 0.39715832205683355, + "grad_norm": 0.9774368132953997, + "learning_rate": 9.968340271725352e-06, + "loss": 0.0545, + "step": 587 + }, + { + "epoch": 0.39783491204330174, + "grad_norm": 0.8968389920777119, + "learning_rate": 9.967896406200498e-06, + "loss": 0.0545, + "step": 588 + }, + { + "epoch": 0.39851150202977, + "grad_norm": 0.3490632538825462, + "learning_rate": 9.967449460847648e-06, + "loss": 0.0406, + "step": 589 + }, + { + "epoch": 0.39918809201623817, + "grad_norm": 0.7494514761875918, + "learning_rate": 9.966999435943882e-06, + "loss": 0.0462, + "step": 590 + }, + { + "epoch": 0.39986468200270636, + "grad_norm": 1.1060014645599572, + "learning_rate": 9.966546331768192e-06, + "loss": 0.0643, + "step": 591 + }, + { + "epoch": 0.40054127198917455, + "grad_norm": 0.5514912544682866, + "learning_rate": 9.966090148601477e-06, + "loss": 0.0539, + "step": 592 + }, + { + "epoch": 0.40121786197564274, + "grad_norm": 0.7534435027607882, + "learning_rate": 9.965630886726548e-06, + "loss": 0.0511, + "step": 593 + }, + { + "epoch": 0.401894451962111, + "grad_norm": 0.48360363761511427, + "learning_rate": 9.965168546428122e-06, + "loss": 0.047, + "step": 594 + }, + { + "epoch": 0.4025710419485792, + "grad_norm": 0.5494374284358066, + "learning_rate": 9.964703127992822e-06, + "loss": 0.0564, + "step": 595 + }, + { + "epoch": 0.40324763193504737, + "grad_norm": 0.47579309611111265, + "learning_rate": 9.964234631709188e-06, + "loss": 0.0546, + "step": 596 + }, + { + "epoch": 0.40392422192151556, + "grad_norm": 0.4530189543942543, + "learning_rate": 9.963763057867658e-06, + "loss": 0.0415, + "step": 597 + }, + { + "epoch": 0.40460081190798375, + "grad_norm": 0.5883162500958534, + "learning_rate": 9.963288406760584e-06, + "loss": 0.0542, + "step": 598 + }, + { + "epoch": 0.40527740189445194, + "grad_norm": 0.8472842059393131, + "learning_rate": 9.962810678682223e-06, + "loss": 0.0648, + "step": 599 + }, + { + "epoch": 0.4059539918809202, + "grad_norm": 0.476752183272619, + "learning_rate": 9.962329873928743e-06, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.4066305818673884, + "grad_norm": 0.4620893294233114, + "learning_rate": 9.961845992798213e-06, + "loss": 0.043, + "step": 601 + }, + { + "epoch": 0.40730717185385656, + "grad_norm": 0.5102829518810936, + "learning_rate": 9.961359035590619e-06, + "loss": 0.0342, + "step": 602 + }, + { + "epoch": 0.40798376184032475, + "grad_norm": 0.6386683967685269, + "learning_rate": 9.960869002607843e-06, + "loss": 0.0591, + "step": 603 + }, + { + "epoch": 0.40866035182679294, + "grad_norm": 0.47902977709344424, + "learning_rate": 9.960375894153682e-06, + "loss": 0.0616, + "step": 604 + }, + { + "epoch": 0.4093369418132612, + "grad_norm": 0.6046102465325948, + "learning_rate": 9.959879710533835e-06, + "loss": 0.0729, + "step": 605 + }, + { + "epoch": 0.4100135317997294, + "grad_norm": 0.5270699471361276, + "learning_rate": 9.959380452055909e-06, + "loss": 0.0623, + "step": 606 + }, + { + "epoch": 0.41069012178619757, + "grad_norm": 0.621065800085645, + "learning_rate": 9.958878119029419e-06, + "loss": 0.0395, + "step": 607 + }, + { + "epoch": 0.41136671177266576, + "grad_norm": 0.36218114056526846, + "learning_rate": 9.958372711765785e-06, + "loss": 0.0433, + "step": 608 + }, + { + "epoch": 0.41204330175913395, + "grad_norm": 0.5685957527884379, + "learning_rate": 9.95786423057833e-06, + "loss": 0.0625, + "step": 609 + }, + { + "epoch": 0.41271989174560214, + "grad_norm": 0.8964199923627175, + "learning_rate": 9.957352675782283e-06, + "loss": 0.0603, + "step": 610 + }, + { + "epoch": 0.4133964817320704, + "grad_norm": 0.731964784299447, + "learning_rate": 9.956838047694785e-06, + "loss": 0.0715, + "step": 611 + }, + { + "epoch": 0.4140730717185386, + "grad_norm": 0.6211147042778331, + "learning_rate": 9.956320346634877e-06, + "loss": 0.0428, + "step": 612 + }, + { + "epoch": 0.41474966170500677, + "grad_norm": 0.6737382967821821, + "learning_rate": 9.955799572923503e-06, + "loss": 0.0596, + "step": 613 + }, + { + "epoch": 0.41542625169147496, + "grad_norm": 0.5050139257203307, + "learning_rate": 9.955275726883517e-06, + "loss": 0.0416, + "step": 614 + }, + { + "epoch": 0.41610284167794315, + "grad_norm": 0.5149927453539003, + "learning_rate": 9.954748808839675e-06, + "loss": 0.0404, + "step": 615 + }, + { + "epoch": 0.4167794316644114, + "grad_norm": 1.1872072320176672, + "learning_rate": 9.954218819118636e-06, + "loss": 0.0715, + "step": 616 + }, + { + "epoch": 0.4174560216508796, + "grad_norm": 0.6146032125426063, + "learning_rate": 9.953685758048968e-06, + "loss": 0.0516, + "step": 617 + }, + { + "epoch": 0.4181326116373478, + "grad_norm": 0.533819097044187, + "learning_rate": 9.953149625961136e-06, + "loss": 0.0508, + "step": 618 + }, + { + "epoch": 0.41880920162381596, + "grad_norm": 0.6712374248895266, + "learning_rate": 9.952610423187516e-06, + "loss": 0.0624, + "step": 619 + }, + { + "epoch": 0.41948579161028415, + "grad_norm": 0.6291747872540906, + "learning_rate": 9.952068150062386e-06, + "loss": 0.0503, + "step": 620 + }, + { + "epoch": 0.42016238159675234, + "grad_norm": 0.5191015273620253, + "learning_rate": 9.951522806921922e-06, + "loss": 0.0489, + "step": 621 + }, + { + "epoch": 0.4208389715832206, + "grad_norm": 0.4112482372183207, + "learning_rate": 9.95097439410421e-06, + "loss": 0.0519, + "step": 622 + }, + { + "epoch": 0.4215155615696888, + "grad_norm": 0.6218172837637318, + "learning_rate": 9.950422911949238e-06, + "loss": 0.0643, + "step": 623 + }, + { + "epoch": 0.42219215155615697, + "grad_norm": 0.7148700144926305, + "learning_rate": 9.949868360798893e-06, + "loss": 0.063, + "step": 624 + }, + { + "epoch": 0.42286874154262516, + "grad_norm": 0.5722947833285671, + "learning_rate": 9.949310740996964e-06, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.42354533152909335, + "grad_norm": 0.729292484266292, + "learning_rate": 9.94875005288915e-06, + "loss": 0.0485, + "step": 626 + }, + { + "epoch": 0.4242219215155616, + "grad_norm": 0.5468712068033238, + "learning_rate": 9.948186296823048e-06, + "loss": 0.0453, + "step": 627 + }, + { + "epoch": 0.4248985115020298, + "grad_norm": 0.5379532134952075, + "learning_rate": 9.947619473148152e-06, + "loss": 0.0551, + "step": 628 + }, + { + "epoch": 0.425575101488498, + "grad_norm": 0.6838294705325765, + "learning_rate": 9.947049582215862e-06, + "loss": 0.067, + "step": 629 + }, + { + "epoch": 0.42625169147496617, + "grad_norm": 0.44702169819272947, + "learning_rate": 9.946476624379485e-06, + "loss": 0.0577, + "step": 630 + }, + { + "epoch": 0.42692828146143436, + "grad_norm": 0.7107904020354667, + "learning_rate": 9.945900599994219e-06, + "loss": 0.0451, + "step": 631 + }, + { + "epoch": 0.42760487144790255, + "grad_norm": 0.44741982038100586, + "learning_rate": 9.94532150941717e-06, + "loss": 0.036, + "step": 632 + }, + { + "epoch": 0.4282814614343708, + "grad_norm": 0.620847654640831, + "learning_rate": 9.944739353007344e-06, + "loss": 0.0593, + "step": 633 + }, + { + "epoch": 0.428958051420839, + "grad_norm": 0.6466303946835565, + "learning_rate": 9.944154131125643e-06, + "loss": 0.0537, + "step": 634 + }, + { + "epoch": 0.42963464140730717, + "grad_norm": 0.6605384758118511, + "learning_rate": 9.943565844134877e-06, + "loss": 0.0563, + "step": 635 + }, + { + "epoch": 0.43031123139377536, + "grad_norm": 0.6279944033337461, + "learning_rate": 9.942974492399751e-06, + "loss": 0.0517, + "step": 636 + }, + { + "epoch": 0.43098782138024355, + "grad_norm": 0.799163664852329, + "learning_rate": 9.94238007628687e-06, + "loss": 0.065, + "step": 637 + }, + { + "epoch": 0.4316644113667118, + "grad_norm": 0.6831934664432242, + "learning_rate": 9.94178259616474e-06, + "loss": 0.0603, + "step": 638 + }, + { + "epoch": 0.43234100135318, + "grad_norm": 0.44395458651519953, + "learning_rate": 9.941182052403768e-06, + "loss": 0.0456, + "step": 639 + }, + { + "epoch": 0.4330175913396482, + "grad_norm": 0.7882901427214065, + "learning_rate": 9.940578445376259e-06, + "loss": 0.0493, + "step": 640 + }, + { + "epoch": 0.43369418132611637, + "grad_norm": 0.8695595713041687, + "learning_rate": 9.939971775456416e-06, + "loss": 0.0643, + "step": 641 + }, + { + "epoch": 0.43437077131258456, + "grad_norm": 0.8732608626098697, + "learning_rate": 9.93936204302034e-06, + "loss": 0.0739, + "step": 642 + }, + { + "epoch": 0.43504736129905275, + "grad_norm": 0.5239275121018045, + "learning_rate": 9.938749248446033e-06, + "loss": 0.0448, + "step": 643 + }, + { + "epoch": 0.435723951285521, + "grad_norm": 0.7753422232990571, + "learning_rate": 9.938133392113399e-06, + "loss": 0.0635, + "step": 644 + }, + { + "epoch": 0.4364005412719892, + "grad_norm": 0.6273548559703079, + "learning_rate": 9.937514474404229e-06, + "loss": 0.0535, + "step": 645 + }, + { + "epoch": 0.4370771312584574, + "grad_norm": 0.7674442628416059, + "learning_rate": 9.936892495702222e-06, + "loss": 0.0578, + "step": 646 + }, + { + "epoch": 0.43775372124492556, + "grad_norm": 0.4808136940151125, + "learning_rate": 9.936267456392971e-06, + "loss": 0.0523, + "step": 647 + }, + { + "epoch": 0.43843031123139375, + "grad_norm": 0.6077370418485973, + "learning_rate": 9.935639356863966e-06, + "loss": 0.0644, + "step": 648 + }, + { + "epoch": 0.439106901217862, + "grad_norm": 0.8564838789435368, + "learning_rate": 9.935008197504596e-06, + "loss": 0.0526, + "step": 649 + }, + { + "epoch": 0.4397834912043302, + "grad_norm": 0.7480733069773909, + "learning_rate": 9.934373978706147e-06, + "loss": 0.0795, + "step": 650 + }, + { + "epoch": 0.4404600811907984, + "grad_norm": 0.4289437402578333, + "learning_rate": 9.933736700861798e-06, + "loss": 0.041, + "step": 651 + }, + { + "epoch": 0.44113667117726657, + "grad_norm": 0.520375773258376, + "learning_rate": 9.933096364366625e-06, + "loss": 0.0677, + "step": 652 + }, + { + "epoch": 0.44181326116373476, + "grad_norm": 0.5815794825346166, + "learning_rate": 9.932452969617607e-06, + "loss": 0.0495, + "step": 653 + }, + { + "epoch": 0.44248985115020295, + "grad_norm": 0.5821123973883511, + "learning_rate": 9.931806517013612e-06, + "loss": 0.0565, + "step": 654 + }, + { + "epoch": 0.4431664411366712, + "grad_norm": 0.5056099241481415, + "learning_rate": 9.931157006955406e-06, + "loss": 0.0471, + "step": 655 + }, + { + "epoch": 0.4438430311231394, + "grad_norm": 0.6414689039348711, + "learning_rate": 9.93050443984565e-06, + "loss": 0.0543, + "step": 656 + }, + { + "epoch": 0.4445196211096076, + "grad_norm": 0.5480756679690673, + "learning_rate": 9.929848816088898e-06, + "loss": 0.0634, + "step": 657 + }, + { + "epoch": 0.44519621109607577, + "grad_norm": 0.7174926108633242, + "learning_rate": 9.929190136091604e-06, + "loss": 0.0616, + "step": 658 + }, + { + "epoch": 0.44587280108254396, + "grad_norm": 0.5686244495481496, + "learning_rate": 9.928528400262116e-06, + "loss": 0.0453, + "step": 659 + }, + { + "epoch": 0.4465493910690122, + "grad_norm": 0.4914095498933844, + "learning_rate": 9.92786360901067e-06, + "loss": 0.0563, + "step": 660 + }, + { + "epoch": 0.4472259810554804, + "grad_norm": 0.6090759602899615, + "learning_rate": 9.927195762749405e-06, + "loss": 0.0517, + "step": 661 + }, + { + "epoch": 0.4479025710419486, + "grad_norm": 0.6161988737908256, + "learning_rate": 9.926524861892346e-06, + "loss": 0.0547, + "step": 662 + }, + { + "epoch": 0.4485791610284168, + "grad_norm": 0.3292183584682544, + "learning_rate": 9.925850906855419e-06, + "loss": 0.0425, + "step": 663 + }, + { + "epoch": 0.44925575101488496, + "grad_norm": 0.42276768491687367, + "learning_rate": 9.925173898056436e-06, + "loss": 0.0405, + "step": 664 + }, + { + "epoch": 0.44993234100135315, + "grad_norm": 0.8380544193813583, + "learning_rate": 9.924493835915108e-06, + "loss": 0.0568, + "step": 665 + }, + { + "epoch": 0.4506089309878214, + "grad_norm": 0.6999964194444989, + "learning_rate": 9.923810720853038e-06, + "loss": 0.0605, + "step": 666 + }, + { + "epoch": 0.4512855209742896, + "grad_norm": 0.4837652325939674, + "learning_rate": 9.923124553293718e-06, + "loss": 0.0444, + "step": 667 + }, + { + "epoch": 0.4519621109607578, + "grad_norm": 0.6216585201930606, + "learning_rate": 9.922435333662537e-06, + "loss": 0.0585, + "step": 668 + }, + { + "epoch": 0.45263870094722597, + "grad_norm": 0.5996384717459816, + "learning_rate": 9.921743062386773e-06, + "loss": 0.0561, + "step": 669 + }, + { + "epoch": 0.45331529093369416, + "grad_norm": 0.5326204102007049, + "learning_rate": 9.921047739895596e-06, + "loss": 0.042, + "step": 670 + }, + { + "epoch": 0.4539918809201624, + "grad_norm": 0.49562577862563084, + "learning_rate": 9.92034936662007e-06, + "loss": 0.0507, + "step": 671 + }, + { + "epoch": 0.4546684709066306, + "grad_norm": 0.730328018389376, + "learning_rate": 9.91964794299315e-06, + "loss": 0.0536, + "step": 672 + }, + { + "epoch": 0.4553450608930988, + "grad_norm": 0.38455552302941354, + "learning_rate": 9.918943469449676e-06, + "loss": 0.049, + "step": 673 + }, + { + "epoch": 0.456021650879567, + "grad_norm": 0.5401055015676005, + "learning_rate": 9.918235946426389e-06, + "loss": 0.0457, + "step": 674 + }, + { + "epoch": 0.45669824086603517, + "grad_norm": 0.6353008301105469, + "learning_rate": 9.917525374361913e-06, + "loss": 0.0518, + "step": 675 + }, + { + "epoch": 0.45737483085250336, + "grad_norm": 0.5267061996841854, + "learning_rate": 9.916811753696764e-06, + "loss": 0.0469, + "step": 676 + }, + { + "epoch": 0.4580514208389716, + "grad_norm": 0.6084663569530226, + "learning_rate": 9.916095084873348e-06, + "loss": 0.0534, + "step": 677 + }, + { + "epoch": 0.4587280108254398, + "grad_norm": 0.8048621834043865, + "learning_rate": 9.915375368335962e-06, + "loss": 0.0579, + "step": 678 + }, + { + "epoch": 0.459404600811908, + "grad_norm": 0.5935252094474919, + "learning_rate": 9.91465260453079e-06, + "loss": 0.0497, + "step": 679 + }, + { + "epoch": 0.46008119079837617, + "grad_norm": 0.6484070730343198, + "learning_rate": 9.913926793905909e-06, + "loss": 0.0501, + "step": 680 + }, + { + "epoch": 0.46075778078484436, + "grad_norm": 0.7664190173161035, + "learning_rate": 9.91319793691128e-06, + "loss": 0.0518, + "step": 681 + }, + { + "epoch": 0.4614343707713126, + "grad_norm": 0.5004208463200237, + "learning_rate": 9.912466033998758e-06, + "loss": 0.0527, + "step": 682 + }, + { + "epoch": 0.4621109607577808, + "grad_norm": 0.746120463933263, + "learning_rate": 9.91173108562208e-06, + "loss": 0.0716, + "step": 683 + }, + { + "epoch": 0.462787550744249, + "grad_norm": 0.7118564064495075, + "learning_rate": 9.910993092236878e-06, + "loss": 0.0683, + "step": 684 + }, + { + "epoch": 0.4634641407307172, + "grad_norm": 0.3665810805358901, + "learning_rate": 9.910252054300664e-06, + "loss": 0.0379, + "step": 685 + }, + { + "epoch": 0.46414073071718537, + "grad_norm": 0.6464876277176455, + "learning_rate": 9.909507972272845e-06, + "loss": 0.0708, + "step": 686 + }, + { + "epoch": 0.4648173207036536, + "grad_norm": 0.9433879986761604, + "learning_rate": 9.90876084661471e-06, + "loss": 0.0714, + "step": 687 + }, + { + "epoch": 0.4654939106901218, + "grad_norm": 0.5715423921258398, + "learning_rate": 9.908010677789437e-06, + "loss": 0.0439, + "step": 688 + }, + { + "epoch": 0.46617050067659, + "grad_norm": 0.4660682565688531, + "learning_rate": 9.90725746626209e-06, + "loss": 0.0498, + "step": 689 + }, + { + "epoch": 0.4668470906630582, + "grad_norm": 0.46524053689266187, + "learning_rate": 9.90650121249962e-06, + "loss": 0.0543, + "step": 690 + }, + { + "epoch": 0.4675236806495264, + "grad_norm": 0.46365328983571014, + "learning_rate": 9.905741916970863e-06, + "loss": 0.0416, + "step": 691 + }, + { + "epoch": 0.46820027063599456, + "grad_norm": 0.36787289617199853, + "learning_rate": 9.904979580146544e-06, + "loss": 0.0562, + "step": 692 + }, + { + "epoch": 0.4688768606224628, + "grad_norm": 0.6294200557803371, + "learning_rate": 9.904214202499266e-06, + "loss": 0.0525, + "step": 693 + }, + { + "epoch": 0.469553450608931, + "grad_norm": 0.5515365908364331, + "learning_rate": 9.903445784503525e-06, + "loss": 0.0448, + "step": 694 + }, + { + "epoch": 0.4702300405953992, + "grad_norm": 0.4889466923384764, + "learning_rate": 9.902674326635698e-06, + "loss": 0.0516, + "step": 695 + }, + { + "epoch": 0.4709066305818674, + "grad_norm": 0.4265080784712957, + "learning_rate": 9.901899829374048e-06, + "loss": 0.0542, + "step": 696 + }, + { + "epoch": 0.47158322056833557, + "grad_norm": 0.7795121815798032, + "learning_rate": 9.90112229319872e-06, + "loss": 0.0524, + "step": 697 + }, + { + "epoch": 0.4722598105548038, + "grad_norm": 0.6477501865570612, + "learning_rate": 9.900341718591746e-06, + "loss": 0.0668, + "step": 698 + }, + { + "epoch": 0.472936400541272, + "grad_norm": 0.6093290599919889, + "learning_rate": 9.899558106037039e-06, + "loss": 0.0478, + "step": 699 + }, + { + "epoch": 0.4736129905277402, + "grad_norm": 0.6437924066156706, + "learning_rate": 9.898771456020397e-06, + "loss": 0.0454, + "step": 700 + }, + { + "epoch": 0.4742895805142084, + "grad_norm": 0.41372214037147886, + "learning_rate": 9.897981769029504e-06, + "loss": 0.0554, + "step": 701 + }, + { + "epoch": 0.4749661705006766, + "grad_norm": 0.406804966119598, + "learning_rate": 9.897189045553917e-06, + "loss": 0.0381, + "step": 702 + }, + { + "epoch": 0.47564276048714477, + "grad_norm": 0.785719969880346, + "learning_rate": 9.896393286085085e-06, + "loss": 0.0608, + "step": 703 + }, + { + "epoch": 0.476319350473613, + "grad_norm": 0.6757989837633964, + "learning_rate": 9.895594491116336e-06, + "loss": 0.0637, + "step": 704 + }, + { + "epoch": 0.4769959404600812, + "grad_norm": 0.54434553685572, + "learning_rate": 9.89479266114288e-06, + "loss": 0.0517, + "step": 705 + }, + { + "epoch": 0.4776725304465494, + "grad_norm": 0.556258617178189, + "learning_rate": 9.893987796661809e-06, + "loss": 0.0464, + "step": 706 + }, + { + "epoch": 0.4783491204330176, + "grad_norm": 0.5443569208148896, + "learning_rate": 9.893179898172095e-06, + "loss": 0.0478, + "step": 707 + }, + { + "epoch": 0.4790257104194858, + "grad_norm": 0.3782871653886809, + "learning_rate": 9.89236896617459e-06, + "loss": 0.0344, + "step": 708 + }, + { + "epoch": 0.479702300405954, + "grad_norm": 0.5601099762544572, + "learning_rate": 9.891555001172032e-06, + "loss": 0.0494, + "step": 709 + }, + { + "epoch": 0.4803788903924222, + "grad_norm": 0.5597549361352787, + "learning_rate": 9.890738003669029e-06, + "loss": 0.0485, + "step": 710 + }, + { + "epoch": 0.4810554803788904, + "grad_norm": 0.5294133072336844, + "learning_rate": 9.88991797417208e-06, + "loss": 0.0536, + "step": 711 + }, + { + "epoch": 0.4817320703653586, + "grad_norm": 0.9265159910409836, + "learning_rate": 9.889094913189561e-06, + "loss": 0.0788, + "step": 712 + }, + { + "epoch": 0.4824086603518268, + "grad_norm": 0.43730031477379844, + "learning_rate": 9.888268821231721e-06, + "loss": 0.0507, + "step": 713 + }, + { + "epoch": 0.48308525033829497, + "grad_norm": 0.5567250065444755, + "learning_rate": 9.887439698810694e-06, + "loss": 0.0699, + "step": 714 + }, + { + "epoch": 0.4837618403247632, + "grad_norm": 0.4096922412310759, + "learning_rate": 9.886607546440492e-06, + "loss": 0.0434, + "step": 715 + }, + { + "epoch": 0.4844384303112314, + "grad_norm": 0.4636339848328735, + "learning_rate": 9.885772364637002e-06, + "loss": 0.0439, + "step": 716 + }, + { + "epoch": 0.4851150202976996, + "grad_norm": 0.4081535798118912, + "learning_rate": 9.884934153917998e-06, + "loss": 0.0374, + "step": 717 + }, + { + "epoch": 0.4857916102841678, + "grad_norm": 0.36006897850211084, + "learning_rate": 9.884092914803119e-06, + "loss": 0.0438, + "step": 718 + }, + { + "epoch": 0.486468200270636, + "grad_norm": 0.6180958830932193, + "learning_rate": 9.88324864781389e-06, + "loss": 0.0463, + "step": 719 + }, + { + "epoch": 0.4871447902571042, + "grad_norm": 0.5632508848694299, + "learning_rate": 9.882401353473711e-06, + "loss": 0.0483, + "step": 720 + }, + { + "epoch": 0.4878213802435724, + "grad_norm": 0.5793828620228911, + "learning_rate": 9.881551032307859e-06, + "loss": 0.0652, + "step": 721 + }, + { + "epoch": 0.4884979702300406, + "grad_norm": 0.448107907002612, + "learning_rate": 9.880697684843487e-06, + "loss": 0.0479, + "step": 722 + }, + { + "epoch": 0.4891745602165088, + "grad_norm": 0.4391741741594667, + "learning_rate": 9.879841311609625e-06, + "loss": 0.0531, + "step": 723 + }, + { + "epoch": 0.489851150202977, + "grad_norm": 0.5181556645111783, + "learning_rate": 9.878981913137178e-06, + "loss": 0.0602, + "step": 724 + }, + { + "epoch": 0.49052774018944517, + "grad_norm": 0.5384630165805743, + "learning_rate": 9.878119489958929e-06, + "loss": 0.0688, + "step": 725 + }, + { + "epoch": 0.4912043301759134, + "grad_norm": 0.5378313548014063, + "learning_rate": 9.877254042609529e-06, + "loss": 0.0439, + "step": 726 + }, + { + "epoch": 0.4918809201623816, + "grad_norm": 0.3892701405664777, + "learning_rate": 9.87638557162551e-06, + "loss": 0.0398, + "step": 727 + }, + { + "epoch": 0.4925575101488498, + "grad_norm": 0.4259128758039286, + "learning_rate": 9.875514077545282e-06, + "loss": 0.0417, + "step": 728 + }, + { + "epoch": 0.493234100135318, + "grad_norm": 0.47040481180464283, + "learning_rate": 9.874639560909118e-06, + "loss": 0.0394, + "step": 729 + }, + { + "epoch": 0.4939106901217862, + "grad_norm": 0.36087360236407345, + "learning_rate": 9.873762022259177e-06, + "loss": 0.0287, + "step": 730 + }, + { + "epoch": 0.4945872801082544, + "grad_norm": 0.70693194587442, + "learning_rate": 9.87288146213948e-06, + "loss": 0.0597, + "step": 731 + }, + { + "epoch": 0.4952638700947226, + "grad_norm": 0.39665457047176367, + "learning_rate": 9.87199788109593e-06, + "loss": 0.0387, + "step": 732 + }, + { + "epoch": 0.4959404600811908, + "grad_norm": 0.48451934239788397, + "learning_rate": 9.8711112796763e-06, + "loss": 0.0505, + "step": 733 + }, + { + "epoch": 0.496617050067659, + "grad_norm": 0.5368848334988352, + "learning_rate": 9.870221658430233e-06, + "loss": 0.0467, + "step": 734 + }, + { + "epoch": 0.4972936400541272, + "grad_norm": 0.5674198214012114, + "learning_rate": 9.869329017909248e-06, + "loss": 0.042, + "step": 735 + }, + { + "epoch": 0.4979702300405954, + "grad_norm": 0.5721578103687843, + "learning_rate": 9.868433358666734e-06, + "loss": 0.0636, + "step": 736 + }, + { + "epoch": 0.4986468200270636, + "grad_norm": 0.9677416335813325, + "learning_rate": 9.86753468125795e-06, + "loss": 0.0587, + "step": 737 + }, + { + "epoch": 0.4993234100135318, + "grad_norm": 0.40180822511425857, + "learning_rate": 9.86663298624003e-06, + "loss": 0.0481, + "step": 738 + }, + { + "epoch": 0.5, + "grad_norm": 0.361135441012983, + "learning_rate": 9.865728274171972e-06, + "loss": 0.0406, + "step": 739 + }, + { + "epoch": 0.5006765899864682, + "grad_norm": 0.8479454545639886, + "learning_rate": 9.864820545614656e-06, + "loss": 0.0598, + "step": 740 + }, + { + "epoch": 0.5013531799729364, + "grad_norm": 0.5106368822232472, + "learning_rate": 9.863909801130816e-06, + "loss": 0.053, + "step": 741 + }, + { + "epoch": 0.5020297699594046, + "grad_norm": 0.4756226065532691, + "learning_rate": 9.862996041285071e-06, + "loss": 0.0483, + "step": 742 + }, + { + "epoch": 0.5027063599458728, + "grad_norm": 0.6219060655877254, + "learning_rate": 9.862079266643899e-06, + "loss": 0.0502, + "step": 743 + }, + { + "epoch": 0.503382949932341, + "grad_norm": 0.4845490822586034, + "learning_rate": 9.861159477775653e-06, + "loss": 0.0488, + "step": 744 + }, + { + "epoch": 0.5040595399188093, + "grad_norm": 0.6812021084222101, + "learning_rate": 9.860236675250553e-06, + "loss": 0.0554, + "step": 745 + }, + { + "epoch": 0.5047361299052774, + "grad_norm": 0.5590569632794635, + "learning_rate": 9.859310859640685e-06, + "loss": 0.0491, + "step": 746 + }, + { + "epoch": 0.5054127198917456, + "grad_norm": 0.46212555175343245, + "learning_rate": 9.858382031520005e-06, + "loss": 0.0539, + "step": 747 + }, + { + "epoch": 0.5060893098782138, + "grad_norm": 0.40800380331762076, + "learning_rate": 9.857450191464337e-06, + "loss": 0.0466, + "step": 748 + }, + { + "epoch": 0.506765899864682, + "grad_norm": 0.6091738784482054, + "learning_rate": 9.856515340051374e-06, + "loss": 0.05, + "step": 749 + }, + { + "epoch": 0.5074424898511503, + "grad_norm": 0.7055256422888375, + "learning_rate": 9.855577477860669e-06, + "loss": 0.0434, + "step": 750 + }, + { + "epoch": 0.5081190798376184, + "grad_norm": 0.9539674471910841, + "learning_rate": 9.854636605473647e-06, + "loss": 0.0528, + "step": 751 + }, + { + "epoch": 0.5087956698240866, + "grad_norm": 0.47633172733382284, + "learning_rate": 9.8536927234736e-06, + "loss": 0.0497, + "step": 752 + }, + { + "epoch": 0.5094722598105548, + "grad_norm": 0.7532584333033749, + "learning_rate": 9.852745832445684e-06, + "loss": 0.0501, + "step": 753 + }, + { + "epoch": 0.510148849797023, + "grad_norm": 0.5304042345746277, + "learning_rate": 9.851795932976919e-06, + "loss": 0.046, + "step": 754 + }, + { + "epoch": 0.5108254397834912, + "grad_norm": 0.44485606906848746, + "learning_rate": 9.850843025656194e-06, + "loss": 0.0549, + "step": 755 + }, + { + "epoch": 0.5115020297699594, + "grad_norm": 0.5465848222940929, + "learning_rate": 9.849887111074256e-06, + "loss": 0.0547, + "step": 756 + }, + { + "epoch": 0.5121786197564276, + "grad_norm": 0.5418310103747346, + "learning_rate": 9.848928189823724e-06, + "loss": 0.0441, + "step": 757 + }, + { + "epoch": 0.5128552097428958, + "grad_norm": 0.7031623884866932, + "learning_rate": 9.847966262499073e-06, + "loss": 0.0692, + "step": 758 + }, + { + "epoch": 0.513531799729364, + "grad_norm": 0.45568690818784974, + "learning_rate": 9.847001329696653e-06, + "loss": 0.0473, + "step": 759 + }, + { + "epoch": 0.5142083897158322, + "grad_norm": 0.6068664057424306, + "learning_rate": 9.846033392014665e-06, + "loss": 0.0539, + "step": 760 + }, + { + "epoch": 0.5148849797023004, + "grad_norm": 0.4017234045118912, + "learning_rate": 9.84506245005318e-06, + "loss": 0.04, + "step": 761 + }, + { + "epoch": 0.5155615696887687, + "grad_norm": 0.4686458728542905, + "learning_rate": 9.84408850441413e-06, + "loss": 0.0478, + "step": 762 + }, + { + "epoch": 0.5162381596752368, + "grad_norm": 0.45832931873341604, + "learning_rate": 9.843111555701307e-06, + "loss": 0.0367, + "step": 763 + }, + { + "epoch": 0.516914749661705, + "grad_norm": 0.5584097887482565, + "learning_rate": 9.84213160452037e-06, + "loss": 0.0497, + "step": 764 + }, + { + "epoch": 0.5175913396481732, + "grad_norm": 0.4431597984514597, + "learning_rate": 9.841148651478833e-06, + "loss": 0.0513, + "step": 765 + }, + { + "epoch": 0.5182679296346414, + "grad_norm": 0.27012020329095626, + "learning_rate": 9.840162697186075e-06, + "loss": 0.0349, + "step": 766 + }, + { + "epoch": 0.5189445196211097, + "grad_norm": 0.45363845667223135, + "learning_rate": 9.839173742253334e-06, + "loss": 0.041, + "step": 767 + }, + { + "epoch": 0.5196211096075778, + "grad_norm": 0.50469702558176, + "learning_rate": 9.838181787293707e-06, + "loss": 0.0582, + "step": 768 + }, + { + "epoch": 0.520297699594046, + "grad_norm": 0.4578431401614713, + "learning_rate": 9.837186832922157e-06, + "loss": 0.0476, + "step": 769 + }, + { + "epoch": 0.5209742895805142, + "grad_norm": 0.6613480257508604, + "learning_rate": 9.8361888797555e-06, + "loss": 0.0527, + "step": 770 + }, + { + "epoch": 0.5216508795669824, + "grad_norm": 0.7676793309014901, + "learning_rate": 9.835187928412412e-06, + "loss": 0.0514, + "step": 771 + }, + { + "epoch": 0.5223274695534507, + "grad_norm": 0.512508020266984, + "learning_rate": 9.834183979513427e-06, + "loss": 0.0591, + "step": 772 + }, + { + "epoch": 0.5230040595399188, + "grad_norm": 0.51084222634459, + "learning_rate": 9.833177033680945e-06, + "loss": 0.0423, + "step": 773 + }, + { + "epoch": 0.523680649526387, + "grad_norm": 0.6413724330584915, + "learning_rate": 9.832167091539215e-06, + "loss": 0.0418, + "step": 774 + }, + { + "epoch": 0.5243572395128552, + "grad_norm": 0.7460443994099318, + "learning_rate": 9.831154153714344e-06, + "loss": 0.0541, + "step": 775 + }, + { + "epoch": 0.5250338294993234, + "grad_norm": 0.6734291747835667, + "learning_rate": 9.830138220834305e-06, + "loss": 0.0513, + "step": 776 + }, + { + "epoch": 0.5257104194857916, + "grad_norm": 0.5738298507102398, + "learning_rate": 9.829119293528916e-06, + "loss": 0.0403, + "step": 777 + }, + { + "epoch": 0.5263870094722598, + "grad_norm": 0.43068434135473127, + "learning_rate": 9.82809737242986e-06, + "loss": 0.0485, + "step": 778 + }, + { + "epoch": 0.527063599458728, + "grad_norm": 0.8668327646717695, + "learning_rate": 9.827072458170673e-06, + "loss": 0.0727, + "step": 779 + }, + { + "epoch": 0.5277401894451962, + "grad_norm": 0.42508437715024205, + "learning_rate": 9.826044551386743e-06, + "loss": 0.0352, + "step": 780 + }, + { + "epoch": 0.5284167794316644, + "grad_norm": 0.5472949883047366, + "learning_rate": 9.825013652715323e-06, + "loss": 0.0395, + "step": 781 + }, + { + "epoch": 0.5290933694181326, + "grad_norm": 0.8154528193183785, + "learning_rate": 9.82397976279551e-06, + "loss": 0.0446, + "step": 782 + }, + { + "epoch": 0.5297699594046008, + "grad_norm": 0.5338044803942171, + "learning_rate": 9.822942882268261e-06, + "loss": 0.0408, + "step": 783 + }, + { + "epoch": 0.530446549391069, + "grad_norm": 0.5827835289743865, + "learning_rate": 9.821903011776385e-06, + "loss": 0.0441, + "step": 784 + }, + { + "epoch": 0.5311231393775372, + "grad_norm": 0.8365956151736382, + "learning_rate": 9.820860151964548e-06, + "loss": 0.0488, + "step": 785 + }, + { + "epoch": 0.5317997293640054, + "grad_norm": 0.5749035683313815, + "learning_rate": 9.819814303479268e-06, + "loss": 0.0553, + "step": 786 + }, + { + "epoch": 0.5324763193504736, + "grad_norm": 0.5212962411059993, + "learning_rate": 9.818765466968909e-06, + "loss": 0.0406, + "step": 787 + }, + { + "epoch": 0.5331529093369418, + "grad_norm": 0.7427578140330747, + "learning_rate": 9.8177136430837e-06, + "loss": 0.0538, + "step": 788 + }, + { + "epoch": 0.5338294993234101, + "grad_norm": 0.5977082799800159, + "learning_rate": 9.816658832475709e-06, + "loss": 0.0601, + "step": 789 + }, + { + "epoch": 0.5345060893098782, + "grad_norm": 1.0394597388074733, + "learning_rate": 9.815601035798866e-06, + "loss": 0.0779, + "step": 790 + }, + { + "epoch": 0.5351826792963464, + "grad_norm": 0.8065819456719235, + "learning_rate": 9.814540253708945e-06, + "loss": 0.0533, + "step": 791 + }, + { + "epoch": 0.5358592692828146, + "grad_norm": 0.6025659459066489, + "learning_rate": 9.813476486863575e-06, + "loss": 0.0441, + "step": 792 + }, + { + "epoch": 0.5365358592692828, + "grad_norm": 1.017224752493771, + "learning_rate": 9.812409735922236e-06, + "loss": 0.0668, + "step": 793 + }, + { + "epoch": 0.5372124492557511, + "grad_norm": 0.4502355249080863, + "learning_rate": 9.811340001546252e-06, + "loss": 0.0407, + "step": 794 + }, + { + "epoch": 0.5378890392422192, + "grad_norm": 0.5251804211501145, + "learning_rate": 9.810267284398805e-06, + "loss": 0.0654, + "step": 795 + }, + { + "epoch": 0.5385656292286874, + "grad_norm": 1.433502261944195, + "learning_rate": 9.80919158514492e-06, + "loss": 0.0832, + "step": 796 + }, + { + "epoch": 0.5392422192151556, + "grad_norm": 0.48935290039798657, + "learning_rate": 9.80811290445147e-06, + "loss": 0.0549, + "step": 797 + }, + { + "epoch": 0.5399188092016238, + "grad_norm": 0.9396984260311254, + "learning_rate": 9.807031242987182e-06, + "loss": 0.0498, + "step": 798 + }, + { + "epoch": 0.540595399188092, + "grad_norm": 0.6802177887060812, + "learning_rate": 9.805946601422628e-06, + "loss": 0.0795, + "step": 799 + }, + { + "epoch": 0.5412719891745602, + "grad_norm": 0.5269895591335515, + "learning_rate": 9.804858980430225e-06, + "loss": 0.0784, + "step": 800 + }, + { + "epoch": 0.5419485791610285, + "grad_norm": 0.5091073912024127, + "learning_rate": 9.803768380684242e-06, + "loss": 0.0743, + "step": 801 + }, + { + "epoch": 0.5426251691474966, + "grad_norm": 0.44128852573354854, + "learning_rate": 9.80267480286079e-06, + "loss": 0.0514, + "step": 802 + }, + { + "epoch": 0.5433017591339648, + "grad_norm": 0.7140554103607668, + "learning_rate": 9.801578247637828e-06, + "loss": 0.053, + "step": 803 + }, + { + "epoch": 0.543978349120433, + "grad_norm": 0.5944336586055404, + "learning_rate": 9.800478715695165e-06, + "loss": 0.0443, + "step": 804 + }, + { + "epoch": 0.5446549391069012, + "grad_norm": 0.4363773630726969, + "learning_rate": 9.799376207714446e-06, + "loss": 0.0438, + "step": 805 + }, + { + "epoch": 0.5453315290933695, + "grad_norm": 0.4663745215765508, + "learning_rate": 9.79827072437917e-06, + "loss": 0.0373, + "step": 806 + }, + { + "epoch": 0.5460081190798376, + "grad_norm": 0.872105611260014, + "learning_rate": 9.797162266374677e-06, + "loss": 0.0502, + "step": 807 + }, + { + "epoch": 0.5466847090663058, + "grad_norm": 0.3824510105734749, + "learning_rate": 9.79605083438815e-06, + "loss": 0.0381, + "step": 808 + }, + { + "epoch": 0.547361299052774, + "grad_norm": 0.5380356410057527, + "learning_rate": 9.794936429108617e-06, + "loss": 0.0465, + "step": 809 + }, + { + "epoch": 0.5480378890392422, + "grad_norm": 0.8585674677229587, + "learning_rate": 9.79381905122695e-06, + "loss": 0.0632, + "step": 810 + }, + { + "epoch": 0.5487144790257105, + "grad_norm": 0.4813256618801878, + "learning_rate": 9.792698701435863e-06, + "loss": 0.0392, + "step": 811 + }, + { + "epoch": 0.5493910690121786, + "grad_norm": 0.5944200029999508, + "learning_rate": 9.791575380429911e-06, + "loss": 0.0556, + "step": 812 + }, + { + "epoch": 0.5500676589986468, + "grad_norm": 0.5427250539898076, + "learning_rate": 9.790449088905496e-06, + "loss": 0.0415, + "step": 813 + }, + { + "epoch": 0.550744248985115, + "grad_norm": 0.606873245670459, + "learning_rate": 9.789319827560854e-06, + "loss": 0.0618, + "step": 814 + }, + { + "epoch": 0.5514208389715832, + "grad_norm": 0.4616053148817582, + "learning_rate": 9.78818759709607e-06, + "loss": 0.0347, + "step": 815 + }, + { + "epoch": 0.5520974289580515, + "grad_norm": 0.6603760242553907, + "learning_rate": 9.787052398213062e-06, + "loss": 0.049, + "step": 816 + }, + { + "epoch": 0.5527740189445196, + "grad_norm": 0.5037802895061197, + "learning_rate": 9.785914231615595e-06, + "loss": 0.0547, + "step": 817 + }, + { + "epoch": 0.5534506089309879, + "grad_norm": 0.4696939062638266, + "learning_rate": 9.784773098009269e-06, + "loss": 0.045, + "step": 818 + }, + { + "epoch": 0.554127198917456, + "grad_norm": 0.5578286839070521, + "learning_rate": 9.783628998101525e-06, + "loss": 0.0509, + "step": 819 + }, + { + "epoch": 0.5548037889039242, + "grad_norm": 0.41249886508663836, + "learning_rate": 9.782481932601643e-06, + "loss": 0.0504, + "step": 820 + }, + { + "epoch": 0.5554803788903924, + "grad_norm": 0.6001426424741404, + "learning_rate": 9.781331902220748e-06, + "loss": 0.0408, + "step": 821 + }, + { + "epoch": 0.5561569688768606, + "grad_norm": 0.8508634044858778, + "learning_rate": 9.780178907671788e-06, + "loss": 0.0555, + "step": 822 + }, + { + "epoch": 0.5568335588633289, + "grad_norm": 0.4411759060670162, + "learning_rate": 9.779022949669565e-06, + "loss": 0.0354, + "step": 823 + }, + { + "epoch": 0.557510148849797, + "grad_norm": 0.6172703878800332, + "learning_rate": 9.777864028930705e-06, + "loss": 0.0508, + "step": 824 + }, + { + "epoch": 0.5581867388362652, + "grad_norm": 0.7608138913298033, + "learning_rate": 9.776702146173678e-06, + "loss": 0.0708, + "step": 825 + }, + { + "epoch": 0.5588633288227334, + "grad_norm": 0.623352152631987, + "learning_rate": 9.775537302118791e-06, + "loss": 0.0859, + "step": 826 + }, + { + "epoch": 0.5595399188092016, + "grad_norm": 0.407783288556159, + "learning_rate": 9.77436949748818e-06, + "loss": 0.043, + "step": 827 + }, + { + "epoch": 0.5602165087956699, + "grad_norm": 0.46535537137874305, + "learning_rate": 9.773198733005827e-06, + "loss": 0.0497, + "step": 828 + }, + { + "epoch": 0.560893098782138, + "grad_norm": 0.45065579390232596, + "learning_rate": 9.772025009397538e-06, + "loss": 0.0527, + "step": 829 + }, + { + "epoch": 0.5615696887686062, + "grad_norm": 0.46231610036598364, + "learning_rate": 9.770848327390961e-06, + "loss": 0.0517, + "step": 830 + }, + { + "epoch": 0.5622462787550744, + "grad_norm": 1.048157851355278, + "learning_rate": 9.769668687715572e-06, + "loss": 0.0843, + "step": 831 + }, + { + "epoch": 0.5629228687415426, + "grad_norm": 0.43874424106315724, + "learning_rate": 9.76848609110269e-06, + "loss": 0.0492, + "step": 832 + }, + { + "epoch": 0.5635994587280109, + "grad_norm": 0.46382084948494207, + "learning_rate": 9.767300538285454e-06, + "loss": 0.049, + "step": 833 + }, + { + "epoch": 0.564276048714479, + "grad_norm": 0.472999564690522, + "learning_rate": 9.766112029998847e-06, + "loss": 0.0453, + "step": 834 + }, + { + "epoch": 0.5649526387009473, + "grad_norm": 0.7025502042943563, + "learning_rate": 9.76492056697968e-06, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.5656292286874154, + "grad_norm": 0.49652552955740603, + "learning_rate": 9.763726149966596e-06, + "loss": 0.074, + "step": 836 + }, + { + "epoch": 0.5663058186738836, + "grad_norm": 0.569183317519104, + "learning_rate": 9.762528779700067e-06, + "loss": 0.0383, + "step": 837 + }, + { + "epoch": 0.5669824086603519, + "grad_norm": 0.4867420729747149, + "learning_rate": 9.7613284569224e-06, + "loss": 0.0461, + "step": 838 + }, + { + "epoch": 0.56765899864682, + "grad_norm": 0.6620579961253371, + "learning_rate": 9.760125182377732e-06, + "loss": 0.0344, + "step": 839 + }, + { + "epoch": 0.5683355886332883, + "grad_norm": 0.5999869012264301, + "learning_rate": 9.758918956812024e-06, + "loss": 0.0482, + "step": 840 + }, + { + "epoch": 0.5690121786197564, + "grad_norm": 0.5816447336458729, + "learning_rate": 9.757709780973074e-06, + "loss": 0.0797, + "step": 841 + }, + { + "epoch": 0.5696887686062246, + "grad_norm": 0.5392546672467222, + "learning_rate": 9.756497655610503e-06, + "loss": 0.0482, + "step": 842 + }, + { + "epoch": 0.5703653585926928, + "grad_norm": 0.38395903568969386, + "learning_rate": 9.755282581475769e-06, + "loss": 0.0428, + "step": 843 + }, + { + "epoch": 0.571041948579161, + "grad_norm": 0.350845540377357, + "learning_rate": 9.754064559322147e-06, + "loss": 0.048, + "step": 844 + }, + { + "epoch": 0.5717185385656293, + "grad_norm": 0.426006391790128, + "learning_rate": 9.752843589904746e-06, + "loss": 0.0378, + "step": 845 + }, + { + "epoch": 0.5723951285520974, + "grad_norm": 0.5506082651480366, + "learning_rate": 9.751619673980503e-06, + "loss": 0.0578, + "step": 846 + }, + { + "epoch": 0.5730717185385656, + "grad_norm": 0.5607556280880193, + "learning_rate": 9.75039281230818e-06, + "loss": 0.0582, + "step": 847 + }, + { + "epoch": 0.5737483085250338, + "grad_norm": 0.4361753389211241, + "learning_rate": 9.749163005648362e-06, + "loss": 0.0516, + "step": 848 + }, + { + "epoch": 0.574424898511502, + "grad_norm": 0.311642821351553, + "learning_rate": 9.747930254763467e-06, + "loss": 0.0456, + "step": 849 + }, + { + "epoch": 0.5751014884979703, + "grad_norm": 0.4218841410849098, + "learning_rate": 9.746694560417731e-06, + "loss": 0.0493, + "step": 850 + }, + { + "epoch": 0.5757780784844384, + "grad_norm": 0.49112066955428146, + "learning_rate": 9.745455923377218e-06, + "loss": 0.0508, + "step": 851 + }, + { + "epoch": 0.5764546684709067, + "grad_norm": 0.8830925732758667, + "learning_rate": 9.74421434440982e-06, + "loss": 0.1176, + "step": 852 + }, + { + "epoch": 0.5771312584573748, + "grad_norm": 0.41140583556266247, + "learning_rate": 9.742969824285244e-06, + "loss": 0.0507, + "step": 853 + }, + { + "epoch": 0.577807848443843, + "grad_norm": 0.6883009934084277, + "learning_rate": 9.741722363775029e-06, + "loss": 0.0518, + "step": 854 + }, + { + "epoch": 0.5784844384303113, + "grad_norm": 0.4537184728374772, + "learning_rate": 9.74047196365253e-06, + "loss": 0.0385, + "step": 855 + }, + { + "epoch": 0.5791610284167794, + "grad_norm": 0.5762945675549866, + "learning_rate": 9.73921862469293e-06, + "loss": 0.0505, + "step": 856 + }, + { + "epoch": 0.5798376184032477, + "grad_norm": 0.37253773754660935, + "learning_rate": 9.737962347673232e-06, + "loss": 0.0417, + "step": 857 + }, + { + "epoch": 0.5805142083897158, + "grad_norm": 0.9010191810911216, + "learning_rate": 9.736703133372259e-06, + "loss": 0.0417, + "step": 858 + }, + { + "epoch": 0.581190798376184, + "grad_norm": 0.3508686515086484, + "learning_rate": 9.735440982570656e-06, + "loss": 0.0394, + "step": 859 + }, + { + "epoch": 0.5818673883626523, + "grad_norm": 0.41766463733115045, + "learning_rate": 9.734175896050889e-06, + "loss": 0.0378, + "step": 860 + }, + { + "epoch": 0.5825439783491204, + "grad_norm": 0.558033890428094, + "learning_rate": 9.732907874597241e-06, + "loss": 0.0494, + "step": 861 + }, + { + "epoch": 0.5832205683355887, + "grad_norm": 0.6661422303593342, + "learning_rate": 9.731636918995821e-06, + "loss": 0.0464, + "step": 862 + }, + { + "epoch": 0.5838971583220568, + "grad_norm": 0.4894819614699645, + "learning_rate": 9.730363030034551e-06, + "loss": 0.044, + "step": 863 + }, + { + "epoch": 0.584573748308525, + "grad_norm": 0.6611212441695493, + "learning_rate": 9.729086208503174e-06, + "loss": 0.0694, + "step": 864 + }, + { + "epoch": 0.5852503382949933, + "grad_norm": 0.35893723110631065, + "learning_rate": 9.72780645519325e-06, + "loss": 0.041, + "step": 865 + }, + { + "epoch": 0.5859269282814614, + "grad_norm": 0.5450738557142877, + "learning_rate": 9.726523770898157e-06, + "loss": 0.0399, + "step": 866 + }, + { + "epoch": 0.5866035182679297, + "grad_norm": 0.8876228563217334, + "learning_rate": 9.725238156413089e-06, + "loss": 0.0448, + "step": 867 + }, + { + "epoch": 0.5872801082543978, + "grad_norm": 0.6286748245367716, + "learning_rate": 9.72394961253506e-06, + "loss": 0.0426, + "step": 868 + }, + { + "epoch": 0.587956698240866, + "grad_norm": 0.547776339613492, + "learning_rate": 9.722658140062898e-06, + "loss": 0.0412, + "step": 869 + }, + { + "epoch": 0.5886332882273342, + "grad_norm": 0.4480973214312458, + "learning_rate": 9.721363739797243e-06, + "loss": 0.0435, + "step": 870 + }, + { + "epoch": 0.5893098782138024, + "grad_norm": 0.5784135916573524, + "learning_rate": 9.720066412540554e-06, + "loss": 0.0425, + "step": 871 + }, + { + "epoch": 0.5899864682002707, + "grad_norm": 0.4819523267521132, + "learning_rate": 9.718766159097109e-06, + "loss": 0.0495, + "step": 872 + }, + { + "epoch": 0.5906630581867388, + "grad_norm": 0.5462720785705018, + "learning_rate": 9.717462980272989e-06, + "loss": 0.0479, + "step": 873 + }, + { + "epoch": 0.591339648173207, + "grad_norm": 0.5753746704368786, + "learning_rate": 9.716156876876096e-06, + "loss": 0.0531, + "step": 874 + }, + { + "epoch": 0.5920162381596752, + "grad_norm": 0.40828434914038086, + "learning_rate": 9.714847849716149e-06, + "loss": 0.0477, + "step": 875 + }, + { + "epoch": 0.5926928281461434, + "grad_norm": 0.48445222346652345, + "learning_rate": 9.713535899604667e-06, + "loss": 0.0425, + "step": 876 + }, + { + "epoch": 0.5933694181326117, + "grad_norm": 0.6683247339589322, + "learning_rate": 9.71222102735499e-06, + "loss": 0.0386, + "step": 877 + }, + { + "epoch": 0.5940460081190798, + "grad_norm": 0.42909292375202934, + "learning_rate": 9.710903233782273e-06, + "loss": 0.046, + "step": 878 + }, + { + "epoch": 0.5947225981055481, + "grad_norm": 0.7303995269588103, + "learning_rate": 9.70958251970347e-06, + "loss": 0.0508, + "step": 879 + }, + { + "epoch": 0.5953991880920162, + "grad_norm": 0.6128204947378759, + "learning_rate": 9.708258885937359e-06, + "loss": 0.0461, + "step": 880 + }, + { + "epoch": 0.5960757780784844, + "grad_norm": 0.5653008366891901, + "learning_rate": 9.706932333304518e-06, + "loss": 0.0344, + "step": 881 + }, + { + "epoch": 0.5967523680649527, + "grad_norm": 0.5798904476841784, + "learning_rate": 9.705602862627335e-06, + "loss": 0.0394, + "step": 882 + }, + { + "epoch": 0.5974289580514208, + "grad_norm": 0.4386755015091282, + "learning_rate": 9.704270474730018e-06, + "loss": 0.0384, + "step": 883 + }, + { + "epoch": 0.5981055480378891, + "grad_norm": 0.3270568512910185, + "learning_rate": 9.70293517043857e-06, + "loss": 0.0349, + "step": 884 + }, + { + "epoch": 0.5987821380243572, + "grad_norm": 0.5416745768346694, + "learning_rate": 9.701596950580807e-06, + "loss": 0.0555, + "step": 885 + }, + { + "epoch": 0.5994587280108254, + "grad_norm": 0.542606637133406, + "learning_rate": 9.700255815986357e-06, + "loss": 0.0458, + "step": 886 + }, + { + "epoch": 0.6001353179972937, + "grad_norm": 0.40012553877358126, + "learning_rate": 9.69891176748665e-06, + "loss": 0.0312, + "step": 887 + }, + { + "epoch": 0.6008119079837618, + "grad_norm": 0.5943951286176301, + "learning_rate": 9.697564805914922e-06, + "loss": 0.053, + "step": 888 + }, + { + "epoch": 0.6014884979702301, + "grad_norm": 0.36145698367203055, + "learning_rate": 9.696214932106218e-06, + "loss": 0.0377, + "step": 889 + }, + { + "epoch": 0.6021650879566982, + "grad_norm": 0.3645486931235761, + "learning_rate": 9.694862146897385e-06, + "loss": 0.0368, + "step": 890 + }, + { + "epoch": 0.6028416779431665, + "grad_norm": 0.443536129736776, + "learning_rate": 9.693506451127082e-06, + "loss": 0.0354, + "step": 891 + }, + { + "epoch": 0.6035182679296346, + "grad_norm": 0.4546288869488424, + "learning_rate": 9.692147845635761e-06, + "loss": 0.0387, + "step": 892 + }, + { + "epoch": 0.6041948579161028, + "grad_norm": 0.5615972553394489, + "learning_rate": 9.690786331265687e-06, + "loss": 0.0595, + "step": 893 + }, + { + "epoch": 0.6048714479025711, + "grad_norm": 0.3955607243296082, + "learning_rate": 9.689421908860928e-06, + "loss": 0.0356, + "step": 894 + }, + { + "epoch": 0.6055480378890392, + "grad_norm": 0.3432982231897223, + "learning_rate": 9.688054579267347e-06, + "loss": 0.0377, + "step": 895 + }, + { + "epoch": 0.6062246278755075, + "grad_norm": 0.4640843848808698, + "learning_rate": 9.68668434333262e-06, + "loss": 0.0469, + "step": 896 + }, + { + "epoch": 0.6069012178619756, + "grad_norm": 0.5060279217404642, + "learning_rate": 9.685311201906216e-06, + "loss": 0.0505, + "step": 897 + }, + { + "epoch": 0.6075778078484438, + "grad_norm": 0.4382657736866923, + "learning_rate": 9.683935155839408e-06, + "loss": 0.0439, + "step": 898 + }, + { + "epoch": 0.6082543978349121, + "grad_norm": 0.761356984884096, + "learning_rate": 9.682556205985274e-06, + "loss": 0.0418, + "step": 899 + }, + { + "epoch": 0.6089309878213802, + "grad_norm": 0.36611036135917924, + "learning_rate": 9.681174353198687e-06, + "loss": 0.0449, + "step": 900 + }, + { + "epoch": 0.6096075778078485, + "grad_norm": 0.7103148905704162, + "learning_rate": 9.67978959833632e-06, + "loss": 0.0518, + "step": 901 + }, + { + "epoch": 0.6102841677943166, + "grad_norm": 0.42861527819236733, + "learning_rate": 9.678401942256648e-06, + "loss": 0.0435, + "step": 902 + }, + { + "epoch": 0.6109607577807848, + "grad_norm": 0.41156509646753364, + "learning_rate": 9.67701138581994e-06, + "loss": 0.0349, + "step": 903 + }, + { + "epoch": 0.6116373477672531, + "grad_norm": 0.42902342479749483, + "learning_rate": 9.675617929888271e-06, + "loss": 0.0446, + "step": 904 + }, + { + "epoch": 0.6123139377537212, + "grad_norm": 0.3475314164339951, + "learning_rate": 9.674221575325503e-06, + "loss": 0.0428, + "step": 905 + }, + { + "epoch": 0.6129905277401895, + "grad_norm": 0.47446237263781843, + "learning_rate": 9.672822322997305e-06, + "loss": 0.0435, + "step": 906 + }, + { + "epoch": 0.6136671177266576, + "grad_norm": 0.6151965921405438, + "learning_rate": 9.671420173771135e-06, + "loss": 0.0493, + "step": 907 + }, + { + "epoch": 0.6143437077131259, + "grad_norm": 0.38357859953913104, + "learning_rate": 9.670015128516253e-06, + "loss": 0.0359, + "step": 908 + }, + { + "epoch": 0.6150202976995941, + "grad_norm": 0.507570983315249, + "learning_rate": 9.668607188103708e-06, + "loss": 0.0515, + "step": 909 + }, + { + "epoch": 0.6156968876860622, + "grad_norm": 0.3057624219178951, + "learning_rate": 9.667196353406352e-06, + "loss": 0.034, + "step": 910 + }, + { + "epoch": 0.6163734776725305, + "grad_norm": 0.6047675298642423, + "learning_rate": 9.665782625298821e-06, + "loss": 0.0483, + "step": 911 + }, + { + "epoch": 0.6170500676589986, + "grad_norm": 0.429809230360215, + "learning_rate": 9.664366004657553e-06, + "loss": 0.0479, + "step": 912 + }, + { + "epoch": 0.6177266576454669, + "grad_norm": 0.33850749588389023, + "learning_rate": 9.662946492360777e-06, + "loss": 0.0372, + "step": 913 + }, + { + "epoch": 0.618403247631935, + "grad_norm": 0.4383026714391823, + "learning_rate": 9.66152408928851e-06, + "loss": 0.0464, + "step": 914 + }, + { + "epoch": 0.6190798376184032, + "grad_norm": 0.3389190885921122, + "learning_rate": 9.66009879632257e-06, + "loss": 0.0387, + "step": 915 + }, + { + "epoch": 0.6197564276048715, + "grad_norm": 0.5288823141822944, + "learning_rate": 9.65867061434656e-06, + "loss": 0.0434, + "step": 916 + }, + { + "epoch": 0.6204330175913396, + "grad_norm": 0.46263646372460493, + "learning_rate": 9.657239544245877e-06, + "loss": 0.0365, + "step": 917 + }, + { + "epoch": 0.6211096075778079, + "grad_norm": 0.3710608847499482, + "learning_rate": 9.655805586907705e-06, + "loss": 0.045, + "step": 918 + }, + { + "epoch": 0.621786197564276, + "grad_norm": 0.4579363079680455, + "learning_rate": 9.654368743221022e-06, + "loss": 0.0374, + "step": 919 + }, + { + "epoch": 0.6224627875507442, + "grad_norm": 0.44294044719332765, + "learning_rate": 9.652929014076593e-06, + "loss": 0.0474, + "step": 920 + }, + { + "epoch": 0.6231393775372125, + "grad_norm": 0.34199808247768276, + "learning_rate": 9.651486400366972e-06, + "loss": 0.0322, + "step": 921 + }, + { + "epoch": 0.6238159675236806, + "grad_norm": 0.48767698423175115, + "learning_rate": 9.650040902986504e-06, + "loss": 0.0485, + "step": 922 + }, + { + "epoch": 0.6244925575101489, + "grad_norm": 0.48358834244712573, + "learning_rate": 9.648592522831316e-06, + "loss": 0.0424, + "step": 923 + }, + { + "epoch": 0.625169147496617, + "grad_norm": 0.5172813697337071, + "learning_rate": 9.64714126079933e-06, + "loss": 0.0591, + "step": 924 + }, + { + "epoch": 0.6258457374830853, + "grad_norm": 0.38027197375093347, + "learning_rate": 9.645687117790246e-06, + "loss": 0.0392, + "step": 925 + }, + { + "epoch": 0.6265223274695535, + "grad_norm": 0.326766981389363, + "learning_rate": 9.644230094705555e-06, + "loss": 0.0417, + "step": 926 + }, + { + "epoch": 0.6271989174560216, + "grad_norm": 0.37975622243619345, + "learning_rate": 9.642770192448537e-06, + "loss": 0.0355, + "step": 927 + }, + { + "epoch": 0.6278755074424899, + "grad_norm": 0.40318220005365135, + "learning_rate": 9.641307411924246e-06, + "loss": 0.0455, + "step": 928 + }, + { + "epoch": 0.628552097428958, + "grad_norm": 0.5167687457949933, + "learning_rate": 9.639841754039534e-06, + "loss": 0.0407, + "step": 929 + }, + { + "epoch": 0.6292286874154263, + "grad_norm": 0.4885812957255087, + "learning_rate": 9.638373219703023e-06, + "loss": 0.0561, + "step": 930 + }, + { + "epoch": 0.6299052774018945, + "grad_norm": 0.48013557815758745, + "learning_rate": 9.63690180982513e-06, + "loss": 0.0442, + "step": 931 + }, + { + "epoch": 0.6305818673883626, + "grad_norm": 0.4199012352919531, + "learning_rate": 9.635427525318048e-06, + "loss": 0.0327, + "step": 932 + }, + { + "epoch": 0.6312584573748309, + "grad_norm": 0.34720279637819, + "learning_rate": 9.633950367095758e-06, + "loss": 0.0322, + "step": 933 + }, + { + "epoch": 0.631935047361299, + "grad_norm": 0.451015570434906, + "learning_rate": 9.632470336074009e-06, + "loss": 0.0486, + "step": 934 + }, + { + "epoch": 0.6326116373477673, + "grad_norm": 0.42015740109231275, + "learning_rate": 9.63098743317035e-06, + "loss": 0.035, + "step": 935 + }, + { + "epoch": 0.6332882273342354, + "grad_norm": 0.6411163507899782, + "learning_rate": 9.629501659304096e-06, + "loss": 0.0553, + "step": 936 + }, + { + "epoch": 0.6339648173207036, + "grad_norm": 0.3538309776081996, + "learning_rate": 9.628013015396347e-06, + "loss": 0.0337, + "step": 937 + }, + { + "epoch": 0.6346414073071719, + "grad_norm": 0.5236845273971029, + "learning_rate": 9.626521502369984e-06, + "loss": 0.056, + "step": 938 + }, + { + "epoch": 0.63531799729364, + "grad_norm": 0.41809020137958025, + "learning_rate": 9.625027121149665e-06, + "loss": 0.0476, + "step": 939 + }, + { + "epoch": 0.6359945872801083, + "grad_norm": 0.5060195307147896, + "learning_rate": 9.623529872661821e-06, + "loss": 0.0418, + "step": 940 + }, + { + "epoch": 0.6366711772665764, + "grad_norm": 0.44072874382524213, + "learning_rate": 9.62202975783467e-06, + "loss": 0.0441, + "step": 941 + }, + { + "epoch": 0.6373477672530447, + "grad_norm": 0.43222278345904935, + "learning_rate": 9.620526777598202e-06, + "loss": 0.0554, + "step": 942 + }, + { + "epoch": 0.6380243572395129, + "grad_norm": 0.582429766652759, + "learning_rate": 9.619020932884182e-06, + "loss": 0.0803, + "step": 943 + }, + { + "epoch": 0.638700947225981, + "grad_norm": 0.3994284470354665, + "learning_rate": 9.617512224626153e-06, + "loss": 0.0355, + "step": 944 + }, + { + "epoch": 0.6393775372124493, + "grad_norm": 0.3791513274555608, + "learning_rate": 9.616000653759435e-06, + "loss": 0.0321, + "step": 945 + }, + { + "epoch": 0.6400541271989174, + "grad_norm": 0.4528994898732974, + "learning_rate": 9.614486221221115e-06, + "loss": 0.0396, + "step": 946 + }, + { + "epoch": 0.6407307171853857, + "grad_norm": 0.4837314205327873, + "learning_rate": 9.612968927950066e-06, + "loss": 0.0436, + "step": 947 + }, + { + "epoch": 0.6414073071718539, + "grad_norm": 0.5347640164457272, + "learning_rate": 9.611448774886925e-06, + "loss": 0.0513, + "step": 948 + }, + { + "epoch": 0.642083897158322, + "grad_norm": 0.5042218258862791, + "learning_rate": 9.609925762974103e-06, + "loss": 0.0475, + "step": 949 + }, + { + "epoch": 0.6427604871447903, + "grad_norm": 0.4264989507780004, + "learning_rate": 9.60839989315579e-06, + "loss": 0.0357, + "step": 950 + }, + { + "epoch": 0.6434370771312584, + "grad_norm": 0.5392493535467197, + "learning_rate": 9.606871166377939e-06, + "loss": 0.052, + "step": 951 + }, + { + "epoch": 0.6441136671177267, + "grad_norm": 0.3908082376550831, + "learning_rate": 9.60533958358828e-06, + "loss": 0.0426, + "step": 952 + }, + { + "epoch": 0.6447902571041949, + "grad_norm": 0.7627853739424385, + "learning_rate": 9.603805145736311e-06, + "loss": 0.0558, + "step": 953 + }, + { + "epoch": 0.645466847090663, + "grad_norm": 0.5672781531325081, + "learning_rate": 9.602267853773301e-06, + "loss": 0.0504, + "step": 954 + }, + { + "epoch": 0.6461434370771313, + "grad_norm": 0.45194462663889134, + "learning_rate": 9.60072770865229e-06, + "loss": 0.0428, + "step": 955 + }, + { + "epoch": 0.6468200270635994, + "grad_norm": 0.5404875667227371, + "learning_rate": 9.599184711328082e-06, + "loss": 0.0321, + "step": 956 + }, + { + "epoch": 0.6474966170500677, + "grad_norm": 0.7264194143274479, + "learning_rate": 9.597638862757255e-06, + "loss": 0.0576, + "step": 957 + }, + { + "epoch": 0.6481732070365359, + "grad_norm": 0.43277733284387404, + "learning_rate": 9.596090163898148e-06, + "loss": 0.0427, + "step": 958 + }, + { + "epoch": 0.648849797023004, + "grad_norm": 0.43668803468184053, + "learning_rate": 9.594538615710875e-06, + "loss": 0.0408, + "step": 959 + }, + { + "epoch": 0.6495263870094723, + "grad_norm": 0.6373441415174744, + "learning_rate": 9.59298421915731e-06, + "loss": 0.0493, + "step": 960 + }, + { + "epoch": 0.6502029769959404, + "grad_norm": 0.5039542204065095, + "learning_rate": 9.591426975201093e-06, + "loss": 0.0505, + "step": 961 + }, + { + "epoch": 0.6508795669824087, + "grad_norm": 0.5401177598769642, + "learning_rate": 9.589866884807637e-06, + "loss": 0.0604, + "step": 962 + }, + { + "epoch": 0.6515561569688768, + "grad_norm": 0.45949039645671613, + "learning_rate": 9.588303948944109e-06, + "loss": 0.0442, + "step": 963 + }, + { + "epoch": 0.652232746955345, + "grad_norm": 0.48521941125554213, + "learning_rate": 9.586738168579446e-06, + "loss": 0.036, + "step": 964 + }, + { + "epoch": 0.6529093369418133, + "grad_norm": 0.4598983065811199, + "learning_rate": 9.58516954468435e-06, + "loss": 0.0433, + "step": 965 + }, + { + "epoch": 0.6535859269282814, + "grad_norm": 0.3718353137593582, + "learning_rate": 9.58359807823128e-06, + "loss": 0.0465, + "step": 966 + }, + { + "epoch": 0.6542625169147497, + "grad_norm": 0.3543182839474053, + "learning_rate": 9.582023770194462e-06, + "loss": 0.036, + "step": 967 + }, + { + "epoch": 0.6549391069012178, + "grad_norm": 0.5577019363935712, + "learning_rate": 9.580446621549883e-06, + "loss": 0.0414, + "step": 968 + }, + { + "epoch": 0.6556156968876861, + "grad_norm": 0.40349930365154446, + "learning_rate": 9.578866633275289e-06, + "loss": 0.0508, + "step": 969 + }, + { + "epoch": 0.6562922868741543, + "grad_norm": 0.49495689312928554, + "learning_rate": 9.577283806350186e-06, + "loss": 0.0379, + "step": 970 + }, + { + "epoch": 0.6569688768606224, + "grad_norm": 0.5897893067553056, + "learning_rate": 9.575698141755844e-06, + "loss": 0.0598, + "step": 971 + }, + { + "epoch": 0.6576454668470907, + "grad_norm": 0.40963069354089227, + "learning_rate": 9.57410964047529e-06, + "loss": 0.0406, + "step": 972 + }, + { + "epoch": 0.6583220568335588, + "grad_norm": 0.33723745271737793, + "learning_rate": 9.572518303493305e-06, + "loss": 0.0345, + "step": 973 + }, + { + "epoch": 0.6589986468200271, + "grad_norm": 0.5830671994151339, + "learning_rate": 9.570924131796437e-06, + "loss": 0.051, + "step": 974 + }, + { + "epoch": 0.6596752368064953, + "grad_norm": 0.563938532943089, + "learning_rate": 9.569327126372985e-06, + "loss": 0.0465, + "step": 975 + }, + { + "epoch": 0.6603518267929634, + "grad_norm": 0.4110509351619063, + "learning_rate": 9.567727288213005e-06, + "loss": 0.0508, + "step": 976 + }, + { + "epoch": 0.6610284167794317, + "grad_norm": 0.5101530319858956, + "learning_rate": 9.566124618308312e-06, + "loss": 0.0469, + "step": 977 + }, + { + "epoch": 0.6617050067658998, + "grad_norm": 0.5446245789300718, + "learning_rate": 9.564519117652473e-06, + "loss": 0.0475, + "step": 978 + }, + { + "epoch": 0.6623815967523681, + "grad_norm": 0.38532615768990897, + "learning_rate": 9.562910787240814e-06, + "loss": 0.0371, + "step": 979 + }, + { + "epoch": 0.6630581867388363, + "grad_norm": 0.5452581963795065, + "learning_rate": 9.56129962807041e-06, + "loss": 0.0463, + "step": 980 + }, + { + "epoch": 0.6637347767253045, + "grad_norm": 0.493897519037664, + "learning_rate": 9.559685641140098e-06, + "loss": 0.046, + "step": 981 + }, + { + "epoch": 0.6644113667117727, + "grad_norm": 0.49120411649237344, + "learning_rate": 9.55806882745046e-06, + "loss": 0.038, + "step": 982 + }, + { + "epoch": 0.6650879566982408, + "grad_norm": 0.37594809722236106, + "learning_rate": 9.556449188003831e-06, + "loss": 0.0332, + "step": 983 + }, + { + "epoch": 0.6657645466847091, + "grad_norm": 0.5689483506911673, + "learning_rate": 9.554826723804304e-06, + "loss": 0.0428, + "step": 984 + }, + { + "epoch": 0.6664411366711772, + "grad_norm": 0.47459352359813967, + "learning_rate": 9.553201435857718e-06, + "loss": 0.048, + "step": 985 + }, + { + "epoch": 0.6671177266576455, + "grad_norm": 0.4175841295594717, + "learning_rate": 9.551573325171662e-06, + "loss": 0.0433, + "step": 986 + }, + { + "epoch": 0.6677943166441137, + "grad_norm": 0.4703368986441674, + "learning_rate": 9.54994239275548e-06, + "loss": 0.045, + "step": 987 + }, + { + "epoch": 0.6684709066305818, + "grad_norm": 0.5174768810420528, + "learning_rate": 9.54830863962026e-06, + "loss": 0.0477, + "step": 988 + }, + { + "epoch": 0.6691474966170501, + "grad_norm": 0.45031078391738333, + "learning_rate": 9.546672066778842e-06, + "loss": 0.0453, + "step": 989 + }, + { + "epoch": 0.6698240866035182, + "grad_norm": 0.4661069434329015, + "learning_rate": 9.545032675245814e-06, + "loss": 0.0361, + "step": 990 + }, + { + "epoch": 0.6705006765899865, + "grad_norm": 0.6263373512558729, + "learning_rate": 9.543390466037507e-06, + "loss": 0.0482, + "step": 991 + }, + { + "epoch": 0.6711772665764547, + "grad_norm": 0.6198243720105686, + "learning_rate": 9.541745440172006e-06, + "loss": 0.0463, + "step": 992 + }, + { + "epoch": 0.6718538565629228, + "grad_norm": 0.41016128196243706, + "learning_rate": 9.540097598669135e-06, + "loss": 0.0417, + "step": 993 + }, + { + "epoch": 0.6725304465493911, + "grad_norm": 0.39800918436000615, + "learning_rate": 9.538446942550468e-06, + "loss": 0.0415, + "step": 994 + }, + { + "epoch": 0.6732070365358592, + "grad_norm": 0.5543222813132193, + "learning_rate": 9.536793472839325e-06, + "loss": 0.0401, + "step": 995 + }, + { + "epoch": 0.6738836265223275, + "grad_norm": 0.5940809739304163, + "learning_rate": 9.535137190560765e-06, + "loss": 0.0442, + "step": 996 + }, + { + "epoch": 0.6745602165087957, + "grad_norm": 0.6673629427097577, + "learning_rate": 9.533478096741597e-06, + "loss": 0.0496, + "step": 997 + }, + { + "epoch": 0.6752368064952639, + "grad_norm": 0.6404194200639447, + "learning_rate": 9.531816192410366e-06, + "loss": 0.05, + "step": 998 + }, + { + "epoch": 0.6759133964817321, + "grad_norm": 0.40213167814173456, + "learning_rate": 9.530151478597366e-06, + "loss": 0.0303, + "step": 999 + }, + { + "epoch": 0.6765899864682002, + "grad_norm": 0.8361044332179497, + "learning_rate": 9.528483956334628e-06, + "loss": 0.0402, + "step": 1000 + }, + { + "epoch": 0.6772665764546685, + "grad_norm": 0.6371984225608404, + "learning_rate": 9.526813626655929e-06, + "loss": 0.0487, + "step": 1001 + }, + { + "epoch": 0.6779431664411367, + "grad_norm": 0.3529968456644681, + "learning_rate": 9.525140490596778e-06, + "loss": 0.0457, + "step": 1002 + }, + { + "epoch": 0.6786197564276049, + "grad_norm": 0.6890677686012265, + "learning_rate": 9.523464549194434e-06, + "loss": 0.0434, + "step": 1003 + }, + { + "epoch": 0.6792963464140731, + "grad_norm": 0.4976579938373486, + "learning_rate": 9.521785803487888e-06, + "loss": 0.0364, + "step": 1004 + }, + { + "epoch": 0.6799729364005412, + "grad_norm": 0.32376422130205423, + "learning_rate": 9.520104254517873e-06, + "loss": 0.037, + "step": 1005 + }, + { + "epoch": 0.6806495263870095, + "grad_norm": 0.6068602119994349, + "learning_rate": 9.518419903326859e-06, + "loss": 0.0474, + "step": 1006 + }, + { + "epoch": 0.6813261163734776, + "grad_norm": 0.5220480755210147, + "learning_rate": 9.51673275095905e-06, + "loss": 0.0484, + "step": 1007 + }, + { + "epoch": 0.6820027063599459, + "grad_norm": 0.4482989240924032, + "learning_rate": 9.515042798460393e-06, + "loss": 0.0369, + "step": 1008 + }, + { + "epoch": 0.6826792963464141, + "grad_norm": 0.5301825487940978, + "learning_rate": 9.513350046878565e-06, + "loss": 0.0502, + "step": 1009 + }, + { + "epoch": 0.6833558863328822, + "grad_norm": 0.5303576531468821, + "learning_rate": 9.511654497262984e-06, + "loss": 0.0642, + "step": 1010 + }, + { + "epoch": 0.6840324763193505, + "grad_norm": 0.5349687200733244, + "learning_rate": 9.509956150664796e-06, + "loss": 0.0522, + "step": 1011 + }, + { + "epoch": 0.6847090663058186, + "grad_norm": 0.6886510654722846, + "learning_rate": 9.508255008136885e-06, + "loss": 0.047, + "step": 1012 + }, + { + "epoch": 0.6853856562922869, + "grad_norm": 0.6089451596108506, + "learning_rate": 9.506551070733869e-06, + "loss": 0.0422, + "step": 1013 + }, + { + "epoch": 0.6860622462787551, + "grad_norm": 0.62732429197919, + "learning_rate": 9.504844339512096e-06, + "loss": 0.0563, + "step": 1014 + }, + { + "epoch": 0.6867388362652233, + "grad_norm": 0.5271058132726802, + "learning_rate": 9.50313481552965e-06, + "loss": 0.0419, + "step": 1015 + }, + { + "epoch": 0.6874154262516915, + "grad_norm": 0.40746987014719194, + "learning_rate": 9.501422499846338e-06, + "loss": 0.0374, + "step": 1016 + }, + { + "epoch": 0.6880920162381596, + "grad_norm": 0.4185099148099006, + "learning_rate": 9.49970739352371e-06, + "loss": 0.0446, + "step": 1017 + }, + { + "epoch": 0.6887686062246279, + "grad_norm": 0.5619760070106918, + "learning_rate": 9.497989497625036e-06, + "loss": 0.0398, + "step": 1018 + }, + { + "epoch": 0.6894451962110961, + "grad_norm": 0.3937333405315312, + "learning_rate": 9.49626881321532e-06, + "loss": 0.0361, + "step": 1019 + }, + { + "epoch": 0.6901217861975643, + "grad_norm": 0.4291750955145607, + "learning_rate": 9.494545341361291e-06, + "loss": 0.0392, + "step": 1020 + }, + { + "epoch": 0.6907983761840325, + "grad_norm": 0.8025857719308748, + "learning_rate": 9.492819083131412e-06, + "loss": 0.059, + "step": 1021 + }, + { + "epoch": 0.6914749661705006, + "grad_norm": 0.3449247526874084, + "learning_rate": 9.491090039595869e-06, + "loss": 0.0386, + "step": 1022 + }, + { + "epoch": 0.6921515561569689, + "grad_norm": 0.40244666843718724, + "learning_rate": 9.489358211826577e-06, + "loss": 0.0324, + "step": 1023 + }, + { + "epoch": 0.6928281461434371, + "grad_norm": 0.8645169898857172, + "learning_rate": 9.487623600897172e-06, + "loss": 0.07, + "step": 1024 + }, + { + "epoch": 0.6935047361299053, + "grad_norm": 1.13253333163851, + "learning_rate": 9.485886207883022e-06, + "loss": 0.0512, + "step": 1025 + }, + { + "epoch": 0.6941813261163735, + "grad_norm": 0.5641277435023277, + "learning_rate": 9.484146033861216e-06, + "loss": 0.0437, + "step": 1026 + }, + { + "epoch": 0.6948579161028416, + "grad_norm": 0.8351590629206309, + "learning_rate": 9.482403079910571e-06, + "loss": 0.0488, + "step": 1027 + }, + { + "epoch": 0.6955345060893099, + "grad_norm": 0.5072789648438356, + "learning_rate": 9.480657347111621e-06, + "loss": 0.0532, + "step": 1028 + }, + { + "epoch": 0.696211096075778, + "grad_norm": 0.4708085692179905, + "learning_rate": 9.478908836546629e-06, + "loss": 0.0337, + "step": 1029 + }, + { + "epoch": 0.6968876860622463, + "grad_norm": 0.623392456528769, + "learning_rate": 9.477157549299574e-06, + "loss": 0.0431, + "step": 1030 + }, + { + "epoch": 0.6975642760487145, + "grad_norm": 0.486412673751419, + "learning_rate": 9.475403486456162e-06, + "loss": 0.0493, + "step": 1031 + }, + { + "epoch": 0.6982408660351827, + "grad_norm": 0.3619781041039289, + "learning_rate": 9.473646649103819e-06, + "loss": 0.0359, + "step": 1032 + }, + { + "epoch": 0.6989174560216509, + "grad_norm": 0.5055199497504502, + "learning_rate": 9.471887038331686e-06, + "loss": 0.0417, + "step": 1033 + }, + { + "epoch": 0.699594046008119, + "grad_norm": 0.731027212723068, + "learning_rate": 9.470124655230627e-06, + "loss": 0.0397, + "step": 1034 + }, + { + "epoch": 0.7002706359945873, + "grad_norm": 0.6703689044818381, + "learning_rate": 9.468359500893227e-06, + "loss": 0.0392, + "step": 1035 + }, + { + "epoch": 0.7009472259810555, + "grad_norm": 0.5332302039854108, + "learning_rate": 9.466591576413785e-06, + "loss": 0.0621, + "step": 1036 + }, + { + "epoch": 0.7016238159675237, + "grad_norm": 0.4020929727770793, + "learning_rate": 9.464820882888319e-06, + "loss": 0.0473, + "step": 1037 + }, + { + "epoch": 0.7023004059539919, + "grad_norm": 0.5995014402035128, + "learning_rate": 9.463047421414564e-06, + "loss": 0.049, + "step": 1038 + }, + { + "epoch": 0.70297699594046, + "grad_norm": 0.41047290020663224, + "learning_rate": 9.461271193091971e-06, + "loss": 0.0384, + "step": 1039 + }, + { + "epoch": 0.7036535859269283, + "grad_norm": 0.4631576567759274, + "learning_rate": 9.459492199021705e-06, + "loss": 0.0462, + "step": 1040 + }, + { + "epoch": 0.7043301759133965, + "grad_norm": 0.5471261313831118, + "learning_rate": 9.457710440306645e-06, + "loss": 0.0485, + "step": 1041 + }, + { + "epoch": 0.7050067658998647, + "grad_norm": 0.7540552681273482, + "learning_rate": 9.455925918051388e-06, + "loss": 0.0608, + "step": 1042 + }, + { + "epoch": 0.7056833558863329, + "grad_norm": 0.4657514871110121, + "learning_rate": 9.454138633362241e-06, + "loss": 0.0456, + "step": 1043 + }, + { + "epoch": 0.706359945872801, + "grad_norm": 0.5912269204702294, + "learning_rate": 9.452348587347224e-06, + "loss": 0.0342, + "step": 1044 + }, + { + "epoch": 0.7070365358592693, + "grad_norm": 0.5858407115011176, + "learning_rate": 9.450555781116068e-06, + "loss": 0.0387, + "step": 1045 + }, + { + "epoch": 0.7077131258457375, + "grad_norm": 0.47893427829398627, + "learning_rate": 9.448760215780218e-06, + "loss": 0.0512, + "step": 1046 + }, + { + "epoch": 0.7083897158322057, + "grad_norm": 0.6581060020226865, + "learning_rate": 9.446961892452824e-06, + "loss": 0.053, + "step": 1047 + }, + { + "epoch": 0.7090663058186739, + "grad_norm": 0.6447452214945267, + "learning_rate": 9.445160812248754e-06, + "loss": 0.0525, + "step": 1048 + }, + { + "epoch": 0.709742895805142, + "grad_norm": 0.3340549267704265, + "learning_rate": 9.44335697628458e-06, + "loss": 0.0326, + "step": 1049 + }, + { + "epoch": 0.7104194857916103, + "grad_norm": 0.5470335567157704, + "learning_rate": 9.44155038567858e-06, + "loss": 0.0499, + "step": 1050 + }, + { + "epoch": 0.7110960757780784, + "grad_norm": 0.3966932054039324, + "learning_rate": 9.439741041550745e-06, + "loss": 0.032, + "step": 1051 + }, + { + "epoch": 0.7117726657645467, + "grad_norm": 0.44056447508322627, + "learning_rate": 9.437928945022772e-06, + "loss": 0.0412, + "step": 1052 + }, + { + "epoch": 0.7124492557510149, + "grad_norm": 0.3869827577066024, + "learning_rate": 9.43611409721806e-06, + "loss": 0.0431, + "step": 1053 + }, + { + "epoch": 0.713125845737483, + "grad_norm": 0.3980613763277345, + "learning_rate": 9.434296499261719e-06, + "loss": 0.048, + "step": 1054 + }, + { + "epoch": 0.7138024357239513, + "grad_norm": 0.7811011021579496, + "learning_rate": 9.432476152280562e-06, + "loss": 0.0514, + "step": 1055 + }, + { + "epoch": 0.7144790257104194, + "grad_norm": 0.37486212234087307, + "learning_rate": 9.430653057403105e-06, + "loss": 0.0389, + "step": 1056 + }, + { + "epoch": 0.7151556156968877, + "grad_norm": 0.4317908795764949, + "learning_rate": 9.428827215759569e-06, + "loss": 0.0439, + "step": 1057 + }, + { + "epoch": 0.7158322056833559, + "grad_norm": 0.3836328577522653, + "learning_rate": 9.426998628481876e-06, + "loss": 0.038, + "step": 1058 + }, + { + "epoch": 0.7165087956698241, + "grad_norm": 0.47017815847762695, + "learning_rate": 9.425167296703655e-06, + "loss": 0.0392, + "step": 1059 + }, + { + "epoch": 0.7171853856562923, + "grad_norm": 0.43806505261570117, + "learning_rate": 9.42333322156023e-06, + "loss": 0.051, + "step": 1060 + }, + { + "epoch": 0.7178619756427604, + "grad_norm": 0.5105023036196973, + "learning_rate": 9.42149640418863e-06, + "loss": 0.0426, + "step": 1061 + }, + { + "epoch": 0.7185385656292287, + "grad_norm": 0.4775796931237403, + "learning_rate": 9.419656845727582e-06, + "loss": 0.041, + "step": 1062 + }, + { + "epoch": 0.7192151556156969, + "grad_norm": 0.46565718426011904, + "learning_rate": 9.417814547317513e-06, + "loss": 0.0415, + "step": 1063 + }, + { + "epoch": 0.7198917456021651, + "grad_norm": 0.3718909598609268, + "learning_rate": 9.415969510100549e-06, + "loss": 0.0306, + "step": 1064 + }, + { + "epoch": 0.7205683355886333, + "grad_norm": 0.4842662981078468, + "learning_rate": 9.414121735220513e-06, + "loss": 0.0445, + "step": 1065 + }, + { + "epoch": 0.7212449255751014, + "grad_norm": 0.47420840949599175, + "learning_rate": 9.412271223822929e-06, + "loss": 0.0486, + "step": 1066 + }, + { + "epoch": 0.7219215155615697, + "grad_norm": 0.48895220057621547, + "learning_rate": 9.41041797705501e-06, + "loss": 0.0393, + "step": 1067 + }, + { + "epoch": 0.7225981055480379, + "grad_norm": 0.4875400791808188, + "learning_rate": 9.408561996065672e-06, + "loss": 0.0519, + "step": 1068 + }, + { + "epoch": 0.7232746955345061, + "grad_norm": 0.6144523783870842, + "learning_rate": 9.406703282005523e-06, + "loss": 0.0442, + "step": 1069 + }, + { + "epoch": 0.7239512855209743, + "grad_norm": 0.4754043228344461, + "learning_rate": 9.404841836026863e-06, + "loss": 0.0595, + "step": 1070 + }, + { + "epoch": 0.7246278755074425, + "grad_norm": 0.47642322745592036, + "learning_rate": 9.40297765928369e-06, + "loss": 0.0501, + "step": 1071 + }, + { + "epoch": 0.7253044654939107, + "grad_norm": 0.32558905339326694, + "learning_rate": 9.401110752931694e-06, + "loss": 0.0339, + "step": 1072 + }, + { + "epoch": 0.725981055480379, + "grad_norm": 0.45351688092518283, + "learning_rate": 9.399241118128255e-06, + "loss": 0.039, + "step": 1073 + }, + { + "epoch": 0.7266576454668471, + "grad_norm": 0.2500361418411447, + "learning_rate": 9.397368756032445e-06, + "loss": 0.0274, + "step": 1074 + }, + { + "epoch": 0.7273342354533153, + "grad_norm": 0.41487142952439504, + "learning_rate": 9.395493667805032e-06, + "loss": 0.0448, + "step": 1075 + }, + { + "epoch": 0.7280108254397835, + "grad_norm": 0.3163011214155268, + "learning_rate": 9.393615854608461e-06, + "loss": 0.0309, + "step": 1076 + }, + { + "epoch": 0.7286874154262517, + "grad_norm": 0.4334865050484572, + "learning_rate": 9.391735317606885e-06, + "loss": 0.0419, + "step": 1077 + }, + { + "epoch": 0.7293640054127198, + "grad_norm": 0.4769385480050723, + "learning_rate": 9.389852057966129e-06, + "loss": 0.0506, + "step": 1078 + }, + { + "epoch": 0.7300405953991881, + "grad_norm": 0.35996837272589777, + "learning_rate": 9.387966076853714e-06, + "loss": 0.039, + "step": 1079 + }, + { + "epoch": 0.7307171853856563, + "grad_norm": 0.3256486674181158, + "learning_rate": 9.386077375438848e-06, + "loss": 0.0336, + "step": 1080 + }, + { + "epoch": 0.7313937753721245, + "grad_norm": 0.6222509033563155, + "learning_rate": 9.384185954892423e-06, + "loss": 0.0522, + "step": 1081 + }, + { + "epoch": 0.7320703653585927, + "grad_norm": 0.4082078879056643, + "learning_rate": 9.382291816387018e-06, + "loss": 0.0375, + "step": 1082 + }, + { + "epoch": 0.7327469553450608, + "grad_norm": 0.33543824603862954, + "learning_rate": 9.380394961096895e-06, + "loss": 0.0352, + "step": 1083 + }, + { + "epoch": 0.7334235453315291, + "grad_norm": 0.5060874797939505, + "learning_rate": 9.378495390198005e-06, + "loss": 0.0379, + "step": 1084 + }, + { + "epoch": 0.7341001353179973, + "grad_norm": 0.41743487319063965, + "learning_rate": 9.376593104867976e-06, + "loss": 0.0426, + "step": 1085 + }, + { + "epoch": 0.7347767253044655, + "grad_norm": 0.5184910031839326, + "learning_rate": 9.374688106286127e-06, + "loss": 0.0489, + "step": 1086 + }, + { + "epoch": 0.7354533152909337, + "grad_norm": 0.4084470647844144, + "learning_rate": 9.372780395633451e-06, + "loss": 0.0486, + "step": 1087 + }, + { + "epoch": 0.7361299052774019, + "grad_norm": 0.47999994192707224, + "learning_rate": 9.370869974092628e-06, + "loss": 0.0439, + "step": 1088 + }, + { + "epoch": 0.7368064952638701, + "grad_norm": 0.3813360371555213, + "learning_rate": 9.368956842848014e-06, + "loss": 0.0309, + "step": 1089 + }, + { + "epoch": 0.7374830852503383, + "grad_norm": 0.4503744066776666, + "learning_rate": 9.36704100308565e-06, + "loss": 0.0473, + "step": 1090 + }, + { + "epoch": 0.7381596752368065, + "grad_norm": 0.35717092121021204, + "learning_rate": 9.36512245599325e-06, + "loss": 0.0412, + "step": 1091 + }, + { + "epoch": 0.7388362652232747, + "grad_norm": 0.5764849679099431, + "learning_rate": 9.363201202760212e-06, + "loss": 0.0474, + "step": 1092 + }, + { + "epoch": 0.7395128552097429, + "grad_norm": 0.2875285228362852, + "learning_rate": 9.36127724457761e-06, + "loss": 0.0284, + "step": 1093 + }, + { + "epoch": 0.7401894451962111, + "grad_norm": 0.43016349621660316, + "learning_rate": 9.359350582638193e-06, + "loss": 0.0434, + "step": 1094 + }, + { + "epoch": 0.7408660351826793, + "grad_norm": 0.45620787154940123, + "learning_rate": 9.357421218136387e-06, + "loss": 0.0533, + "step": 1095 + }, + { + "epoch": 0.7415426251691475, + "grad_norm": 0.8166639455518038, + "learning_rate": 9.355489152268296e-06, + "loss": 0.0948, + "step": 1096 + }, + { + "epoch": 0.7422192151556157, + "grad_norm": 0.3144810238083724, + "learning_rate": 9.353554386231697e-06, + "loss": 0.0408, + "step": 1097 + }, + { + "epoch": 0.7428958051420839, + "grad_norm": 0.36361057224729754, + "learning_rate": 9.351616921226036e-06, + "loss": 0.0501, + "step": 1098 + }, + { + "epoch": 0.7435723951285521, + "grad_norm": 0.5326846996900816, + "learning_rate": 9.349676758452441e-06, + "loss": 0.0462, + "step": 1099 + }, + { + "epoch": 0.7442489851150202, + "grad_norm": 0.29471165072913635, + "learning_rate": 9.347733899113709e-06, + "loss": 0.0307, + "step": 1100 + }, + { + "epoch": 0.7449255751014885, + "grad_norm": 0.6901682512249298, + "learning_rate": 9.345788344414306e-06, + "loss": 0.0606, + "step": 1101 + }, + { + "epoch": 0.7456021650879567, + "grad_norm": 0.367005328692303, + "learning_rate": 9.343840095560373e-06, + "loss": 0.0393, + "step": 1102 + }, + { + "epoch": 0.7462787550744249, + "grad_norm": 0.5773480553617552, + "learning_rate": 9.341889153759715e-06, + "loss": 0.0455, + "step": 1103 + }, + { + "epoch": 0.7469553450608931, + "grad_norm": 0.5822139256764285, + "learning_rate": 9.339935520221816e-06, + "loss": 0.0546, + "step": 1104 + }, + { + "epoch": 0.7476319350473613, + "grad_norm": 0.4303231690692278, + "learning_rate": 9.33797919615782e-06, + "loss": 0.0321, + "step": 1105 + }, + { + "epoch": 0.7483085250338295, + "grad_norm": 0.5164958197463455, + "learning_rate": 9.336020182780545e-06, + "loss": 0.0371, + "step": 1106 + }, + { + "epoch": 0.7489851150202977, + "grad_norm": 0.5840950520153051, + "learning_rate": 9.33405848130447e-06, + "loss": 0.0363, + "step": 1107 + }, + { + "epoch": 0.7496617050067659, + "grad_norm": 0.8580538083904401, + "learning_rate": 9.332094092945749e-06, + "loss": 0.0593, + "step": 1108 + }, + { + "epoch": 0.7503382949932341, + "grad_norm": 0.45669196915044574, + "learning_rate": 9.330127018922195e-06, + "loss": 0.0419, + "step": 1109 + }, + { + "epoch": 0.7510148849797023, + "grad_norm": 0.6311970925803472, + "learning_rate": 9.328157260453286e-06, + "loss": 0.0431, + "step": 1110 + }, + { + "epoch": 0.7516914749661705, + "grad_norm": 0.5072034301545328, + "learning_rate": 9.326184818760167e-06, + "loss": 0.047, + "step": 1111 + }, + { + "epoch": 0.7523680649526387, + "grad_norm": 0.4469160704046604, + "learning_rate": 9.324209695065644e-06, + "loss": 0.0451, + "step": 1112 + }, + { + "epoch": 0.7530446549391069, + "grad_norm": 0.4704715800688946, + "learning_rate": 9.322231890594193e-06, + "loss": 0.0419, + "step": 1113 + }, + { + "epoch": 0.7537212449255751, + "grad_norm": 0.5600133859764858, + "learning_rate": 9.32025140657194e-06, + "loss": 0.0505, + "step": 1114 + }, + { + "epoch": 0.7543978349120433, + "grad_norm": 0.4038967147652209, + "learning_rate": 9.318268244226681e-06, + "loss": 0.0479, + "step": 1115 + }, + { + "epoch": 0.7550744248985115, + "grad_norm": 0.47394683668025994, + "learning_rate": 9.31628240478787e-06, + "loss": 0.0427, + "step": 1116 + }, + { + "epoch": 0.7557510148849798, + "grad_norm": 0.4363282100835522, + "learning_rate": 9.31429388948662e-06, + "loss": 0.0409, + "step": 1117 + }, + { + "epoch": 0.7564276048714479, + "grad_norm": 0.6631345517105665, + "learning_rate": 9.312302699555701e-06, + "loss": 0.0526, + "step": 1118 + }, + { + "epoch": 0.7571041948579161, + "grad_norm": 0.4251997861259301, + "learning_rate": 9.310308836229548e-06, + "loss": 0.0458, + "step": 1119 + }, + { + "epoch": 0.7577807848443843, + "grad_norm": 0.4375017741111722, + "learning_rate": 9.308312300744247e-06, + "loss": 0.0388, + "step": 1120 + }, + { + "epoch": 0.7584573748308525, + "grad_norm": 0.5247235800165496, + "learning_rate": 9.306313094337539e-06, + "loss": 0.0443, + "step": 1121 + }, + { + "epoch": 0.7591339648173207, + "grad_norm": 0.584204461473707, + "learning_rate": 9.304311218248828e-06, + "loss": 0.0555, + "step": 1122 + }, + { + "epoch": 0.7598105548037889, + "grad_norm": 0.34492025134679155, + "learning_rate": 9.30230667371917e-06, + "loss": 0.0353, + "step": 1123 + }, + { + "epoch": 0.7604871447902571, + "grad_norm": 0.39682716944671537, + "learning_rate": 9.30029946199127e-06, + "loss": 0.042, + "step": 1124 + }, + { + "epoch": 0.7611637347767253, + "grad_norm": 0.32624179514132623, + "learning_rate": 9.298289584309496e-06, + "loss": 0.0352, + "step": 1125 + }, + { + "epoch": 0.7618403247631935, + "grad_norm": 0.6122427027969475, + "learning_rate": 9.29627704191986e-06, + "loss": 0.0427, + "step": 1126 + }, + { + "epoch": 0.7625169147496617, + "grad_norm": 0.43218148320586536, + "learning_rate": 9.294261836070034e-06, + "loss": 0.0468, + "step": 1127 + }, + { + "epoch": 0.7631935047361299, + "grad_norm": 0.43508867127579565, + "learning_rate": 9.292243968009332e-06, + "loss": 0.0397, + "step": 1128 + }, + { + "epoch": 0.7638700947225981, + "grad_norm": 0.3605762409954845, + "learning_rate": 9.290223438988726e-06, + "loss": 0.0387, + "step": 1129 + }, + { + "epoch": 0.7645466847090663, + "grad_norm": 0.44807011054996954, + "learning_rate": 9.288200250260836e-06, + "loss": 0.0456, + "step": 1130 + }, + { + "epoch": 0.7652232746955345, + "grad_norm": 0.3486712359694568, + "learning_rate": 9.286174403079928e-06, + "loss": 0.0436, + "step": 1131 + }, + { + "epoch": 0.7658998646820027, + "grad_norm": 0.39675791530677357, + "learning_rate": 9.284145898701921e-06, + "loss": 0.0368, + "step": 1132 + }, + { + "epoch": 0.7665764546684709, + "grad_norm": 0.5133506378319832, + "learning_rate": 9.282114738384375e-06, + "loss": 0.0491, + "step": 1133 + }, + { + "epoch": 0.7672530446549392, + "grad_norm": 0.5956950097378654, + "learning_rate": 9.280080923386501e-06, + "loss": 0.0587, + "step": 1134 + }, + { + "epoch": 0.7679296346414073, + "grad_norm": 0.3113555182626179, + "learning_rate": 9.278044454969157e-06, + "loss": 0.0327, + "step": 1135 + }, + { + "epoch": 0.7686062246278755, + "grad_norm": 0.42301234160024176, + "learning_rate": 9.27600533439484e-06, + "loss": 0.0417, + "step": 1136 + }, + { + "epoch": 0.7692828146143437, + "grad_norm": 0.41994481653635907, + "learning_rate": 9.273963562927695e-06, + "loss": 0.0337, + "step": 1137 + }, + { + "epoch": 0.7699594046008119, + "grad_norm": 0.4158291408145277, + "learning_rate": 9.271919141833514e-06, + "loss": 0.0359, + "step": 1138 + }, + { + "epoch": 0.7706359945872802, + "grad_norm": 0.5260878684426418, + "learning_rate": 9.269872072379725e-06, + "loss": 0.055, + "step": 1139 + }, + { + "epoch": 0.7713125845737483, + "grad_norm": 0.6764803636872951, + "learning_rate": 9.267822355835402e-06, + "loss": 0.0582, + "step": 1140 + }, + { + "epoch": 0.7719891745602165, + "grad_norm": 0.5363223867689032, + "learning_rate": 9.265769993471258e-06, + "loss": 0.0382, + "step": 1141 + }, + { + "epoch": 0.7726657645466847, + "grad_norm": 0.4661861760992129, + "learning_rate": 9.263714986559647e-06, + "loss": 0.0367, + "step": 1142 + }, + { + "epoch": 0.7733423545331529, + "grad_norm": 0.3728301812931657, + "learning_rate": 9.261657336374561e-06, + "loss": 0.0542, + "step": 1143 + }, + { + "epoch": 0.774018944519621, + "grad_norm": 0.7420033962125824, + "learning_rate": 9.259597044191635e-06, + "loss": 0.0556, + "step": 1144 + }, + { + "epoch": 0.7746955345060893, + "grad_norm": 0.6254152152879247, + "learning_rate": 9.25753411128814e-06, + "loss": 0.0526, + "step": 1145 + }, + { + "epoch": 0.7753721244925575, + "grad_norm": 0.32286689466872553, + "learning_rate": 9.25546853894298e-06, + "loss": 0.0363, + "step": 1146 + }, + { + "epoch": 0.7760487144790257, + "grad_norm": 0.758545351167433, + "learning_rate": 9.253400328436699e-06, + "loss": 0.0566, + "step": 1147 + }, + { + "epoch": 0.7767253044654939, + "grad_norm": 0.5172544507113069, + "learning_rate": 9.251329481051476e-06, + "loss": 0.0418, + "step": 1148 + }, + { + "epoch": 0.7774018944519621, + "grad_norm": 0.4944990830258745, + "learning_rate": 9.249255998071127e-06, + "loss": 0.0479, + "step": 1149 + }, + { + "epoch": 0.7780784844384303, + "grad_norm": 0.4680289187890781, + "learning_rate": 9.247179880781099e-06, + "loss": 0.0434, + "step": 1150 + }, + { + "epoch": 0.7787550744248986, + "grad_norm": 0.37933941688250344, + "learning_rate": 9.24510113046847e-06, + "loss": 0.0381, + "step": 1151 + }, + { + "epoch": 0.7794316644113667, + "grad_norm": 0.5580678117796845, + "learning_rate": 9.243019748421956e-06, + "loss": 0.0411, + "step": 1152 + }, + { + "epoch": 0.7801082543978349, + "grad_norm": 0.3792863592199863, + "learning_rate": 9.2409357359319e-06, + "loss": 0.0387, + "step": 1153 + }, + { + "epoch": 0.7807848443843031, + "grad_norm": 0.49151909894050444, + "learning_rate": 9.238849094290279e-06, + "loss": 0.0475, + "step": 1154 + }, + { + "epoch": 0.7814614343707713, + "grad_norm": 0.4150200341713875, + "learning_rate": 9.236759824790698e-06, + "loss": 0.0374, + "step": 1155 + }, + { + "epoch": 0.7821380243572396, + "grad_norm": 0.3328406752636634, + "learning_rate": 9.234667928728392e-06, + "loss": 0.0321, + "step": 1156 + }, + { + "epoch": 0.7828146143437077, + "grad_norm": 0.9741272621124953, + "learning_rate": 9.23257340740022e-06, + "loss": 0.062, + "step": 1157 + }, + { + "epoch": 0.7834912043301759, + "grad_norm": 0.5466469275195904, + "learning_rate": 9.230476262104678e-06, + "loss": 0.0561, + "step": 1158 + }, + { + "epoch": 0.7841677943166441, + "grad_norm": 0.38619283655988507, + "learning_rate": 9.22837649414188e-06, + "loss": 0.0339, + "step": 1159 + }, + { + "epoch": 0.7848443843031123, + "grad_norm": 0.44722381263042815, + "learning_rate": 9.226274104813567e-06, + "loss": 0.0449, + "step": 1160 + }, + { + "epoch": 0.7855209742895806, + "grad_norm": 0.3507827327034681, + "learning_rate": 9.22416909542311e-06, + "loss": 0.0373, + "step": 1161 + }, + { + "epoch": 0.7861975642760487, + "grad_norm": 0.4603881135888454, + "learning_rate": 9.222061467275503e-06, + "loss": 0.0572, + "step": 1162 + }, + { + "epoch": 0.786874154262517, + "grad_norm": 0.4085902778519741, + "learning_rate": 9.219951221677356e-06, + "loss": 0.0377, + "step": 1163 + }, + { + "epoch": 0.7875507442489851, + "grad_norm": 0.46066320087189744, + "learning_rate": 9.217838359936914e-06, + "loss": 0.0388, + "step": 1164 + }, + { + "epoch": 0.7882273342354533, + "grad_norm": 0.37053625701059334, + "learning_rate": 9.215722883364033e-06, + "loss": 0.0395, + "step": 1165 + }, + { + "epoch": 0.7889039242219216, + "grad_norm": 0.38791731905276344, + "learning_rate": 9.213604793270196e-06, + "loss": 0.0522, + "step": 1166 + }, + { + "epoch": 0.7895805142083897, + "grad_norm": 0.486347306914698, + "learning_rate": 9.211484090968505e-06, + "loss": 0.0383, + "step": 1167 + }, + { + "epoch": 0.790257104194858, + "grad_norm": 0.5309241482023681, + "learning_rate": 9.20936077777368e-06, + "loss": 0.0604, + "step": 1168 + }, + { + "epoch": 0.7909336941813261, + "grad_norm": 0.3936513120655959, + "learning_rate": 9.207234855002062e-06, + "loss": 0.0439, + "step": 1169 + }, + { + "epoch": 0.7916102841677943, + "grad_norm": 0.3905053801028906, + "learning_rate": 9.205106323971607e-06, + "loss": 0.039, + "step": 1170 + }, + { + "epoch": 0.7922868741542625, + "grad_norm": 0.43833901476768505, + "learning_rate": 9.202975186001892e-06, + "loss": 0.0482, + "step": 1171 + }, + { + "epoch": 0.7929634641407307, + "grad_norm": 0.7516556302472751, + "learning_rate": 9.200841442414106e-06, + "loss": 0.056, + "step": 1172 + }, + { + "epoch": 0.793640054127199, + "grad_norm": 0.3819234864029351, + "learning_rate": 9.198705094531053e-06, + "loss": 0.0439, + "step": 1173 + }, + { + "epoch": 0.7943166441136671, + "grad_norm": 0.6120124177063407, + "learning_rate": 9.196566143677157e-06, + "loss": 0.059, + "step": 1174 + }, + { + "epoch": 0.7949932341001353, + "grad_norm": 0.4910293294970947, + "learning_rate": 9.19442459117845e-06, + "loss": 0.0456, + "step": 1175 + }, + { + "epoch": 0.7956698240866035, + "grad_norm": 0.5328215559905269, + "learning_rate": 9.192280438362581e-06, + "loss": 0.0611, + "step": 1176 + }, + { + "epoch": 0.7963464140730717, + "grad_norm": 0.35797326030579124, + "learning_rate": 9.190133686558809e-06, + "loss": 0.0384, + "step": 1177 + }, + { + "epoch": 0.79702300405954, + "grad_norm": 0.3188990333573404, + "learning_rate": 9.187984337098002e-06, + "loss": 0.0322, + "step": 1178 + }, + { + "epoch": 0.7976995940460081, + "grad_norm": 0.3794249733690347, + "learning_rate": 9.185832391312644e-06, + "loss": 0.0398, + "step": 1179 + }, + { + "epoch": 0.7983761840324763, + "grad_norm": 0.3822287588253073, + "learning_rate": 9.183677850536823e-06, + "loss": 0.0392, + "step": 1180 + }, + { + "epoch": 0.7990527740189445, + "grad_norm": 0.35000444773849926, + "learning_rate": 9.181520716106238e-06, + "loss": 0.0328, + "step": 1181 + }, + { + "epoch": 0.7997293640054127, + "grad_norm": 0.3199665504575698, + "learning_rate": 9.179360989358199e-06, + "loss": 0.033, + "step": 1182 + }, + { + "epoch": 0.800405953991881, + "grad_norm": 0.40110028777129825, + "learning_rate": 9.177198671631616e-06, + "loss": 0.0507, + "step": 1183 + }, + { + "epoch": 0.8010825439783491, + "grad_norm": 0.3605079309777011, + "learning_rate": 9.175033764267013e-06, + "loss": 0.0388, + "step": 1184 + }, + { + "epoch": 0.8017591339648173, + "grad_norm": 0.48857922904966433, + "learning_rate": 9.172866268606514e-06, + "loss": 0.0471, + "step": 1185 + }, + { + "epoch": 0.8024357239512855, + "grad_norm": 0.3356889780868183, + "learning_rate": 9.17069618599385e-06, + "loss": 0.0374, + "step": 1186 + }, + { + "epoch": 0.8031123139377537, + "grad_norm": 0.5677033185722352, + "learning_rate": 9.168523517774356e-06, + "loss": 0.0399, + "step": 1187 + }, + { + "epoch": 0.803788903924222, + "grad_norm": 0.44601535886712806, + "learning_rate": 9.166348265294968e-06, + "loss": 0.0317, + "step": 1188 + }, + { + "epoch": 0.8044654939106901, + "grad_norm": 0.28908970148597557, + "learning_rate": 9.164170429904224e-06, + "loss": 0.0253, + "step": 1189 + }, + { + "epoch": 0.8051420838971584, + "grad_norm": 0.37936933561101305, + "learning_rate": 9.16199001295227e-06, + "loss": 0.0333, + "step": 1190 + }, + { + "epoch": 0.8058186738836265, + "grad_norm": 0.5378799769789874, + "learning_rate": 9.15980701579084e-06, + "loss": 0.0463, + "step": 1191 + }, + { + "epoch": 0.8064952638700947, + "grad_norm": 0.3664821531951507, + "learning_rate": 9.157621439773278e-06, + "loss": 0.0416, + "step": 1192 + }, + { + "epoch": 0.8071718538565629, + "grad_norm": 0.5992640475325052, + "learning_rate": 9.155433286254524e-06, + "loss": 0.0465, + "step": 1193 + }, + { + "epoch": 0.8078484438430311, + "grad_norm": 0.44246404429673586, + "learning_rate": 9.153242556591115e-06, + "loss": 0.0476, + "step": 1194 + }, + { + "epoch": 0.8085250338294994, + "grad_norm": 0.49868840269074993, + "learning_rate": 9.151049252141185e-06, + "loss": 0.0666, + "step": 1195 + }, + { + "epoch": 0.8092016238159675, + "grad_norm": 0.4199962889037978, + "learning_rate": 9.148853374264463e-06, + "loss": 0.0412, + "step": 1196 + }, + { + "epoch": 0.8098782138024357, + "grad_norm": 0.6412453213760052, + "learning_rate": 9.146654924322277e-06, + "loss": 0.038, + "step": 1197 + }, + { + "epoch": 0.8105548037889039, + "grad_norm": 0.364908387543322, + "learning_rate": 9.144453903677546e-06, + "loss": 0.0571, + "step": 1198 + }, + { + "epoch": 0.8112313937753721, + "grad_norm": 0.44181588815410566, + "learning_rate": 9.142250313694785e-06, + "loss": 0.0338, + "step": 1199 + }, + { + "epoch": 0.8119079837618404, + "grad_norm": 0.2652282884689182, + "learning_rate": 9.140044155740102e-06, + "loss": 0.0301, + "step": 1200 + }, + { + "epoch": 0.8125845737483085, + "grad_norm": 0.5075050392163715, + "learning_rate": 9.137835431181192e-06, + "loss": 0.0498, + "step": 1201 + }, + { + "epoch": 0.8132611637347767, + "grad_norm": 0.45580087964926747, + "learning_rate": 9.13562414138735e-06, + "loss": 0.0322, + "step": 1202 + }, + { + "epoch": 0.8139377537212449, + "grad_norm": 0.586251118417022, + "learning_rate": 9.133410287729454e-06, + "loss": 0.0544, + "step": 1203 + }, + { + "epoch": 0.8146143437077131, + "grad_norm": 0.47252601204366695, + "learning_rate": 9.131193871579975e-06, + "loss": 0.0441, + "step": 1204 + }, + { + "epoch": 0.8152909336941814, + "grad_norm": 1.0376002865967784, + "learning_rate": 9.12897489431297e-06, + "loss": 0.064, + "step": 1205 + }, + { + "epoch": 0.8159675236806495, + "grad_norm": 0.362957751058306, + "learning_rate": 9.126753357304088e-06, + "loss": 0.0384, + "step": 1206 + }, + { + "epoch": 0.8166441136671178, + "grad_norm": 0.3195252023339085, + "learning_rate": 9.12452926193056e-06, + "loss": 0.0393, + "step": 1207 + }, + { + "epoch": 0.8173207036535859, + "grad_norm": 0.3753364178537597, + "learning_rate": 9.122302609571204e-06, + "loss": 0.0272, + "step": 1208 + }, + { + "epoch": 0.8179972936400541, + "grad_norm": 0.3709876630054977, + "learning_rate": 9.120073401606427e-06, + "loss": 0.0375, + "step": 1209 + }, + { + "epoch": 0.8186738836265224, + "grad_norm": 0.3918475527915412, + "learning_rate": 9.117841639418218e-06, + "loss": 0.0373, + "step": 1210 + }, + { + "epoch": 0.8193504736129905, + "grad_norm": 0.7699736605312884, + "learning_rate": 9.115607324390146e-06, + "loss": 0.0571, + "step": 1211 + }, + { + "epoch": 0.8200270635994588, + "grad_norm": 0.7354881317235208, + "learning_rate": 9.11337045790737e-06, + "loss": 0.0431, + "step": 1212 + }, + { + "epoch": 0.8207036535859269, + "grad_norm": 0.4607374635589529, + "learning_rate": 9.111131041356624e-06, + "loss": 0.0374, + "step": 1213 + }, + { + "epoch": 0.8213802435723951, + "grad_norm": 0.4527689501272563, + "learning_rate": 9.108889076126226e-06, + "loss": 0.0368, + "step": 1214 + }, + { + "epoch": 0.8220568335588633, + "grad_norm": 0.9913737981840453, + "learning_rate": 9.106644563606076e-06, + "loss": 0.0406, + "step": 1215 + }, + { + "epoch": 0.8227334235453315, + "grad_norm": 0.5240453942563007, + "learning_rate": 9.104397505187645e-06, + "loss": 0.0513, + "step": 1216 + }, + { + "epoch": 0.8234100135317998, + "grad_norm": 0.6220759777033223, + "learning_rate": 9.102147902263994e-06, + "loss": 0.0452, + "step": 1217 + }, + { + "epoch": 0.8240866035182679, + "grad_norm": 0.3221077414204484, + "learning_rate": 9.099895756229754e-06, + "loss": 0.0371, + "step": 1218 + }, + { + "epoch": 0.8247631935047361, + "grad_norm": 0.6362524400354698, + "learning_rate": 9.097641068481133e-06, + "loss": 0.0587, + "step": 1219 + }, + { + "epoch": 0.8254397834912043, + "grad_norm": 0.4386780555869206, + "learning_rate": 9.095383840415915e-06, + "loss": 0.0475, + "step": 1220 + }, + { + "epoch": 0.8261163734776725, + "grad_norm": 0.49091159815368024, + "learning_rate": 9.093124073433464e-06, + "loss": 0.0306, + "step": 1221 + }, + { + "epoch": 0.8267929634641408, + "grad_norm": 0.37397811856107593, + "learning_rate": 9.090861768934708e-06, + "loss": 0.0368, + "step": 1222 + }, + { + "epoch": 0.8274695534506089, + "grad_norm": 0.4195135915086271, + "learning_rate": 9.088596928322158e-06, + "loss": 0.0326, + "step": 1223 + }, + { + "epoch": 0.8281461434370772, + "grad_norm": 0.44335281138776866, + "learning_rate": 9.08632955299989e-06, + "loss": 0.0313, + "step": 1224 + }, + { + "epoch": 0.8288227334235453, + "grad_norm": 0.45103277842907036, + "learning_rate": 9.084059644373558e-06, + "loss": 0.0421, + "step": 1225 + }, + { + "epoch": 0.8294993234100135, + "grad_norm": 0.2989663277016761, + "learning_rate": 9.08178720385038e-06, + "loss": 0.0313, + "step": 1226 + }, + { + "epoch": 0.8301759133964818, + "grad_norm": 0.3421564851969261, + "learning_rate": 9.07951223283915e-06, + "loss": 0.0338, + "step": 1227 + }, + { + "epoch": 0.8308525033829499, + "grad_norm": 0.6700826368288783, + "learning_rate": 9.077234732750223e-06, + "loss": 0.0539, + "step": 1228 + }, + { + "epoch": 0.8315290933694182, + "grad_norm": 0.42400569896634155, + "learning_rate": 9.074954704995532e-06, + "loss": 0.0429, + "step": 1229 + }, + { + "epoch": 0.8322056833558863, + "grad_norm": 0.43217022714275255, + "learning_rate": 9.072672150988563e-06, + "loss": 0.0531, + "step": 1230 + }, + { + "epoch": 0.8328822733423545, + "grad_norm": 0.33547200954628376, + "learning_rate": 9.070387072144386e-06, + "loss": 0.0427, + "step": 1231 + }, + { + "epoch": 0.8335588633288228, + "grad_norm": 0.42985392965408503, + "learning_rate": 9.06809946987962e-06, + "loss": 0.0489, + "step": 1232 + }, + { + "epoch": 0.8342354533152909, + "grad_norm": 0.3897977124645228, + "learning_rate": 9.065809345612458e-06, + "loss": 0.0383, + "step": 1233 + }, + { + "epoch": 0.8349120433017592, + "grad_norm": 0.5109827851236731, + "learning_rate": 9.06351670076265e-06, + "loss": 0.0545, + "step": 1234 + }, + { + "epoch": 0.8355886332882273, + "grad_norm": 0.4915371656148965, + "learning_rate": 9.061221536751517e-06, + "loss": 0.0466, + "step": 1235 + }, + { + "epoch": 0.8362652232746955, + "grad_norm": 0.35946300440613643, + "learning_rate": 9.058923855001935e-06, + "loss": 0.0428, + "step": 1236 + }, + { + "epoch": 0.8369418132611637, + "grad_norm": 0.332874472838773, + "learning_rate": 9.056623656938344e-06, + "loss": 0.0447, + "step": 1237 + }, + { + "epoch": 0.8376184032476319, + "grad_norm": 0.49876938075197785, + "learning_rate": 9.05432094398674e-06, + "loss": 0.0407, + "step": 1238 + }, + { + "epoch": 0.8382949932341002, + "grad_norm": 0.3299883868490397, + "learning_rate": 9.052015717574683e-06, + "loss": 0.0385, + "step": 1239 + }, + { + "epoch": 0.8389715832205683, + "grad_norm": 0.4304477923193939, + "learning_rate": 9.049707979131288e-06, + "loss": 0.051, + "step": 1240 + }, + { + "epoch": 0.8396481732070366, + "grad_norm": 0.4018257282137082, + "learning_rate": 9.04739773008723e-06, + "loss": 0.0328, + "step": 1241 + }, + { + "epoch": 0.8403247631935047, + "grad_norm": 0.5139290067371771, + "learning_rate": 9.045084971874738e-06, + "loss": 0.0545, + "step": 1242 + }, + { + "epoch": 0.8410013531799729, + "grad_norm": 0.3030113463432619, + "learning_rate": 9.042769705927597e-06, + "loss": 0.0278, + "step": 1243 + }, + { + "epoch": 0.8416779431664412, + "grad_norm": 0.9482400717391877, + "learning_rate": 9.040451933681148e-06, + "loss": 0.0589, + "step": 1244 + }, + { + "epoch": 0.8423545331529093, + "grad_norm": 0.4483159760178228, + "learning_rate": 9.038131656572284e-06, + "loss": 0.0419, + "step": 1245 + }, + { + "epoch": 0.8430311231393776, + "grad_norm": 0.40578292251140413, + "learning_rate": 9.035808876039451e-06, + "loss": 0.0436, + "step": 1246 + }, + { + "epoch": 0.8437077131258457, + "grad_norm": 0.3694519035719774, + "learning_rate": 9.033483593522652e-06, + "loss": 0.0384, + "step": 1247 + }, + { + "epoch": 0.8443843031123139, + "grad_norm": 0.4777745163070217, + "learning_rate": 9.03115581046343e-06, + "loss": 0.039, + "step": 1248 + }, + { + "epoch": 0.8450608930987822, + "grad_norm": 0.4213671177327557, + "learning_rate": 9.028825528304892e-06, + "loss": 0.0467, + "step": 1249 + }, + { + "epoch": 0.8457374830852503, + "grad_norm": 0.5393334161933487, + "learning_rate": 9.026492748491683e-06, + "loss": 0.042, + "step": 1250 + }, + { + "epoch": 0.8464140730717186, + "grad_norm": 0.4617021484576158, + "learning_rate": 9.02415747247e-06, + "loss": 0.0479, + "step": 1251 + }, + { + "epoch": 0.8470906630581867, + "grad_norm": 0.44235254226389215, + "learning_rate": 9.02181970168759e-06, + "loss": 0.044, + "step": 1252 + }, + { + "epoch": 0.847767253044655, + "grad_norm": 0.6653049698515878, + "learning_rate": 9.019479437593748e-06, + "loss": 0.0456, + "step": 1253 + }, + { + "epoch": 0.8484438430311232, + "grad_norm": 0.4178732931904288, + "learning_rate": 9.017136681639307e-06, + "loss": 0.0347, + "step": 1254 + }, + { + "epoch": 0.8491204330175913, + "grad_norm": 0.3861007999326273, + "learning_rate": 9.014791435276651e-06, + "loss": 0.04, + "step": 1255 + }, + { + "epoch": 0.8497970230040596, + "grad_norm": 0.40929829348577457, + "learning_rate": 9.012443699959706e-06, + "loss": 0.0394, + "step": 1256 + }, + { + "epoch": 0.8504736129905277, + "grad_norm": 0.811155314670333, + "learning_rate": 9.010093477143942e-06, + "loss": 0.0666, + "step": 1257 + }, + { + "epoch": 0.851150202976996, + "grad_norm": 0.41445007834888836, + "learning_rate": 9.007740768286369e-06, + "loss": 0.0444, + "step": 1258 + }, + { + "epoch": 0.8518267929634641, + "grad_norm": 0.3234418608942231, + "learning_rate": 9.005385574845543e-06, + "loss": 0.0373, + "step": 1259 + }, + { + "epoch": 0.8525033829499323, + "grad_norm": 0.6188106431766026, + "learning_rate": 9.003027898281551e-06, + "loss": 0.0427, + "step": 1260 + }, + { + "epoch": 0.8531799729364006, + "grad_norm": 0.6533740575115408, + "learning_rate": 9.000667740056033e-06, + "loss": 0.0534, + "step": 1261 + }, + { + "epoch": 0.8538565629228687, + "grad_norm": 0.4189806216269285, + "learning_rate": 8.998305101632155e-06, + "loss": 0.0507, + "step": 1262 + }, + { + "epoch": 0.854533152909337, + "grad_norm": 0.27350968064232517, + "learning_rate": 8.995939984474624e-06, + "loss": 0.0287, + "step": 1263 + }, + { + "epoch": 0.8552097428958051, + "grad_norm": 0.42523468953669913, + "learning_rate": 8.99357239004969e-06, + "loss": 0.0402, + "step": 1264 + }, + { + "epoch": 0.8558863328822733, + "grad_norm": 0.5369799132951332, + "learning_rate": 8.991202319825131e-06, + "loss": 0.0549, + "step": 1265 + }, + { + "epoch": 0.8565629228687416, + "grad_norm": 0.42772143241216515, + "learning_rate": 8.988829775270265e-06, + "loss": 0.0382, + "step": 1266 + }, + { + "epoch": 0.8572395128552097, + "grad_norm": 0.2987472351107439, + "learning_rate": 8.986454757855938e-06, + "loss": 0.0349, + "step": 1267 + }, + { + "epoch": 0.857916102841678, + "grad_norm": 0.3961898182156086, + "learning_rate": 8.984077269054535e-06, + "loss": 0.0381, + "step": 1268 + }, + { + "epoch": 0.8585926928281461, + "grad_norm": 0.34654081148722105, + "learning_rate": 8.981697310339972e-06, + "loss": 0.0411, + "step": 1269 + }, + { + "epoch": 0.8592692828146143, + "grad_norm": 0.41463106570300323, + "learning_rate": 8.979314883187694e-06, + "loss": 0.0365, + "step": 1270 + }, + { + "epoch": 0.8599458728010826, + "grad_norm": 0.3310488399216415, + "learning_rate": 8.976929989074677e-06, + "loss": 0.0324, + "step": 1271 + }, + { + "epoch": 0.8606224627875507, + "grad_norm": 0.5261414363189244, + "learning_rate": 8.974542629479426e-06, + "loss": 0.0421, + "step": 1272 + }, + { + "epoch": 0.861299052774019, + "grad_norm": 0.4342571413552553, + "learning_rate": 8.972152805881978e-06, + "loss": 0.0382, + "step": 1273 + }, + { + "epoch": 0.8619756427604871, + "grad_norm": 0.4835208110976876, + "learning_rate": 8.969760519763891e-06, + "loss": 0.0487, + "step": 1274 + }, + { + "epoch": 0.8626522327469553, + "grad_norm": 0.4274045983340695, + "learning_rate": 8.967365772608258e-06, + "loss": 0.0488, + "step": 1275 + }, + { + "epoch": 0.8633288227334236, + "grad_norm": 0.3572140106942213, + "learning_rate": 8.96496856589969e-06, + "loss": 0.0357, + "step": 1276 + }, + { + "epoch": 0.8640054127198917, + "grad_norm": 0.3389369988565005, + "learning_rate": 8.962568901124326e-06, + "loss": 0.0379, + "step": 1277 + }, + { + "epoch": 0.86468200270636, + "grad_norm": 0.4252598709924321, + "learning_rate": 8.96016677976983e-06, + "loss": 0.0552, + "step": 1278 + }, + { + "epoch": 0.8653585926928281, + "grad_norm": 0.4274585234906251, + "learning_rate": 8.957762203325389e-06, + "loss": 0.0608, + "step": 1279 + }, + { + "epoch": 0.8660351826792964, + "grad_norm": 0.413955169996957, + "learning_rate": 8.955355173281709e-06, + "loss": 0.0391, + "step": 1280 + }, + { + "epoch": 0.8667117726657646, + "grad_norm": 0.3632041768440433, + "learning_rate": 8.952945691131016e-06, + "loss": 0.0322, + "step": 1281 + }, + { + "epoch": 0.8673883626522327, + "grad_norm": 0.4207387415918705, + "learning_rate": 8.950533758367063e-06, + "loss": 0.0385, + "step": 1282 + }, + { + "epoch": 0.868064952638701, + "grad_norm": 0.3097737464929893, + "learning_rate": 8.948119376485119e-06, + "loss": 0.0366, + "step": 1283 + }, + { + "epoch": 0.8687415426251691, + "grad_norm": 0.5002017558211296, + "learning_rate": 8.94570254698197e-06, + "loss": 0.0361, + "step": 1284 + }, + { + "epoch": 0.8694181326116374, + "grad_norm": 0.41760825960748166, + "learning_rate": 8.943283271355915e-06, + "loss": 0.0513, + "step": 1285 + }, + { + "epoch": 0.8700947225981055, + "grad_norm": 0.3735950997932303, + "learning_rate": 8.940861551106784e-06, + "loss": 0.0348, + "step": 1286 + }, + { + "epoch": 0.8707713125845737, + "grad_norm": 0.49506876431359564, + "learning_rate": 8.938437387735903e-06, + "loss": 0.0543, + "step": 1287 + }, + { + "epoch": 0.871447902571042, + "grad_norm": 0.39553789847793236, + "learning_rate": 8.93601078274613e-06, + "loss": 0.0358, + "step": 1288 + }, + { + "epoch": 0.8721244925575101, + "grad_norm": 0.28824367101976933, + "learning_rate": 8.933581737641824e-06, + "loss": 0.0344, + "step": 1289 + }, + { + "epoch": 0.8728010825439784, + "grad_norm": 0.29995877856501596, + "learning_rate": 8.931150253928866e-06, + "loss": 0.0324, + "step": 1290 + }, + { + "epoch": 0.8734776725304465, + "grad_norm": 0.34421727276338043, + "learning_rate": 8.928716333114643e-06, + "loss": 0.0424, + "step": 1291 + }, + { + "epoch": 0.8741542625169147, + "grad_norm": 0.42456032358958895, + "learning_rate": 8.926279976708056e-06, + "loss": 0.0397, + "step": 1292 + }, + { + "epoch": 0.874830852503383, + "grad_norm": 0.4052196496131863, + "learning_rate": 8.923841186219512e-06, + "loss": 0.04, + "step": 1293 + }, + { + "epoch": 0.8755074424898511, + "grad_norm": 0.51106078651104, + "learning_rate": 8.921399963160934e-06, + "loss": 0.0383, + "step": 1294 + }, + { + "epoch": 0.8761840324763194, + "grad_norm": 1.0246571675765568, + "learning_rate": 8.918956309045743e-06, + "loss": 0.0419, + "step": 1295 + }, + { + "epoch": 0.8768606224627875, + "grad_norm": 0.489503464874003, + "learning_rate": 8.916510225388878e-06, + "loss": 0.0553, + "step": 1296 + }, + { + "epoch": 0.8775372124492558, + "grad_norm": 0.5868667513212799, + "learning_rate": 8.914061713706776e-06, + "loss": 0.0375, + "step": 1297 + }, + { + "epoch": 0.878213802435724, + "grad_norm": 0.568524914043775, + "learning_rate": 8.911610775517383e-06, + "loss": 0.0316, + "step": 1298 + }, + { + "epoch": 0.8788903924221921, + "grad_norm": 0.4303128515684782, + "learning_rate": 8.90915741234015e-06, + "loss": 0.0457, + "step": 1299 + }, + { + "epoch": 0.8795669824086604, + "grad_norm": 0.4989966037455737, + "learning_rate": 8.906701625696028e-06, + "loss": 0.0441, + "step": 1300 + }, + { + "epoch": 0.8802435723951285, + "grad_norm": 0.3963366881165422, + "learning_rate": 8.904243417107473e-06, + "loss": 0.044, + "step": 1301 + }, + { + "epoch": 0.8809201623815968, + "grad_norm": 1.1829084709010367, + "learning_rate": 8.901782788098442e-06, + "loss": 0.0571, + "step": 1302 + }, + { + "epoch": 0.881596752368065, + "grad_norm": 0.347485401772923, + "learning_rate": 8.899319740194391e-06, + "loss": 0.0337, + "step": 1303 + }, + { + "epoch": 0.8822733423545331, + "grad_norm": 0.31723947459588236, + "learning_rate": 8.89685427492228e-06, + "loss": 0.0306, + "step": 1304 + }, + { + "epoch": 0.8829499323410014, + "grad_norm": 0.43758053153646603, + "learning_rate": 8.894386393810563e-06, + "loss": 0.0394, + "step": 1305 + }, + { + "epoch": 0.8836265223274695, + "grad_norm": 0.5122954040268756, + "learning_rate": 8.891916098389193e-06, + "loss": 0.0492, + "step": 1306 + }, + { + "epoch": 0.8843031123139378, + "grad_norm": 0.4291499611534161, + "learning_rate": 8.889443390189618e-06, + "loss": 0.0501, + "step": 1307 + }, + { + "epoch": 0.8849797023004059, + "grad_norm": 0.43676737301055296, + "learning_rate": 8.886968270744789e-06, + "loss": 0.044, + "step": 1308 + }, + { + "epoch": 0.8856562922868741, + "grad_norm": 0.3464180119501573, + "learning_rate": 8.88449074158914e-06, + "loss": 0.045, + "step": 1309 + }, + { + "epoch": 0.8863328822733424, + "grad_norm": 0.3493272905534359, + "learning_rate": 8.882010804258612e-06, + "loss": 0.0344, + "step": 1310 + }, + { + "epoch": 0.8870094722598105, + "grad_norm": 0.3295603557387321, + "learning_rate": 8.879528460290628e-06, + "loss": 0.0373, + "step": 1311 + }, + { + "epoch": 0.8876860622462788, + "grad_norm": 0.5660882260290939, + "learning_rate": 8.877043711224109e-06, + "loss": 0.0467, + "step": 1312 + }, + { + "epoch": 0.8883626522327469, + "grad_norm": 0.4073247970162156, + "learning_rate": 8.874556558599465e-06, + "loss": 0.0425, + "step": 1313 + }, + { + "epoch": 0.8890392422192152, + "grad_norm": 0.5009482962458793, + "learning_rate": 8.872067003958597e-06, + "loss": 0.0585, + "step": 1314 + }, + { + "epoch": 0.8897158322056834, + "grad_norm": 0.48128534011963076, + "learning_rate": 8.869575048844896e-06, + "loss": 0.04, + "step": 1315 + }, + { + "epoch": 0.8903924221921515, + "grad_norm": 0.2963676138282622, + "learning_rate": 8.867080694803238e-06, + "loss": 0.032, + "step": 1316 + }, + { + "epoch": 0.8910690121786198, + "grad_norm": 0.42528060953519137, + "learning_rate": 8.864583943379987e-06, + "loss": 0.0262, + "step": 1317 + }, + { + "epoch": 0.8917456021650879, + "grad_norm": 0.3732517425386884, + "learning_rate": 8.862084796122998e-06, + "loss": 0.0362, + "step": 1318 + }, + { + "epoch": 0.8924221921515562, + "grad_norm": 0.49779303193927454, + "learning_rate": 8.859583254581604e-06, + "loss": 0.0365, + "step": 1319 + }, + { + "epoch": 0.8930987821380244, + "grad_norm": 0.33431602169000396, + "learning_rate": 8.85707932030663e-06, + "loss": 0.0437, + "step": 1320 + }, + { + "epoch": 0.8937753721244925, + "grad_norm": 0.3337056934295125, + "learning_rate": 8.854572994850376e-06, + "loss": 0.0361, + "step": 1321 + }, + { + "epoch": 0.8944519621109608, + "grad_norm": 0.44781149353065697, + "learning_rate": 8.85206427976663e-06, + "loss": 0.0433, + "step": 1322 + }, + { + "epoch": 0.8951285520974289, + "grad_norm": 0.3760732383639481, + "learning_rate": 8.849553176610661e-06, + "loss": 0.0439, + "step": 1323 + }, + { + "epoch": 0.8958051420838972, + "grad_norm": 0.41732728601814806, + "learning_rate": 8.847039686939218e-06, + "loss": 0.0402, + "step": 1324 + }, + { + "epoch": 0.8964817320703654, + "grad_norm": 0.40984679968932614, + "learning_rate": 8.844523812310527e-06, + "loss": 0.0328, + "step": 1325 + }, + { + "epoch": 0.8971583220568335, + "grad_norm": 0.3972413846697271, + "learning_rate": 8.842005554284296e-06, + "loss": 0.0306, + "step": 1326 + }, + { + "epoch": 0.8978349120433018, + "grad_norm": 0.3877047464990477, + "learning_rate": 8.83948491442171e-06, + "loss": 0.0363, + "step": 1327 + }, + { + "epoch": 0.8985115020297699, + "grad_norm": 0.3983312816608221, + "learning_rate": 8.836961894285428e-06, + "loss": 0.0527, + "step": 1328 + }, + { + "epoch": 0.8991880920162382, + "grad_norm": 0.44535858494343633, + "learning_rate": 8.834436495439588e-06, + "loss": 0.0411, + "step": 1329 + }, + { + "epoch": 0.8998646820027063, + "grad_norm": 0.4155006794569815, + "learning_rate": 8.8319087194498e-06, + "loss": 0.035, + "step": 1330 + }, + { + "epoch": 0.9005412719891746, + "grad_norm": 0.5184428865538863, + "learning_rate": 8.829378567883152e-06, + "loss": 0.0524, + "step": 1331 + }, + { + "epoch": 0.9012178619756428, + "grad_norm": 0.4257897513356357, + "learning_rate": 8.826846042308195e-06, + "loss": 0.0419, + "step": 1332 + }, + { + "epoch": 0.9018944519621109, + "grad_norm": 0.3381178914295803, + "learning_rate": 8.824311144294966e-06, + "loss": 0.0374, + "step": 1333 + }, + { + "epoch": 0.9025710419485792, + "grad_norm": 0.4276666299311541, + "learning_rate": 8.82177387541496e-06, + "loss": 0.0385, + "step": 1334 + }, + { + "epoch": 0.9032476319350473, + "grad_norm": 0.40618159906521484, + "learning_rate": 8.819234237241148e-06, + "loss": 0.0368, + "step": 1335 + }, + { + "epoch": 0.9039242219215156, + "grad_norm": 0.46333943890891277, + "learning_rate": 8.816692231347972e-06, + "loss": 0.0431, + "step": 1336 + }, + { + "epoch": 0.9046008119079838, + "grad_norm": 0.35018955285789055, + "learning_rate": 8.814147859311333e-06, + "loss": 0.0292, + "step": 1337 + }, + { + "epoch": 0.9052774018944519, + "grad_norm": 0.4055635239042611, + "learning_rate": 8.81160112270861e-06, + "loss": 0.0416, + "step": 1338 + }, + { + "epoch": 0.9059539918809202, + "grad_norm": 0.43599525265023237, + "learning_rate": 8.809052023118638e-06, + "loss": 0.0485, + "step": 1339 + }, + { + "epoch": 0.9066305818673883, + "grad_norm": 0.3633448138753023, + "learning_rate": 8.806500562121724e-06, + "loss": 0.0354, + "step": 1340 + }, + { + "epoch": 0.9073071718538566, + "grad_norm": 0.4194681453206341, + "learning_rate": 8.803946741299635e-06, + "loss": 0.0348, + "step": 1341 + }, + { + "epoch": 0.9079837618403248, + "grad_norm": 0.3484340555072282, + "learning_rate": 8.801390562235603e-06, + "loss": 0.0412, + "step": 1342 + }, + { + "epoch": 0.908660351826793, + "grad_norm": 0.39180485947097266, + "learning_rate": 8.79883202651432e-06, + "loss": 0.0344, + "step": 1343 + }, + { + "epoch": 0.9093369418132612, + "grad_norm": 0.5878957640500705, + "learning_rate": 8.796271135721944e-06, + "loss": 0.0438, + "step": 1344 + }, + { + "epoch": 0.9100135317997293, + "grad_norm": 0.3585537157318733, + "learning_rate": 8.793707891446086e-06, + "loss": 0.0465, + "step": 1345 + }, + { + "epoch": 0.9106901217861976, + "grad_norm": 0.42682788574677444, + "learning_rate": 8.791142295275819e-06, + "loss": 0.0419, + "step": 1346 + }, + { + "epoch": 0.9113667117726658, + "grad_norm": 0.43525000643942857, + "learning_rate": 8.788574348801676e-06, + "loss": 0.0439, + "step": 1347 + }, + { + "epoch": 0.912043301759134, + "grad_norm": 0.4271708108608584, + "learning_rate": 8.786004053615642e-06, + "loss": 0.0408, + "step": 1348 + }, + { + "epoch": 0.9127198917456022, + "grad_norm": 0.3982985169696188, + "learning_rate": 8.783431411311165e-06, + "loss": 0.0406, + "step": 1349 + }, + { + "epoch": 0.9133964817320703, + "grad_norm": 0.4420284914173309, + "learning_rate": 8.780856423483145e-06, + "loss": 0.049, + "step": 1350 + }, + { + "epoch": 0.9140730717185386, + "grad_norm": 0.5133876517309015, + "learning_rate": 8.778279091727933e-06, + "loss": 0.0376, + "step": 1351 + }, + { + "epoch": 0.9147496617050067, + "grad_norm": 0.4517642045654338, + "learning_rate": 8.775699417643337e-06, + "loss": 0.036, + "step": 1352 + }, + { + "epoch": 0.915426251691475, + "grad_norm": 0.4187033930337199, + "learning_rate": 8.773117402828618e-06, + "loss": 0.0387, + "step": 1353 + }, + { + "epoch": 0.9161028416779432, + "grad_norm": 0.5689038088698848, + "learning_rate": 8.770533048884483e-06, + "loss": 0.0457, + "step": 1354 + }, + { + "epoch": 0.9167794316644113, + "grad_norm": 0.5477824374413572, + "learning_rate": 8.767946357413091e-06, + "loss": 0.0508, + "step": 1355 + }, + { + "epoch": 0.9174560216508796, + "grad_norm": 0.432617373887483, + "learning_rate": 8.765357330018056e-06, + "loss": 0.0512, + "step": 1356 + }, + { + "epoch": 0.9181326116373477, + "grad_norm": 0.39216575430497996, + "learning_rate": 8.76276596830443e-06, + "loss": 0.0393, + "step": 1357 + }, + { + "epoch": 0.918809201623816, + "grad_norm": 0.505404286144978, + "learning_rate": 8.760172273878723e-06, + "loss": 0.0432, + "step": 1358 + }, + { + "epoch": 0.9194857916102842, + "grad_norm": 0.41370772948770634, + "learning_rate": 8.757576248348883e-06, + "loss": 0.0458, + "step": 1359 + }, + { + "epoch": 0.9201623815967523, + "grad_norm": 0.3533451333059337, + "learning_rate": 8.754977893324305e-06, + "loss": 0.0371, + "step": 1360 + }, + { + "epoch": 0.9208389715832206, + "grad_norm": 0.33252775450950467, + "learning_rate": 8.75237721041583e-06, + "loss": 0.037, + "step": 1361 + }, + { + "epoch": 0.9215155615696887, + "grad_norm": 0.30753827187988675, + "learning_rate": 8.74977420123574e-06, + "loss": 0.043, + "step": 1362 + }, + { + "epoch": 0.922192151556157, + "grad_norm": 0.31934034280234236, + "learning_rate": 8.747168867397765e-06, + "loss": 0.0295, + "step": 1363 + }, + { + "epoch": 0.9228687415426252, + "grad_norm": 0.4469463716086972, + "learning_rate": 8.744561210517067e-06, + "loss": 0.0394, + "step": 1364 + }, + { + "epoch": 0.9235453315290933, + "grad_norm": 0.5110721075186322, + "learning_rate": 8.741951232210254e-06, + "loss": 0.0748, + "step": 1365 + }, + { + "epoch": 0.9242219215155616, + "grad_norm": 0.3587084701697703, + "learning_rate": 8.73933893409537e-06, + "loss": 0.0356, + "step": 1366 + }, + { + "epoch": 0.9248985115020297, + "grad_norm": 0.6121188033381618, + "learning_rate": 8.736724317791903e-06, + "loss": 0.0447, + "step": 1367 + }, + { + "epoch": 0.925575101488498, + "grad_norm": 0.6167435253465511, + "learning_rate": 8.734107384920771e-06, + "loss": 0.0342, + "step": 1368 + }, + { + "epoch": 0.9262516914749662, + "grad_norm": 0.7836362184370608, + "learning_rate": 8.731488137104332e-06, + "loss": 0.0426, + "step": 1369 + }, + { + "epoch": 0.9269282814614344, + "grad_norm": 0.3434747402556253, + "learning_rate": 8.728866575966379e-06, + "loss": 0.0324, + "step": 1370 + }, + { + "epoch": 0.9276048714479026, + "grad_norm": 0.34921447130412037, + "learning_rate": 8.726242703132139e-06, + "loss": 0.0336, + "step": 1371 + }, + { + "epoch": 0.9282814614343707, + "grad_norm": 0.34504458772487184, + "learning_rate": 8.72361652022827e-06, + "loss": 0.0327, + "step": 1372 + }, + { + "epoch": 0.928958051420839, + "grad_norm": 0.4475201290757406, + "learning_rate": 8.720988028882867e-06, + "loss": 0.0436, + "step": 1373 + }, + { + "epoch": 0.9296346414073072, + "grad_norm": 0.5271961046361782, + "learning_rate": 8.71835723072545e-06, + "loss": 0.0508, + "step": 1374 + }, + { + "epoch": 0.9303112313937754, + "grad_norm": 0.5633526363693144, + "learning_rate": 8.715724127386971e-06, + "loss": 0.0458, + "step": 1375 + }, + { + "epoch": 0.9309878213802436, + "grad_norm": 0.36374114879754027, + "learning_rate": 8.713088720499817e-06, + "loss": 0.0315, + "step": 1376 + }, + { + "epoch": 0.9316644113667117, + "grad_norm": 0.415289551630065, + "learning_rate": 8.710451011697794e-06, + "loss": 0.0417, + "step": 1377 + }, + { + "epoch": 0.93234100135318, + "grad_norm": 0.5290341699989562, + "learning_rate": 8.70781100261614e-06, + "loss": 0.0459, + "step": 1378 + }, + { + "epoch": 0.9330175913396481, + "grad_norm": 0.3854241345094511, + "learning_rate": 8.705168694891522e-06, + "loss": 0.0434, + "step": 1379 + }, + { + "epoch": 0.9336941813261164, + "grad_norm": 0.31857051670522657, + "learning_rate": 8.702524090162023e-06, + "loss": 0.0328, + "step": 1380 + }, + { + "epoch": 0.9343707713125846, + "grad_norm": 0.45310218723053053, + "learning_rate": 8.699877190067158e-06, + "loss": 0.0409, + "step": 1381 + }, + { + "epoch": 0.9350473612990527, + "grad_norm": 0.40780180266545213, + "learning_rate": 8.697227996247861e-06, + "loss": 0.0387, + "step": 1382 + }, + { + "epoch": 0.935723951285521, + "grad_norm": 0.42366079031226367, + "learning_rate": 8.694576510346493e-06, + "loss": 0.0464, + "step": 1383 + }, + { + "epoch": 0.9364005412719891, + "grad_norm": 0.5556293597587709, + "learning_rate": 8.691922734006828e-06, + "loss": 0.0467, + "step": 1384 + }, + { + "epoch": 0.9370771312584574, + "grad_norm": 0.3072980297939123, + "learning_rate": 8.689266668874067e-06, + "loss": 0.0344, + "step": 1385 + }, + { + "epoch": 0.9377537212449256, + "grad_norm": 0.4031713885243129, + "learning_rate": 8.686608316594826e-06, + "loss": 0.0537, + "step": 1386 + }, + { + "epoch": 0.9384303112313938, + "grad_norm": 0.2959236664300782, + "learning_rate": 8.683947678817139e-06, + "loss": 0.0281, + "step": 1387 + }, + { + "epoch": 0.939106901217862, + "grad_norm": 0.37637415121870216, + "learning_rate": 8.681284757190462e-06, + "loss": 0.0428, + "step": 1388 + }, + { + "epoch": 0.9397834912043301, + "grad_norm": 0.33953434939379035, + "learning_rate": 8.67861955336566e-06, + "loss": 0.0357, + "step": 1389 + }, + { + "epoch": 0.9404600811907984, + "grad_norm": 0.36545007825329084, + "learning_rate": 8.675952068995014e-06, + "loss": 0.0466, + "step": 1390 + }, + { + "epoch": 0.9411366711772666, + "grad_norm": 0.363065405780556, + "learning_rate": 8.673282305732225e-06, + "loss": 0.0378, + "step": 1391 + }, + { + "epoch": 0.9418132611637348, + "grad_norm": 0.47084181189146035, + "learning_rate": 8.670610265232398e-06, + "loss": 0.0438, + "step": 1392 + }, + { + "epoch": 0.942489851150203, + "grad_norm": 0.6011275614491401, + "learning_rate": 8.667935949152057e-06, + "loss": 0.0425, + "step": 1393 + }, + { + "epoch": 0.9431664411366711, + "grad_norm": 0.25204300564609355, + "learning_rate": 8.665259359149132e-06, + "loss": 0.0291, + "step": 1394 + }, + { + "epoch": 0.9438430311231394, + "grad_norm": 0.33203799073671036, + "learning_rate": 8.662580496882967e-06, + "loss": 0.0365, + "step": 1395 + }, + { + "epoch": 0.9445196211096076, + "grad_norm": 0.5095084735188393, + "learning_rate": 8.659899364014309e-06, + "loss": 0.0539, + "step": 1396 + }, + { + "epoch": 0.9451962110960758, + "grad_norm": 0.400212300905144, + "learning_rate": 8.657215962205318e-06, + "loss": 0.0459, + "step": 1397 + }, + { + "epoch": 0.945872801082544, + "grad_norm": 0.47524099633985223, + "learning_rate": 8.654530293119558e-06, + "loss": 0.0426, + "step": 1398 + }, + { + "epoch": 0.9465493910690121, + "grad_norm": 0.4513709827543037, + "learning_rate": 8.651842358421999e-06, + "loss": 0.0384, + "step": 1399 + }, + { + "epoch": 0.9472259810554804, + "grad_norm": 0.42380788435904354, + "learning_rate": 8.649152159779015e-06, + "loss": 0.0534, + "step": 1400 + }, + { + "epoch": 0.9479025710419485, + "grad_norm": 0.4746189652862116, + "learning_rate": 8.646459698858386e-06, + "loss": 0.0496, + "step": 1401 + }, + { + "epoch": 0.9485791610284168, + "grad_norm": 0.33647820772373627, + "learning_rate": 8.64376497732929e-06, + "loss": 0.0359, + "step": 1402 + }, + { + "epoch": 0.949255751014885, + "grad_norm": 0.3826856128010313, + "learning_rate": 8.64106799686231e-06, + "loss": 0.0379, + "step": 1403 + }, + { + "epoch": 0.9499323410013532, + "grad_norm": 0.43063802215566566, + "learning_rate": 8.638368759129433e-06, + "loss": 0.0383, + "step": 1404 + }, + { + "epoch": 0.9506089309878214, + "grad_norm": 0.4379848528368647, + "learning_rate": 8.635667265804034e-06, + "loss": 0.0402, + "step": 1405 + }, + { + "epoch": 0.9512855209742895, + "grad_norm": 0.365501588914398, + "learning_rate": 8.632963518560894e-06, + "loss": 0.0302, + "step": 1406 + }, + { + "epoch": 0.9519621109607578, + "grad_norm": 0.5678982451258465, + "learning_rate": 8.630257519076196e-06, + "loss": 0.0514, + "step": 1407 + }, + { + "epoch": 0.952638700947226, + "grad_norm": 0.42652816898918783, + "learning_rate": 8.627549269027509e-06, + "loss": 0.0335, + "step": 1408 + }, + { + "epoch": 0.9533152909336942, + "grad_norm": 0.4278765027867784, + "learning_rate": 8.624838770093805e-06, + "loss": 0.052, + "step": 1409 + }, + { + "epoch": 0.9539918809201624, + "grad_norm": 0.35663373279217603, + "learning_rate": 8.622126023955446e-06, + "loss": 0.0352, + "step": 1410 + }, + { + "epoch": 0.9546684709066305, + "grad_norm": 0.2603426735691204, + "learning_rate": 8.619411032294187e-06, + "loss": 0.0252, + "step": 1411 + }, + { + "epoch": 0.9553450608930988, + "grad_norm": 0.4082089492484779, + "learning_rate": 8.616693796793178e-06, + "loss": 0.0371, + "step": 1412 + }, + { + "epoch": 0.956021650879567, + "grad_norm": 0.4961975359572775, + "learning_rate": 8.613974319136959e-06, + "loss": 0.0471, + "step": 1413 + }, + { + "epoch": 0.9566982408660352, + "grad_norm": 0.3054302991764964, + "learning_rate": 8.611252601011457e-06, + "loss": 0.0301, + "step": 1414 + }, + { + "epoch": 0.9573748308525034, + "grad_norm": 0.30909263378726926, + "learning_rate": 8.608528644103994e-06, + "loss": 0.0426, + "step": 1415 + }, + { + "epoch": 0.9580514208389715, + "grad_norm": 0.36823686981212433, + "learning_rate": 8.605802450103276e-06, + "loss": 0.0318, + "step": 1416 + }, + { + "epoch": 0.9587280108254398, + "grad_norm": 0.4674893579677942, + "learning_rate": 8.603074020699393e-06, + "loss": 0.0393, + "step": 1417 + }, + { + "epoch": 0.959404600811908, + "grad_norm": 0.8831206562571963, + "learning_rate": 8.600343357583826e-06, + "loss": 0.0625, + "step": 1418 + }, + { + "epoch": 0.9600811907983762, + "grad_norm": 0.4818670093680137, + "learning_rate": 8.597610462449441e-06, + "loss": 0.0409, + "step": 1419 + }, + { + "epoch": 0.9607577807848444, + "grad_norm": 0.4343179237952745, + "learning_rate": 8.594875336990482e-06, + "loss": 0.046, + "step": 1420 + }, + { + "epoch": 0.9614343707713126, + "grad_norm": 0.45361580834281207, + "learning_rate": 8.592137982902585e-06, + "loss": 0.0376, + "step": 1421 + }, + { + "epoch": 0.9621109607577808, + "grad_norm": 0.3861893952181174, + "learning_rate": 8.589398401882755e-06, + "loss": 0.0357, + "step": 1422 + }, + { + "epoch": 0.9627875507442489, + "grad_norm": 0.49705716116923543, + "learning_rate": 8.586656595629387e-06, + "loss": 0.0442, + "step": 1423 + }, + { + "epoch": 0.9634641407307172, + "grad_norm": 2.0948926324352524, + "learning_rate": 8.583912565842258e-06, + "loss": 0.0437, + "step": 1424 + }, + { + "epoch": 0.9641407307171854, + "grad_norm": 0.7543514241780067, + "learning_rate": 8.581166314222512e-06, + "loss": 0.04, + "step": 1425 + }, + { + "epoch": 0.9648173207036536, + "grad_norm": 0.6287871594271817, + "learning_rate": 8.57841784247268e-06, + "loss": 0.044, + "step": 1426 + }, + { + "epoch": 0.9654939106901218, + "grad_norm": 0.2973673246487005, + "learning_rate": 8.575667152296666e-06, + "loss": 0.0315, + "step": 1427 + }, + { + "epoch": 0.9661705006765899, + "grad_norm": 0.4947375813489154, + "learning_rate": 8.572914245399748e-06, + "loss": 0.0413, + "step": 1428 + }, + { + "epoch": 0.9668470906630582, + "grad_norm": 0.6129574364831808, + "learning_rate": 8.570159123488584e-06, + "loss": 0.0424, + "step": 1429 + }, + { + "epoch": 0.9675236806495264, + "grad_norm": 0.4962127020607894, + "learning_rate": 8.567401788271195e-06, + "loss": 0.037, + "step": 1430 + }, + { + "epoch": 0.9682002706359946, + "grad_norm": 0.7511633360613846, + "learning_rate": 8.564642241456986e-06, + "loss": 0.0455, + "step": 1431 + }, + { + "epoch": 0.9688768606224628, + "grad_norm": 0.3728095573745696, + "learning_rate": 8.561880484756726e-06, + "loss": 0.0386, + "step": 1432 + }, + { + "epoch": 0.969553450608931, + "grad_norm": 0.7617079823802402, + "learning_rate": 8.559116519882551e-06, + "loss": 0.0445, + "step": 1433 + }, + { + "epoch": 0.9702300405953992, + "grad_norm": 0.5868883031623126, + "learning_rate": 8.556350348547978e-06, + "loss": 0.0329, + "step": 1434 + }, + { + "epoch": 0.9709066305818674, + "grad_norm": 0.37583133626818915, + "learning_rate": 8.553581972467875e-06, + "loss": 0.0332, + "step": 1435 + }, + { + "epoch": 0.9715832205683356, + "grad_norm": 0.46307090629435255, + "learning_rate": 8.550811393358494e-06, + "loss": 0.0369, + "step": 1436 + }, + { + "epoch": 0.9722598105548038, + "grad_norm": 0.8679344247761973, + "learning_rate": 8.54803861293744e-06, + "loss": 0.0583, + "step": 1437 + }, + { + "epoch": 0.972936400541272, + "grad_norm": 0.5324883304926782, + "learning_rate": 8.545263632923687e-06, + "loss": 0.0316, + "step": 1438 + }, + { + "epoch": 0.9736129905277402, + "grad_norm": 0.3736040104215009, + "learning_rate": 8.542486455037578e-06, + "loss": 0.0304, + "step": 1439 + }, + { + "epoch": 0.9742895805142084, + "grad_norm": 0.5557634980946791, + "learning_rate": 8.539707081000808e-06, + "loss": 0.0353, + "step": 1440 + }, + { + "epoch": 0.9749661705006766, + "grad_norm": 0.8313826424684858, + "learning_rate": 8.536925512536441e-06, + "loss": 0.0404, + "step": 1441 + }, + { + "epoch": 0.9756427604871448, + "grad_norm": 0.4738732342928901, + "learning_rate": 8.534141751368901e-06, + "loss": 0.0455, + "step": 1442 + }, + { + "epoch": 0.976319350473613, + "grad_norm": 0.4252280747402319, + "learning_rate": 8.531355799223968e-06, + "loss": 0.0376, + "step": 1443 + }, + { + "epoch": 0.9769959404600812, + "grad_norm": 0.5163268090039261, + "learning_rate": 8.528567657828785e-06, + "loss": 0.0359, + "step": 1444 + }, + { + "epoch": 0.9776725304465493, + "grad_norm": 0.6482992708520467, + "learning_rate": 8.525777328911846e-06, + "loss": 0.0402, + "step": 1445 + }, + { + "epoch": 0.9783491204330176, + "grad_norm": 0.327154709322352, + "learning_rate": 8.522984814203006e-06, + "loss": 0.0264, + "step": 1446 + }, + { + "epoch": 0.9790257104194858, + "grad_norm": 0.34617077354022013, + "learning_rate": 8.520190115433473e-06, + "loss": 0.0351, + "step": 1447 + }, + { + "epoch": 0.979702300405954, + "grad_norm": 1.0728151682978733, + "learning_rate": 8.517393234335812e-06, + "loss": 0.0602, + "step": 1448 + }, + { + "epoch": 0.9803788903924222, + "grad_norm": 0.47186916437704224, + "learning_rate": 8.514594172643934e-06, + "loss": 0.0535, + "step": 1449 + }, + { + "epoch": 0.9810554803788903, + "grad_norm": 0.5570069753863448, + "learning_rate": 8.51179293209311e-06, + "loss": 0.0474, + "step": 1450 + }, + { + "epoch": 0.9817320703653586, + "grad_norm": 0.414451585522381, + "learning_rate": 8.508989514419959e-06, + "loss": 0.0327, + "step": 1451 + }, + { + "epoch": 0.9824086603518268, + "grad_norm": 0.3941767751245504, + "learning_rate": 8.506183921362443e-06, + "loss": 0.0268, + "step": 1452 + }, + { + "epoch": 0.983085250338295, + "grad_norm": 0.5328637980848144, + "learning_rate": 8.503376154659886e-06, + "loss": 0.0512, + "step": 1453 + }, + { + "epoch": 0.9837618403247632, + "grad_norm": 0.46205090639806845, + "learning_rate": 8.500566216052948e-06, + "loss": 0.0473, + "step": 1454 + }, + { + "epoch": 0.9844384303112313, + "grad_norm": 0.4458660847402536, + "learning_rate": 8.497754107283637e-06, + "loss": 0.0379, + "step": 1455 + }, + { + "epoch": 0.9851150202976996, + "grad_norm": 0.37550898093481744, + "learning_rate": 8.494939830095315e-06, + "loss": 0.0484, + "step": 1456 + }, + { + "epoch": 0.9857916102841678, + "grad_norm": 0.5418968458578318, + "learning_rate": 8.492123386232678e-06, + "loss": 0.0479, + "step": 1457 + }, + { + "epoch": 0.986468200270636, + "grad_norm": 0.6605495668597899, + "learning_rate": 8.489304777441772e-06, + "loss": 0.0337, + "step": 1458 + }, + { + "epoch": 0.9871447902571042, + "grad_norm": 0.37757453503117505, + "learning_rate": 8.486484005469977e-06, + "loss": 0.0441, + "step": 1459 + }, + { + "epoch": 0.9878213802435724, + "grad_norm": 1.6101027995387267, + "learning_rate": 8.483661072066027e-06, + "loss": 0.0536, + "step": 1460 + }, + { + "epoch": 0.9884979702300406, + "grad_norm": 0.37741568372068746, + "learning_rate": 8.480835978979983e-06, + "loss": 0.0332, + "step": 1461 + }, + { + "epoch": 0.9891745602165088, + "grad_norm": 0.360903326245528, + "learning_rate": 8.478008727963253e-06, + "loss": 0.0378, + "step": 1462 + }, + { + "epoch": 0.989851150202977, + "grad_norm": 0.4449941405624222, + "learning_rate": 8.475179320768581e-06, + "loss": 0.0376, + "step": 1463 + }, + { + "epoch": 0.9905277401894452, + "grad_norm": 0.3399711778243278, + "learning_rate": 8.472347759150044e-06, + "loss": 0.0275, + "step": 1464 + }, + { + "epoch": 0.9912043301759134, + "grad_norm": 0.6851638740987143, + "learning_rate": 8.46951404486306e-06, + "loss": 0.0419, + "step": 1465 + }, + { + "epoch": 0.9918809201623816, + "grad_norm": 0.45020867674883097, + "learning_rate": 8.466678179664378e-06, + "loss": 0.0528, + "step": 1466 + }, + { + "epoch": 0.9925575101488497, + "grad_norm": 0.5147653396813089, + "learning_rate": 8.463840165312083e-06, + "loss": 0.0682, + "step": 1467 + }, + { + "epoch": 0.993234100135318, + "grad_norm": 0.35734575635179644, + "learning_rate": 8.461000003565588e-06, + "loss": 0.0447, + "step": 1468 + }, + { + "epoch": 0.9939106901217862, + "grad_norm": 1.1181458457099287, + "learning_rate": 8.458157696185643e-06, + "loss": 0.0566, + "step": 1469 + }, + { + "epoch": 0.9945872801082544, + "grad_norm": 0.3770843257846701, + "learning_rate": 8.455313244934324e-06, + "loss": 0.0412, + "step": 1470 + }, + { + "epoch": 0.9952638700947226, + "grad_norm": 0.3277284833415035, + "learning_rate": 8.452466651575039e-06, + "loss": 0.0319, + "step": 1471 + }, + { + "epoch": 0.9959404600811907, + "grad_norm": 0.4301378623400825, + "learning_rate": 8.44961791787252e-06, + "loss": 0.0455, + "step": 1472 + }, + { + "epoch": 0.996617050067659, + "grad_norm": 0.5153539805412881, + "learning_rate": 8.446767045592829e-06, + "loss": 0.0502, + "step": 1473 + }, + { + "epoch": 0.9972936400541272, + "grad_norm": 0.4532534249478539, + "learning_rate": 8.443914036503356e-06, + "loss": 0.0451, + "step": 1474 + }, + { + "epoch": 0.9979702300405954, + "grad_norm": 0.4550178125850516, + "learning_rate": 8.44105889237281e-06, + "loss": 0.0492, + "step": 1475 + }, + { + "epoch": 0.9986468200270636, + "grad_norm": 0.4079338583231294, + "learning_rate": 8.438201614971227e-06, + "loss": 0.0435, + "step": 1476 + }, + { + "epoch": 0.9993234100135318, + "grad_norm": 0.37749640154370667, + "learning_rate": 8.435342206069965e-06, + "loss": 0.0374, + "step": 1477 + }, + { + "epoch": 1.0, + "grad_norm": 0.5003925380914696, + "learning_rate": 8.432480667441703e-06, + "loss": 0.0446, + "step": 1478 + }, + { + "epoch": 1.0, + "eval_loss": 0.043284833431243896, + "eval_runtime": 236.1821, + "eval_samples_per_second": 42.15, + "eval_steps_per_second": 1.321, + "step": 1478 + }, + { + "epoch": 1.0006765899864682, + "grad_norm": 0.33522796023289275, + "learning_rate": 8.429617000860441e-06, + "loss": 0.0324, + "step": 1479 + }, + { + "epoch": 1.0013531799729365, + "grad_norm": 0.33595347771660705, + "learning_rate": 8.4267512081015e-06, + "loss": 0.0315, + "step": 1480 + }, + { + "epoch": 1.0020297699594045, + "grad_norm": 0.3800678530622861, + "learning_rate": 8.423883290941514e-06, + "loss": 0.0462, + "step": 1481 + }, + { + "epoch": 1.0027063599458728, + "grad_norm": 0.9187984515660299, + "learning_rate": 8.421013251158437e-06, + "loss": 0.0704, + "step": 1482 + }, + { + "epoch": 1.003382949932341, + "grad_norm": 0.34154557986940803, + "learning_rate": 8.418141090531543e-06, + "loss": 0.0369, + "step": 1483 + }, + { + "epoch": 1.0040595399188093, + "grad_norm": 0.4655237200659949, + "learning_rate": 8.415266810841412e-06, + "loss": 0.043, + "step": 1484 + }, + { + "epoch": 1.0047361299052775, + "grad_norm": 0.2820164912654239, + "learning_rate": 8.412390413869944e-06, + "loss": 0.0246, + "step": 1485 + }, + { + "epoch": 1.0054127198917455, + "grad_norm": 0.5598470452671843, + "learning_rate": 8.409511901400351e-06, + "loss": 0.0464, + "step": 1486 + }, + { + "epoch": 1.0060893098782138, + "grad_norm": 0.4965875342631596, + "learning_rate": 8.406631275217156e-06, + "loss": 0.0373, + "step": 1487 + }, + { + "epoch": 1.006765899864682, + "grad_norm": 0.37281908124115976, + "learning_rate": 8.40374853710619e-06, + "loss": 0.0325, + "step": 1488 + }, + { + "epoch": 1.0074424898511503, + "grad_norm": 0.35947338122712696, + "learning_rate": 8.400863688854598e-06, + "loss": 0.03, + "step": 1489 + }, + { + "epoch": 1.0081190798376185, + "grad_norm": 0.8250910442927936, + "learning_rate": 8.397976732250827e-06, + "loss": 0.0353, + "step": 1490 + }, + { + "epoch": 1.0087956698240865, + "grad_norm": 1.089833068210979, + "learning_rate": 8.395087669084638e-06, + "loss": 0.0387, + "step": 1491 + }, + { + "epoch": 1.0094722598105548, + "grad_norm": 0.4130176092842279, + "learning_rate": 8.392196501147092e-06, + "loss": 0.0341, + "step": 1492 + }, + { + "epoch": 1.010148849797023, + "grad_norm": 0.7927218466842345, + "learning_rate": 8.389303230230556e-06, + "loss": 0.0465, + "step": 1493 + }, + { + "epoch": 1.0108254397834913, + "grad_norm": 0.3443406951629904, + "learning_rate": 8.386407858128707e-06, + "loss": 0.0269, + "step": 1494 + }, + { + "epoch": 1.0115020297699595, + "grad_norm": 0.5931951717062812, + "learning_rate": 8.383510386636516e-06, + "loss": 0.0409, + "step": 1495 + }, + { + "epoch": 1.0121786197564275, + "grad_norm": 0.48089521261360063, + "learning_rate": 8.380610817550256e-06, + "loss": 0.0305, + "step": 1496 + }, + { + "epoch": 1.0128552097428958, + "grad_norm": 0.3327952861061012, + "learning_rate": 8.377709152667513e-06, + "loss": 0.0362, + "step": 1497 + }, + { + "epoch": 1.013531799729364, + "grad_norm": 0.34476344023970174, + "learning_rate": 8.374805393787154e-06, + "loss": 0.0317, + "step": 1498 + }, + { + "epoch": 1.0142083897158323, + "grad_norm": 0.66524767617579, + "learning_rate": 8.371899542709355e-06, + "loss": 0.0602, + "step": 1499 + }, + { + "epoch": 1.0148849797023005, + "grad_norm": 0.5081402036360071, + "learning_rate": 8.36899160123559e-06, + "loss": 0.0485, + "step": 1500 + }, + { + "epoch": 1.0155615696887685, + "grad_norm": 0.674647947598683, + "learning_rate": 8.366081571168625e-06, + "loss": 0.0318, + "step": 1501 + }, + { + "epoch": 1.0162381596752368, + "grad_norm": 0.30597063315759776, + "learning_rate": 8.363169454312518e-06, + "loss": 0.0312, + "step": 1502 + }, + { + "epoch": 1.016914749661705, + "grad_norm": 0.44430106872899877, + "learning_rate": 8.36025525247263e-06, + "loss": 0.0362, + "step": 1503 + }, + { + "epoch": 1.0175913396481733, + "grad_norm": 0.3661569722846717, + "learning_rate": 8.357338967455605e-06, + "loss": 0.0287, + "step": 1504 + }, + { + "epoch": 1.0182679296346413, + "grad_norm": 0.6498882765499547, + "learning_rate": 8.354420601069384e-06, + "loss": 0.0322, + "step": 1505 + }, + { + "epoch": 1.0189445196211095, + "grad_norm": 0.5930942256287522, + "learning_rate": 8.3515001551232e-06, + "loss": 0.0339, + "step": 1506 + }, + { + "epoch": 1.0196211096075778, + "grad_norm": 0.3884818716396439, + "learning_rate": 8.348577631427565e-06, + "loss": 0.032, + "step": 1507 + }, + { + "epoch": 1.020297699594046, + "grad_norm": 0.511039217554216, + "learning_rate": 8.345653031794292e-06, + "loss": 0.0348, + "step": 1508 + }, + { + "epoch": 1.0209742895805143, + "grad_norm": 0.46397974067848646, + "learning_rate": 8.342726358036473e-06, + "loss": 0.0411, + "step": 1509 + }, + { + "epoch": 1.0216508795669823, + "grad_norm": 0.5205087161344565, + "learning_rate": 8.339797611968488e-06, + "loss": 0.0484, + "step": 1510 + }, + { + "epoch": 1.0223274695534506, + "grad_norm": 0.3811280149269098, + "learning_rate": 8.336866795406003e-06, + "loss": 0.0355, + "step": 1511 + }, + { + "epoch": 1.0230040595399188, + "grad_norm": 0.3328731343602381, + "learning_rate": 8.333933910165964e-06, + "loss": 0.0316, + "step": 1512 + }, + { + "epoch": 1.023680649526387, + "grad_norm": 0.4671503400685048, + "learning_rate": 8.3309989580666e-06, + "loss": 0.0338, + "step": 1513 + }, + { + "epoch": 1.0243572395128553, + "grad_norm": 0.4439344594654805, + "learning_rate": 8.32806194092743e-06, + "loss": 0.0339, + "step": 1514 + }, + { + "epoch": 1.0250338294993233, + "grad_norm": 0.3248928138089397, + "learning_rate": 8.325122860569241e-06, + "loss": 0.0306, + "step": 1515 + }, + { + "epoch": 1.0257104194857916, + "grad_norm": 0.3066152336742344, + "learning_rate": 8.322181718814107e-06, + "loss": 0.0338, + "step": 1516 + }, + { + "epoch": 1.0263870094722598, + "grad_norm": 0.39788533595101166, + "learning_rate": 8.319238517485376e-06, + "loss": 0.0318, + "step": 1517 + }, + { + "epoch": 1.027063599458728, + "grad_norm": 0.37846270133688437, + "learning_rate": 8.316293258407673e-06, + "loss": 0.0323, + "step": 1518 + }, + { + "epoch": 1.0277401894451963, + "grad_norm": 0.33545972275343455, + "learning_rate": 8.313345943406903e-06, + "loss": 0.031, + "step": 1519 + }, + { + "epoch": 1.0284167794316643, + "grad_norm": 0.34039697813991265, + "learning_rate": 8.310396574310239e-06, + "loss": 0.0396, + "step": 1520 + }, + { + "epoch": 1.0290933694181326, + "grad_norm": 0.42115212717125267, + "learning_rate": 8.307445152946133e-06, + "loss": 0.0372, + "step": 1521 + }, + { + "epoch": 1.0297699594046008, + "grad_norm": 0.39351326497939293, + "learning_rate": 8.304491681144306e-06, + "loss": 0.0423, + "step": 1522 + }, + { + "epoch": 1.030446549391069, + "grad_norm": 0.38777271173894157, + "learning_rate": 8.301536160735752e-06, + "loss": 0.0303, + "step": 1523 + }, + { + "epoch": 1.0311231393775373, + "grad_norm": 0.5419920395095353, + "learning_rate": 8.298578593552737e-06, + "loss": 0.0339, + "step": 1524 + }, + { + "epoch": 1.0317997293640053, + "grad_norm": 0.34527420183614954, + "learning_rate": 8.295618981428788e-06, + "loss": 0.0223, + "step": 1525 + }, + { + "epoch": 1.0324763193504736, + "grad_norm": 0.34245985834211895, + "learning_rate": 8.292657326198707e-06, + "loss": 0.0323, + "step": 1526 + }, + { + "epoch": 1.0331529093369418, + "grad_norm": 0.5092072347718094, + "learning_rate": 8.289693629698564e-06, + "loss": 0.0387, + "step": 1527 + }, + { + "epoch": 1.03382949932341, + "grad_norm": 0.4761678063172832, + "learning_rate": 8.286727893765687e-06, + "loss": 0.0387, + "step": 1528 + }, + { + "epoch": 1.0345060893098783, + "grad_norm": 0.8543000263590717, + "learning_rate": 8.283760120238672e-06, + "loss": 0.0475, + "step": 1529 + }, + { + "epoch": 1.0351826792963463, + "grad_norm": 0.3218258332800442, + "learning_rate": 8.280790310957382e-06, + "loss": 0.033, + "step": 1530 + }, + { + "epoch": 1.0358592692828146, + "grad_norm": 0.254089712043321, + "learning_rate": 8.277818467762937e-06, + "loss": 0.0245, + "step": 1531 + }, + { + "epoch": 1.0365358592692828, + "grad_norm": 0.4275886427356734, + "learning_rate": 8.27484459249772e-06, + "loss": 0.0439, + "step": 1532 + }, + { + "epoch": 1.037212449255751, + "grad_norm": 0.5234314087999413, + "learning_rate": 8.271868687005371e-06, + "loss": 0.0453, + "step": 1533 + }, + { + "epoch": 1.0378890392422193, + "grad_norm": 0.3271970526964452, + "learning_rate": 8.268890753130794e-06, + "loss": 0.0296, + "step": 1534 + }, + { + "epoch": 1.0385656292286873, + "grad_norm": 0.6884229044748504, + "learning_rate": 8.265910792720147e-06, + "loss": 0.05, + "step": 1535 + }, + { + "epoch": 1.0392422192151556, + "grad_norm": 0.4463909883398464, + "learning_rate": 8.262928807620843e-06, + "loss": 0.0399, + "step": 1536 + }, + { + "epoch": 1.0399188092016238, + "grad_norm": 0.39159432382126147, + "learning_rate": 8.259944799681555e-06, + "loss": 0.0323, + "step": 1537 + }, + { + "epoch": 1.040595399188092, + "grad_norm": 0.4942700529498418, + "learning_rate": 8.256958770752203e-06, + "loss": 0.0499, + "step": 1538 + }, + { + "epoch": 1.0412719891745603, + "grad_norm": 0.2945986814302107, + "learning_rate": 8.253970722683968e-06, + "loss": 0.0261, + "step": 1539 + }, + { + "epoch": 1.0419485791610283, + "grad_norm": 0.3335683511759464, + "learning_rate": 8.250980657329278e-06, + "loss": 0.0319, + "step": 1540 + }, + { + "epoch": 1.0426251691474966, + "grad_norm": 0.31197255403397706, + "learning_rate": 8.24798857654181e-06, + "loss": 0.0321, + "step": 1541 + }, + { + "epoch": 1.0433017591339648, + "grad_norm": 0.3053910422157987, + "learning_rate": 8.244994482176495e-06, + "loss": 0.025, + "step": 1542 + }, + { + "epoch": 1.043978349120433, + "grad_norm": 0.5682066498563606, + "learning_rate": 8.241998376089508e-06, + "loss": 0.0357, + "step": 1543 + }, + { + "epoch": 1.044654939106901, + "grad_norm": 0.5442165459359114, + "learning_rate": 8.239000260138277e-06, + "loss": 0.0391, + "step": 1544 + }, + { + "epoch": 1.0453315290933693, + "grad_norm": 0.41921869529424377, + "learning_rate": 8.236000136181468e-06, + "loss": 0.0433, + "step": 1545 + }, + { + "epoch": 1.0460081190798376, + "grad_norm": 0.4051124974993968, + "learning_rate": 8.232998006078998e-06, + "loss": 0.0318, + "step": 1546 + }, + { + "epoch": 1.0466847090663058, + "grad_norm": 0.510469955338187, + "learning_rate": 8.229993871692028e-06, + "loss": 0.0378, + "step": 1547 + }, + { + "epoch": 1.047361299052774, + "grad_norm": 0.45677758549316627, + "learning_rate": 8.226987734882956e-06, + "loss": 0.0445, + "step": 1548 + }, + { + "epoch": 1.048037889039242, + "grad_norm": 0.4883730554355881, + "learning_rate": 8.223979597515425e-06, + "loss": 0.037, + "step": 1549 + }, + { + "epoch": 1.0487144790257104, + "grad_norm": 0.40831327457707306, + "learning_rate": 8.220969461454322e-06, + "loss": 0.0438, + "step": 1550 + }, + { + "epoch": 1.0493910690121786, + "grad_norm": 0.4668162520799239, + "learning_rate": 8.217957328565765e-06, + "loss": 0.032, + "step": 1551 + }, + { + "epoch": 1.0500676589986468, + "grad_norm": 0.377538200410174, + "learning_rate": 8.214943200717114e-06, + "loss": 0.0406, + "step": 1552 + }, + { + "epoch": 1.050744248985115, + "grad_norm": 0.5971134143439779, + "learning_rate": 8.211927079776969e-06, + "loss": 0.0303, + "step": 1553 + }, + { + "epoch": 1.0514208389715831, + "grad_norm": 0.33996140132456804, + "learning_rate": 8.208908967615159e-06, + "loss": 0.0265, + "step": 1554 + }, + { + "epoch": 1.0520974289580514, + "grad_norm": 0.4139458638311267, + "learning_rate": 8.205888866102753e-06, + "loss": 0.0429, + "step": 1555 + }, + { + "epoch": 1.0527740189445196, + "grad_norm": 0.373063790038525, + "learning_rate": 8.202866777112049e-06, + "loss": 0.0404, + "step": 1556 + }, + { + "epoch": 1.0534506089309879, + "grad_norm": 0.4080877429876385, + "learning_rate": 8.199842702516584e-06, + "loss": 0.0446, + "step": 1557 + }, + { + "epoch": 1.054127198917456, + "grad_norm": 0.333078946434557, + "learning_rate": 8.196816644191116e-06, + "loss": 0.0349, + "step": 1558 + }, + { + "epoch": 1.0548037889039241, + "grad_norm": 0.5599428068564104, + "learning_rate": 8.193788604011639e-06, + "loss": 0.045, + "step": 1559 + }, + { + "epoch": 1.0554803788903924, + "grad_norm": 0.38792237910080307, + "learning_rate": 8.190758583855379e-06, + "loss": 0.0345, + "step": 1560 + }, + { + "epoch": 1.0561569688768606, + "grad_norm": 0.3910418436826441, + "learning_rate": 8.187726585600779e-06, + "loss": 0.0399, + "step": 1561 + }, + { + "epoch": 1.0568335588633289, + "grad_norm": 0.38310512968348226, + "learning_rate": 8.18469261112752e-06, + "loss": 0.032, + "step": 1562 + }, + { + "epoch": 1.057510148849797, + "grad_norm": 0.30445322632194627, + "learning_rate": 8.181656662316498e-06, + "loss": 0.0319, + "step": 1563 + }, + { + "epoch": 1.0581867388362651, + "grad_norm": 0.3300628513317476, + "learning_rate": 8.178618741049841e-06, + "loss": 0.024, + "step": 1564 + }, + { + "epoch": 1.0588633288227334, + "grad_norm": 0.3856795494235353, + "learning_rate": 8.175578849210894e-06, + "loss": 0.0448, + "step": 1565 + }, + { + "epoch": 1.0595399188092016, + "grad_norm": 0.5723942162671996, + "learning_rate": 8.172536988684227e-06, + "loss": 0.046, + "step": 1566 + }, + { + "epoch": 1.0602165087956699, + "grad_norm": 0.31413445599872286, + "learning_rate": 8.169493161355632e-06, + "loss": 0.0374, + "step": 1567 + }, + { + "epoch": 1.060893098782138, + "grad_norm": 0.5475407183176644, + "learning_rate": 8.166447369112115e-06, + "loss": 0.0489, + "step": 1568 + }, + { + "epoch": 1.0615696887686061, + "grad_norm": 0.5244671095066844, + "learning_rate": 8.163399613841903e-06, + "loss": 0.0495, + "step": 1569 + }, + { + "epoch": 1.0622462787550744, + "grad_norm": 0.3758287293296234, + "learning_rate": 8.160349897434441e-06, + "loss": 0.0411, + "step": 1570 + }, + { + "epoch": 1.0629228687415426, + "grad_norm": 0.7654749597629602, + "learning_rate": 8.157298221780388e-06, + "loss": 0.0366, + "step": 1571 + }, + { + "epoch": 1.0635994587280109, + "grad_norm": 0.9044094727671382, + "learning_rate": 8.15424458877162e-06, + "loss": 0.0348, + "step": 1572 + }, + { + "epoch": 1.0642760487144791, + "grad_norm": 0.37991636890979896, + "learning_rate": 8.151189000301223e-06, + "loss": 0.0353, + "step": 1573 + }, + { + "epoch": 1.0649526387009471, + "grad_norm": 0.4254130519529926, + "learning_rate": 8.148131458263499e-06, + "loss": 0.0344, + "step": 1574 + }, + { + "epoch": 1.0656292286874154, + "grad_norm": 0.3696155354094528, + "learning_rate": 8.145071964553956e-06, + "loss": 0.0301, + "step": 1575 + }, + { + "epoch": 1.0663058186738836, + "grad_norm": 0.43828698159398116, + "learning_rate": 8.142010521069319e-06, + "loss": 0.0296, + "step": 1576 + }, + { + "epoch": 1.0669824086603519, + "grad_norm": 0.3898802338702638, + "learning_rate": 8.138947129707517e-06, + "loss": 0.0464, + "step": 1577 + }, + { + "epoch": 1.0676589986468201, + "grad_norm": 0.3677185173412872, + "learning_rate": 8.135881792367686e-06, + "loss": 0.0312, + "step": 1578 + }, + { + "epoch": 1.0683355886332881, + "grad_norm": 0.3618411532164138, + "learning_rate": 8.132814510950172e-06, + "loss": 0.0371, + "step": 1579 + }, + { + "epoch": 1.0690121786197564, + "grad_norm": 0.32566881071724985, + "learning_rate": 8.129745287356521e-06, + "loss": 0.0293, + "step": 1580 + }, + { + "epoch": 1.0696887686062246, + "grad_norm": 0.4687197012890747, + "learning_rate": 8.12667412348949e-06, + "loss": 0.037, + "step": 1581 + }, + { + "epoch": 1.0703653585926929, + "grad_norm": 0.31449392470467563, + "learning_rate": 8.12360102125303e-06, + "loss": 0.0321, + "step": 1582 + }, + { + "epoch": 1.0710419485791611, + "grad_norm": 0.3527388825096024, + "learning_rate": 8.120525982552304e-06, + "loss": 0.0346, + "step": 1583 + }, + { + "epoch": 1.0717185385656292, + "grad_norm": 0.5887727003319773, + "learning_rate": 8.117449009293668e-06, + "loss": 0.0457, + "step": 1584 + }, + { + "epoch": 1.0723951285520974, + "grad_norm": 0.26688692588913676, + "learning_rate": 8.11437010338468e-06, + "loss": 0.0219, + "step": 1585 + }, + { + "epoch": 1.0730717185385656, + "grad_norm": 0.26821326851919136, + "learning_rate": 8.111289266734095e-06, + "loss": 0.0234, + "step": 1586 + }, + { + "epoch": 1.073748308525034, + "grad_norm": 0.2671973703500071, + "learning_rate": 8.108206501251868e-06, + "loss": 0.0257, + "step": 1587 + }, + { + "epoch": 1.0744248985115021, + "grad_norm": 0.3298742756049819, + "learning_rate": 8.105121808849143e-06, + "loss": 0.026, + "step": 1588 + }, + { + "epoch": 1.0751014884979702, + "grad_norm": 0.4953624489261912, + "learning_rate": 8.102035191438268e-06, + "loss": 0.0387, + "step": 1589 + }, + { + "epoch": 1.0757780784844384, + "grad_norm": 0.4502736852528534, + "learning_rate": 8.098946650932776e-06, + "loss": 0.0504, + "step": 1590 + }, + { + "epoch": 1.0764546684709067, + "grad_norm": 0.5280120427931959, + "learning_rate": 8.095856189247396e-06, + "loss": 0.0483, + "step": 1591 + }, + { + "epoch": 1.077131258457375, + "grad_norm": 0.3892483852796154, + "learning_rate": 8.092763808298048e-06, + "loss": 0.0407, + "step": 1592 + }, + { + "epoch": 1.0778078484438431, + "grad_norm": 0.3164332934881196, + "learning_rate": 8.089669510001843e-06, + "loss": 0.0286, + "step": 1593 + }, + { + "epoch": 1.0784844384303112, + "grad_norm": 0.375134934327732, + "learning_rate": 8.086573296277078e-06, + "loss": 0.0361, + "step": 1594 + }, + { + "epoch": 1.0791610284167794, + "grad_norm": 0.3232728457786153, + "learning_rate": 8.083475169043237e-06, + "loss": 0.0326, + "step": 1595 + }, + { + "epoch": 1.0798376184032477, + "grad_norm": 0.49497911034836545, + "learning_rate": 8.080375130220995e-06, + "loss": 0.041, + "step": 1596 + }, + { + "epoch": 1.080514208389716, + "grad_norm": 0.3878288769567584, + "learning_rate": 8.077273181732207e-06, + "loss": 0.0279, + "step": 1597 + }, + { + "epoch": 1.0811907983761841, + "grad_norm": 0.41041669159238475, + "learning_rate": 8.074169325499915e-06, + "loss": 0.0397, + "step": 1598 + }, + { + "epoch": 1.0818673883626522, + "grad_norm": 0.4103534818542798, + "learning_rate": 8.071063563448341e-06, + "loss": 0.031, + "step": 1599 + }, + { + "epoch": 1.0825439783491204, + "grad_norm": 0.4131078990705404, + "learning_rate": 8.06795589750289e-06, + "loss": 0.0439, + "step": 1600 + }, + { + "epoch": 1.0832205683355887, + "grad_norm": 0.41139813037009376, + "learning_rate": 8.06484632959015e-06, + "loss": 0.0339, + "step": 1601 + }, + { + "epoch": 1.083897158322057, + "grad_norm": 0.35903652512166706, + "learning_rate": 8.061734861637883e-06, + "loss": 0.0302, + "step": 1602 + }, + { + "epoch": 1.084573748308525, + "grad_norm": 0.4265701546683336, + "learning_rate": 8.058621495575032e-06, + "loss": 0.0381, + "step": 1603 + }, + { + "epoch": 1.0852503382949932, + "grad_norm": 0.4721907691857595, + "learning_rate": 8.055506233331718e-06, + "loss": 0.0394, + "step": 1604 + }, + { + "epoch": 1.0859269282814614, + "grad_norm": 0.485689393088364, + "learning_rate": 8.052389076839233e-06, + "loss": 0.0361, + "step": 1605 + }, + { + "epoch": 1.0866035182679297, + "grad_norm": 0.39125175857039185, + "learning_rate": 8.049270028030045e-06, + "loss": 0.0304, + "step": 1606 + }, + { + "epoch": 1.087280108254398, + "grad_norm": 0.565839044842079, + "learning_rate": 8.046149088837803e-06, + "loss": 0.031, + "step": 1607 + }, + { + "epoch": 1.087956698240866, + "grad_norm": 0.3441864604049982, + "learning_rate": 8.043026261197312e-06, + "loss": 0.031, + "step": 1608 + }, + { + "epoch": 1.0886332882273342, + "grad_norm": 0.2861275412838357, + "learning_rate": 8.039901547044564e-06, + "loss": 0.0319, + "step": 1609 + }, + { + "epoch": 1.0893098782138024, + "grad_norm": 0.39751366001270527, + "learning_rate": 8.03677494831671e-06, + "loss": 0.025, + "step": 1610 + }, + { + "epoch": 1.0899864682002707, + "grad_norm": 0.6844399185434005, + "learning_rate": 8.033646466952072e-06, + "loss": 0.0505, + "step": 1611 + }, + { + "epoch": 1.090663058186739, + "grad_norm": 0.40688187672698983, + "learning_rate": 8.03051610489014e-06, + "loss": 0.036, + "step": 1612 + }, + { + "epoch": 1.091339648173207, + "grad_norm": 0.5056451013640939, + "learning_rate": 8.027383864071573e-06, + "loss": 0.0302, + "step": 1613 + }, + { + "epoch": 1.0920162381596752, + "grad_norm": 0.3301636612204815, + "learning_rate": 8.024249746438189e-06, + "loss": 0.0271, + "step": 1614 + }, + { + "epoch": 1.0926928281461434, + "grad_norm": 0.42877224360312527, + "learning_rate": 8.021113753932972e-06, + "loss": 0.0337, + "step": 1615 + }, + { + "epoch": 1.0933694181326117, + "grad_norm": 0.43008720041470855, + "learning_rate": 8.017975888500067e-06, + "loss": 0.0329, + "step": 1616 + }, + { + "epoch": 1.09404600811908, + "grad_norm": 0.5083512064425184, + "learning_rate": 8.014836152084784e-06, + "loss": 0.0434, + "step": 1617 + }, + { + "epoch": 1.094722598105548, + "grad_norm": 0.7713722017793138, + "learning_rate": 8.01169454663359e-06, + "loss": 0.0637, + "step": 1618 + }, + { + "epoch": 1.0953991880920162, + "grad_norm": 0.3990509266010287, + "learning_rate": 8.008551074094108e-06, + "loss": 0.0313, + "step": 1619 + }, + { + "epoch": 1.0960757780784844, + "grad_norm": 0.35489169548609006, + "learning_rate": 8.005405736415127e-06, + "loss": 0.0293, + "step": 1620 + }, + { + "epoch": 1.0967523680649527, + "grad_norm": 0.37090764662473097, + "learning_rate": 8.00225853554658e-06, + "loss": 0.0312, + "step": 1621 + }, + { + "epoch": 1.097428958051421, + "grad_norm": 0.4883875153457756, + "learning_rate": 7.99910947343957e-06, + "loss": 0.05, + "step": 1622 + }, + { + "epoch": 1.098105548037889, + "grad_norm": 0.43816736897448766, + "learning_rate": 7.995958552046338e-06, + "loss": 0.0328, + "step": 1623 + }, + { + "epoch": 1.0987821380243572, + "grad_norm": 0.5871725257052695, + "learning_rate": 7.99280577332029e-06, + "loss": 0.0493, + "step": 1624 + }, + { + "epoch": 1.0994587280108254, + "grad_norm": 0.41783825761682186, + "learning_rate": 7.989651139215979e-06, + "loss": 0.0297, + "step": 1625 + }, + { + "epoch": 1.1001353179972937, + "grad_norm": 0.566107827599266, + "learning_rate": 7.986494651689104e-06, + "loss": 0.0397, + "step": 1626 + }, + { + "epoch": 1.100811907983762, + "grad_norm": 0.4219969821926648, + "learning_rate": 7.983336312696521e-06, + "loss": 0.0325, + "step": 1627 + }, + { + "epoch": 1.10148849797023, + "grad_norm": 0.45797002548525556, + "learning_rate": 7.980176124196231e-06, + "loss": 0.0359, + "step": 1628 + }, + { + "epoch": 1.1021650879566982, + "grad_norm": 0.4725202273779479, + "learning_rate": 7.977014088147375e-06, + "loss": 0.0288, + "step": 1629 + }, + { + "epoch": 1.1028416779431665, + "grad_norm": 0.6259522913144866, + "learning_rate": 7.973850206510251e-06, + "loss": 0.0336, + "step": 1630 + }, + { + "epoch": 1.1035182679296347, + "grad_norm": 0.4909351056917921, + "learning_rate": 7.970684481246291e-06, + "loss": 0.0313, + "step": 1631 + }, + { + "epoch": 1.104194857916103, + "grad_norm": 0.4852188933981559, + "learning_rate": 7.967516914318075e-06, + "loss": 0.0346, + "step": 1632 + }, + { + "epoch": 1.104871447902571, + "grad_norm": 0.7070531892888307, + "learning_rate": 7.964347507689325e-06, + "loss": 0.0357, + "step": 1633 + }, + { + "epoch": 1.1055480378890392, + "grad_norm": 0.3683086646902459, + "learning_rate": 7.961176263324902e-06, + "loss": 0.0362, + "step": 1634 + }, + { + "epoch": 1.1062246278755075, + "grad_norm": 0.4844979931695627, + "learning_rate": 7.958003183190804e-06, + "loss": 0.0482, + "step": 1635 + }, + { + "epoch": 1.1069012178619757, + "grad_norm": 0.4638708289005973, + "learning_rate": 7.954828269254173e-06, + "loss": 0.0407, + "step": 1636 + }, + { + "epoch": 1.1075778078484437, + "grad_norm": 0.3266540988896249, + "learning_rate": 7.951651523483283e-06, + "loss": 0.0312, + "step": 1637 + }, + { + "epoch": 1.108254397834912, + "grad_norm": 0.2880477755406068, + "learning_rate": 7.948472947847546e-06, + "loss": 0.0263, + "step": 1638 + }, + { + "epoch": 1.1089309878213802, + "grad_norm": 0.49976069322425787, + "learning_rate": 7.945292544317505e-06, + "loss": 0.0453, + "step": 1639 + }, + { + "epoch": 1.1096075778078485, + "grad_norm": 0.9157470463047522, + "learning_rate": 7.942110314864842e-06, + "loss": 0.0413, + "step": 1640 + }, + { + "epoch": 1.1102841677943167, + "grad_norm": 0.39513786066966183, + "learning_rate": 7.938926261462366e-06, + "loss": 0.0377, + "step": 1641 + }, + { + "epoch": 1.1109607577807847, + "grad_norm": 0.35044777038520347, + "learning_rate": 7.93574038608402e-06, + "loss": 0.0315, + "step": 1642 + }, + { + "epoch": 1.111637347767253, + "grad_norm": 0.4056375593937972, + "learning_rate": 7.932552690704871e-06, + "loss": 0.0369, + "step": 1643 + }, + { + "epoch": 1.1123139377537212, + "grad_norm": 0.4688694087789003, + "learning_rate": 7.929363177301124e-06, + "loss": 0.0458, + "step": 1644 + }, + { + "epoch": 1.1129905277401895, + "grad_norm": 0.49214825386525457, + "learning_rate": 7.926171847850101e-06, + "loss": 0.0461, + "step": 1645 + }, + { + "epoch": 1.1136671177266577, + "grad_norm": 0.4603831493068334, + "learning_rate": 7.922978704330257e-06, + "loss": 0.0312, + "step": 1646 + }, + { + "epoch": 1.1143437077131257, + "grad_norm": 0.35540355813278274, + "learning_rate": 7.919783748721169e-06, + "loss": 0.0358, + "step": 1647 + }, + { + "epoch": 1.115020297699594, + "grad_norm": 0.3086143597181157, + "learning_rate": 7.916586983003534e-06, + "loss": 0.0297, + "step": 1648 + }, + { + "epoch": 1.1156968876860622, + "grad_norm": 0.3404204101476662, + "learning_rate": 7.913388409159175e-06, + "loss": 0.0358, + "step": 1649 + }, + { + "epoch": 1.1163734776725305, + "grad_norm": 0.4997484911135465, + "learning_rate": 7.910188029171039e-06, + "loss": 0.0326, + "step": 1650 + }, + { + "epoch": 1.1170500676589987, + "grad_norm": 0.2605099264047454, + "learning_rate": 7.906985845023187e-06, + "loss": 0.0242, + "step": 1651 + }, + { + "epoch": 1.1177266576454667, + "grad_norm": 0.34049687122226624, + "learning_rate": 7.903781858700799e-06, + "loss": 0.0279, + "step": 1652 + }, + { + "epoch": 1.118403247631935, + "grad_norm": 0.40720284283588737, + "learning_rate": 7.900576072190177e-06, + "loss": 0.0398, + "step": 1653 + }, + { + "epoch": 1.1190798376184032, + "grad_norm": 0.43246417364287354, + "learning_rate": 7.897368487478733e-06, + "loss": 0.04, + "step": 1654 + }, + { + "epoch": 1.1197564276048715, + "grad_norm": 0.46293228867610187, + "learning_rate": 7.894159106554997e-06, + "loss": 0.0322, + "step": 1655 + }, + { + "epoch": 1.1204330175913397, + "grad_norm": 0.3614772846544025, + "learning_rate": 7.890947931408614e-06, + "loss": 0.0333, + "step": 1656 + }, + { + "epoch": 1.1211096075778078, + "grad_norm": 0.4177570410280789, + "learning_rate": 7.887734964030337e-06, + "loss": 0.0293, + "step": 1657 + }, + { + "epoch": 1.121786197564276, + "grad_norm": 0.5524351239758096, + "learning_rate": 7.884520206412036e-06, + "loss": 0.0409, + "step": 1658 + }, + { + "epoch": 1.1224627875507442, + "grad_norm": 0.5420011967543492, + "learning_rate": 7.881303660546684e-06, + "loss": 0.0572, + "step": 1659 + }, + { + "epoch": 1.1231393775372125, + "grad_norm": 0.28848419254404023, + "learning_rate": 7.87808532842837e-06, + "loss": 0.0278, + "step": 1660 + }, + { + "epoch": 1.1238159675236807, + "grad_norm": 0.8523139376772814, + "learning_rate": 7.87486521205228e-06, + "loss": 0.0373, + "step": 1661 + }, + { + "epoch": 1.1244925575101488, + "grad_norm": 0.3168921113962619, + "learning_rate": 7.871643313414718e-06, + "loss": 0.0246, + "step": 1662 + }, + { + "epoch": 1.125169147496617, + "grad_norm": 0.5133589993063726, + "learning_rate": 7.868419634513087e-06, + "loss": 0.0363, + "step": 1663 + }, + { + "epoch": 1.1258457374830853, + "grad_norm": 0.3674668379082116, + "learning_rate": 7.865194177345894e-06, + "loss": 0.0319, + "step": 1664 + }, + { + "epoch": 1.1265223274695535, + "grad_norm": 0.6093969710809076, + "learning_rate": 7.861966943912746e-06, + "loss": 0.0383, + "step": 1665 + }, + { + "epoch": 1.1271989174560217, + "grad_norm": 0.31434767917615763, + "learning_rate": 7.858737936214355e-06, + "loss": 0.0229, + "step": 1666 + }, + { + "epoch": 1.1278755074424898, + "grad_norm": 0.49117740766595575, + "learning_rate": 7.855507156252536e-06, + "loss": 0.0374, + "step": 1667 + }, + { + "epoch": 1.128552097428958, + "grad_norm": 0.4135179430121241, + "learning_rate": 7.852274606030191e-06, + "loss": 0.0348, + "step": 1668 + }, + { + "epoch": 1.1292286874154263, + "grad_norm": 0.3331697162708948, + "learning_rate": 7.849040287551331e-06, + "loss": 0.0292, + "step": 1669 + }, + { + "epoch": 1.1299052774018945, + "grad_norm": 0.37454027897025555, + "learning_rate": 7.84580420282106e-06, + "loss": 0.0326, + "step": 1670 + }, + { + "epoch": 1.1305818673883627, + "grad_norm": 0.42709392587927086, + "learning_rate": 7.842566353845575e-06, + "loss": 0.0428, + "step": 1671 + }, + { + "epoch": 1.1312584573748308, + "grad_norm": 0.4695476700239723, + "learning_rate": 7.839326742632168e-06, + "loss": 0.0401, + "step": 1672 + }, + { + "epoch": 1.131935047361299, + "grad_norm": 0.7265900096506797, + "learning_rate": 7.836085371189221e-06, + "loss": 0.0336, + "step": 1673 + }, + { + "epoch": 1.1326116373477673, + "grad_norm": 0.29255653236545787, + "learning_rate": 7.832842241526212e-06, + "loss": 0.0367, + "step": 1674 + }, + { + "epoch": 1.1332882273342355, + "grad_norm": 0.3024686470353999, + "learning_rate": 7.829597355653707e-06, + "loss": 0.0242, + "step": 1675 + }, + { + "epoch": 1.1339648173207038, + "grad_norm": 0.33658204307282275, + "learning_rate": 7.82635071558336e-06, + "loss": 0.0307, + "step": 1676 + }, + { + "epoch": 1.1346414073071718, + "grad_norm": 0.4023445895726629, + "learning_rate": 7.82310232332791e-06, + "loss": 0.0405, + "step": 1677 + }, + { + "epoch": 1.13531799729364, + "grad_norm": 0.26451858971973113, + "learning_rate": 7.81985218090119e-06, + "loss": 0.0262, + "step": 1678 + }, + { + "epoch": 1.1359945872801083, + "grad_norm": 0.3952379143134822, + "learning_rate": 7.81660029031811e-06, + "loss": 0.0271, + "step": 1679 + }, + { + "epoch": 1.1366711772665765, + "grad_norm": 0.36027556723477727, + "learning_rate": 7.813346653594667e-06, + "loss": 0.0263, + "step": 1680 + }, + { + "epoch": 1.1373477672530448, + "grad_norm": 0.4853700627381573, + "learning_rate": 7.810091272747943e-06, + "loss": 0.0315, + "step": 1681 + }, + { + "epoch": 1.1380243572395128, + "grad_norm": 0.4164582623763797, + "learning_rate": 7.806834149796094e-06, + "loss": 0.0345, + "step": 1682 + }, + { + "epoch": 1.138700947225981, + "grad_norm": 0.3382194588460656, + "learning_rate": 7.803575286758365e-06, + "loss": 0.0278, + "step": 1683 + }, + { + "epoch": 1.1393775372124493, + "grad_norm": 0.48303131698605367, + "learning_rate": 7.800314685655072e-06, + "loss": 0.0455, + "step": 1684 + }, + { + "epoch": 1.1400541271989175, + "grad_norm": 0.4337702265887451, + "learning_rate": 7.797052348507614e-06, + "loss": 0.0376, + "step": 1685 + }, + { + "epoch": 1.1407307171853858, + "grad_norm": 0.5900178022531811, + "learning_rate": 7.793788277338464e-06, + "loss": 0.0552, + "step": 1686 + }, + { + "epoch": 1.1414073071718538, + "grad_norm": 0.37966231199451633, + "learning_rate": 7.790522474171171e-06, + "loss": 0.0316, + "step": 1687 + }, + { + "epoch": 1.142083897158322, + "grad_norm": 0.33225280887417386, + "learning_rate": 7.787254941030353e-06, + "loss": 0.0297, + "step": 1688 + }, + { + "epoch": 1.1427604871447903, + "grad_norm": 0.4131074510358068, + "learning_rate": 7.78398567994171e-06, + "loss": 0.0294, + "step": 1689 + }, + { + "epoch": 1.1434370771312585, + "grad_norm": 0.6372941134981561, + "learning_rate": 7.780714692932002e-06, + "loss": 0.0356, + "step": 1690 + }, + { + "epoch": 1.1441136671177268, + "grad_norm": 0.33031146965199987, + "learning_rate": 7.777441982029072e-06, + "loss": 0.0356, + "step": 1691 + }, + { + "epoch": 1.1447902571041948, + "grad_norm": 0.3332906140704906, + "learning_rate": 7.774167549261817e-06, + "loss": 0.0254, + "step": 1692 + }, + { + "epoch": 1.145466847090663, + "grad_norm": 0.39298673791634586, + "learning_rate": 7.770891396660212e-06, + "loss": 0.0369, + "step": 1693 + }, + { + "epoch": 1.1461434370771313, + "grad_norm": 0.4165825025098529, + "learning_rate": 7.767613526255296e-06, + "loss": 0.0282, + "step": 1694 + }, + { + "epoch": 1.1468200270635995, + "grad_norm": 0.34886050427002396, + "learning_rate": 7.764333940079169e-06, + "loss": 0.0318, + "step": 1695 + }, + { + "epoch": 1.1474966170500678, + "grad_norm": 0.5327647557270032, + "learning_rate": 7.761052640165e-06, + "loss": 0.03, + "step": 1696 + }, + { + "epoch": 1.1481732070365358, + "grad_norm": 0.4615044713226057, + "learning_rate": 7.757769628547018e-06, + "loss": 0.0434, + "step": 1697 + }, + { + "epoch": 1.148849797023004, + "grad_norm": 0.4304149838702361, + "learning_rate": 7.754484907260513e-06, + "loss": 0.0422, + "step": 1698 + }, + { + "epoch": 1.1495263870094723, + "grad_norm": 0.37820064932543607, + "learning_rate": 7.751198478341836e-06, + "loss": 0.0317, + "step": 1699 + }, + { + "epoch": 1.1502029769959405, + "grad_norm": 0.4371365288523147, + "learning_rate": 7.747910343828391e-06, + "loss": 0.0435, + "step": 1700 + }, + { + "epoch": 1.1508795669824086, + "grad_norm": 0.35221916919671453, + "learning_rate": 7.744620505758652e-06, + "loss": 0.0392, + "step": 1701 + }, + { + "epoch": 1.1515561569688768, + "grad_norm": 0.32769852974318486, + "learning_rate": 7.741328966172134e-06, + "loss": 0.0283, + "step": 1702 + }, + { + "epoch": 1.152232746955345, + "grad_norm": 0.2944078529104389, + "learning_rate": 7.738035727109418e-06, + "loss": 0.0309, + "step": 1703 + }, + { + "epoch": 1.1529093369418133, + "grad_norm": 0.43745468977194324, + "learning_rate": 7.734740790612137e-06, + "loss": 0.0376, + "step": 1704 + }, + { + "epoch": 1.1535859269282815, + "grad_norm": 0.301414702108444, + "learning_rate": 7.731444158722967e-06, + "loss": 0.0255, + "step": 1705 + }, + { + "epoch": 1.1542625169147496, + "grad_norm": 0.3095088765629941, + "learning_rate": 7.728145833485647e-06, + "loss": 0.0252, + "step": 1706 + }, + { + "epoch": 1.1549391069012178, + "grad_norm": 0.34503916445165483, + "learning_rate": 7.724845816944962e-06, + "loss": 0.0347, + "step": 1707 + }, + { + "epoch": 1.155615696887686, + "grad_norm": 0.3577350655249529, + "learning_rate": 7.72154411114674e-06, + "loss": 0.0346, + "step": 1708 + }, + { + "epoch": 1.1562922868741543, + "grad_norm": 0.40226713175297785, + "learning_rate": 7.718240718137863e-06, + "loss": 0.037, + "step": 1709 + }, + { + "epoch": 1.1569688768606226, + "grad_norm": 0.5002419556023001, + "learning_rate": 7.714935639966257e-06, + "loss": 0.0373, + "step": 1710 + }, + { + "epoch": 1.1576454668470906, + "grad_norm": 0.3026300902484468, + "learning_rate": 7.711628878680892e-06, + "loss": 0.04, + "step": 1711 + }, + { + "epoch": 1.1583220568335588, + "grad_norm": 0.3416151011693065, + "learning_rate": 7.708320436331782e-06, + "loss": 0.0266, + "step": 1712 + }, + { + "epoch": 1.158998646820027, + "grad_norm": 0.36286593113461185, + "learning_rate": 7.705010314969983e-06, + "loss": 0.0284, + "step": 1713 + }, + { + "epoch": 1.1596752368064953, + "grad_norm": 0.35775642200205643, + "learning_rate": 7.70169851664759e-06, + "loss": 0.0375, + "step": 1714 + }, + { + "epoch": 1.1603518267929636, + "grad_norm": 0.4725787571443051, + "learning_rate": 7.698385043417741e-06, + "loss": 0.048, + "step": 1715 + }, + { + "epoch": 1.1610284167794316, + "grad_norm": 0.29847538616807273, + "learning_rate": 7.695069897334613e-06, + "loss": 0.0251, + "step": 1716 + }, + { + "epoch": 1.1617050067658998, + "grad_norm": 0.41082923259581106, + "learning_rate": 7.691753080453413e-06, + "loss": 0.0399, + "step": 1717 + }, + { + "epoch": 1.162381596752368, + "grad_norm": 0.5190388420490879, + "learning_rate": 7.688434594830392e-06, + "loss": 0.0343, + "step": 1718 + }, + { + "epoch": 1.1630581867388363, + "grad_norm": 0.28683473141450755, + "learning_rate": 7.685114442522831e-06, + "loss": 0.0283, + "step": 1719 + }, + { + "epoch": 1.1637347767253043, + "grad_norm": 0.5867114519178951, + "learning_rate": 7.681792625589046e-06, + "loss": 0.0465, + "step": 1720 + }, + { + "epoch": 1.1644113667117726, + "grad_norm": 0.28189626443705196, + "learning_rate": 7.678469146088385e-06, + "loss": 0.0377, + "step": 1721 + }, + { + "epoch": 1.1650879566982408, + "grad_norm": 0.4257169140629805, + "learning_rate": 7.675144006081225e-06, + "loss": 0.0313, + "step": 1722 + }, + { + "epoch": 1.165764546684709, + "grad_norm": 0.26975661652548166, + "learning_rate": 7.671817207628973e-06, + "loss": 0.0218, + "step": 1723 + }, + { + "epoch": 1.1664411366711773, + "grad_norm": 0.29291059896048743, + "learning_rate": 7.668488752794067e-06, + "loss": 0.0262, + "step": 1724 + }, + { + "epoch": 1.1671177266576453, + "grad_norm": 0.2923749383286343, + "learning_rate": 7.66515864363997e-06, + "loss": 0.0325, + "step": 1725 + }, + { + "epoch": 1.1677943166441136, + "grad_norm": 0.42348482931588116, + "learning_rate": 7.661826882231165e-06, + "loss": 0.0472, + "step": 1726 + }, + { + "epoch": 1.1684709066305818, + "grad_norm": 0.4901156885176605, + "learning_rate": 7.658493470633173e-06, + "loss": 0.0406, + "step": 1727 + }, + { + "epoch": 1.16914749661705, + "grad_norm": 0.3180984357336836, + "learning_rate": 7.65515841091252e-06, + "loss": 0.0305, + "step": 1728 + }, + { + "epoch": 1.1698240866035183, + "grad_norm": 0.6441721154116925, + "learning_rate": 7.651821705136771e-06, + "loss": 0.0526, + "step": 1729 + }, + { + "epoch": 1.1705006765899864, + "grad_norm": 0.26853044046386193, + "learning_rate": 7.648483355374496e-06, + "loss": 0.0277, + "step": 1730 + }, + { + "epoch": 1.1711772665764546, + "grad_norm": 0.3139187607695551, + "learning_rate": 7.645143363695302e-06, + "loss": 0.0284, + "step": 1731 + }, + { + "epoch": 1.1718538565629228, + "grad_norm": 0.5718022912501142, + "learning_rate": 7.641801732169796e-06, + "loss": 0.0465, + "step": 1732 + }, + { + "epoch": 1.172530446549391, + "grad_norm": 0.5447100324678501, + "learning_rate": 7.63845846286961e-06, + "loss": 0.033, + "step": 1733 + }, + { + "epoch": 1.1732070365358593, + "grad_norm": 0.41472159633268796, + "learning_rate": 7.635113557867395e-06, + "loss": 0.0333, + "step": 1734 + }, + { + "epoch": 1.1738836265223274, + "grad_norm": 0.49728259053599116, + "learning_rate": 7.63176701923681e-06, + "loss": 0.0409, + "step": 1735 + }, + { + "epoch": 1.1745602165087956, + "grad_norm": 0.6398719692509215, + "learning_rate": 7.628418849052523e-06, + "loss": 0.0394, + "step": 1736 + }, + { + "epoch": 1.1752368064952639, + "grad_norm": 0.36361617017018366, + "learning_rate": 7.625069049390228e-06, + "loss": 0.0375, + "step": 1737 + }, + { + "epoch": 1.175913396481732, + "grad_norm": 0.5502760150541254, + "learning_rate": 7.621717622326617e-06, + "loss": 0.0369, + "step": 1738 + }, + { + "epoch": 1.1765899864682003, + "grad_norm": 0.5090515892709198, + "learning_rate": 7.61836456993939e-06, + "loss": 0.049, + "step": 1739 + }, + { + "epoch": 1.1772665764546684, + "grad_norm": 0.44776416479226083, + "learning_rate": 7.615009894307263e-06, + "loss": 0.0339, + "step": 1740 + }, + { + "epoch": 1.1779431664411366, + "grad_norm": 0.3492141392783174, + "learning_rate": 7.611653597509954e-06, + "loss": 0.0247, + "step": 1741 + }, + { + "epoch": 1.1786197564276049, + "grad_norm": 0.41289570839557044, + "learning_rate": 7.608295681628185e-06, + "loss": 0.0314, + "step": 1742 + }, + { + "epoch": 1.179296346414073, + "grad_norm": 0.3852313302007077, + "learning_rate": 7.604936148743682e-06, + "loss": 0.0328, + "step": 1743 + }, + { + "epoch": 1.1799729364005414, + "grad_norm": 0.36440277176471003, + "learning_rate": 7.6015750009391776e-06, + "loss": 0.0357, + "step": 1744 + }, + { + "epoch": 1.1806495263870094, + "grad_norm": 0.33358825995538066, + "learning_rate": 7.5982122402983986e-06, + "loss": 0.0305, + "step": 1745 + }, + { + "epoch": 1.1813261163734776, + "grad_norm": 0.33472694610144493, + "learning_rate": 7.594847868906076e-06, + "loss": 0.0375, + "step": 1746 + }, + { + "epoch": 1.1820027063599459, + "grad_norm": 0.33714868177143426, + "learning_rate": 7.5914818888479406e-06, + "loss": 0.0272, + "step": 1747 + }, + { + "epoch": 1.182679296346414, + "grad_norm": 0.46217272974324325, + "learning_rate": 7.588114302210719e-06, + "loss": 0.0413, + "step": 1748 + }, + { + "epoch": 1.1833558863328824, + "grad_norm": 0.39229166482071326, + "learning_rate": 7.584745111082128e-06, + "loss": 0.0378, + "step": 1749 + }, + { + "epoch": 1.1840324763193504, + "grad_norm": 0.40934798728743826, + "learning_rate": 7.5813743175508914e-06, + "loss": 0.039, + "step": 1750 + }, + { + "epoch": 1.1847090663058186, + "grad_norm": 0.36785624519295507, + "learning_rate": 7.578001923706715e-06, + "loss": 0.0278, + "step": 1751 + }, + { + "epoch": 1.1853856562922869, + "grad_norm": 0.33291977893035213, + "learning_rate": 7.574627931640304e-06, + "loss": 0.0307, + "step": 1752 + }, + { + "epoch": 1.1860622462787551, + "grad_norm": 0.3110764512490773, + "learning_rate": 7.571252343443349e-06, + "loss": 0.0284, + "step": 1753 + }, + { + "epoch": 1.1867388362652234, + "grad_norm": 0.32015619790102334, + "learning_rate": 7.5678751612085344e-06, + "loss": 0.0309, + "step": 1754 + }, + { + "epoch": 1.1874154262516914, + "grad_norm": 0.44750536856238265, + "learning_rate": 7.564496387029532e-06, + "loss": 0.0346, + "step": 1755 + }, + { + "epoch": 1.1880920162381596, + "grad_norm": 0.32415843516157805, + "learning_rate": 7.5611160230009975e-06, + "loss": 0.0298, + "step": 1756 + }, + { + "epoch": 1.1887686062246279, + "grad_norm": 0.4935743931579038, + "learning_rate": 7.557734071218575e-06, + "loss": 0.0397, + "step": 1757 + }, + { + "epoch": 1.1894451962110961, + "grad_norm": 0.39357605337326074, + "learning_rate": 7.5543505337788934e-06, + "loss": 0.0418, + "step": 1758 + }, + { + "epoch": 1.1901217861975644, + "grad_norm": 0.370324508135328, + "learning_rate": 7.550965412779563e-06, + "loss": 0.0377, + "step": 1759 + }, + { + "epoch": 1.1907983761840324, + "grad_norm": 0.3817327709787257, + "learning_rate": 7.547578710319174e-06, + "loss": 0.0377, + "step": 1760 + }, + { + "epoch": 1.1914749661705006, + "grad_norm": 0.4073392230960148, + "learning_rate": 7.544190428497304e-06, + "loss": 0.03, + "step": 1761 + }, + { + "epoch": 1.1921515561569689, + "grad_norm": 0.332922118773184, + "learning_rate": 7.540800569414501e-06, + "loss": 0.0333, + "step": 1762 + }, + { + "epoch": 1.1928281461434371, + "grad_norm": 0.3137655265944553, + "learning_rate": 7.537409135172298e-06, + "loss": 0.0285, + "step": 1763 + }, + { + "epoch": 1.1935047361299054, + "grad_norm": 0.3998108253196923, + "learning_rate": 7.5340161278732e-06, + "loss": 0.0312, + "step": 1764 + }, + { + "epoch": 1.1941813261163734, + "grad_norm": 0.4620898570120324, + "learning_rate": 7.530621549620689e-06, + "loss": 0.0366, + "step": 1765 + }, + { + "epoch": 1.1948579161028416, + "grad_norm": 0.3314723928152739, + "learning_rate": 7.527225402519218e-06, + "loss": 0.0348, + "step": 1766 + }, + { + "epoch": 1.19553450608931, + "grad_norm": 0.3256254347500369, + "learning_rate": 7.52382768867422e-06, + "loss": 0.0291, + "step": 1767 + }, + { + "epoch": 1.1962110960757781, + "grad_norm": 0.4716243068193622, + "learning_rate": 7.52042841019209e-06, + "loss": 0.0391, + "step": 1768 + }, + { + "epoch": 1.1968876860622464, + "grad_norm": 0.505941339169538, + "learning_rate": 7.5170275691802e-06, + "loss": 0.0468, + "step": 1769 + }, + { + "epoch": 1.1975642760487144, + "grad_norm": 0.2440627906387663, + "learning_rate": 7.5136251677468856e-06, + "loss": 0.0265, + "step": 1770 + }, + { + "epoch": 1.1982408660351827, + "grad_norm": 0.3862800024403366, + "learning_rate": 7.510221208001457e-06, + "loss": 0.0311, + "step": 1771 + }, + { + "epoch": 1.198917456021651, + "grad_norm": 0.4494524757815659, + "learning_rate": 7.50681569205418e-06, + "loss": 0.041, + "step": 1772 + }, + { + "epoch": 1.1995940460081191, + "grad_norm": 0.30573292592723056, + "learning_rate": 7.5034086220162945e-06, + "loss": 0.029, + "step": 1773 + }, + { + "epoch": 1.2002706359945874, + "grad_norm": 0.35724743501500195, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0436, + "step": 1774 + }, + { + "epoch": 1.2009472259810554, + "grad_norm": 0.28470371882579004, + "learning_rate": 7.496589828118458e-06, + "loss": 0.0268, + "step": 1775 + }, + { + "epoch": 1.2016238159675237, + "grad_norm": 0.5802818204184877, + "learning_rate": 7.4931781084857915e-06, + "loss": 0.0393, + "step": 1776 + }, + { + "epoch": 1.202300405953992, + "grad_norm": 0.37036387455969116, + "learning_rate": 7.489764843217082e-06, + "loss": 0.0275, + "step": 1777 + }, + { + "epoch": 1.2029769959404601, + "grad_norm": 0.4040523667928298, + "learning_rate": 7.4863500344283715e-06, + "loss": 0.0443, + "step": 1778 + }, + { + "epoch": 1.2036535859269284, + "grad_norm": 0.33092259192120466, + "learning_rate": 7.482933684236654e-06, + "loss": 0.0277, + "step": 1779 + }, + { + "epoch": 1.2043301759133964, + "grad_norm": 0.31198159640990164, + "learning_rate": 7.4795157947598864e-06, + "loss": 0.0267, + "step": 1780 + }, + { + "epoch": 1.2050067658998647, + "grad_norm": 0.25570069748428265, + "learning_rate": 7.476096368116974e-06, + "loss": 0.0294, + "step": 1781 + }, + { + "epoch": 1.205683355886333, + "grad_norm": 0.6124980425015061, + "learning_rate": 7.4726754064277775e-06, + "loss": 0.0434, + "step": 1782 + }, + { + "epoch": 1.2063599458728012, + "grad_norm": 0.3730907914223387, + "learning_rate": 7.469252911813107e-06, + "loss": 0.0282, + "step": 1783 + }, + { + "epoch": 1.2070365358592694, + "grad_norm": 0.45770328786927994, + "learning_rate": 7.465828886394729e-06, + "loss": 0.0365, + "step": 1784 + }, + { + "epoch": 1.2077131258457374, + "grad_norm": 0.32839720355086915, + "learning_rate": 7.462403332295351e-06, + "loss": 0.0297, + "step": 1785 + }, + { + "epoch": 1.2083897158322057, + "grad_norm": 0.3069509341440066, + "learning_rate": 7.458976251638632e-06, + "loss": 0.0213, + "step": 1786 + }, + { + "epoch": 1.209066305818674, + "grad_norm": 0.3121843224806762, + "learning_rate": 7.455547646549179e-06, + "loss": 0.0264, + "step": 1787 + }, + { + "epoch": 1.2097428958051422, + "grad_norm": 0.4398713007216972, + "learning_rate": 7.452117519152542e-06, + "loss": 0.0561, + "step": 1788 + }, + { + "epoch": 1.2104194857916104, + "grad_norm": 0.3437192919415782, + "learning_rate": 7.448685871575213e-06, + "loss": 0.032, + "step": 1789 + }, + { + "epoch": 1.2110960757780784, + "grad_norm": 0.3043326496305167, + "learning_rate": 7.445252705944632e-06, + "loss": 0.0261, + "step": 1790 + }, + { + "epoch": 1.2117726657645467, + "grad_norm": 0.442295644348202, + "learning_rate": 7.441818024389173e-06, + "loss": 0.0304, + "step": 1791 + }, + { + "epoch": 1.212449255751015, + "grad_norm": 0.4861660836252403, + "learning_rate": 7.438381829038157e-06, + "loss": 0.0419, + "step": 1792 + }, + { + "epoch": 1.2131258457374832, + "grad_norm": 0.31993392324287556, + "learning_rate": 7.434944122021837e-06, + "loss": 0.0304, + "step": 1793 + }, + { + "epoch": 1.2138024357239512, + "grad_norm": 0.41406701587352657, + "learning_rate": 7.431504905471407e-06, + "loss": 0.0266, + "step": 1794 + }, + { + "epoch": 1.2144790257104194, + "grad_norm": 0.3633991929204166, + "learning_rate": 7.428064181518997e-06, + "loss": 0.0357, + "step": 1795 + }, + { + "epoch": 1.2151556156968877, + "grad_norm": 0.4237964217740215, + "learning_rate": 7.424621952297668e-06, + "loss": 0.034, + "step": 1796 + }, + { + "epoch": 1.215832205683356, + "grad_norm": 0.38030150791077555, + "learning_rate": 7.4211782199414204e-06, + "loss": 0.0414, + "step": 1797 + }, + { + "epoch": 1.2165087956698242, + "grad_norm": 0.2702929059974458, + "learning_rate": 7.417732986585179e-06, + "loss": 0.0256, + "step": 1798 + }, + { + "epoch": 1.2171853856562922, + "grad_norm": 0.3970911186679923, + "learning_rate": 7.414286254364804e-06, + "loss": 0.0231, + "step": 1799 + }, + { + "epoch": 1.2178619756427604, + "grad_norm": 0.5690876899384154, + "learning_rate": 7.410838025417083e-06, + "loss": 0.0328, + "step": 1800 + }, + { + "epoch": 1.2185385656292287, + "grad_norm": 0.5606083143485081, + "learning_rate": 7.407388301879735e-06, + "loss": 0.0499, + "step": 1801 + }, + { + "epoch": 1.219215155615697, + "grad_norm": 0.2974123739096487, + "learning_rate": 7.403937085891397e-06, + "loss": 0.0379, + "step": 1802 + }, + { + "epoch": 1.2198917456021652, + "grad_norm": 0.4254067773408788, + "learning_rate": 7.400484379591644e-06, + "loss": 0.0384, + "step": 1803 + }, + { + "epoch": 1.2205683355886332, + "grad_norm": 0.419035886897327, + "learning_rate": 7.397030185120962e-06, + "loss": 0.0358, + "step": 1804 + }, + { + "epoch": 1.2212449255751014, + "grad_norm": 0.3306811361233171, + "learning_rate": 7.393574504620767e-06, + "loss": 0.0279, + "step": 1805 + }, + { + "epoch": 1.2219215155615697, + "grad_norm": 0.4373800575024946, + "learning_rate": 7.390117340233396e-06, + "loss": 0.0309, + "step": 1806 + }, + { + "epoch": 1.222598105548038, + "grad_norm": 0.3412603697202751, + "learning_rate": 7.386658694102103e-06, + "loss": 0.0334, + "step": 1807 + }, + { + "epoch": 1.2232746955345062, + "grad_norm": 0.36595538328644733, + "learning_rate": 7.383198568371064e-06, + "loss": 0.0372, + "step": 1808 + }, + { + "epoch": 1.2239512855209742, + "grad_norm": 0.4175259270789885, + "learning_rate": 7.379736965185369e-06, + "loss": 0.034, + "step": 1809 + }, + { + "epoch": 1.2246278755074425, + "grad_norm": 0.4358483411091658, + "learning_rate": 7.376273886691024e-06, + "loss": 0.0433, + "step": 1810 + }, + { + "epoch": 1.2253044654939107, + "grad_norm": 0.47099895438816536, + "learning_rate": 7.372809335034955e-06, + "loss": 0.0343, + "step": 1811 + }, + { + "epoch": 1.225981055480379, + "grad_norm": 0.39883289123379007, + "learning_rate": 7.369343312364994e-06, + "loss": 0.0355, + "step": 1812 + }, + { + "epoch": 1.226657645466847, + "grad_norm": 0.2570217171563449, + "learning_rate": 7.365875820829889e-06, + "loss": 0.0274, + "step": 1813 + }, + { + "epoch": 1.2273342354533152, + "grad_norm": 0.36785410865452167, + "learning_rate": 7.362406862579299e-06, + "loss": 0.0488, + "step": 1814 + }, + { + "epoch": 1.2280108254397835, + "grad_norm": 0.5117751581032768, + "learning_rate": 7.358936439763789e-06, + "loss": 0.0479, + "step": 1815 + }, + { + "epoch": 1.2286874154262517, + "grad_norm": 0.25366821073798473, + "learning_rate": 7.355464554534837e-06, + "loss": 0.025, + "step": 1816 + }, + { + "epoch": 1.22936400541272, + "grad_norm": 0.4914107184195335, + "learning_rate": 7.351991209044822e-06, + "loss": 0.0345, + "step": 1817 + }, + { + "epoch": 1.230040595399188, + "grad_norm": 0.32382934645796957, + "learning_rate": 7.348516405447031e-06, + "loss": 0.0275, + "step": 1818 + }, + { + "epoch": 1.2307171853856562, + "grad_norm": 0.3904025573575658, + "learning_rate": 7.345040145895656e-06, + "loss": 0.0351, + "step": 1819 + }, + { + "epoch": 1.2313937753721245, + "grad_norm": 0.3828009858030226, + "learning_rate": 7.341562432545793e-06, + "loss": 0.0432, + "step": 1820 + }, + { + "epoch": 1.2320703653585927, + "grad_norm": 0.3545668565357232, + "learning_rate": 7.338083267553433e-06, + "loss": 0.0339, + "step": 1821 + }, + { + "epoch": 1.232746955345061, + "grad_norm": 0.37698907015804395, + "learning_rate": 7.334602653075471e-06, + "loss": 0.0304, + "step": 1822 + }, + { + "epoch": 1.233423545331529, + "grad_norm": 0.5494252593816147, + "learning_rate": 7.331120591269701e-06, + "loss": 0.0419, + "step": 1823 + }, + { + "epoch": 1.2341001353179972, + "grad_norm": 0.3371825382799375, + "learning_rate": 7.327637084294818e-06, + "loss": 0.0311, + "step": 1824 + }, + { + "epoch": 1.2347767253044655, + "grad_norm": 0.32525737487534057, + "learning_rate": 7.324152134310401e-06, + "loss": 0.0272, + "step": 1825 + }, + { + "epoch": 1.2354533152909337, + "grad_norm": 0.30485570847197085, + "learning_rate": 7.3206657434769354e-06, + "loss": 0.0296, + "step": 1826 + }, + { + "epoch": 1.236129905277402, + "grad_norm": 0.5896781547259747, + "learning_rate": 7.317177913955795e-06, + "loss": 0.0452, + "step": 1827 + }, + { + "epoch": 1.23680649526387, + "grad_norm": 0.24082812103365153, + "learning_rate": 7.313688647909245e-06, + "loss": 0.0191, + "step": 1828 + }, + { + "epoch": 1.2374830852503382, + "grad_norm": 0.2739898053952858, + "learning_rate": 7.310197947500446e-06, + "loss": 0.0228, + "step": 1829 + }, + { + "epoch": 1.2381596752368065, + "grad_norm": 0.3626964636222225, + "learning_rate": 7.30670581489344e-06, + "loss": 0.0308, + "step": 1830 + }, + { + "epoch": 1.2388362652232747, + "grad_norm": 0.4300869336134017, + "learning_rate": 7.303212252253163e-06, + "loss": 0.0421, + "step": 1831 + }, + { + "epoch": 1.239512855209743, + "grad_norm": 0.3578639679889823, + "learning_rate": 7.2997172617454335e-06, + "loss": 0.032, + "step": 1832 + }, + { + "epoch": 1.240189445196211, + "grad_norm": 0.30010753600803947, + "learning_rate": 7.29622084553696e-06, + "loss": 0.0285, + "step": 1833 + }, + { + "epoch": 1.2408660351826792, + "grad_norm": 0.30841554347311356, + "learning_rate": 7.29272300579533e-06, + "loss": 0.0351, + "step": 1834 + }, + { + "epoch": 1.2415426251691475, + "grad_norm": 0.5914157627033085, + "learning_rate": 7.289223744689018e-06, + "loss": 0.0313, + "step": 1835 + }, + { + "epoch": 1.2422192151556157, + "grad_norm": 0.6591605263758334, + "learning_rate": 7.285723064387373e-06, + "loss": 0.0533, + "step": 1836 + }, + { + "epoch": 1.242895805142084, + "grad_norm": 0.39489023938145035, + "learning_rate": 7.282220967060634e-06, + "loss": 0.0378, + "step": 1837 + }, + { + "epoch": 1.243572395128552, + "grad_norm": 0.3579266660032314, + "learning_rate": 7.278717454879907e-06, + "loss": 0.0349, + "step": 1838 + }, + { + "epoch": 1.2442489851150202, + "grad_norm": 1.5192761124350649, + "learning_rate": 7.2752125300171835e-06, + "loss": 0.0453, + "step": 1839 + }, + { + "epoch": 1.2449255751014885, + "grad_norm": 0.46901360526469527, + "learning_rate": 7.271706194645327e-06, + "loss": 0.0318, + "step": 1840 + }, + { + "epoch": 1.2456021650879567, + "grad_norm": 0.2941993859492877, + "learning_rate": 7.26819845093808e-06, + "loss": 0.0272, + "step": 1841 + }, + { + "epoch": 1.246278755074425, + "grad_norm": 0.5656120002648428, + "learning_rate": 7.264689301070048e-06, + "loss": 0.0318, + "step": 1842 + }, + { + "epoch": 1.246955345060893, + "grad_norm": 0.34297030662363087, + "learning_rate": 7.2611787472167194e-06, + "loss": 0.0297, + "step": 1843 + }, + { + "epoch": 1.2476319350473613, + "grad_norm": 0.3427656632543216, + "learning_rate": 7.257666791554448e-06, + "loss": 0.0316, + "step": 1844 + }, + { + "epoch": 1.2483085250338295, + "grad_norm": 0.3986940128566045, + "learning_rate": 7.254153436260456e-06, + "loss": 0.048, + "step": 1845 + }, + { + "epoch": 1.2489851150202977, + "grad_norm": 0.4102796662454975, + "learning_rate": 7.250638683512833e-06, + "loss": 0.0465, + "step": 1846 + }, + { + "epoch": 1.249661705006766, + "grad_norm": 0.36303880433424535, + "learning_rate": 7.247122535490539e-06, + "loss": 0.0407, + "step": 1847 + }, + { + "epoch": 1.250338294993234, + "grad_norm": 0.4227726410120517, + "learning_rate": 7.2436049943733955e-06, + "loss": 0.0484, + "step": 1848 + }, + { + "epoch": 1.2510148849797023, + "grad_norm": 0.35731347162707455, + "learning_rate": 7.240086062342087e-06, + "loss": 0.0378, + "step": 1849 + }, + { + "epoch": 1.2516914749661705, + "grad_norm": 0.3898581094476785, + "learning_rate": 7.236565741578163e-06, + "loss": 0.0376, + "step": 1850 + }, + { + "epoch": 1.2523680649526387, + "grad_norm": 0.31193670850306715, + "learning_rate": 7.233044034264034e-06, + "loss": 0.0351, + "step": 1851 + }, + { + "epoch": 1.253044654939107, + "grad_norm": 0.37208656401812507, + "learning_rate": 7.229520942582965e-06, + "loss": 0.0314, + "step": 1852 + }, + { + "epoch": 1.253721244925575, + "grad_norm": 0.2917320550304776, + "learning_rate": 7.2259964687190855e-06, + "loss": 0.0312, + "step": 1853 + }, + { + "epoch": 1.2543978349120433, + "grad_norm": 0.3768590619705817, + "learning_rate": 7.22247061485738e-06, + "loss": 0.0407, + "step": 1854 + }, + { + "epoch": 1.2550744248985115, + "grad_norm": 0.2695118904034237, + "learning_rate": 7.218943383183684e-06, + "loss": 0.0236, + "step": 1855 + }, + { + "epoch": 1.2557510148849798, + "grad_norm": 0.28625741408281885, + "learning_rate": 7.215414775884695e-06, + "loss": 0.0265, + "step": 1856 + }, + { + "epoch": 1.256427604871448, + "grad_norm": 0.2960892263088646, + "learning_rate": 7.211884795147958e-06, + "loss": 0.0265, + "step": 1857 + }, + { + "epoch": 1.257104194857916, + "grad_norm": 0.33672543886910594, + "learning_rate": 7.208353443161871e-06, + "loss": 0.0417, + "step": 1858 + }, + { + "epoch": 1.2577807848443843, + "grad_norm": 0.3705340418236436, + "learning_rate": 7.204820722115681e-06, + "loss": 0.0295, + "step": 1859 + }, + { + "epoch": 1.2584573748308525, + "grad_norm": 0.39785029994746385, + "learning_rate": 7.201286634199484e-06, + "loss": 0.0417, + "step": 1860 + }, + { + "epoch": 1.2591339648173208, + "grad_norm": 0.22744172659411263, + "learning_rate": 7.197751181604228e-06, + "loss": 0.0241, + "step": 1861 + }, + { + "epoch": 1.259810554803789, + "grad_norm": 0.31737065279219406, + "learning_rate": 7.194214366521699e-06, + "loss": 0.036, + "step": 1862 + }, + { + "epoch": 1.260487144790257, + "grad_norm": 0.3558706744710046, + "learning_rate": 7.190676191144532e-06, + "loss": 0.0323, + "step": 1863 + }, + { + "epoch": 1.2611637347767253, + "grad_norm": 0.5041509694465267, + "learning_rate": 7.187136657666208e-06, + "loss": 0.0455, + "step": 1864 + }, + { + "epoch": 1.2618403247631935, + "grad_norm": 0.7918153565133799, + "learning_rate": 7.183595768281044e-06, + "loss": 0.0369, + "step": 1865 + }, + { + "epoch": 1.2625169147496618, + "grad_norm": 0.3576372518860047, + "learning_rate": 7.180053525184202e-06, + "loss": 0.0318, + "step": 1866 + }, + { + "epoch": 1.26319350473613, + "grad_norm": 0.3545731617593706, + "learning_rate": 7.176509930571682e-06, + "loss": 0.04, + "step": 1867 + }, + { + "epoch": 1.263870094722598, + "grad_norm": 0.24907884616057627, + "learning_rate": 7.172964986640319e-06, + "loss": 0.0258, + "step": 1868 + }, + { + "epoch": 1.2645466847090663, + "grad_norm": 0.3990611988547069, + "learning_rate": 7.169418695587791e-06, + "loss": 0.0387, + "step": 1869 + }, + { + "epoch": 1.2652232746955345, + "grad_norm": 0.8533603526557988, + "learning_rate": 7.165871059612604e-06, + "loss": 0.0322, + "step": 1870 + }, + { + "epoch": 1.2658998646820028, + "grad_norm": 0.4201196052175664, + "learning_rate": 7.162322080914106e-06, + "loss": 0.0404, + "step": 1871 + }, + { + "epoch": 1.266576454668471, + "grad_norm": 0.38394986726536406, + "learning_rate": 7.158771761692464e-06, + "loss": 0.0486, + "step": 1872 + }, + { + "epoch": 1.267253044654939, + "grad_norm": 0.2778453641283127, + "learning_rate": 7.155220104148694e-06, + "loss": 0.0243, + "step": 1873 + }, + { + "epoch": 1.2679296346414073, + "grad_norm": 0.347755938494088, + "learning_rate": 7.151667110484626e-06, + "loss": 0.0249, + "step": 1874 + }, + { + "epoch": 1.2686062246278755, + "grad_norm": 0.321177144443505, + "learning_rate": 7.148112782902927e-06, + "loss": 0.0312, + "step": 1875 + }, + { + "epoch": 1.2692828146143438, + "grad_norm": 0.3345751051084106, + "learning_rate": 7.144557123607087e-06, + "loss": 0.0327, + "step": 1876 + }, + { + "epoch": 1.269959404600812, + "grad_norm": 0.3646963498599837, + "learning_rate": 7.141000134801426e-06, + "loss": 0.0337, + "step": 1877 + }, + { + "epoch": 1.27063599458728, + "grad_norm": 0.25366441604914525, + "learning_rate": 7.137441818691081e-06, + "loss": 0.0219, + "step": 1878 + }, + { + "epoch": 1.2713125845737483, + "grad_norm": 0.29505248364640535, + "learning_rate": 7.133882177482019e-06, + "loss": 0.0285, + "step": 1879 + }, + { + "epoch": 1.2719891745602165, + "grad_norm": 0.41020582940222533, + "learning_rate": 7.130321213381025e-06, + "loss": 0.0363, + "step": 1880 + }, + { + "epoch": 1.2726657645466848, + "grad_norm": 0.32971167969871995, + "learning_rate": 7.1267589285957075e-06, + "loss": 0.0324, + "step": 1881 + }, + { + "epoch": 1.273342354533153, + "grad_norm": 0.3866276317569233, + "learning_rate": 7.123195325334486e-06, + "loss": 0.0336, + "step": 1882 + }, + { + "epoch": 1.274018944519621, + "grad_norm": 0.4475787946434262, + "learning_rate": 7.119630405806607e-06, + "loss": 0.0504, + "step": 1883 + }, + { + "epoch": 1.2746955345060893, + "grad_norm": 0.28936968902941973, + "learning_rate": 7.1160641722221255e-06, + "loss": 0.0224, + "step": 1884 + }, + { + "epoch": 1.2753721244925575, + "grad_norm": 0.4049126240542631, + "learning_rate": 7.112496626791915e-06, + "loss": 0.0418, + "step": 1885 + }, + { + "epoch": 1.2760487144790258, + "grad_norm": 0.3552857013720632, + "learning_rate": 7.108927771727661e-06, + "loss": 0.0334, + "step": 1886 + }, + { + "epoch": 1.276725304465494, + "grad_norm": 0.35734244118327135, + "learning_rate": 7.105357609241863e-06, + "loss": 0.0318, + "step": 1887 + }, + { + "epoch": 1.277401894451962, + "grad_norm": 0.4029530974663675, + "learning_rate": 7.101786141547829e-06, + "loss": 0.034, + "step": 1888 + }, + { + "epoch": 1.2780784844384303, + "grad_norm": 0.24957046377592598, + "learning_rate": 7.098213370859673e-06, + "loss": 0.0273, + "step": 1889 + }, + { + "epoch": 1.2787550744248986, + "grad_norm": 0.4552743843761523, + "learning_rate": 7.094639299392324e-06, + "loss": 0.0492, + "step": 1890 + }, + { + "epoch": 1.2794316644113666, + "grad_norm": 0.4478278647204221, + "learning_rate": 7.0910639293615125e-06, + "loss": 0.0446, + "step": 1891 + }, + { + "epoch": 1.280108254397835, + "grad_norm": 0.3632687438028061, + "learning_rate": 7.087487262983776e-06, + "loss": 0.0319, + "step": 1892 + }, + { + "epoch": 1.280784844384303, + "grad_norm": 0.34356582039614136, + "learning_rate": 7.083909302476453e-06, + "loss": 0.0314, + "step": 1893 + }, + { + "epoch": 1.2814614343707713, + "grad_norm": 0.47234299537785407, + "learning_rate": 7.080330050057687e-06, + "loss": 0.0377, + "step": 1894 + }, + { + "epoch": 1.2821380243572396, + "grad_norm": 0.46778752210844915, + "learning_rate": 7.076749507946422e-06, + "loss": 0.0483, + "step": 1895 + }, + { + "epoch": 1.2828146143437076, + "grad_norm": 0.25889490958952643, + "learning_rate": 7.0731676783624015e-06, + "loss": 0.0287, + "step": 1896 + }, + { + "epoch": 1.283491204330176, + "grad_norm": 0.39166620194980056, + "learning_rate": 7.069584563526166e-06, + "loss": 0.0416, + "step": 1897 + }, + { + "epoch": 1.284167794316644, + "grad_norm": 0.686299029669835, + "learning_rate": 7.066000165659054e-06, + "loss": 0.0354, + "step": 1898 + }, + { + "epoch": 1.2848443843031123, + "grad_norm": 0.4802383448666566, + "learning_rate": 7.062414486983197e-06, + "loss": 0.0581, + "step": 1899 + }, + { + "epoch": 1.2855209742895806, + "grad_norm": 0.31311888043709935, + "learning_rate": 7.058827529721526e-06, + "loss": 0.0333, + "step": 1900 + }, + { + "epoch": 1.2861975642760486, + "grad_norm": 0.4387072159234433, + "learning_rate": 7.055239296097758e-06, + "loss": 0.0357, + "step": 1901 + }, + { + "epoch": 1.2868741542625168, + "grad_norm": 0.5647885966002203, + "learning_rate": 7.051649788336405e-06, + "loss": 0.0331, + "step": 1902 + }, + { + "epoch": 1.287550744248985, + "grad_norm": 0.39020468286056914, + "learning_rate": 7.048059008662772e-06, + "loss": 0.0304, + "step": 1903 + }, + { + "epoch": 1.2882273342354533, + "grad_norm": 0.4040841233603088, + "learning_rate": 7.044466959302945e-06, + "loss": 0.0292, + "step": 1904 + }, + { + "epoch": 1.2889039242219216, + "grad_norm": 0.4370296663488622, + "learning_rate": 7.040873642483801e-06, + "loss": 0.0284, + "step": 1905 + }, + { + "epoch": 1.2895805142083896, + "grad_norm": 0.5615939264222903, + "learning_rate": 7.037279060433004e-06, + "loss": 0.0363, + "step": 1906 + }, + { + "epoch": 1.2902571041948578, + "grad_norm": 0.41362958692463603, + "learning_rate": 7.033683215379002e-06, + "loss": 0.03, + "step": 1907 + }, + { + "epoch": 1.290933694181326, + "grad_norm": 0.4240644099885667, + "learning_rate": 7.030086109551023e-06, + "loss": 0.0379, + "step": 1908 + }, + { + "epoch": 1.2916102841677943, + "grad_norm": 0.3927369609998387, + "learning_rate": 7.02648774517908e-06, + "loss": 0.0325, + "step": 1909 + }, + { + "epoch": 1.2922868741542626, + "grad_norm": 0.5016041617659053, + "learning_rate": 7.022888124493964e-06, + "loss": 0.0349, + "step": 1910 + }, + { + "epoch": 1.2929634641407306, + "grad_norm": 0.4350517164606389, + "learning_rate": 7.019287249727248e-06, + "loss": 0.0296, + "step": 1911 + }, + { + "epoch": 1.2936400541271988, + "grad_norm": 0.36137303117135344, + "learning_rate": 7.015685123111276e-06, + "loss": 0.0366, + "step": 1912 + }, + { + "epoch": 1.294316644113667, + "grad_norm": 0.4479583549009364, + "learning_rate": 7.012081746879178e-06, + "loss": 0.0388, + "step": 1913 + }, + { + "epoch": 1.2949932341001353, + "grad_norm": 0.5723533232917354, + "learning_rate": 7.008477123264849e-06, + "loss": 0.0383, + "step": 1914 + }, + { + "epoch": 1.2956698240866036, + "grad_norm": 0.35527692033264874, + "learning_rate": 7.004871254502962e-06, + "loss": 0.0299, + "step": 1915 + }, + { + "epoch": 1.2963464140730716, + "grad_norm": 0.30625989024013195, + "learning_rate": 7.001264142828961e-06, + "loss": 0.0276, + "step": 1916 + }, + { + "epoch": 1.2970230040595399, + "grad_norm": 0.4021769539521218, + "learning_rate": 6.997655790479062e-06, + "loss": 0.0296, + "step": 1917 + }, + { + "epoch": 1.297699594046008, + "grad_norm": 0.46910891089868806, + "learning_rate": 6.9940461996902495e-06, + "loss": 0.0357, + "step": 1918 + }, + { + "epoch": 1.2983761840324763, + "grad_norm": 0.48277128794886304, + "learning_rate": 6.990435372700273e-06, + "loss": 0.0393, + "step": 1919 + }, + { + "epoch": 1.2990527740189446, + "grad_norm": 0.2897119627855233, + "learning_rate": 6.986823311747652e-06, + "loss": 0.0216, + "step": 1920 + }, + { + "epoch": 1.2997293640054126, + "grad_norm": 0.29404834759520837, + "learning_rate": 6.983210019071671e-06, + "loss": 0.0298, + "step": 1921 + }, + { + "epoch": 1.3004059539918809, + "grad_norm": 0.4654456507271897, + "learning_rate": 6.979595496912374e-06, + "loss": 0.0523, + "step": 1922 + }, + { + "epoch": 1.301082543978349, + "grad_norm": 0.3861213528569032, + "learning_rate": 6.97597974751057e-06, + "loss": 0.0424, + "step": 1923 + }, + { + "epoch": 1.3017591339648173, + "grad_norm": 0.3763987234193048, + "learning_rate": 6.972362773107832e-06, + "loss": 0.0274, + "step": 1924 + }, + { + "epoch": 1.3024357239512856, + "grad_norm": 0.4930265693347563, + "learning_rate": 6.968744575946484e-06, + "loss": 0.0454, + "step": 1925 + }, + { + "epoch": 1.3031123139377536, + "grad_norm": 0.7085680617892084, + "learning_rate": 6.965125158269619e-06, + "loss": 0.0452, + "step": 1926 + }, + { + "epoch": 1.3037889039242219, + "grad_norm": 0.29466271097816943, + "learning_rate": 6.961504522321077e-06, + "loss": 0.0334, + "step": 1927 + }, + { + "epoch": 1.30446549391069, + "grad_norm": 0.3254460745028027, + "learning_rate": 6.957882670345458e-06, + "loss": 0.0253, + "step": 1928 + }, + { + "epoch": 1.3051420838971584, + "grad_norm": 0.35444113858214193, + "learning_rate": 6.954259604588114e-06, + "loss": 0.0323, + "step": 1929 + }, + { + "epoch": 1.3058186738836266, + "grad_norm": 0.52127584285171, + "learning_rate": 6.950635327295154e-06, + "loss": 0.049, + "step": 1930 + }, + { + "epoch": 1.3064952638700946, + "grad_norm": 0.3550872489898909, + "learning_rate": 6.94700984071343e-06, + "loss": 0.0331, + "step": 1931 + }, + { + "epoch": 1.3071718538565629, + "grad_norm": 0.42030445015020074, + "learning_rate": 6.943383147090552e-06, + "loss": 0.045, + "step": 1932 + }, + { + "epoch": 1.3078484438430311, + "grad_norm": 0.29625798135552134, + "learning_rate": 6.939755248674872e-06, + "loss": 0.0268, + "step": 1933 + }, + { + "epoch": 1.3085250338294994, + "grad_norm": 0.49240760765291725, + "learning_rate": 6.936126147715494e-06, + "loss": 0.0335, + "step": 1934 + }, + { + "epoch": 1.3092016238159676, + "grad_norm": 0.42465284862854297, + "learning_rate": 6.932495846462262e-06, + "loss": 0.0256, + "step": 1935 + }, + { + "epoch": 1.3098782138024356, + "grad_norm": 0.3805865277222013, + "learning_rate": 6.928864347165769e-06, + "loss": 0.0353, + "step": 1936 + }, + { + "epoch": 1.3105548037889039, + "grad_norm": 0.5019950449960542, + "learning_rate": 6.925231652077349e-06, + "loss": 0.0559, + "step": 1937 + }, + { + "epoch": 1.3112313937753721, + "grad_norm": 0.3886221758964247, + "learning_rate": 6.921597763449075e-06, + "loss": 0.0381, + "step": 1938 + }, + { + "epoch": 1.3119079837618404, + "grad_norm": 0.5536427584541227, + "learning_rate": 6.917962683533765e-06, + "loss": 0.0472, + "step": 1939 + }, + { + "epoch": 1.3125845737483086, + "grad_norm": 0.48745194443929857, + "learning_rate": 6.914326414584971e-06, + "loss": 0.0417, + "step": 1940 + }, + { + "epoch": 1.3132611637347766, + "grad_norm": 0.32193328227593826, + "learning_rate": 6.9106889588569845e-06, + "loss": 0.0285, + "step": 1941 + }, + { + "epoch": 1.3139377537212449, + "grad_norm": 0.49476158649818397, + "learning_rate": 6.907050318604831e-06, + "loss": 0.0465, + "step": 1942 + }, + { + "epoch": 1.3146143437077131, + "grad_norm": 0.39431421931011645, + "learning_rate": 6.903410496084272e-06, + "loss": 0.0294, + "step": 1943 + }, + { + "epoch": 1.3152909336941814, + "grad_norm": 0.47405875743912096, + "learning_rate": 6.8997694935518e-06, + "loss": 0.042, + "step": 1944 + }, + { + "epoch": 1.3159675236806496, + "grad_norm": 0.29733195878313373, + "learning_rate": 6.896127313264643e-06, + "loss": 0.0259, + "step": 1945 + }, + { + "epoch": 1.3166441136671176, + "grad_norm": 0.39496217525237887, + "learning_rate": 6.892483957480754e-06, + "loss": 0.0434, + "step": 1946 + }, + { + "epoch": 1.317320703653586, + "grad_norm": 0.34049160066288403, + "learning_rate": 6.888839428458819e-06, + "loss": 0.0455, + "step": 1947 + }, + { + "epoch": 1.3179972936400541, + "grad_norm": 0.48592835205029217, + "learning_rate": 6.885193728458247e-06, + "loss": 0.037, + "step": 1948 + }, + { + "epoch": 1.3186738836265224, + "grad_norm": 0.3316655074997355, + "learning_rate": 6.8815468597391785e-06, + "loss": 0.0339, + "step": 1949 + }, + { + "epoch": 1.3193504736129906, + "grad_norm": 0.5105973850235418, + "learning_rate": 6.877898824562472e-06, + "loss": 0.0426, + "step": 1950 + }, + { + "epoch": 1.3200270635994586, + "grad_norm": 0.4015952845350009, + "learning_rate": 6.8742496251897185e-06, + "loss": 0.0258, + "step": 1951 + }, + { + "epoch": 1.320703653585927, + "grad_norm": 0.3082620160475625, + "learning_rate": 6.8705992638832185e-06, + "loss": 0.0289, + "step": 1952 + }, + { + "epoch": 1.3213802435723951, + "grad_norm": 0.3552939542081474, + "learning_rate": 6.8669477429060026e-06, + "loss": 0.0311, + "step": 1953 + }, + { + "epoch": 1.3220568335588634, + "grad_norm": 0.3160118419064857, + "learning_rate": 6.863295064521816e-06, + "loss": 0.0273, + "step": 1954 + }, + { + "epoch": 1.3227334235453316, + "grad_norm": 0.3641908306233171, + "learning_rate": 6.859641230995123e-06, + "loss": 0.0302, + "step": 1955 + }, + { + "epoch": 1.3234100135317997, + "grad_norm": 0.29663698696202934, + "learning_rate": 6.855986244591104e-06, + "loss": 0.0343, + "step": 1956 + }, + { + "epoch": 1.324086603518268, + "grad_norm": 0.5164054662110036, + "learning_rate": 6.852330107575653e-06, + "loss": 0.0502, + "step": 1957 + }, + { + "epoch": 1.3247631935047361, + "grad_norm": 0.37482013806126385, + "learning_rate": 6.848672822215378e-06, + "loss": 0.0498, + "step": 1958 + }, + { + "epoch": 1.3254397834912044, + "grad_norm": 0.3877114629635358, + "learning_rate": 6.845014390777595e-06, + "loss": 0.0369, + "step": 1959 + }, + { + "epoch": 1.3261163734776726, + "grad_norm": 0.4727065765763033, + "learning_rate": 6.841354815530341e-06, + "loss": 0.0416, + "step": 1960 + }, + { + "epoch": 1.3267929634641407, + "grad_norm": 0.2551714333929746, + "learning_rate": 6.8376940987423526e-06, + "loss": 0.031, + "step": 1961 + }, + { + "epoch": 1.327469553450609, + "grad_norm": 0.3885309030982085, + "learning_rate": 6.834032242683075e-06, + "loss": 0.0362, + "step": 1962 + }, + { + "epoch": 1.3281461434370772, + "grad_norm": 0.43539867609887284, + "learning_rate": 6.830369249622663e-06, + "loss": 0.032, + "step": 1963 + }, + { + "epoch": 1.3288227334235454, + "grad_norm": 0.3410568789390074, + "learning_rate": 6.8267051218319766e-06, + "loss": 0.0404, + "step": 1964 + }, + { + "epoch": 1.3294993234100136, + "grad_norm": 0.2712607415709858, + "learning_rate": 6.823039861582574e-06, + "loss": 0.0333, + "step": 1965 + }, + { + "epoch": 1.3301759133964817, + "grad_norm": 0.32396143704614094, + "learning_rate": 6.819373471146722e-06, + "loss": 0.0307, + "step": 1966 + }, + { + "epoch": 1.33085250338295, + "grad_norm": 0.35684203155714, + "learning_rate": 6.815705952797383e-06, + "loss": 0.0318, + "step": 1967 + }, + { + "epoch": 1.3315290933694182, + "grad_norm": 0.4010634033800326, + "learning_rate": 6.8120373088082215e-06, + "loss": 0.0384, + "step": 1968 + }, + { + "epoch": 1.3322056833558864, + "grad_norm": 0.3089504714330527, + "learning_rate": 6.808367541453599e-06, + "loss": 0.0298, + "step": 1969 + }, + { + "epoch": 1.3328822733423547, + "grad_norm": 0.40216323878568305, + "learning_rate": 6.804696653008574e-06, + "loss": 0.0354, + "step": 1970 + }, + { + "epoch": 1.3335588633288227, + "grad_norm": 0.3192824884643534, + "learning_rate": 6.801024645748899e-06, + "loss": 0.0338, + "step": 1971 + }, + { + "epoch": 1.334235453315291, + "grad_norm": 0.35068944988584044, + "learning_rate": 6.797351521951021e-06, + "loss": 0.0342, + "step": 1972 + }, + { + "epoch": 1.3349120433017592, + "grad_norm": 0.39623052144026705, + "learning_rate": 6.793677283892077e-06, + "loss": 0.027, + "step": 1973 + }, + { + "epoch": 1.3355886332882274, + "grad_norm": 0.390208636935748, + "learning_rate": 6.7900019338499005e-06, + "loss": 0.0321, + "step": 1974 + }, + { + "epoch": 1.3362652232746957, + "grad_norm": 0.3745006285107433, + "learning_rate": 6.786325474103006e-06, + "loss": 0.0333, + "step": 1975 + }, + { + "epoch": 1.3369418132611637, + "grad_norm": 0.5320986808788182, + "learning_rate": 6.782647906930602e-06, + "loss": 0.0457, + "step": 1976 + }, + { + "epoch": 1.337618403247632, + "grad_norm": 0.37218918371951465, + "learning_rate": 6.778969234612583e-06, + "loss": 0.0304, + "step": 1977 + }, + { + "epoch": 1.3382949932341002, + "grad_norm": 0.41581495398695617, + "learning_rate": 6.775289459429526e-06, + "loss": 0.0331, + "step": 1978 + }, + { + "epoch": 1.3389715832205684, + "grad_norm": 0.3083089395931107, + "learning_rate": 6.771608583662694e-06, + "loss": 0.0305, + "step": 1979 + }, + { + "epoch": 1.3396481732070367, + "grad_norm": 0.4549625165103553, + "learning_rate": 6.767926609594032e-06, + "loss": 0.0352, + "step": 1980 + }, + { + "epoch": 1.3403247631935047, + "grad_norm": 0.29487283752090165, + "learning_rate": 6.764243539506166e-06, + "loss": 0.0272, + "step": 1981 + }, + { + "epoch": 1.341001353179973, + "grad_norm": 0.3915630691845097, + "learning_rate": 6.760559375682398e-06, + "loss": 0.029, + "step": 1982 + }, + { + "epoch": 1.3416779431664412, + "grad_norm": 0.2985063865689262, + "learning_rate": 6.7568741204067145e-06, + "loss": 0.0254, + "step": 1983 + }, + { + "epoch": 1.3423545331529092, + "grad_norm": 0.37609563254971934, + "learning_rate": 6.753187775963773e-06, + "loss": 0.0271, + "step": 1984 + }, + { + "epoch": 1.3430311231393777, + "grad_norm": 0.2960225373939122, + "learning_rate": 6.749500344638908e-06, + "loss": 0.0241, + "step": 1985 + }, + { + "epoch": 1.3437077131258457, + "grad_norm": 0.46484952982762223, + "learning_rate": 6.74581182871813e-06, + "loss": 0.0312, + "step": 1986 + }, + { + "epoch": 1.344384303112314, + "grad_norm": 0.4643978417506475, + "learning_rate": 6.7421222304881194e-06, + "loss": 0.0415, + "step": 1987 + }, + { + "epoch": 1.3450608930987822, + "grad_norm": 0.3629192973826768, + "learning_rate": 6.738431552236228e-06, + "loss": 0.0315, + "step": 1988 + }, + { + "epoch": 1.3457374830852502, + "grad_norm": 0.34050995548259366, + "learning_rate": 6.734739796250477e-06, + "loss": 0.0291, + "step": 1989 + }, + { + "epoch": 1.3464140730717187, + "grad_norm": 0.374090343086368, + "learning_rate": 6.731046964819555e-06, + "loss": 0.0331, + "step": 1990 + }, + { + "epoch": 1.3470906630581867, + "grad_norm": 0.36890224068615757, + "learning_rate": 6.727353060232822e-06, + "loss": 0.0255, + "step": 1991 + }, + { + "epoch": 1.347767253044655, + "grad_norm": 0.31052678606574163, + "learning_rate": 6.723658084780297e-06, + "loss": 0.0289, + "step": 1992 + }, + { + "epoch": 1.3484438430311232, + "grad_norm": 0.29195805083667253, + "learning_rate": 6.719962040752665e-06, + "loss": 0.0305, + "step": 1993 + }, + { + "epoch": 1.3491204330175912, + "grad_norm": 0.346054583503512, + "learning_rate": 6.716264930441279e-06, + "loss": 0.0395, + "step": 1994 + }, + { + "epoch": 1.3497970230040595, + "grad_norm": 0.5532799972686093, + "learning_rate": 6.712566756138142e-06, + "loss": 0.0361, + "step": 1995 + }, + { + "epoch": 1.3504736129905277, + "grad_norm": 0.3070130836221577, + "learning_rate": 6.708867520135924e-06, + "loss": 0.0387, + "step": 1996 + }, + { + "epoch": 1.351150202976996, + "grad_norm": 0.36365030320565084, + "learning_rate": 6.705167224727956e-06, + "loss": 0.0345, + "step": 1997 + }, + { + "epoch": 1.3518267929634642, + "grad_norm": 0.2936265965360264, + "learning_rate": 6.701465872208216e-06, + "loss": 0.0273, + "step": 1998 + }, + { + "epoch": 1.3525033829499322, + "grad_norm": 0.26117973313412873, + "learning_rate": 6.697763464871346e-06, + "loss": 0.0236, + "step": 1999 + }, + { + "epoch": 1.3531799729364005, + "grad_norm": 0.30964815143078134, + "learning_rate": 6.694060005012642e-06, + "loss": 0.0263, + "step": 2000 + }, + { + "epoch": 1.3538565629228687, + "grad_norm": 0.4914254563625275, + "learning_rate": 6.690355494928043e-06, + "loss": 0.0297, + "step": 2001 + }, + { + "epoch": 1.354533152909337, + "grad_norm": 0.522963248563417, + "learning_rate": 6.686649936914151e-06, + "loss": 0.0375, + "step": 2002 + }, + { + "epoch": 1.3552097428958052, + "grad_norm": 0.3153906403863871, + "learning_rate": 6.682943333268208e-06, + "loss": 0.0322, + "step": 2003 + }, + { + "epoch": 1.3558863328822732, + "grad_norm": 0.3064065236158712, + "learning_rate": 6.6792356862881144e-06, + "loss": 0.0274, + "step": 2004 + }, + { + "epoch": 1.3565629228687415, + "grad_norm": 0.5655353933523952, + "learning_rate": 6.675526998272405e-06, + "loss": 0.0363, + "step": 2005 + }, + { + "epoch": 1.3572395128552097, + "grad_norm": 0.5824549510550345, + "learning_rate": 6.671817271520269e-06, + "loss": 0.0323, + "step": 2006 + }, + { + "epoch": 1.357916102841678, + "grad_norm": 0.2908098025143807, + "learning_rate": 6.668106508331539e-06, + "loss": 0.0281, + "step": 2007 + }, + { + "epoch": 1.3585926928281462, + "grad_norm": 0.3886613946837747, + "learning_rate": 6.664394711006684e-06, + "loss": 0.0593, + "step": 2008 + }, + { + "epoch": 1.3592692828146142, + "grad_norm": 0.2602980440199633, + "learning_rate": 6.660681881846822e-06, + "loss": 0.0237, + "step": 2009 + }, + { + "epoch": 1.3599458728010825, + "grad_norm": 0.2962619776666657, + "learning_rate": 6.656968023153706e-06, + "loss": 0.0293, + "step": 2010 + }, + { + "epoch": 1.3606224627875507, + "grad_norm": 0.4715239486702429, + "learning_rate": 6.653253137229727e-06, + "loss": 0.0242, + "step": 2011 + }, + { + "epoch": 1.361299052774019, + "grad_norm": 0.30883648305118566, + "learning_rate": 6.6495372263779145e-06, + "loss": 0.0279, + "step": 2012 + }, + { + "epoch": 1.3619756427604872, + "grad_norm": 0.4829350071057054, + "learning_rate": 6.6458202929019345e-06, + "loss": 0.0383, + "step": 2013 + }, + { + "epoch": 1.3626522327469552, + "grad_norm": 0.27575244400290594, + "learning_rate": 6.6421023391060845e-06, + "loss": 0.0231, + "step": 2014 + }, + { + "epoch": 1.3633288227334235, + "grad_norm": 0.43115525602974536, + "learning_rate": 6.6383833672952945e-06, + "loss": 0.0316, + "step": 2015 + }, + { + "epoch": 1.3640054127198917, + "grad_norm": 0.45118528428735527, + "learning_rate": 6.634663379775126e-06, + "loss": 0.0473, + "step": 2016 + }, + { + "epoch": 1.36468200270636, + "grad_norm": 0.4366983129632561, + "learning_rate": 6.630942378851774e-06, + "loss": 0.0335, + "step": 2017 + }, + { + "epoch": 1.3653585926928282, + "grad_norm": 0.4462867555229021, + "learning_rate": 6.627220366832056e-06, + "loss": 0.0391, + "step": 2018 + }, + { + "epoch": 1.3660351826792962, + "grad_norm": 0.32642262322940635, + "learning_rate": 6.6234973460234184e-06, + "loss": 0.0266, + "step": 2019 + }, + { + "epoch": 1.3667117726657645, + "grad_norm": 0.22280107789149386, + "learning_rate": 6.619773318733934e-06, + "loss": 0.0221, + "step": 2020 + }, + { + "epoch": 1.3673883626522327, + "grad_norm": 0.35837691241255937, + "learning_rate": 6.616048287272301e-06, + "loss": 0.0358, + "step": 2021 + }, + { + "epoch": 1.368064952638701, + "grad_norm": 0.33777792151625974, + "learning_rate": 6.612322253947836e-06, + "loss": 0.032, + "step": 2022 + }, + { + "epoch": 1.3687415426251692, + "grad_norm": 0.7163729402437692, + "learning_rate": 6.608595221070478e-06, + "loss": 0.0338, + "step": 2023 + }, + { + "epoch": 1.3694181326116373, + "grad_norm": 0.319789911005006, + "learning_rate": 6.60486719095079e-06, + "loss": 0.0283, + "step": 2024 + }, + { + "epoch": 1.3700947225981055, + "grad_norm": 0.27226424546217287, + "learning_rate": 6.601138165899945e-06, + "loss": 0.0227, + "step": 2025 + }, + { + "epoch": 1.3707713125845737, + "grad_norm": 0.3273309937541878, + "learning_rate": 6.597408148229742e-06, + "loss": 0.0314, + "step": 2026 + }, + { + "epoch": 1.371447902571042, + "grad_norm": 0.27041595036966815, + "learning_rate": 6.5936771402525875e-06, + "loss": 0.0328, + "step": 2027 + }, + { + "epoch": 1.3721244925575102, + "grad_norm": 0.32856085638934746, + "learning_rate": 6.589945144281508e-06, + "loss": 0.0359, + "step": 2028 + }, + { + "epoch": 1.3728010825439783, + "grad_norm": 0.5082650016533564, + "learning_rate": 6.586212162630137e-06, + "loss": 0.0429, + "step": 2029 + }, + { + "epoch": 1.3734776725304465, + "grad_norm": 0.36097284492020404, + "learning_rate": 6.582478197612725e-06, + "loss": 0.0237, + "step": 2030 + }, + { + "epoch": 1.3741542625169147, + "grad_norm": 0.28282032291287545, + "learning_rate": 6.578743251544128e-06, + "loss": 0.0256, + "step": 2031 + }, + { + "epoch": 1.374830852503383, + "grad_norm": 0.26420694962027813, + "learning_rate": 6.57500732673981e-06, + "loss": 0.0274, + "step": 2032 + }, + { + "epoch": 1.3755074424898512, + "grad_norm": 0.4219218950231299, + "learning_rate": 6.571270425515843e-06, + "loss": 0.0294, + "step": 2033 + }, + { + "epoch": 1.3761840324763193, + "grad_norm": 0.3313638757920285, + "learning_rate": 6.567532550188908e-06, + "loss": 0.0324, + "step": 2034 + }, + { + "epoch": 1.3768606224627875, + "grad_norm": 0.39554296467903294, + "learning_rate": 6.56379370307628e-06, + "loss": 0.0365, + "step": 2035 + }, + { + "epoch": 1.3775372124492558, + "grad_norm": 0.541519278217376, + "learning_rate": 6.560053886495847e-06, + "loss": 0.0553, + "step": 2036 + }, + { + "epoch": 1.378213802435724, + "grad_norm": 0.36617668515633667, + "learning_rate": 6.556313102766094e-06, + "loss": 0.0387, + "step": 2037 + }, + { + "epoch": 1.3788903924221922, + "grad_norm": 0.42547372186926263, + "learning_rate": 6.552571354206104e-06, + "loss": 0.0276, + "step": 2038 + }, + { + "epoch": 1.3795669824086603, + "grad_norm": 0.38643235510601825, + "learning_rate": 6.548828643135559e-06, + "loss": 0.0338, + "step": 2039 + }, + { + "epoch": 1.3802435723951285, + "grad_norm": 0.42679781985591464, + "learning_rate": 6.545084971874738e-06, + "loss": 0.0334, + "step": 2040 + }, + { + "epoch": 1.3809201623815968, + "grad_norm": 0.4811872195591035, + "learning_rate": 6.541340342744517e-06, + "loss": 0.0314, + "step": 2041 + }, + { + "epoch": 1.381596752368065, + "grad_norm": 0.3881378711290422, + "learning_rate": 6.537594758066362e-06, + "loss": 0.0289, + "step": 2042 + }, + { + "epoch": 1.3822733423545333, + "grad_norm": 0.363316686679428, + "learning_rate": 6.533848220162336e-06, + "loss": 0.0329, + "step": 2043 + }, + { + "epoch": 1.3829499323410013, + "grad_norm": 0.5194872511315176, + "learning_rate": 6.530100731355089e-06, + "loss": 0.0364, + "step": 2044 + }, + { + "epoch": 1.3836265223274695, + "grad_norm": 0.3509987046782082, + "learning_rate": 6.5263522939678626e-06, + "loss": 0.0367, + "step": 2045 + }, + { + "epoch": 1.3843031123139378, + "grad_norm": 0.8794986059479333, + "learning_rate": 6.5226029103244846e-06, + "loss": 0.0277, + "step": 2046 + }, + { + "epoch": 1.384979702300406, + "grad_norm": 0.37079855582402726, + "learning_rate": 6.518852582749373e-06, + "loss": 0.026, + "step": 2047 + }, + { + "epoch": 1.3856562922868743, + "grad_norm": 0.38243280438514937, + "learning_rate": 6.515101313567529e-06, + "loss": 0.0342, + "step": 2048 + }, + { + "epoch": 1.3863328822733423, + "grad_norm": 0.3566315120700252, + "learning_rate": 6.511349105104534e-06, + "loss": 0.039, + "step": 2049 + }, + { + "epoch": 1.3870094722598105, + "grad_norm": 0.2737242000700114, + "learning_rate": 6.507595959686558e-06, + "loss": 0.0212, + "step": 2050 + }, + { + "epoch": 1.3876860622462788, + "grad_norm": 0.401899097675513, + "learning_rate": 6.503841879640349e-06, + "loss": 0.0315, + "step": 2051 + }, + { + "epoch": 1.388362652232747, + "grad_norm": 0.42994403962960515, + "learning_rate": 6.500086867293231e-06, + "loss": 0.0458, + "step": 2052 + }, + { + "epoch": 1.3890392422192153, + "grad_norm": 0.3298658103107547, + "learning_rate": 6.496330924973112e-06, + "loss": 0.0257, + "step": 2053 + }, + { + "epoch": 1.3897158322056833, + "grad_norm": 0.457177812307383, + "learning_rate": 6.492574055008474e-06, + "loss": 0.03, + "step": 2054 + }, + { + "epoch": 1.3903924221921515, + "grad_norm": 0.5646728212522865, + "learning_rate": 6.488816259728372e-06, + "loss": 0.0453, + "step": 2055 + }, + { + "epoch": 1.3910690121786198, + "grad_norm": 0.3077669655382729, + "learning_rate": 6.4850575414624385e-06, + "loss": 0.0352, + "step": 2056 + }, + { + "epoch": 1.391745602165088, + "grad_norm": 0.2585666570782783, + "learning_rate": 6.481297902540875e-06, + "loss": 0.0275, + "step": 2057 + }, + { + "epoch": 1.3924221921515563, + "grad_norm": 0.32267053665643863, + "learning_rate": 6.477537345294455e-06, + "loss": 0.0344, + "step": 2058 + }, + { + "epoch": 1.3930987821380243, + "grad_norm": 0.35264057294196477, + "learning_rate": 6.473775872054522e-06, + "loss": 0.0313, + "step": 2059 + }, + { + "epoch": 1.3937753721244925, + "grad_norm": 0.2807470775739839, + "learning_rate": 6.4700134851529864e-06, + "loss": 0.0264, + "step": 2060 + }, + { + "epoch": 1.3944519621109608, + "grad_norm": 0.4160332993314981, + "learning_rate": 6.466250186922325e-06, + "loss": 0.039, + "step": 2061 + }, + { + "epoch": 1.395128552097429, + "grad_norm": 0.29364956421954663, + "learning_rate": 6.46248597969558e-06, + "loss": 0.0276, + "step": 2062 + }, + { + "epoch": 1.3958051420838973, + "grad_norm": 0.35315452768437977, + "learning_rate": 6.458720865806356e-06, + "loss": 0.0298, + "step": 2063 + }, + { + "epoch": 1.3964817320703653, + "grad_norm": 0.35059909971098374, + "learning_rate": 6.454954847588824e-06, + "loss": 0.0346, + "step": 2064 + }, + { + "epoch": 1.3971583220568335, + "grad_norm": 0.3451610708419374, + "learning_rate": 6.4511879273777065e-06, + "loss": 0.0292, + "step": 2065 + }, + { + "epoch": 1.3978349120433018, + "grad_norm": 0.3260737404156102, + "learning_rate": 6.447420107508297e-06, + "loss": 0.0361, + "step": 2066 + }, + { + "epoch": 1.39851150202977, + "grad_norm": 0.26665138272941735, + "learning_rate": 6.443651390316438e-06, + "loss": 0.0191, + "step": 2067 + }, + { + "epoch": 1.3991880920162383, + "grad_norm": 0.3316380161766256, + "learning_rate": 6.439881778138531e-06, + "loss": 0.0286, + "step": 2068 + }, + { + "epoch": 1.3998646820027063, + "grad_norm": 0.2555171129823219, + "learning_rate": 6.436111273311533e-06, + "loss": 0.0239, + "step": 2069 + }, + { + "epoch": 1.4005412719891746, + "grad_norm": 0.31877515349966057, + "learning_rate": 6.4323398781729525e-06, + "loss": 0.0388, + "step": 2070 + }, + { + "epoch": 1.4012178619756428, + "grad_norm": 0.34526680355294986, + "learning_rate": 6.428567595060853e-06, + "loss": 0.0347, + "step": 2071 + }, + { + "epoch": 1.401894451962111, + "grad_norm": 0.3541673705549516, + "learning_rate": 6.424794426313845e-06, + "loss": 0.0297, + "step": 2072 + }, + { + "epoch": 1.4025710419485793, + "grad_norm": 0.3846339376565212, + "learning_rate": 6.42102037427109e-06, + "loss": 0.0366, + "step": 2073 + }, + { + "epoch": 1.4032476319350473, + "grad_norm": 0.40517755933658284, + "learning_rate": 6.417245441272299e-06, + "loss": 0.0401, + "step": 2074 + }, + { + "epoch": 1.4039242219215156, + "grad_norm": 0.34974420366334524, + "learning_rate": 6.413469629657724e-06, + "loss": 0.0304, + "step": 2075 + }, + { + "epoch": 1.4046008119079838, + "grad_norm": 0.31033898464225507, + "learning_rate": 6.409692941768166e-06, + "loss": 0.0261, + "step": 2076 + }, + { + "epoch": 1.4052774018944518, + "grad_norm": 0.3105272439758102, + "learning_rate": 6.405915379944967e-06, + "loss": 0.0369, + "step": 2077 + }, + { + "epoch": 1.4059539918809203, + "grad_norm": 0.30416573331952884, + "learning_rate": 6.402136946530014e-06, + "loss": 0.0249, + "step": 2078 + }, + { + "epoch": 1.4066305818673883, + "grad_norm": 0.3677384878897852, + "learning_rate": 6.398357643865731e-06, + "loss": 0.0413, + "step": 2079 + }, + { + "epoch": 1.4073071718538566, + "grad_norm": 0.26852596319192984, + "learning_rate": 6.394577474295081e-06, + "loss": 0.0231, + "step": 2080 + }, + { + "epoch": 1.4079837618403248, + "grad_norm": 0.2714719235055002, + "learning_rate": 6.390796440161566e-06, + "loss": 0.024, + "step": 2081 + }, + { + "epoch": 1.4086603518267928, + "grad_norm": 0.4053568590348655, + "learning_rate": 6.387014543809224e-06, + "loss": 0.0529, + "step": 2082 + }, + { + "epoch": 1.4093369418132613, + "grad_norm": 0.38264869853034933, + "learning_rate": 6.383231787582625e-06, + "loss": 0.043, + "step": 2083 + }, + { + "epoch": 1.4100135317997293, + "grad_norm": 0.3023966476066303, + "learning_rate": 6.3794481738268765e-06, + "loss": 0.0254, + "step": 2084 + }, + { + "epoch": 1.4106901217861976, + "grad_norm": 0.25204219489059054, + "learning_rate": 6.375663704887614e-06, + "loss": 0.0247, + "step": 2085 + }, + { + "epoch": 1.4113667117726658, + "grad_norm": 0.3603784899646957, + "learning_rate": 6.371878383111002e-06, + "loss": 0.0282, + "step": 2086 + }, + { + "epoch": 1.4120433017591338, + "grad_norm": 0.3448311011389403, + "learning_rate": 6.368092210843739e-06, + "loss": 0.026, + "step": 2087 + }, + { + "epoch": 1.412719891745602, + "grad_norm": 0.2691654436255992, + "learning_rate": 6.364305190433049e-06, + "loss": 0.0217, + "step": 2088 + }, + { + "epoch": 1.4133964817320703, + "grad_norm": 0.36562016184734747, + "learning_rate": 6.360517324226676e-06, + "loss": 0.0413, + "step": 2089 + }, + { + "epoch": 1.4140730717185386, + "grad_norm": 0.49998657397031354, + "learning_rate": 6.3567286145728944e-06, + "loss": 0.0475, + "step": 2090 + }, + { + "epoch": 1.4147496617050068, + "grad_norm": 0.35948438061063, + "learning_rate": 6.3529390638205036e-06, + "loss": 0.0338, + "step": 2091 + }, + { + "epoch": 1.4154262516914748, + "grad_norm": 0.30024840156854604, + "learning_rate": 6.349148674318816e-06, + "loss": 0.0223, + "step": 2092 + }, + { + "epoch": 1.416102841677943, + "grad_norm": 0.37323546109523026, + "learning_rate": 6.34535744841767e-06, + "loss": 0.0348, + "step": 2093 + }, + { + "epoch": 1.4167794316644113, + "grad_norm": 0.3138761193803162, + "learning_rate": 6.341565388467425e-06, + "loss": 0.0212, + "step": 2094 + }, + { + "epoch": 1.4174560216508796, + "grad_norm": 0.42562570612532213, + "learning_rate": 6.3377724968189494e-06, + "loss": 0.0427, + "step": 2095 + }, + { + "epoch": 1.4181326116373478, + "grad_norm": 0.2975809896826504, + "learning_rate": 6.3339787758236316e-06, + "loss": 0.0319, + "step": 2096 + }, + { + "epoch": 1.4188092016238159, + "grad_norm": 0.20055310290567982, + "learning_rate": 6.330184227833376e-06, + "loss": 0.0192, + "step": 2097 + }, + { + "epoch": 1.419485791610284, + "grad_norm": 0.2903888548762818, + "learning_rate": 6.326388855200598e-06, + "loss": 0.0332, + "step": 2098 + }, + { + "epoch": 1.4201623815967523, + "grad_norm": 0.3237604981373609, + "learning_rate": 6.322592660278223e-06, + "loss": 0.0211, + "step": 2099 + }, + { + "epoch": 1.4208389715832206, + "grad_norm": 0.4563084635253882, + "learning_rate": 6.3187956454196885e-06, + "loss": 0.0293, + "step": 2100 + }, + { + "epoch": 1.4215155615696888, + "grad_norm": 0.3565677324507497, + "learning_rate": 6.314997812978938e-06, + "loss": 0.0471, + "step": 2101 + }, + { + "epoch": 1.4221921515561569, + "grad_norm": 0.37222445082887984, + "learning_rate": 6.311199165310422e-06, + "loss": 0.0299, + "step": 2102 + }, + { + "epoch": 1.422868741542625, + "grad_norm": 0.3603043808313665, + "learning_rate": 6.3073997047691e-06, + "loss": 0.0345, + "step": 2103 + }, + { + "epoch": 1.4235453315290933, + "grad_norm": 0.28793353670094385, + "learning_rate": 6.30359943371043e-06, + "loss": 0.03, + "step": 2104 + }, + { + "epoch": 1.4242219215155616, + "grad_norm": 0.31778201674399575, + "learning_rate": 6.299798354490376e-06, + "loss": 0.0326, + "step": 2105 + }, + { + "epoch": 1.4248985115020298, + "grad_norm": 0.39267194353182716, + "learning_rate": 6.295996469465404e-06, + "loss": 0.0441, + "step": 2106 + }, + { + "epoch": 1.4255751014884979, + "grad_norm": 0.2555114962569351, + "learning_rate": 6.292193780992475e-06, + "loss": 0.0209, + "step": 2107 + }, + { + "epoch": 1.426251691474966, + "grad_norm": 0.36004345414357736, + "learning_rate": 6.288390291429054e-06, + "loss": 0.033, + "step": 2108 + }, + { + "epoch": 1.4269282814614344, + "grad_norm": 0.5615797430841993, + "learning_rate": 6.284586003133096e-06, + "loss": 0.0382, + "step": 2109 + }, + { + "epoch": 1.4276048714479026, + "grad_norm": 0.29624884789625017, + "learning_rate": 6.280780918463057e-06, + "loss": 0.0338, + "step": 2110 + }, + { + "epoch": 1.4282814614343708, + "grad_norm": 0.3837618125131463, + "learning_rate": 6.276975039777885e-06, + "loss": 0.0342, + "step": 2111 + }, + { + "epoch": 1.4289580514208389, + "grad_norm": 0.27034435250466143, + "learning_rate": 6.2731683694370185e-06, + "loss": 0.0295, + "step": 2112 + }, + { + "epoch": 1.4296346414073071, + "grad_norm": 0.7620266575277731, + "learning_rate": 6.269360909800386e-06, + "loss": 0.0314, + "step": 2113 + }, + { + "epoch": 1.4303112313937754, + "grad_norm": 0.3480021200860554, + "learning_rate": 6.265552663228411e-06, + "loss": 0.035, + "step": 2114 + }, + { + "epoch": 1.4309878213802436, + "grad_norm": 0.2758089710725665, + "learning_rate": 6.261743632081998e-06, + "loss": 0.0248, + "step": 2115 + }, + { + "epoch": 1.4316644113667119, + "grad_norm": 0.3379064505582351, + "learning_rate": 6.257933818722544e-06, + "loss": 0.0265, + "step": 2116 + }, + { + "epoch": 1.4323410013531799, + "grad_norm": 0.38029596555033884, + "learning_rate": 6.254123225511924e-06, + "loss": 0.0488, + "step": 2117 + }, + { + "epoch": 1.4330175913396481, + "grad_norm": 0.27409117177720943, + "learning_rate": 6.250311854812504e-06, + "loss": 0.0248, + "step": 2118 + }, + { + "epoch": 1.4336941813261164, + "grad_norm": 0.36908567109447665, + "learning_rate": 6.246499708987127e-06, + "loss": 0.0395, + "step": 2119 + }, + { + "epoch": 1.4343707713125846, + "grad_norm": 0.48804597539011046, + "learning_rate": 6.242686790399117e-06, + "loss": 0.0522, + "step": 2120 + }, + { + "epoch": 1.4350473612990529, + "grad_norm": 0.2535007906418984, + "learning_rate": 6.238873101412282e-06, + "loss": 0.0308, + "step": 2121 + }, + { + "epoch": 1.4357239512855209, + "grad_norm": 0.41145685151365685, + "learning_rate": 6.2350586443908965e-06, + "loss": 0.0352, + "step": 2122 + }, + { + "epoch": 1.4364005412719891, + "grad_norm": 0.2782286467786743, + "learning_rate": 6.231243421699725e-06, + "loss": 0.0381, + "step": 2123 + }, + { + "epoch": 1.4370771312584574, + "grad_norm": 0.46114348105248215, + "learning_rate": 6.227427435703997e-06, + "loss": 0.036, + "step": 2124 + }, + { + "epoch": 1.4377537212449256, + "grad_norm": 0.4419496506779566, + "learning_rate": 6.223610688769418e-06, + "loss": 0.024, + "step": 2125 + }, + { + "epoch": 1.4384303112313939, + "grad_norm": 0.3390263762200865, + "learning_rate": 6.219793183262165e-06, + "loss": 0.0419, + "step": 2126 + }, + { + "epoch": 1.439106901217862, + "grad_norm": 0.5057159702902083, + "learning_rate": 6.215974921548888e-06, + "loss": 0.0553, + "step": 2127 + }, + { + "epoch": 1.4397834912043301, + "grad_norm": 0.38168261802940295, + "learning_rate": 6.2121559059966995e-06, + "loss": 0.0383, + "step": 2128 + }, + { + "epoch": 1.4404600811907984, + "grad_norm": 0.35613068846279383, + "learning_rate": 6.2083361389731874e-06, + "loss": 0.0369, + "step": 2129 + }, + { + "epoch": 1.4411366711772666, + "grad_norm": 0.3524748625297267, + "learning_rate": 6.204515622846399e-06, + "loss": 0.0284, + "step": 2130 + }, + { + "epoch": 1.4418132611637349, + "grad_norm": 0.6151239085092787, + "learning_rate": 6.200694359984849e-06, + "loss": 0.0307, + "step": 2131 + }, + { + "epoch": 1.442489851150203, + "grad_norm": 0.2957639819925075, + "learning_rate": 6.1968723527575155e-06, + "loss": 0.0251, + "step": 2132 + }, + { + "epoch": 1.4431664411366711, + "grad_norm": 0.44334858465795796, + "learning_rate": 6.193049603533835e-06, + "loss": 0.0273, + "step": 2133 + }, + { + "epoch": 1.4438430311231394, + "grad_norm": 0.3829246618926111, + "learning_rate": 6.189226114683708e-06, + "loss": 0.0354, + "step": 2134 + }, + { + "epoch": 1.4445196211096076, + "grad_norm": 0.319004469884991, + "learning_rate": 6.185401888577488e-06, + "loss": 0.0342, + "step": 2135 + }, + { + "epoch": 1.4451962110960759, + "grad_norm": 1.0098018314065516, + "learning_rate": 6.181576927585993e-06, + "loss": 0.039, + "step": 2136 + }, + { + "epoch": 1.445872801082544, + "grad_norm": 0.23140407916564779, + "learning_rate": 6.177751234080491e-06, + "loss": 0.0206, + "step": 2137 + }, + { + "epoch": 1.4465493910690121, + "grad_norm": 0.5468696369070107, + "learning_rate": 6.173924810432705e-06, + "loss": 0.0379, + "step": 2138 + }, + { + "epoch": 1.4472259810554804, + "grad_norm": 0.5154559749265718, + "learning_rate": 6.170097659014812e-06, + "loss": 0.039, + "step": 2139 + }, + { + "epoch": 1.4479025710419486, + "grad_norm": 0.37189366775822497, + "learning_rate": 6.166269782199441e-06, + "loss": 0.0309, + "step": 2140 + }, + { + "epoch": 1.4485791610284169, + "grad_norm": 0.5175418728381058, + "learning_rate": 6.162441182359667e-06, + "loss": 0.0463, + "step": 2141 + }, + { + "epoch": 1.449255751014885, + "grad_norm": 0.3518796223354458, + "learning_rate": 6.158611861869018e-06, + "loss": 0.0323, + "step": 2142 + }, + { + "epoch": 1.4499323410013532, + "grad_norm": 0.8374256432474917, + "learning_rate": 6.154781823101463e-06, + "loss": 0.0361, + "step": 2143 + }, + { + "epoch": 1.4506089309878214, + "grad_norm": 0.5178072670670487, + "learning_rate": 6.150951068431424e-06, + "loss": 0.0419, + "step": 2144 + }, + { + "epoch": 1.4512855209742896, + "grad_norm": 0.6220397251423426, + "learning_rate": 6.147119600233758e-06, + "loss": 0.0393, + "step": 2145 + }, + { + "epoch": 1.451962110960758, + "grad_norm": 0.37835908831438864, + "learning_rate": 6.143287420883772e-06, + "loss": 0.0285, + "step": 2146 + }, + { + "epoch": 1.452638700947226, + "grad_norm": 0.5095509553729675, + "learning_rate": 6.1394545327572086e-06, + "loss": 0.0348, + "step": 2147 + }, + { + "epoch": 1.4533152909336942, + "grad_norm": 0.29047671694508453, + "learning_rate": 6.135620938230254e-06, + "loss": 0.0303, + "step": 2148 + }, + { + "epoch": 1.4539918809201624, + "grad_norm": 0.4705487897755824, + "learning_rate": 6.131786639679527e-06, + "loss": 0.0358, + "step": 2149 + }, + { + "epoch": 1.4546684709066307, + "grad_norm": 0.37836045179522076, + "learning_rate": 6.127951639482088e-06, + "loss": 0.0319, + "step": 2150 + }, + { + "epoch": 1.455345060893099, + "grad_norm": 0.304184930680853, + "learning_rate": 6.1241159400154306e-06, + "loss": 0.0276, + "step": 2151 + }, + { + "epoch": 1.456021650879567, + "grad_norm": 0.325903651893824, + "learning_rate": 6.12027954365748e-06, + "loss": 0.0338, + "step": 2152 + }, + { + "epoch": 1.4566982408660352, + "grad_norm": 0.29062088304713973, + "learning_rate": 6.116442452786599e-06, + "loss": 0.029, + "step": 2153 + }, + { + "epoch": 1.4573748308525034, + "grad_norm": 0.30661073480979945, + "learning_rate": 6.112604669781572e-06, + "loss": 0.0314, + "step": 2154 + }, + { + "epoch": 1.4580514208389717, + "grad_norm": 0.4290895580278508, + "learning_rate": 6.108766197021623e-06, + "loss": 0.0363, + "step": 2155 + }, + { + "epoch": 1.45872801082544, + "grad_norm": 0.3229603064211765, + "learning_rate": 6.104927036886392e-06, + "loss": 0.0204, + "step": 2156 + }, + { + "epoch": 1.459404600811908, + "grad_norm": 0.30348680465543104, + "learning_rate": 6.101087191755958e-06, + "loss": 0.0349, + "step": 2157 + }, + { + "epoch": 1.4600811907983762, + "grad_norm": 0.29203304265229446, + "learning_rate": 6.097246664010813e-06, + "loss": 0.0321, + "step": 2158 + }, + { + "epoch": 1.4607577807848444, + "grad_norm": 0.3626742231462104, + "learning_rate": 6.09340545603188e-06, + "loss": 0.0221, + "step": 2159 + }, + { + "epoch": 1.4614343707713127, + "grad_norm": 0.4282709049330542, + "learning_rate": 6.0895635702004985e-06, + "loss": 0.0315, + "step": 2160 + }, + { + "epoch": 1.462110960757781, + "grad_norm": 0.37997009810871746, + "learning_rate": 6.085721008898434e-06, + "loss": 0.0412, + "step": 2161 + }, + { + "epoch": 1.462787550744249, + "grad_norm": 0.4162757999240797, + "learning_rate": 6.081877774507864e-06, + "loss": 0.0423, + "step": 2162 + }, + { + "epoch": 1.4634641407307172, + "grad_norm": 0.36280176386712804, + "learning_rate": 6.078033869411389e-06, + "loss": 0.0302, + "step": 2163 + }, + { + "epoch": 1.4641407307171854, + "grad_norm": 0.48194851027098257, + "learning_rate": 6.0741892959920205e-06, + "loss": 0.03, + "step": 2164 + }, + { + "epoch": 1.4648173207036537, + "grad_norm": 0.4484108166297937, + "learning_rate": 6.070344056633189e-06, + "loss": 0.0329, + "step": 2165 + }, + { + "epoch": 1.465493910690122, + "grad_norm": 0.3294514538164327, + "learning_rate": 6.066498153718735e-06, + "loss": 0.0318, + "step": 2166 + }, + { + "epoch": 1.46617050067659, + "grad_norm": 0.8495940935008968, + "learning_rate": 6.062651589632911e-06, + "loss": 0.0368, + "step": 2167 + }, + { + "epoch": 1.4668470906630582, + "grad_norm": 0.35158915451929507, + "learning_rate": 6.05880436676038e-06, + "loss": 0.0307, + "step": 2168 + }, + { + "epoch": 1.4675236806495264, + "grad_norm": 0.45524338738755626, + "learning_rate": 6.054956487486212e-06, + "loss": 0.0297, + "step": 2169 + }, + { + "epoch": 1.4682002706359945, + "grad_norm": 0.45007889929618056, + "learning_rate": 6.0511079541958825e-06, + "loss": 0.0383, + "step": 2170 + }, + { + "epoch": 1.468876860622463, + "grad_norm": 0.283783063897314, + "learning_rate": 6.04725876927528e-06, + "loss": 0.0274, + "step": 2171 + }, + { + "epoch": 1.469553450608931, + "grad_norm": 0.5068185378853123, + "learning_rate": 6.043408935110688e-06, + "loss": 0.0347, + "step": 2172 + }, + { + "epoch": 1.4702300405953992, + "grad_norm": 0.40950528843006856, + "learning_rate": 6.039558454088796e-06, + "loss": 0.0433, + "step": 2173 + }, + { + "epoch": 1.4709066305818674, + "grad_norm": 0.4398326416941755, + "learning_rate": 6.035707328596698e-06, + "loss": 0.025, + "step": 2174 + }, + { + "epoch": 1.4715832205683355, + "grad_norm": 0.3655865643045538, + "learning_rate": 6.0318555610218796e-06, + "loss": 0.0339, + "step": 2175 + }, + { + "epoch": 1.472259810554804, + "grad_norm": 0.8971567567375772, + "learning_rate": 6.0280031537522335e-06, + "loss": 0.0338, + "step": 2176 + }, + { + "epoch": 1.472936400541272, + "grad_norm": 0.36910870295649967, + "learning_rate": 6.02415010917604e-06, + "loss": 0.0314, + "step": 2177 + }, + { + "epoch": 1.4736129905277402, + "grad_norm": 0.507486121609872, + "learning_rate": 6.020296429681985e-06, + "loss": 0.0512, + "step": 2178 + }, + { + "epoch": 1.4742895805142084, + "grad_norm": 0.43022508829325473, + "learning_rate": 6.016442117659135e-06, + "loss": 0.0306, + "step": 2179 + }, + { + "epoch": 1.4749661705006765, + "grad_norm": 0.6069253084374097, + "learning_rate": 6.0125871754969614e-06, + "loss": 0.0681, + "step": 2180 + }, + { + "epoch": 1.4756427604871447, + "grad_norm": 0.29274371865920534, + "learning_rate": 6.0087316055853175e-06, + "loss": 0.0252, + "step": 2181 + }, + { + "epoch": 1.476319350473613, + "grad_norm": 0.32838593532178906, + "learning_rate": 6.00487541031445e-06, + "loss": 0.0307, + "step": 2182 + }, + { + "epoch": 1.4769959404600812, + "grad_norm": 0.3881233925938184, + "learning_rate": 6.001018592074991e-06, + "loss": 0.0442, + "step": 2183 + }, + { + "epoch": 1.4776725304465494, + "grad_norm": 0.4491432196267391, + "learning_rate": 5.997161153257963e-06, + "loss": 0.0357, + "step": 2184 + }, + { + "epoch": 1.4783491204330175, + "grad_norm": 0.745493908416148, + "learning_rate": 5.9933030962547656e-06, + "loss": 0.0393, + "step": 2185 + }, + { + "epoch": 1.4790257104194857, + "grad_norm": 0.4476644226503889, + "learning_rate": 5.989444423457189e-06, + "loss": 0.03, + "step": 2186 + }, + { + "epoch": 1.479702300405954, + "grad_norm": 0.385099145514051, + "learning_rate": 5.985585137257401e-06, + "loss": 0.0379, + "step": 2187 + }, + { + "epoch": 1.4803788903924222, + "grad_norm": 0.31941391994184515, + "learning_rate": 5.981725240047954e-06, + "loss": 0.0325, + "step": 2188 + }, + { + "epoch": 1.4810554803788905, + "grad_norm": 0.5583607111370749, + "learning_rate": 5.977864734221773e-06, + "loss": 0.0375, + "step": 2189 + }, + { + "epoch": 1.4817320703653585, + "grad_norm": 0.38885996053755606, + "learning_rate": 5.974003622172167e-06, + "loss": 0.0304, + "step": 2190 + }, + { + "epoch": 1.4824086603518267, + "grad_norm": 0.3172314371438738, + "learning_rate": 5.9701419062928125e-06, + "loss": 0.0298, + "step": 2191 + }, + { + "epoch": 1.483085250338295, + "grad_norm": 0.37107791908123133, + "learning_rate": 5.9662795889777666e-06, + "loss": 0.0252, + "step": 2192 + }, + { + "epoch": 1.4837618403247632, + "grad_norm": 0.5656949660516135, + "learning_rate": 5.962416672621461e-06, + "loss": 0.0395, + "step": 2193 + }, + { + "epoch": 1.4844384303112315, + "grad_norm": 0.2715393500302299, + "learning_rate": 5.958553159618693e-06, + "loss": 0.0247, + "step": 2194 + }, + { + "epoch": 1.4851150202976995, + "grad_norm": 0.27915366615382764, + "learning_rate": 5.954689052364633e-06, + "loss": 0.0174, + "step": 2195 + }, + { + "epoch": 1.4857916102841677, + "grad_norm": 0.30230696083916986, + "learning_rate": 5.950824353254818e-06, + "loss": 0.0268, + "step": 2196 + }, + { + "epoch": 1.486468200270636, + "grad_norm": 0.7262061409576978, + "learning_rate": 5.946959064685156e-06, + "loss": 0.0335, + "step": 2197 + }, + { + "epoch": 1.4871447902571042, + "grad_norm": 0.37117066478352984, + "learning_rate": 5.943093189051916e-06, + "loss": 0.0268, + "step": 2198 + }, + { + "epoch": 1.4878213802435725, + "grad_norm": 0.30444721089664023, + "learning_rate": 5.939226728751733e-06, + "loss": 0.0294, + "step": 2199 + }, + { + "epoch": 1.4884979702300405, + "grad_norm": 0.3720986974214217, + "learning_rate": 5.9353596861816e-06, + "loss": 0.0312, + "step": 2200 + }, + { + "epoch": 1.4891745602165087, + "grad_norm": 0.2934579870646905, + "learning_rate": 5.931492063738882e-06, + "loss": 0.0305, + "step": 2201 + }, + { + "epoch": 1.489851150202977, + "grad_norm": 0.2507508948273361, + "learning_rate": 5.92762386382129e-06, + "loss": 0.0285, + "step": 2202 + }, + { + "epoch": 1.4905277401894452, + "grad_norm": 0.3384469594998731, + "learning_rate": 5.9237550888269045e-06, + "loss": 0.0318, + "step": 2203 + }, + { + "epoch": 1.4912043301759135, + "grad_norm": 0.41935236318360014, + "learning_rate": 5.919885741154155e-06, + "loss": 0.0298, + "step": 2204 + }, + { + "epoch": 1.4918809201623815, + "grad_norm": 0.3659203155838567, + "learning_rate": 5.916015823201827e-06, + "loss": 0.0409, + "step": 2205 + }, + { + "epoch": 1.4925575101488497, + "grad_norm": 0.7338320709405541, + "learning_rate": 5.912145337369064e-06, + "loss": 0.0307, + "step": 2206 + }, + { + "epoch": 1.493234100135318, + "grad_norm": 0.3538766905901768, + "learning_rate": 5.908274286055358e-06, + "loss": 0.0282, + "step": 2207 + }, + { + "epoch": 1.4939106901217862, + "grad_norm": 0.46303894457737094, + "learning_rate": 5.904402671660551e-06, + "loss": 0.0325, + "step": 2208 + }, + { + "epoch": 1.4945872801082545, + "grad_norm": 0.333197174440168, + "learning_rate": 5.900530496584834e-06, + "loss": 0.0332, + "step": 2209 + }, + { + "epoch": 1.4952638700947225, + "grad_norm": 0.33949094401429847, + "learning_rate": 5.8966577632287506e-06, + "loss": 0.0323, + "step": 2210 + }, + { + "epoch": 1.4959404600811907, + "grad_norm": 0.3747968067888869, + "learning_rate": 5.892784473993184e-06, + "loss": 0.036, + "step": 2211 + }, + { + "epoch": 1.496617050067659, + "grad_norm": 0.3905306867661988, + "learning_rate": 5.888910631279366e-06, + "loss": 0.0433, + "step": 2212 + }, + { + "epoch": 1.4972936400541272, + "grad_norm": 0.3683225446896326, + "learning_rate": 5.885036237488868e-06, + "loss": 0.037, + "step": 2213 + }, + { + "epoch": 1.4979702300405955, + "grad_norm": 0.5040762838033244, + "learning_rate": 5.88116129502361e-06, + "loss": 0.0356, + "step": 2214 + }, + { + "epoch": 1.4986468200270635, + "grad_norm": 0.3361864175196371, + "learning_rate": 5.8772858062858414e-06, + "loss": 0.03, + "step": 2215 + }, + { + "epoch": 1.4993234100135318, + "grad_norm": 0.32950178770432687, + "learning_rate": 5.873409773678163e-06, + "loss": 0.0348, + "step": 2216 + }, + { + "epoch": 1.5, + "grad_norm": 0.2545920551466513, + "learning_rate": 5.869533199603498e-06, + "loss": 0.0283, + "step": 2217 + }, + { + "epoch": 1.5006765899864682, + "grad_norm": 0.38917650015289157, + "learning_rate": 5.8656560864651225e-06, + "loss": 0.0277, + "step": 2218 + }, + { + "epoch": 1.5013531799729365, + "grad_norm": 0.34582128523983924, + "learning_rate": 5.861778436666631e-06, + "loss": 0.0319, + "step": 2219 + }, + { + "epoch": 1.5020297699594045, + "grad_norm": 0.3980767468611284, + "learning_rate": 5.857900252611959e-06, + "loss": 0.0333, + "step": 2220 + }, + { + "epoch": 1.5027063599458728, + "grad_norm": 0.2666736899074194, + "learning_rate": 5.854021536705373e-06, + "loss": 0.0376, + "step": 2221 + }, + { + "epoch": 1.503382949932341, + "grad_norm": 0.3238074387153149, + "learning_rate": 5.8501422913514665e-06, + "loss": 0.0312, + "step": 2222 + }, + { + "epoch": 1.5040595399188093, + "grad_norm": 0.7465006023551094, + "learning_rate": 5.846262518955163e-06, + "loss": 0.042, + "step": 2223 + }, + { + "epoch": 1.5047361299052775, + "grad_norm": 0.3007257851162775, + "learning_rate": 5.842382221921711e-06, + "loss": 0.0277, + "step": 2224 + }, + { + "epoch": 1.5054127198917455, + "grad_norm": 0.28283027914607606, + "learning_rate": 5.838501402656688e-06, + "loss": 0.0313, + "step": 2225 + }, + { + "epoch": 1.5060893098782138, + "grad_norm": 0.4113052310883682, + "learning_rate": 5.83462006356599e-06, + "loss": 0.0332, + "step": 2226 + }, + { + "epoch": 1.506765899864682, + "grad_norm": 0.3135099299519799, + "learning_rate": 5.830738207055841e-06, + "loss": 0.0293, + "step": 2227 + }, + { + "epoch": 1.5074424898511503, + "grad_norm": 0.3175534859605412, + "learning_rate": 5.8268558355327795e-06, + "loss": 0.0289, + "step": 2228 + }, + { + "epoch": 1.5081190798376185, + "grad_norm": 0.3817557999387755, + "learning_rate": 5.82297295140367e-06, + "loss": 0.0394, + "step": 2229 + }, + { + "epoch": 1.5087956698240865, + "grad_norm": 0.400476420224585, + "learning_rate": 5.819089557075689e-06, + "loss": 0.0285, + "step": 2230 + }, + { + "epoch": 1.5094722598105548, + "grad_norm": 0.28199296079401087, + "learning_rate": 5.815205654956333e-06, + "loss": 0.0224, + "step": 2231 + }, + { + "epoch": 1.510148849797023, + "grad_norm": 0.387485117063692, + "learning_rate": 5.811321247453409e-06, + "loss": 0.0477, + "step": 2232 + }, + { + "epoch": 1.510825439783491, + "grad_norm": 0.31527053910146335, + "learning_rate": 5.807436336975045e-06, + "loss": 0.0319, + "step": 2233 + }, + { + "epoch": 1.5115020297699595, + "grad_norm": 0.3878838111364579, + "learning_rate": 5.803550925929673e-06, + "loss": 0.024, + "step": 2234 + }, + { + "epoch": 1.5121786197564275, + "grad_norm": 0.3884093592351876, + "learning_rate": 5.799665016726039e-06, + "loss": 0.0237, + "step": 2235 + }, + { + "epoch": 1.5128552097428958, + "grad_norm": 0.30494177236123693, + "learning_rate": 5.795778611773197e-06, + "loss": 0.0271, + "step": 2236 + }, + { + "epoch": 1.513531799729364, + "grad_norm": 0.2890206836051052, + "learning_rate": 5.791891713480509e-06, + "loss": 0.0254, + "step": 2237 + }, + { + "epoch": 1.514208389715832, + "grad_norm": 0.41133323803561245, + "learning_rate": 5.788004324257643e-06, + "loss": 0.0355, + "step": 2238 + }, + { + "epoch": 1.5148849797023005, + "grad_norm": 0.44773139766116443, + "learning_rate": 5.784116446514571e-06, + "loss": 0.0405, + "step": 2239 + }, + { + "epoch": 1.5155615696887685, + "grad_norm": 0.35202944055249824, + "learning_rate": 5.780228082661564e-06, + "loss": 0.0283, + "step": 2240 + }, + { + "epoch": 1.5162381596752368, + "grad_norm": 0.3679419256015076, + "learning_rate": 5.776339235109203e-06, + "loss": 0.0304, + "step": 2241 + }, + { + "epoch": 1.516914749661705, + "grad_norm": 0.4012639841648305, + "learning_rate": 5.772449906268362e-06, + "loss": 0.0372, + "step": 2242 + }, + { + "epoch": 1.517591339648173, + "grad_norm": 0.26874391610278875, + "learning_rate": 5.768560098550213e-06, + "loss": 0.0285, + "step": 2243 + }, + { + "epoch": 1.5182679296346415, + "grad_norm": 0.280920653440193, + "learning_rate": 5.764669814366231e-06, + "loss": 0.0303, + "step": 2244 + }, + { + "epoch": 1.5189445196211095, + "grad_norm": 0.43178951630195245, + "learning_rate": 5.760779056128178e-06, + "loss": 0.0364, + "step": 2245 + }, + { + "epoch": 1.5196211096075778, + "grad_norm": 0.3379292088933386, + "learning_rate": 5.756887826248118e-06, + "loss": 0.026, + "step": 2246 + }, + { + "epoch": 1.520297699594046, + "grad_norm": 0.48985792995504546, + "learning_rate": 5.752996127138404e-06, + "loss": 0.0256, + "step": 2247 + }, + { + "epoch": 1.520974289580514, + "grad_norm": 0.42483730153783744, + "learning_rate": 5.749103961211679e-06, + "loss": 0.0392, + "step": 2248 + }, + { + "epoch": 1.5216508795669825, + "grad_norm": 0.3047063888515312, + "learning_rate": 5.745211330880872e-06, + "loss": 0.0303, + "step": 2249 + }, + { + "epoch": 1.5223274695534506, + "grad_norm": 0.23780136080719255, + "learning_rate": 5.74131823855921e-06, + "loss": 0.0199, + "step": 2250 + }, + { + "epoch": 1.5230040595399188, + "grad_norm": 0.23773757627075567, + "learning_rate": 5.737424686660198e-06, + "loss": 0.0264, + "step": 2251 + }, + { + "epoch": 1.523680649526387, + "grad_norm": 0.5277599293734544, + "learning_rate": 5.733530677597627e-06, + "loss": 0.0299, + "step": 2252 + }, + { + "epoch": 1.524357239512855, + "grad_norm": 0.4383727747813701, + "learning_rate": 5.729636213785574e-06, + "loss": 0.0333, + "step": 2253 + }, + { + "epoch": 1.5250338294993235, + "grad_norm": 0.2804393276276087, + "learning_rate": 5.725741297638399e-06, + "loss": 0.0254, + "step": 2254 + }, + { + "epoch": 1.5257104194857916, + "grad_norm": 0.29051578642396086, + "learning_rate": 5.721845931570734e-06, + "loss": 0.0273, + "step": 2255 + }, + { + "epoch": 1.5263870094722598, + "grad_norm": 0.46143304691040654, + "learning_rate": 5.717950117997502e-06, + "loss": 0.0317, + "step": 2256 + }, + { + "epoch": 1.527063599458728, + "grad_norm": 0.30504259152648083, + "learning_rate": 5.714053859333893e-06, + "loss": 0.0242, + "step": 2257 + }, + { + "epoch": 1.527740189445196, + "grad_norm": 0.369149715164369, + "learning_rate": 5.710157157995382e-06, + "loss": 0.0407, + "step": 2258 + }, + { + "epoch": 1.5284167794316645, + "grad_norm": 0.7067227051887994, + "learning_rate": 5.70626001639771e-06, + "loss": 0.0369, + "step": 2259 + }, + { + "epoch": 1.5290933694181326, + "grad_norm": 0.39630297252409374, + "learning_rate": 5.702362436956895e-06, + "loss": 0.0331, + "step": 2260 + }, + { + "epoch": 1.5297699594046008, + "grad_norm": 0.2983715595820596, + "learning_rate": 5.6984644220892295e-06, + "loss": 0.0248, + "step": 2261 + }, + { + "epoch": 1.530446549391069, + "grad_norm": 0.2916272939939936, + "learning_rate": 5.694565974211267e-06, + "loss": 0.0311, + "step": 2262 + }, + { + "epoch": 1.531123139377537, + "grad_norm": 0.28916619163731966, + "learning_rate": 5.69066709573984e-06, + "loss": 0.0278, + "step": 2263 + }, + { + "epoch": 1.5317997293640055, + "grad_norm": 0.5797745890372059, + "learning_rate": 5.686767789092041e-06, + "loss": 0.0354, + "step": 2264 + }, + { + "epoch": 1.5324763193504736, + "grad_norm": 0.41935238131521263, + "learning_rate": 5.6828680566852314e-06, + "loss": 0.0372, + "step": 2265 + }, + { + "epoch": 1.5331529093369418, + "grad_norm": 0.31269369456931645, + "learning_rate": 5.678967900937032e-06, + "loss": 0.0282, + "step": 2266 + }, + { + "epoch": 1.53382949932341, + "grad_norm": 0.3502492713207318, + "learning_rate": 5.675067324265332e-06, + "loss": 0.0299, + "step": 2267 + }, + { + "epoch": 1.534506089309878, + "grad_norm": 0.48059011983953875, + "learning_rate": 5.671166329088278e-06, + "loss": 0.0389, + "step": 2268 + }, + { + "epoch": 1.5351826792963466, + "grad_norm": 0.35812795442247, + "learning_rate": 5.667264917824277e-06, + "loss": 0.0313, + "step": 2269 + }, + { + "epoch": 1.5358592692828146, + "grad_norm": 0.4503601989414651, + "learning_rate": 5.663363092891991e-06, + "loss": 0.0371, + "step": 2270 + }, + { + "epoch": 1.5365358592692828, + "grad_norm": 0.27188587094119093, + "learning_rate": 5.659460856710346e-06, + "loss": 0.0258, + "step": 2271 + }, + { + "epoch": 1.537212449255751, + "grad_norm": 0.46550547520971586, + "learning_rate": 5.655558211698513e-06, + "loss": 0.0383, + "step": 2272 + }, + { + "epoch": 1.537889039242219, + "grad_norm": 0.33906217259787863, + "learning_rate": 5.651655160275925e-06, + "loss": 0.032, + "step": 2273 + }, + { + "epoch": 1.5385656292286876, + "grad_norm": 0.2995042962480075, + "learning_rate": 5.647751704862263e-06, + "loss": 0.026, + "step": 2274 + }, + { + "epoch": 1.5392422192151556, + "grad_norm": 0.2687361674785122, + "learning_rate": 5.643847847877458e-06, + "loss": 0.0226, + "step": 2275 + }, + { + "epoch": 1.5399188092016238, + "grad_norm": 0.3224258661734978, + "learning_rate": 5.639943591741691e-06, + "loss": 0.0295, + "step": 2276 + }, + { + "epoch": 1.540595399188092, + "grad_norm": 0.361300059365036, + "learning_rate": 5.636038938875391e-06, + "loss": 0.0314, + "step": 2277 + }, + { + "epoch": 1.54127198917456, + "grad_norm": 0.3031849303460406, + "learning_rate": 5.632133891699232e-06, + "loss": 0.0224, + "step": 2278 + }, + { + "epoch": 1.5419485791610286, + "grad_norm": 0.3635016257159539, + "learning_rate": 5.628228452634132e-06, + "loss": 0.0269, + "step": 2279 + }, + { + "epoch": 1.5426251691474966, + "grad_norm": 0.3291595141471702, + "learning_rate": 5.624322624101255e-06, + "loss": 0.0309, + "step": 2280 + }, + { + "epoch": 1.5433017591339648, + "grad_norm": 0.4136027896750933, + "learning_rate": 5.620416408522002e-06, + "loss": 0.0311, + "step": 2281 + }, + { + "epoch": 1.543978349120433, + "grad_norm": 0.3654256380068809, + "learning_rate": 5.616509808318017e-06, + "loss": 0.03, + "step": 2282 + }, + { + "epoch": 1.544654939106901, + "grad_norm": 0.4088012751290403, + "learning_rate": 5.612602825911179e-06, + "loss": 0.0239, + "step": 2283 + }, + { + "epoch": 1.5453315290933696, + "grad_norm": 0.3327583838679735, + "learning_rate": 5.608695463723614e-06, + "loss": 0.0332, + "step": 2284 + }, + { + "epoch": 1.5460081190798376, + "grad_norm": 0.28656806895970455, + "learning_rate": 5.604787724177666e-06, + "loss": 0.0252, + "step": 2285 + }, + { + "epoch": 1.5466847090663058, + "grad_norm": 0.3756101794457761, + "learning_rate": 5.600879609695929e-06, + "loss": 0.0224, + "step": 2286 + }, + { + "epoch": 1.547361299052774, + "grad_norm": 0.3372624946666842, + "learning_rate": 5.596971122701221e-06, + "loss": 0.0264, + "step": 2287 + }, + { + "epoch": 1.548037889039242, + "grad_norm": 0.5262564576474407, + "learning_rate": 5.593062265616598e-06, + "loss": 0.0273, + "step": 2288 + }, + { + "epoch": 1.5487144790257106, + "grad_norm": 0.2780470196228257, + "learning_rate": 5.589153040865333e-06, + "loss": 0.0307, + "step": 2289 + }, + { + "epoch": 1.5493910690121786, + "grad_norm": 0.2863435793032491, + "learning_rate": 5.585243450870941e-06, + "loss": 0.0249, + "step": 2290 + }, + { + "epoch": 1.5500676589986468, + "grad_norm": 0.3354611138791834, + "learning_rate": 5.581333498057153e-06, + "loss": 0.0251, + "step": 2291 + }, + { + "epoch": 1.550744248985115, + "grad_norm": 0.352696713815565, + "learning_rate": 5.577423184847932e-06, + "loss": 0.0272, + "step": 2292 + }, + { + "epoch": 1.5514208389715831, + "grad_norm": 0.29367697450554253, + "learning_rate": 5.573512513667459e-06, + "loss": 0.0262, + "step": 2293 + }, + { + "epoch": 1.5520974289580516, + "grad_norm": 0.3406886200568296, + "learning_rate": 5.56960148694014e-06, + "loss": 0.0312, + "step": 2294 + }, + { + "epoch": 1.5527740189445196, + "grad_norm": 0.2841177723734298, + "learning_rate": 5.565690107090603e-06, + "loss": 0.0252, + "step": 2295 + }, + { + "epoch": 1.5534506089309879, + "grad_norm": 0.5547936101971093, + "learning_rate": 5.5617783765436894e-06, + "loss": 0.0377, + "step": 2296 + }, + { + "epoch": 1.554127198917456, + "grad_norm": 0.5201124496523982, + "learning_rate": 5.557866297724462e-06, + "loss": 0.047, + "step": 2297 + }, + { + "epoch": 1.5548037889039241, + "grad_norm": 0.28795915904164315, + "learning_rate": 5.553953873058201e-06, + "loss": 0.0276, + "step": 2298 + }, + { + "epoch": 1.5554803788903924, + "grad_norm": 0.4327150106187772, + "learning_rate": 5.550041104970398e-06, + "loss": 0.0436, + "step": 2299 + }, + { + "epoch": 1.5561569688768606, + "grad_norm": 0.3780508913737479, + "learning_rate": 5.5461279958867556e-06, + "loss": 0.0348, + "step": 2300 + }, + { + "epoch": 1.5568335588633289, + "grad_norm": 0.7123830628748699, + "learning_rate": 5.542214548233195e-06, + "loss": 0.0427, + "step": 2301 + }, + { + "epoch": 1.557510148849797, + "grad_norm": 0.44688756294255394, + "learning_rate": 5.538300764435838e-06, + "loss": 0.0372, + "step": 2302 + }, + { + "epoch": 1.5581867388362651, + "grad_norm": 0.31050222373873143, + "learning_rate": 5.534386646921023e-06, + "loss": 0.028, + "step": 2303 + }, + { + "epoch": 1.5588633288227334, + "grad_norm": 0.2474705453010173, + "learning_rate": 5.530472198115291e-06, + "loss": 0.0224, + "step": 2304 + }, + { + "epoch": 1.5595399188092016, + "grad_norm": 0.4243315473822393, + "learning_rate": 5.52655742044539e-06, + "loss": 0.0434, + "step": 2305 + }, + { + "epoch": 1.5602165087956699, + "grad_norm": 0.3073804022735799, + "learning_rate": 5.522642316338268e-06, + "loss": 0.024, + "step": 2306 + }, + { + "epoch": 1.560893098782138, + "grad_norm": 0.30508481179625546, + "learning_rate": 5.518726888221082e-06, + "loss": 0.0261, + "step": 2307 + }, + { + "epoch": 1.5615696887686061, + "grad_norm": 0.3645415300263851, + "learning_rate": 5.514811138521186e-06, + "loss": 0.0457, + "step": 2308 + }, + { + "epoch": 1.5622462787550744, + "grad_norm": 0.3731880232304017, + "learning_rate": 5.510895069666132e-06, + "loss": 0.0332, + "step": 2309 + }, + { + "epoch": 1.5629228687415426, + "grad_norm": 0.29881040608542214, + "learning_rate": 5.506978684083672e-06, + "loss": 0.0322, + "step": 2310 + }, + { + "epoch": 1.5635994587280109, + "grad_norm": 0.4139647800047502, + "learning_rate": 5.503061984201755e-06, + "loss": 0.0353, + "step": 2311 + }, + { + "epoch": 1.5642760487144791, + "grad_norm": 0.26339450128141606, + "learning_rate": 5.499144972448525e-06, + "loss": 0.0221, + "step": 2312 + }, + { + "epoch": 1.5649526387009471, + "grad_norm": 0.27030552970296745, + "learning_rate": 5.495227651252315e-06, + "loss": 0.0281, + "step": 2313 + }, + { + "epoch": 1.5656292286874154, + "grad_norm": 0.3430571955467693, + "learning_rate": 5.4913100230416536e-06, + "loss": 0.0259, + "step": 2314 + }, + { + "epoch": 1.5663058186738836, + "grad_norm": 0.41751827223862137, + "learning_rate": 5.48739209024526e-06, + "loss": 0.0432, + "step": 2315 + }, + { + "epoch": 1.5669824086603519, + "grad_norm": 0.38928508117837146, + "learning_rate": 5.483473855292043e-06, + "loss": 0.0509, + "step": 2316 + }, + { + "epoch": 1.5676589986468201, + "grad_norm": 0.597502646451781, + "learning_rate": 5.479555320611094e-06, + "loss": 0.0389, + "step": 2317 + }, + { + "epoch": 1.5683355886332881, + "grad_norm": 0.30570336204655224, + "learning_rate": 5.475636488631697e-06, + "loss": 0.0331, + "step": 2318 + }, + { + "epoch": 1.5690121786197564, + "grad_norm": 0.3724477977801581, + "learning_rate": 5.471717361783312e-06, + "loss": 0.0351, + "step": 2319 + }, + { + "epoch": 1.5696887686062246, + "grad_norm": 0.3717105129186735, + "learning_rate": 5.46779794249559e-06, + "loss": 0.0327, + "step": 2320 + }, + { + "epoch": 1.5703653585926927, + "grad_norm": 0.32443787395412504, + "learning_rate": 5.463878233198358e-06, + "loss": 0.0338, + "step": 2321 + }, + { + "epoch": 1.5710419485791611, + "grad_norm": 0.2759446934870076, + "learning_rate": 5.459958236321625e-06, + "loss": 0.0242, + "step": 2322 + }, + { + "epoch": 1.5717185385656292, + "grad_norm": 0.3020330020083215, + "learning_rate": 5.4560379542955766e-06, + "loss": 0.0312, + "step": 2323 + }, + { + "epoch": 1.5723951285520974, + "grad_norm": 0.7976163557677621, + "learning_rate": 5.45211738955058e-06, + "loss": 0.0342, + "step": 2324 + }, + { + "epoch": 1.5730717185385656, + "grad_norm": 0.30519498323082467, + "learning_rate": 5.448196544517168e-06, + "loss": 0.0289, + "step": 2325 + }, + { + "epoch": 1.5737483085250337, + "grad_norm": 0.45976303799466584, + "learning_rate": 5.444275421626058e-06, + "loss": 0.0303, + "step": 2326 + }, + { + "epoch": 1.5744248985115021, + "grad_norm": 0.5640631094334173, + "learning_rate": 5.440354023308134e-06, + "loss": 0.04, + "step": 2327 + }, + { + "epoch": 1.5751014884979702, + "grad_norm": 0.3212546983456934, + "learning_rate": 5.436432351994452e-06, + "loss": 0.0315, + "step": 2328 + }, + { + "epoch": 1.5757780784844384, + "grad_norm": 0.2697860133399002, + "learning_rate": 5.4325104101162345e-06, + "loss": 0.0215, + "step": 2329 + }, + { + "epoch": 1.5764546684709067, + "grad_norm": 0.45636905948917267, + "learning_rate": 5.428588200104875e-06, + "loss": 0.0304, + "step": 2330 + }, + { + "epoch": 1.5771312584573747, + "grad_norm": 0.42084923241510136, + "learning_rate": 5.4246657243919345e-06, + "loss": 0.0319, + "step": 2331 + }, + { + "epoch": 1.5778078484438431, + "grad_norm": 0.3275868438390289, + "learning_rate": 5.420742985409132e-06, + "loss": 0.0333, + "step": 2332 + }, + { + "epoch": 1.5784844384303112, + "grad_norm": 0.356416041888783, + "learning_rate": 5.41681998558836e-06, + "loss": 0.0338, + "step": 2333 + }, + { + "epoch": 1.5791610284167794, + "grad_norm": 0.2923914256852067, + "learning_rate": 5.412896727361663e-06, + "loss": 0.0275, + "step": 2334 + }, + { + "epoch": 1.5798376184032477, + "grad_norm": 0.40928252595061093, + "learning_rate": 5.408973213161251e-06, + "loss": 0.0301, + "step": 2335 + }, + { + "epoch": 1.5805142083897157, + "grad_norm": 0.3685539206759366, + "learning_rate": 5.405049445419488e-06, + "loss": 0.0331, + "step": 2336 + }, + { + "epoch": 1.5811907983761841, + "grad_norm": 0.29125932019013834, + "learning_rate": 5.401125426568904e-06, + "loss": 0.0285, + "step": 2337 + }, + { + "epoch": 1.5818673883626522, + "grad_norm": 0.43011279214871445, + "learning_rate": 5.397201159042176e-06, + "loss": 0.0358, + "step": 2338 + }, + { + "epoch": 1.5825439783491204, + "grad_norm": 0.5382754877623533, + "learning_rate": 5.393276645272139e-06, + "loss": 0.0361, + "step": 2339 + }, + { + "epoch": 1.5832205683355887, + "grad_norm": 0.4224877364186262, + "learning_rate": 5.3893518876917795e-06, + "loss": 0.0382, + "step": 2340 + }, + { + "epoch": 1.5838971583220567, + "grad_norm": 0.5357649876397851, + "learning_rate": 5.385426888734237e-06, + "loss": 0.0273, + "step": 2341 + }, + { + "epoch": 1.5845737483085252, + "grad_norm": 0.3313339071953703, + "learning_rate": 5.381501650832798e-06, + "loss": 0.0285, + "step": 2342 + }, + { + "epoch": 1.5852503382949932, + "grad_norm": 0.37701724090596933, + "learning_rate": 5.377576176420899e-06, + "loss": 0.0298, + "step": 2343 + }, + { + "epoch": 1.5859269282814614, + "grad_norm": 0.38807658159329533, + "learning_rate": 5.373650467932122e-06, + "loss": 0.0301, + "step": 2344 + }, + { + "epoch": 1.5866035182679297, + "grad_norm": 0.2930869365951142, + "learning_rate": 5.3697245278001956e-06, + "loss": 0.0256, + "step": 2345 + }, + { + "epoch": 1.5872801082543977, + "grad_norm": 0.34352526166541836, + "learning_rate": 5.365798358458989e-06, + "loss": 0.031, + "step": 2346 + }, + { + "epoch": 1.5879566982408662, + "grad_norm": 0.4198102347999944, + "learning_rate": 5.361871962342519e-06, + "loss": 0.0354, + "step": 2347 + }, + { + "epoch": 1.5886332882273342, + "grad_norm": 0.4117109193265844, + "learning_rate": 5.357945341884936e-06, + "loss": 0.0444, + "step": 2348 + }, + { + "epoch": 1.5893098782138024, + "grad_norm": 0.42754919195883917, + "learning_rate": 5.354018499520536e-06, + "loss": 0.0195, + "step": 2349 + }, + { + "epoch": 1.5899864682002707, + "grad_norm": 0.29354593613448376, + "learning_rate": 5.350091437683746e-06, + "loss": 0.0308, + "step": 2350 + }, + { + "epoch": 1.5906630581867387, + "grad_norm": 0.2599587562244208, + "learning_rate": 5.346164158809136e-06, + "loss": 0.0209, + "step": 2351 + }, + { + "epoch": 1.5913396481732072, + "grad_norm": 0.5137231231769998, + "learning_rate": 5.342236665331407e-06, + "loss": 0.0447, + "step": 2352 + }, + { + "epoch": 1.5920162381596752, + "grad_norm": 0.28814223157523017, + "learning_rate": 5.338308959685391e-06, + "loss": 0.0293, + "step": 2353 + }, + { + "epoch": 1.5926928281461434, + "grad_norm": 0.3324234006014201, + "learning_rate": 5.334381044306057e-06, + "loss": 0.0275, + "step": 2354 + }, + { + "epoch": 1.5933694181326117, + "grad_norm": 0.8135230429138659, + "learning_rate": 5.3304529216284974e-06, + "loss": 0.039, + "step": 2355 + }, + { + "epoch": 1.5940460081190797, + "grad_norm": 0.24029161254048562, + "learning_rate": 5.32652459408794e-06, + "loss": 0.0219, + "step": 2356 + }, + { + "epoch": 1.5947225981055482, + "grad_norm": 0.39312159225943955, + "learning_rate": 5.322596064119731e-06, + "loss": 0.0319, + "step": 2357 + }, + { + "epoch": 1.5953991880920162, + "grad_norm": 0.31450535898429005, + "learning_rate": 5.318667334159354e-06, + "loss": 0.0277, + "step": 2358 + }, + { + "epoch": 1.5960757780784844, + "grad_norm": 0.28790052265587107, + "learning_rate": 5.314738406642405e-06, + "loss": 0.0255, + "step": 2359 + }, + { + "epoch": 1.5967523680649527, + "grad_norm": 0.3466919896879651, + "learning_rate": 5.310809284004608e-06, + "loss": 0.0257, + "step": 2360 + }, + { + "epoch": 1.5974289580514207, + "grad_norm": 0.30185329987038856, + "learning_rate": 5.306879968681808e-06, + "loss": 0.0242, + "step": 2361 + }, + { + "epoch": 1.5981055480378892, + "grad_norm": 0.476468114203344, + "learning_rate": 5.30295046310997e-06, + "loss": 0.0379, + "step": 2362 + }, + { + "epoch": 1.5987821380243572, + "grad_norm": 0.3266348346067902, + "learning_rate": 5.299020769725172e-06, + "loss": 0.0306, + "step": 2363 + }, + { + "epoch": 1.5994587280108254, + "grad_norm": 0.3510866291031492, + "learning_rate": 5.2950908909636144e-06, + "loss": 0.0301, + "step": 2364 + }, + { + "epoch": 1.6001353179972937, + "grad_norm": 0.29795316071876665, + "learning_rate": 5.2911608292616116e-06, + "loss": 0.0238, + "step": 2365 + }, + { + "epoch": 1.6008119079837617, + "grad_norm": 0.3357903221858778, + "learning_rate": 5.2872305870555874e-06, + "loss": 0.0336, + "step": 2366 + }, + { + "epoch": 1.6014884979702302, + "grad_norm": 0.3222633508321025, + "learning_rate": 5.2833001667820815e-06, + "loss": 0.0305, + "step": 2367 + }, + { + "epoch": 1.6021650879566982, + "grad_norm": 0.3013168859199103, + "learning_rate": 5.279369570877742e-06, + "loss": 0.0245, + "step": 2368 + }, + { + "epoch": 1.6028416779431665, + "grad_norm": 0.3594003014147857, + "learning_rate": 5.275438801779328e-06, + "loss": 0.0423, + "step": 2369 + }, + { + "epoch": 1.6035182679296347, + "grad_norm": 0.4092351587208865, + "learning_rate": 5.271507861923701e-06, + "loss": 0.0498, + "step": 2370 + }, + { + "epoch": 1.6041948579161027, + "grad_norm": 0.33582306878695767, + "learning_rate": 5.267576753747839e-06, + "loss": 0.028, + "step": 2371 + }, + { + "epoch": 1.6048714479025712, + "grad_norm": 0.3557381764205679, + "learning_rate": 5.263645479688807e-06, + "loss": 0.0417, + "step": 2372 + }, + { + "epoch": 1.6055480378890392, + "grad_norm": 0.348355854143184, + "learning_rate": 5.2597140421837915e-06, + "loss": 0.0345, + "step": 2373 + }, + { + "epoch": 1.6062246278755075, + "grad_norm": 0.3523634460129781, + "learning_rate": 5.255782443670068e-06, + "loss": 0.0297, + "step": 2374 + }, + { + "epoch": 1.6069012178619757, + "grad_norm": 0.41331346487260207, + "learning_rate": 5.251850686585015e-06, + "loss": 0.0376, + "step": 2375 + }, + { + "epoch": 1.6075778078484437, + "grad_norm": 0.9209189140563419, + "learning_rate": 5.247918773366112e-06, + "loss": 0.0387, + "step": 2376 + }, + { + "epoch": 1.6082543978349122, + "grad_norm": 0.2853025462118463, + "learning_rate": 5.243986706450933e-06, + "loss": 0.037, + "step": 2377 + }, + { + "epoch": 1.6089309878213802, + "grad_norm": 0.269536875487749, + "learning_rate": 5.240054488277148e-06, + "loss": 0.0252, + "step": 2378 + }, + { + "epoch": 1.6096075778078485, + "grad_norm": 0.43886888149438347, + "learning_rate": 5.2361221212825175e-06, + "loss": 0.0381, + "step": 2379 + }, + { + "epoch": 1.6102841677943167, + "grad_norm": 0.3042639669769169, + "learning_rate": 5.2321896079048994e-06, + "loss": 0.0299, + "step": 2380 + }, + { + "epoch": 1.6109607577807847, + "grad_norm": 0.3857116430689522, + "learning_rate": 5.2282569505822414e-06, + "loss": 0.027, + "step": 2381 + }, + { + "epoch": 1.6116373477672532, + "grad_norm": 0.36262497922029185, + "learning_rate": 5.224324151752575e-06, + "loss": 0.0388, + "step": 2382 + }, + { + "epoch": 1.6123139377537212, + "grad_norm": 0.34505466311143673, + "learning_rate": 5.220391213854028e-06, + "loss": 0.0348, + "step": 2383 + }, + { + "epoch": 1.6129905277401895, + "grad_norm": 0.37328643137737, + "learning_rate": 5.216458139324806e-06, + "loss": 0.0311, + "step": 2384 + }, + { + "epoch": 1.6136671177266577, + "grad_norm": 0.2640567293362263, + "learning_rate": 5.212524930603205e-06, + "loss": 0.0271, + "step": 2385 + }, + { + "epoch": 1.6143437077131257, + "grad_norm": 0.29616927061434556, + "learning_rate": 5.208591590127603e-06, + "loss": 0.0297, + "step": 2386 + }, + { + "epoch": 1.6150202976995942, + "grad_norm": 0.24550333446499212, + "learning_rate": 5.2046581203364585e-06, + "loss": 0.0261, + "step": 2387 + }, + { + "epoch": 1.6156968876860622, + "grad_norm": 0.29369395840996215, + "learning_rate": 5.200724523668311e-06, + "loss": 0.0245, + "step": 2388 + }, + { + "epoch": 1.6163734776725305, + "grad_norm": 0.2683000629854068, + "learning_rate": 5.196790802561776e-06, + "loss": 0.0221, + "step": 2389 + }, + { + "epoch": 1.6170500676589987, + "grad_norm": 0.34670876389850336, + "learning_rate": 5.192856959455552e-06, + "loss": 0.0391, + "step": 2390 + }, + { + "epoch": 1.6177266576454667, + "grad_norm": 0.36926778359752926, + "learning_rate": 5.188922996788409e-06, + "loss": 0.0333, + "step": 2391 + }, + { + "epoch": 1.618403247631935, + "grad_norm": 0.31394306373678366, + "learning_rate": 5.184988916999191e-06, + "loss": 0.0272, + "step": 2392 + }, + { + "epoch": 1.6190798376184032, + "grad_norm": 0.2900385153166722, + "learning_rate": 5.181054722526815e-06, + "loss": 0.0249, + "step": 2393 + }, + { + "epoch": 1.6197564276048715, + "grad_norm": 0.2782432898626616, + "learning_rate": 5.177120415810271e-06, + "loss": 0.0277, + "step": 2394 + }, + { + "epoch": 1.6204330175913397, + "grad_norm": 0.4515117647214088, + "learning_rate": 5.173185999288615e-06, + "loss": 0.0254, + "step": 2395 + }, + { + "epoch": 1.6211096075778078, + "grad_norm": 0.3454856270531984, + "learning_rate": 5.1692514754009744e-06, + "loss": 0.033, + "step": 2396 + }, + { + "epoch": 1.621786197564276, + "grad_norm": 0.2614263919421152, + "learning_rate": 5.165316846586541e-06, + "loss": 0.0238, + "step": 2397 + }, + { + "epoch": 1.6224627875507442, + "grad_norm": 0.34588880208782086, + "learning_rate": 5.161382115284576e-06, + "loss": 0.0263, + "step": 2398 + }, + { + "epoch": 1.6231393775372125, + "grad_norm": 0.3024368957354851, + "learning_rate": 5.1574472839343956e-06, + "loss": 0.0255, + "step": 2399 + }, + { + "epoch": 1.6238159675236807, + "grad_norm": 0.3230674290561768, + "learning_rate": 5.153512354975388e-06, + "loss": 0.0292, + "step": 2400 + }, + { + "epoch": 1.6244925575101488, + "grad_norm": 0.33268775259496547, + "learning_rate": 5.1495773308469935e-06, + "loss": 0.0242, + "step": 2401 + }, + { + "epoch": 1.625169147496617, + "grad_norm": 0.3434921825358834, + "learning_rate": 5.145642213988716e-06, + "loss": 0.0282, + "step": 2402 + }, + { + "epoch": 1.6258457374830853, + "grad_norm": 0.28930577449493566, + "learning_rate": 5.1417070068401165e-06, + "loss": 0.0261, + "step": 2403 + }, + { + "epoch": 1.6265223274695535, + "grad_norm": 0.41573144359694525, + "learning_rate": 5.137771711840811e-06, + "loss": 0.0332, + "step": 2404 + }, + { + "epoch": 1.6271989174560217, + "grad_norm": 0.30668670795523817, + "learning_rate": 5.133836331430469e-06, + "loss": 0.0334, + "step": 2405 + }, + { + "epoch": 1.6278755074424898, + "grad_norm": 0.38110277999573, + "learning_rate": 5.129900868048817e-06, + "loss": 0.0335, + "step": 2406 + }, + { + "epoch": 1.628552097428958, + "grad_norm": 0.3087558173880505, + "learning_rate": 5.1259653241356275e-06, + "loss": 0.0267, + "step": 2407 + }, + { + "epoch": 1.6292286874154263, + "grad_norm": 0.36578731267965897, + "learning_rate": 5.1220297021307275e-06, + "loss": 0.0244, + "step": 2408 + }, + { + "epoch": 1.6299052774018945, + "grad_norm": 0.30876751239844696, + "learning_rate": 5.11809400447399e-06, + "loss": 0.0295, + "step": 2409 + }, + { + "epoch": 1.6305818673883627, + "grad_norm": 0.3979178676891502, + "learning_rate": 5.114158233605334e-06, + "loss": 0.0333, + "step": 2410 + }, + { + "epoch": 1.6312584573748308, + "grad_norm": 0.4297870999131062, + "learning_rate": 5.110222391964728e-06, + "loss": 0.0414, + "step": 2411 + }, + { + "epoch": 1.631935047361299, + "grad_norm": 0.31896124832207173, + "learning_rate": 5.106286481992179e-06, + "loss": 0.0351, + "step": 2412 + }, + { + "epoch": 1.6326116373477673, + "grad_norm": 0.23096169080932097, + "learning_rate": 5.1023505061277405e-06, + "loss": 0.0227, + "step": 2413 + }, + { + "epoch": 1.6332882273342353, + "grad_norm": 0.3413028792115987, + "learning_rate": 5.098414466811504e-06, + "loss": 0.0374, + "step": 2414 + }, + { + "epoch": 1.6339648173207038, + "grad_norm": 0.45352921659674916, + "learning_rate": 5.094478366483604e-06, + "loss": 0.0339, + "step": 2415 + }, + { + "epoch": 1.6346414073071718, + "grad_norm": 0.4903068175504176, + "learning_rate": 5.090542207584207e-06, + "loss": 0.0321, + "step": 2416 + }, + { + "epoch": 1.63531799729364, + "grad_norm": 0.4244606937624204, + "learning_rate": 5.086605992553524e-06, + "loss": 0.0297, + "step": 2417 + }, + { + "epoch": 1.6359945872801083, + "grad_norm": 0.3013416344160587, + "learning_rate": 5.082669723831793e-06, + "loss": 0.0287, + "step": 2418 + }, + { + "epoch": 1.6366711772665763, + "grad_norm": 0.3393425034746203, + "learning_rate": 5.07873340385929e-06, + "loss": 0.0302, + "step": 2419 + }, + { + "epoch": 1.6373477672530448, + "grad_norm": 0.3068218935542081, + "learning_rate": 5.074797035076319e-06, + "loss": 0.0412, + "step": 2420 + }, + { + "epoch": 1.6380243572395128, + "grad_norm": 0.3287251477257142, + "learning_rate": 5.070860619923218e-06, + "loss": 0.0348, + "step": 2421 + }, + { + "epoch": 1.638700947225981, + "grad_norm": 0.2956531549869574, + "learning_rate": 5.066924160840353e-06, + "loss": 0.0291, + "step": 2422 + }, + { + "epoch": 1.6393775372124493, + "grad_norm": 0.40633426779896475, + "learning_rate": 5.062987660268114e-06, + "loss": 0.0399, + "step": 2423 + }, + { + "epoch": 1.6400541271989173, + "grad_norm": 0.35961004553091297, + "learning_rate": 5.059051120646924e-06, + "loss": 0.0349, + "step": 2424 + }, + { + "epoch": 1.6407307171853858, + "grad_norm": 0.4275448811460775, + "learning_rate": 5.055114544417219e-06, + "loss": 0.0324, + "step": 2425 + }, + { + "epoch": 1.6414073071718538, + "grad_norm": 0.2749068378521967, + "learning_rate": 5.051177934019468e-06, + "loss": 0.0265, + "step": 2426 + }, + { + "epoch": 1.642083897158322, + "grad_norm": 0.31688921004024967, + "learning_rate": 5.047241291894156e-06, + "loss": 0.0257, + "step": 2427 + }, + { + "epoch": 1.6427604871447903, + "grad_norm": 0.3297456720984391, + "learning_rate": 5.043304620481791e-06, + "loss": 0.0245, + "step": 2428 + }, + { + "epoch": 1.6434370771312583, + "grad_norm": 0.32117853754405856, + "learning_rate": 5.039367922222894e-06, + "loss": 0.0293, + "step": 2429 + }, + { + "epoch": 1.6441136671177268, + "grad_norm": 0.2742748087180567, + "learning_rate": 5.035431199558008e-06, + "loss": 0.0262, + "step": 2430 + }, + { + "epoch": 1.6447902571041948, + "grad_norm": 0.39990732911072946, + "learning_rate": 5.031494454927688e-06, + "loss": 0.0286, + "step": 2431 + }, + { + "epoch": 1.645466847090663, + "grad_norm": 0.3086953831666762, + "learning_rate": 5.027557690772503e-06, + "loss": 0.0283, + "step": 2432 + }, + { + "epoch": 1.6461434370771313, + "grad_norm": 0.3869513723833396, + "learning_rate": 5.0236209095330344e-06, + "loss": 0.0304, + "step": 2433 + }, + { + "epoch": 1.6468200270635993, + "grad_norm": 0.363960957477567, + "learning_rate": 5.019684113649877e-06, + "loss": 0.0321, + "step": 2434 + }, + { + "epoch": 1.6474966170500678, + "grad_norm": 0.3834734462758225, + "learning_rate": 5.0157473055636285e-06, + "loss": 0.0402, + "step": 2435 + }, + { + "epoch": 1.6481732070365358, + "grad_norm": 0.3305860583863713, + "learning_rate": 5.011810487714901e-06, + "loss": 0.0333, + "step": 2436 + }, + { + "epoch": 1.648849797023004, + "grad_norm": 0.3971169135342817, + "learning_rate": 5.007873662544306e-06, + "loss": 0.0371, + "step": 2437 + }, + { + "epoch": 1.6495263870094723, + "grad_norm": 0.3796118208418505, + "learning_rate": 5.003936832492465e-06, + "loss": 0.028, + "step": 2438 + }, + { + "epoch": 1.6502029769959403, + "grad_norm": 0.40048953027734274, + "learning_rate": 5e-06, + "loss": 0.0352, + "step": 2439 + }, + { + "epoch": 1.6508795669824088, + "grad_norm": 0.26909899622698136, + "learning_rate": 4.9960631675075364e-06, + "loss": 0.0258, + "step": 2440 + }, + { + "epoch": 1.6515561569688768, + "grad_norm": 0.30799193045806944, + "learning_rate": 4.9921263374556946e-06, + "loss": 0.0316, + "step": 2441 + }, + { + "epoch": 1.652232746955345, + "grad_norm": 0.28341985680868104, + "learning_rate": 4.988189512285101e-06, + "loss": 0.0318, + "step": 2442 + }, + { + "epoch": 1.6529093369418133, + "grad_norm": 0.3149542562422749, + "learning_rate": 4.984252694436373e-06, + "loss": 0.0318, + "step": 2443 + }, + { + "epoch": 1.6535859269282813, + "grad_norm": 0.4672164364042736, + "learning_rate": 4.980315886350125e-06, + "loss": 0.0281, + "step": 2444 + }, + { + "epoch": 1.6542625169147498, + "grad_norm": 0.39497733540158936, + "learning_rate": 4.976379090466966e-06, + "loss": 0.0305, + "step": 2445 + }, + { + "epoch": 1.6549391069012178, + "grad_norm": 0.3048033655998791, + "learning_rate": 4.972442309227498e-06, + "loss": 0.0326, + "step": 2446 + }, + { + "epoch": 1.655615696887686, + "grad_norm": 0.3438137923735947, + "learning_rate": 4.968505545072314e-06, + "loss": 0.0252, + "step": 2447 + }, + { + "epoch": 1.6562922868741543, + "grad_norm": 0.4526020897772555, + "learning_rate": 4.964568800441993e-06, + "loss": 0.0319, + "step": 2448 + }, + { + "epoch": 1.6569688768606223, + "grad_norm": 0.4090532831167456, + "learning_rate": 4.960632077777107e-06, + "loss": 0.0374, + "step": 2449 + }, + { + "epoch": 1.6576454668470908, + "grad_norm": 0.284886721313467, + "learning_rate": 4.956695379518211e-06, + "loss": 0.0255, + "step": 2450 + }, + { + "epoch": 1.6583220568335588, + "grad_norm": 0.5135484250527513, + "learning_rate": 4.952758708105845e-06, + "loss": 0.0271, + "step": 2451 + }, + { + "epoch": 1.658998646820027, + "grad_norm": 0.32544962255843, + "learning_rate": 4.948822065980533e-06, + "loss": 0.0293, + "step": 2452 + }, + { + "epoch": 1.6596752368064953, + "grad_norm": 0.36630927223611487, + "learning_rate": 4.944885455582783e-06, + "loss": 0.0339, + "step": 2453 + }, + { + "epoch": 1.6603518267929633, + "grad_norm": 0.4938733572333338, + "learning_rate": 4.940948879353078e-06, + "loss": 0.0393, + "step": 2454 + }, + { + "epoch": 1.6610284167794318, + "grad_norm": 0.28527794194948547, + "learning_rate": 4.937012339731886e-06, + "loss": 0.0239, + "step": 2455 + }, + { + "epoch": 1.6617050067658998, + "grad_norm": 0.35862330240430557, + "learning_rate": 4.933075839159649e-06, + "loss": 0.0367, + "step": 2456 + }, + { + "epoch": 1.662381596752368, + "grad_norm": 0.31645960995043676, + "learning_rate": 4.929139380076784e-06, + "loss": 0.0296, + "step": 2457 + }, + { + "epoch": 1.6630581867388363, + "grad_norm": 0.3838919728468458, + "learning_rate": 4.9252029649236835e-06, + "loss": 0.0344, + "step": 2458 + }, + { + "epoch": 1.6637347767253043, + "grad_norm": 0.35149806363624375, + "learning_rate": 4.921266596140712e-06, + "loss": 0.0325, + "step": 2459 + }, + { + "epoch": 1.6644113667117728, + "grad_norm": 0.31589055620712414, + "learning_rate": 4.917330276168208e-06, + "loss": 0.0258, + "step": 2460 + }, + { + "epoch": 1.6650879566982408, + "grad_norm": 0.3752516646182594, + "learning_rate": 4.913394007446477e-06, + "loss": 0.0335, + "step": 2461 + }, + { + "epoch": 1.665764546684709, + "grad_norm": 0.440045427122306, + "learning_rate": 4.909457792415793e-06, + "loss": 0.0373, + "step": 2462 + }, + { + "epoch": 1.6664411366711773, + "grad_norm": 0.3143434042022228, + "learning_rate": 4.905521633516399e-06, + "loss": 0.027, + "step": 2463 + }, + { + "epoch": 1.6671177266576453, + "grad_norm": 0.49174735617574794, + "learning_rate": 4.9015855331884984e-06, + "loss": 0.0441, + "step": 2464 + }, + { + "epoch": 1.6677943166441138, + "grad_norm": 0.2501179762617151, + "learning_rate": 4.897649493872262e-06, + "loss": 0.0213, + "step": 2465 + }, + { + "epoch": 1.6684709066305818, + "grad_norm": 0.26357079947156115, + "learning_rate": 4.8937135180078236e-06, + "loss": 0.0281, + "step": 2466 + }, + { + "epoch": 1.66914749661705, + "grad_norm": 0.5002463762113306, + "learning_rate": 4.889777608035273e-06, + "loss": 0.0473, + "step": 2467 + }, + { + "epoch": 1.6698240866035183, + "grad_norm": 0.2597923656039043, + "learning_rate": 4.8858417663946665e-06, + "loss": 0.0304, + "step": 2468 + }, + { + "epoch": 1.6705006765899864, + "grad_norm": 0.30532394499123533, + "learning_rate": 4.8819059955260105e-06, + "loss": 0.0282, + "step": 2469 + }, + { + "epoch": 1.6711772665764548, + "grad_norm": 0.3669725059102211, + "learning_rate": 4.877970297869273e-06, + "loss": 0.028, + "step": 2470 + }, + { + "epoch": 1.6718538565629228, + "grad_norm": 0.2958032207690955, + "learning_rate": 4.874034675864373e-06, + "loss": 0.0308, + "step": 2471 + }, + { + "epoch": 1.672530446549391, + "grad_norm": 0.29591575842976964, + "learning_rate": 4.870099131951185e-06, + "loss": 0.0261, + "step": 2472 + }, + { + "epoch": 1.6732070365358593, + "grad_norm": 0.444414438652638, + "learning_rate": 4.866163668569531e-06, + "loss": 0.0356, + "step": 2473 + }, + { + "epoch": 1.6738836265223274, + "grad_norm": 0.2284144790986997, + "learning_rate": 4.862228288159191e-06, + "loss": 0.0203, + "step": 2474 + }, + { + "epoch": 1.6745602165087958, + "grad_norm": 0.3168089399181314, + "learning_rate": 4.858292993159884e-06, + "loss": 0.0266, + "step": 2475 + }, + { + "epoch": 1.6752368064952639, + "grad_norm": 0.45337324058368567, + "learning_rate": 4.854357786011286e-06, + "loss": 0.0304, + "step": 2476 + }, + { + "epoch": 1.675913396481732, + "grad_norm": 0.30255064608622795, + "learning_rate": 4.850422669153009e-06, + "loss": 0.0219, + "step": 2477 + }, + { + "epoch": 1.6765899864682003, + "grad_norm": 0.550612196678239, + "learning_rate": 4.846487645024614e-06, + "loss": 0.037, + "step": 2478 + }, + { + "epoch": 1.6772665764546684, + "grad_norm": 0.4686831388655964, + "learning_rate": 4.842552716065605e-06, + "loss": 0.0306, + "step": 2479 + }, + { + "epoch": 1.6779431664411368, + "grad_norm": 0.4019281797042012, + "learning_rate": 4.838617884715425e-06, + "loss": 0.0258, + "step": 2480 + }, + { + "epoch": 1.6786197564276049, + "grad_norm": 0.34616581680942493, + "learning_rate": 4.8346831534134595e-06, + "loss": 0.0266, + "step": 2481 + }, + { + "epoch": 1.679296346414073, + "grad_norm": 1.1356553221371501, + "learning_rate": 4.830748524599026e-06, + "loss": 0.0457, + "step": 2482 + }, + { + "epoch": 1.6799729364005414, + "grad_norm": 0.301710670880077, + "learning_rate": 4.826814000711388e-06, + "loss": 0.0288, + "step": 2483 + }, + { + "epoch": 1.6806495263870094, + "grad_norm": 0.37289610939268875, + "learning_rate": 4.822879584189732e-06, + "loss": 0.0272, + "step": 2484 + }, + { + "epoch": 1.6813261163734776, + "grad_norm": 0.34094199109719636, + "learning_rate": 4.818945277473187e-06, + "loss": 0.0299, + "step": 2485 + }, + { + "epoch": 1.6820027063599459, + "grad_norm": 0.3537092429111352, + "learning_rate": 4.81501108300081e-06, + "loss": 0.0241, + "step": 2486 + }, + { + "epoch": 1.682679296346414, + "grad_norm": 0.33125130972563754, + "learning_rate": 4.811077003211592e-06, + "loss": 0.0313, + "step": 2487 + }, + { + "epoch": 1.6833558863328824, + "grad_norm": 0.3773197200871978, + "learning_rate": 4.807143040544448e-06, + "loss": 0.0367, + "step": 2488 + }, + { + "epoch": 1.6840324763193504, + "grad_norm": 0.5071851311613988, + "learning_rate": 4.803209197438224e-06, + "loss": 0.0287, + "step": 2489 + }, + { + "epoch": 1.6847090663058186, + "grad_norm": 0.3337540398275131, + "learning_rate": 4.799275476331692e-06, + "loss": 0.0325, + "step": 2490 + }, + { + "epoch": 1.6853856562922869, + "grad_norm": 0.31481810707745517, + "learning_rate": 4.795341879663543e-06, + "loss": 0.0371, + "step": 2491 + }, + { + "epoch": 1.6860622462787551, + "grad_norm": 0.26514726928104737, + "learning_rate": 4.791408409872398e-06, + "loss": 0.0332, + "step": 2492 + }, + { + "epoch": 1.6867388362652234, + "grad_norm": 0.3757558973504405, + "learning_rate": 4.787475069396796e-06, + "loss": 0.0326, + "step": 2493 + }, + { + "epoch": 1.6874154262516914, + "grad_norm": 0.31743216087205844, + "learning_rate": 4.783541860675195e-06, + "loss": 0.0301, + "step": 2494 + }, + { + "epoch": 1.6880920162381596, + "grad_norm": 0.3045483562125322, + "learning_rate": 4.779608786145974e-06, + "loss": 0.032, + "step": 2495 + }, + { + "epoch": 1.6887686062246279, + "grad_norm": 0.4876288542970148, + "learning_rate": 4.775675848247427e-06, + "loss": 0.0398, + "step": 2496 + }, + { + "epoch": 1.6894451962110961, + "grad_norm": 0.33917181742610175, + "learning_rate": 4.771743049417761e-06, + "loss": 0.0266, + "step": 2497 + }, + { + "epoch": 1.6901217861975644, + "grad_norm": 0.4500498165699519, + "learning_rate": 4.767810392095102e-06, + "loss": 0.0352, + "step": 2498 + }, + { + "epoch": 1.6907983761840324, + "grad_norm": 0.46955843946063675, + "learning_rate": 4.763877878717484e-06, + "loss": 0.0293, + "step": 2499 + }, + { + "epoch": 1.6914749661705006, + "grad_norm": 0.3647226957252124, + "learning_rate": 4.759945511722854e-06, + "loss": 0.0319, + "step": 2500 + }, + { + "epoch": 1.6921515561569689, + "grad_norm": 0.3249651773966415, + "learning_rate": 4.756013293549067e-06, + "loss": 0.0327, + "step": 2501 + }, + { + "epoch": 1.6928281461434371, + "grad_norm": 0.4342956740924432, + "learning_rate": 4.752081226633888e-06, + "loss": 0.0346, + "step": 2502 + }, + { + "epoch": 1.6935047361299054, + "grad_norm": 0.314857610946973, + "learning_rate": 4.748149313414987e-06, + "loss": 0.0318, + "step": 2503 + }, + { + "epoch": 1.6941813261163734, + "grad_norm": 0.3860453440238996, + "learning_rate": 4.744217556329935e-06, + "loss": 0.0307, + "step": 2504 + }, + { + "epoch": 1.6948579161028416, + "grad_norm": 0.29012309026341493, + "learning_rate": 4.740285957816211e-06, + "loss": 0.0266, + "step": 2505 + }, + { + "epoch": 1.69553450608931, + "grad_norm": 0.24644734015822517, + "learning_rate": 4.736354520311194e-06, + "loss": 0.0264, + "step": 2506 + }, + { + "epoch": 1.696211096075778, + "grad_norm": 0.27533726904417705, + "learning_rate": 4.732423246252164e-06, + "loss": 0.0241, + "step": 2507 + }, + { + "epoch": 1.6968876860622464, + "grad_norm": 0.3284741190072217, + "learning_rate": 4.728492138076299e-06, + "loss": 0.025, + "step": 2508 + }, + { + "epoch": 1.6975642760487144, + "grad_norm": 0.3313886100791312, + "learning_rate": 4.724561198220672e-06, + "loss": 0.0277, + "step": 2509 + }, + { + "epoch": 1.6982408660351827, + "grad_norm": 0.40421821706479716, + "learning_rate": 4.7206304291222585e-06, + "loss": 0.0374, + "step": 2510 + }, + { + "epoch": 1.698917456021651, + "grad_norm": 0.27182154889405613, + "learning_rate": 4.71669983321792e-06, + "loss": 0.0259, + "step": 2511 + }, + { + "epoch": 1.699594046008119, + "grad_norm": 0.3896546706705147, + "learning_rate": 4.712769412944413e-06, + "loss": 0.0481, + "step": 2512 + }, + { + "epoch": 1.7002706359945874, + "grad_norm": 0.30186878962276914, + "learning_rate": 4.70883917073839e-06, + "loss": 0.0338, + "step": 2513 + }, + { + "epoch": 1.7009472259810554, + "grad_norm": 0.3036270775035042, + "learning_rate": 4.704909109036387e-06, + "loss": 0.0314, + "step": 2514 + }, + { + "epoch": 1.7016238159675237, + "grad_norm": 0.33079428571205244, + "learning_rate": 4.700979230274829e-06, + "loss": 0.031, + "step": 2515 + }, + { + "epoch": 1.702300405953992, + "grad_norm": 0.3306106553601701, + "learning_rate": 4.697049536890033e-06, + "loss": 0.0244, + "step": 2516 + }, + { + "epoch": 1.70297699594046, + "grad_norm": 0.2869525173971686, + "learning_rate": 4.693120031318194e-06, + "loss": 0.0245, + "step": 2517 + }, + { + "epoch": 1.7036535859269284, + "grad_norm": 0.2852776126066731, + "learning_rate": 4.6891907159953935e-06, + "loss": 0.0333, + "step": 2518 + }, + { + "epoch": 1.7043301759133964, + "grad_norm": 0.3109897284406666, + "learning_rate": 4.685261593357598e-06, + "loss": 0.0339, + "step": 2519 + }, + { + "epoch": 1.7050067658998647, + "grad_norm": 0.33832212060387673, + "learning_rate": 4.681332665840647e-06, + "loss": 0.0316, + "step": 2520 + }, + { + "epoch": 1.705683355886333, + "grad_norm": 0.48143311083384405, + "learning_rate": 4.677403935880269e-06, + "loss": 0.0435, + "step": 2521 + }, + { + "epoch": 1.706359945872801, + "grad_norm": 0.3352376146112779, + "learning_rate": 4.673475405912061e-06, + "loss": 0.0347, + "step": 2522 + }, + { + "epoch": 1.7070365358592694, + "grad_norm": 0.26621326568856285, + "learning_rate": 4.669547078371503e-06, + "loss": 0.0217, + "step": 2523 + }, + { + "epoch": 1.7077131258457374, + "grad_norm": 0.27771554687799066, + "learning_rate": 4.6656189556939446e-06, + "loss": 0.0228, + "step": 2524 + }, + { + "epoch": 1.7083897158322057, + "grad_norm": 0.3584084542308951, + "learning_rate": 4.6616910403146095e-06, + "loss": 0.0359, + "step": 2525 + }, + { + "epoch": 1.709066305818674, + "grad_norm": 0.405356741402472, + "learning_rate": 4.657763334668594e-06, + "loss": 0.0341, + "step": 2526 + }, + { + "epoch": 1.709742895805142, + "grad_norm": 0.2899337703771637, + "learning_rate": 4.653835841190865e-06, + "loss": 0.0319, + "step": 2527 + }, + { + "epoch": 1.7104194857916104, + "grad_norm": 0.4226805581452788, + "learning_rate": 4.649908562316255e-06, + "loss": 0.0254, + "step": 2528 + }, + { + "epoch": 1.7110960757780784, + "grad_norm": 0.3731275190776583, + "learning_rate": 4.645981500479466e-06, + "loss": 0.0272, + "step": 2529 + }, + { + "epoch": 1.7117726657645467, + "grad_norm": 0.26488359407055706, + "learning_rate": 4.6420546581150665e-06, + "loss": 0.0254, + "step": 2530 + }, + { + "epoch": 1.712449255751015, + "grad_norm": 0.2624951430826583, + "learning_rate": 4.6381280376574836e-06, + "loss": 0.0279, + "step": 2531 + }, + { + "epoch": 1.713125845737483, + "grad_norm": 0.2714821116235528, + "learning_rate": 4.634201641541013e-06, + "loss": 0.029, + "step": 2532 + }, + { + "epoch": 1.7138024357239514, + "grad_norm": 0.28853122708498474, + "learning_rate": 4.630275472199805e-06, + "loss": 0.0304, + "step": 2533 + }, + { + "epoch": 1.7144790257104194, + "grad_norm": 0.3895490198463758, + "learning_rate": 4.626349532067879e-06, + "loss": 0.041, + "step": 2534 + }, + { + "epoch": 1.7151556156968877, + "grad_norm": 0.35021800573512446, + "learning_rate": 4.622423823579102e-06, + "loss": 0.0298, + "step": 2535 + }, + { + "epoch": 1.715832205683356, + "grad_norm": 0.2758281987995898, + "learning_rate": 4.618498349167204e-06, + "loss": 0.0335, + "step": 2536 + }, + { + "epoch": 1.716508795669824, + "grad_norm": 0.5134424381152581, + "learning_rate": 4.6145731112657644e-06, + "loss": 0.0554, + "step": 2537 + }, + { + "epoch": 1.7171853856562924, + "grad_norm": 0.3357813889948677, + "learning_rate": 4.610648112308221e-06, + "loss": 0.036, + "step": 2538 + }, + { + "epoch": 1.7178619756427604, + "grad_norm": 0.28051604079149534, + "learning_rate": 4.6067233547278614e-06, + "loss": 0.0266, + "step": 2539 + }, + { + "epoch": 1.7185385656292287, + "grad_norm": 0.2503535805027952, + "learning_rate": 4.602798840957825e-06, + "loss": 0.0236, + "step": 2540 + }, + { + "epoch": 1.719215155615697, + "grad_norm": 0.29342839889218436, + "learning_rate": 4.598874573431097e-06, + "loss": 0.0304, + "step": 2541 + }, + { + "epoch": 1.719891745602165, + "grad_norm": 0.35057222844367764, + "learning_rate": 4.594950554580512e-06, + "loss": 0.0336, + "step": 2542 + }, + { + "epoch": 1.7205683355886334, + "grad_norm": 0.32832034013834693, + "learning_rate": 4.5910267868387525e-06, + "loss": 0.034, + "step": 2543 + }, + { + "epoch": 1.7212449255751014, + "grad_norm": 0.33188864379037686, + "learning_rate": 4.587103272638339e-06, + "loss": 0.029, + "step": 2544 + }, + { + "epoch": 1.7219215155615697, + "grad_norm": 0.2816727963933415, + "learning_rate": 4.583180014411642e-06, + "loss": 0.0289, + "step": 2545 + }, + { + "epoch": 1.722598105548038, + "grad_norm": 0.23776351515810717, + "learning_rate": 4.579257014590869e-06, + "loss": 0.0211, + "step": 2546 + }, + { + "epoch": 1.723274695534506, + "grad_norm": 0.3457653307991375, + "learning_rate": 4.575334275608067e-06, + "loss": 0.0274, + "step": 2547 + }, + { + "epoch": 1.7239512855209744, + "grad_norm": 0.458375658403468, + "learning_rate": 4.571411799895126e-06, + "loss": 0.0322, + "step": 2548 + }, + { + "epoch": 1.7246278755074425, + "grad_norm": 0.21555292570133547, + "learning_rate": 4.567489589883766e-06, + "loss": 0.0224, + "step": 2549 + }, + { + "epoch": 1.7253044654939107, + "grad_norm": 0.49539941506089014, + "learning_rate": 4.563567648005551e-06, + "loss": 0.0461, + "step": 2550 + }, + { + "epoch": 1.725981055480379, + "grad_norm": 0.23672890710910027, + "learning_rate": 4.559645976691868e-06, + "loss": 0.0205, + "step": 2551 + }, + { + "epoch": 1.726657645466847, + "grad_norm": 0.3477761464178626, + "learning_rate": 4.5557245783739425e-06, + "loss": 0.0265, + "step": 2552 + }, + { + "epoch": 1.7273342354533154, + "grad_norm": 0.4018184710552279, + "learning_rate": 4.551803455482833e-06, + "loss": 0.0391, + "step": 2553 + }, + { + "epoch": 1.7280108254397835, + "grad_norm": 0.368434955098215, + "learning_rate": 4.5478826104494225e-06, + "loss": 0.0336, + "step": 2554 + }, + { + "epoch": 1.7286874154262517, + "grad_norm": 0.21197450185826905, + "learning_rate": 4.543962045704424e-06, + "loss": 0.0146, + "step": 2555 + }, + { + "epoch": 1.72936400541272, + "grad_norm": 0.31679353577409447, + "learning_rate": 4.540041763678377e-06, + "loss": 0.026, + "step": 2556 + }, + { + "epoch": 1.730040595399188, + "grad_norm": 0.4608580369354584, + "learning_rate": 4.536121766801645e-06, + "loss": 0.0389, + "step": 2557 + }, + { + "epoch": 1.7307171853856564, + "grad_norm": 0.35660547773058215, + "learning_rate": 4.532202057504412e-06, + "loss": 0.038, + "step": 2558 + }, + { + "epoch": 1.7313937753721245, + "grad_norm": 0.2979235390461743, + "learning_rate": 4.528282638216689e-06, + "loss": 0.0282, + "step": 2559 + }, + { + "epoch": 1.7320703653585927, + "grad_norm": 0.28853628014305954, + "learning_rate": 4.524363511368304e-06, + "loss": 0.0264, + "step": 2560 + }, + { + "epoch": 1.732746955345061, + "grad_norm": 0.3837334521930021, + "learning_rate": 4.520444679388906e-06, + "loss": 0.0328, + "step": 2561 + }, + { + "epoch": 1.733423545331529, + "grad_norm": 0.3310512307525053, + "learning_rate": 4.516526144707957e-06, + "loss": 0.0244, + "step": 2562 + }, + { + "epoch": 1.7341001353179974, + "grad_norm": 0.3076839943784699, + "learning_rate": 4.512607909754741e-06, + "loss": 0.0277, + "step": 2563 + }, + { + "epoch": 1.7347767253044655, + "grad_norm": 0.45622340126657446, + "learning_rate": 4.508689976958348e-06, + "loss": 0.0392, + "step": 2564 + }, + { + "epoch": 1.7354533152909337, + "grad_norm": 0.28131755964827976, + "learning_rate": 4.504772348747687e-06, + "loss": 0.0286, + "step": 2565 + }, + { + "epoch": 1.736129905277402, + "grad_norm": 0.30147650400360737, + "learning_rate": 4.500855027551477e-06, + "loss": 0.0293, + "step": 2566 + }, + { + "epoch": 1.73680649526387, + "grad_norm": 0.33851406100999554, + "learning_rate": 4.496938015798246e-06, + "loss": 0.0263, + "step": 2567 + }, + { + "epoch": 1.7374830852503385, + "grad_norm": 0.3561164566438635, + "learning_rate": 4.493021315916328e-06, + "loss": 0.033, + "step": 2568 + }, + { + "epoch": 1.7381596752368065, + "grad_norm": 0.36434014933441217, + "learning_rate": 4.48910493033387e-06, + "loss": 0.0185, + "step": 2569 + }, + { + "epoch": 1.7388362652232747, + "grad_norm": 0.3673289035810428, + "learning_rate": 4.485188861478817e-06, + "loss": 0.0292, + "step": 2570 + }, + { + "epoch": 1.739512855209743, + "grad_norm": 0.28756688532278163, + "learning_rate": 4.481273111778919e-06, + "loss": 0.0239, + "step": 2571 + }, + { + "epoch": 1.740189445196211, + "grad_norm": 0.36322743263883556, + "learning_rate": 4.477357683661734e-06, + "loss": 0.0414, + "step": 2572 + }, + { + "epoch": 1.7408660351826795, + "grad_norm": 0.48899289689185416, + "learning_rate": 4.473442579554612e-06, + "loss": 0.0379, + "step": 2573 + }, + { + "epoch": 1.7415426251691475, + "grad_norm": 0.4448440873212462, + "learning_rate": 4.46952780188471e-06, + "loss": 0.0259, + "step": 2574 + }, + { + "epoch": 1.7422192151556157, + "grad_norm": 0.26910046156525524, + "learning_rate": 4.465613353078978e-06, + "loss": 0.0292, + "step": 2575 + }, + { + "epoch": 1.742895805142084, + "grad_norm": 0.4618472054959546, + "learning_rate": 4.461699235564164e-06, + "loss": 0.0298, + "step": 2576 + }, + { + "epoch": 1.743572395128552, + "grad_norm": 0.34108507079214767, + "learning_rate": 4.457785451766808e-06, + "loss": 0.035, + "step": 2577 + }, + { + "epoch": 1.7442489851150202, + "grad_norm": 0.5639311474799743, + "learning_rate": 4.453872004113247e-06, + "loss": 0.0358, + "step": 2578 + }, + { + "epoch": 1.7449255751014885, + "grad_norm": 0.3426306890416558, + "learning_rate": 4.449958895029604e-06, + "loss": 0.0332, + "step": 2579 + }, + { + "epoch": 1.7456021650879567, + "grad_norm": 0.4145944361184108, + "learning_rate": 4.446046126941801e-06, + "loss": 0.0388, + "step": 2580 + }, + { + "epoch": 1.746278755074425, + "grad_norm": 0.3062466633155877, + "learning_rate": 4.442133702275539e-06, + "loss": 0.0242, + "step": 2581 + }, + { + "epoch": 1.746955345060893, + "grad_norm": 0.37681131182332217, + "learning_rate": 4.438221623456312e-06, + "loss": 0.0299, + "step": 2582 + }, + { + "epoch": 1.7476319350473613, + "grad_norm": 0.35980152188751374, + "learning_rate": 4.4343098929094e-06, + "loss": 0.0279, + "step": 2583 + }, + { + "epoch": 1.7483085250338295, + "grad_norm": 0.3527994633461267, + "learning_rate": 4.4303985130598615e-06, + "loss": 0.0335, + "step": 2584 + }, + { + "epoch": 1.7489851150202977, + "grad_norm": 0.34657169139864935, + "learning_rate": 4.426487486332544e-06, + "loss": 0.0281, + "step": 2585 + }, + { + "epoch": 1.749661705006766, + "grad_norm": 0.38271880544219666, + "learning_rate": 4.42257681515207e-06, + "loss": 0.0404, + "step": 2586 + }, + { + "epoch": 1.750338294993234, + "grad_norm": 0.3493929775948048, + "learning_rate": 4.4186665019428485e-06, + "loss": 0.0351, + "step": 2587 + }, + { + "epoch": 1.7510148849797023, + "grad_norm": 0.3887216055934382, + "learning_rate": 4.41475654912906e-06, + "loss": 0.0347, + "step": 2588 + }, + { + "epoch": 1.7516914749661705, + "grad_norm": 0.36373045467764825, + "learning_rate": 4.410846959134667e-06, + "loss": 0.034, + "step": 2589 + }, + { + "epoch": 1.7523680649526387, + "grad_norm": 0.3585495419331542, + "learning_rate": 4.406937734383405e-06, + "loss": 0.0213, + "step": 2590 + }, + { + "epoch": 1.753044654939107, + "grad_norm": 0.26978918420242576, + "learning_rate": 4.4030288772987795e-06, + "loss": 0.0218, + "step": 2591 + }, + { + "epoch": 1.753721244925575, + "grad_norm": 0.4913247264596912, + "learning_rate": 4.399120390304072e-06, + "loss": 0.0353, + "step": 2592 + }, + { + "epoch": 1.7543978349120433, + "grad_norm": 0.3640852256705213, + "learning_rate": 4.395212275822336e-06, + "loss": 0.0316, + "step": 2593 + }, + { + "epoch": 1.7550744248985115, + "grad_norm": 0.34787757904616445, + "learning_rate": 4.391304536276389e-06, + "loss": 0.029, + "step": 2594 + }, + { + "epoch": 1.7557510148849798, + "grad_norm": 0.39433208435920863, + "learning_rate": 4.3873971740888205e-06, + "loss": 0.0253, + "step": 2595 + }, + { + "epoch": 1.756427604871448, + "grad_norm": 0.3232978320311612, + "learning_rate": 4.383490191681985e-06, + "loss": 0.0307, + "step": 2596 + }, + { + "epoch": 1.757104194857916, + "grad_norm": 0.31679177199140235, + "learning_rate": 4.379583591477999e-06, + "loss": 0.0266, + "step": 2597 + }, + { + "epoch": 1.7577807848443843, + "grad_norm": 0.34946238304926175, + "learning_rate": 4.375677375898746e-06, + "loss": 0.0308, + "step": 2598 + }, + { + "epoch": 1.7584573748308525, + "grad_norm": 0.44043705194334837, + "learning_rate": 4.371771547365869e-06, + "loss": 0.0416, + "step": 2599 + }, + { + "epoch": 1.7591339648173205, + "grad_norm": 0.23837244376575895, + "learning_rate": 4.367866108300769e-06, + "loss": 0.027, + "step": 2600 + }, + { + "epoch": 1.759810554803789, + "grad_norm": 0.3711163901861945, + "learning_rate": 4.3639610611246106e-06, + "loss": 0.0305, + "step": 2601 + }, + { + "epoch": 1.760487144790257, + "grad_norm": 0.229365775241179, + "learning_rate": 4.36005640825831e-06, + "loss": 0.0279, + "step": 2602 + }, + { + "epoch": 1.7611637347767253, + "grad_norm": 0.4369561044549791, + "learning_rate": 4.3561521521225445e-06, + "loss": 0.0331, + "step": 2603 + }, + { + "epoch": 1.7618403247631935, + "grad_norm": 0.27636344202214114, + "learning_rate": 4.352248295137739e-06, + "loss": 0.0265, + "step": 2604 + }, + { + "epoch": 1.7625169147496615, + "grad_norm": 0.29708739553259333, + "learning_rate": 4.348344839724076e-06, + "loss": 0.0218, + "step": 2605 + }, + { + "epoch": 1.76319350473613, + "grad_norm": 0.30476715927779024, + "learning_rate": 4.3444417883014885e-06, + "loss": 0.0336, + "step": 2606 + }, + { + "epoch": 1.763870094722598, + "grad_norm": 0.32606089081914547, + "learning_rate": 4.340539143289655e-06, + "loss": 0.0386, + "step": 2607 + }, + { + "epoch": 1.7645466847090663, + "grad_norm": 0.6625495622679105, + "learning_rate": 4.33663690710801e-06, + "loss": 0.0269, + "step": 2608 + }, + { + "epoch": 1.7652232746955345, + "grad_norm": 0.2261189032678472, + "learning_rate": 4.332735082175724e-06, + "loss": 0.021, + "step": 2609 + }, + { + "epoch": 1.7658998646820026, + "grad_norm": 0.3677337458826156, + "learning_rate": 4.3288336709117246e-06, + "loss": 0.0277, + "step": 2610 + }, + { + "epoch": 1.766576454668471, + "grad_norm": 0.28741855426393353, + "learning_rate": 4.32493267573467e-06, + "loss": 0.0334, + "step": 2611 + }, + { + "epoch": 1.767253044654939, + "grad_norm": 0.30175491897174606, + "learning_rate": 4.3210320990629696e-06, + "loss": 0.0306, + "step": 2612 + }, + { + "epoch": 1.7679296346414073, + "grad_norm": 0.4459154430617385, + "learning_rate": 4.31713194331477e-06, + "loss": 0.0367, + "step": 2613 + }, + { + "epoch": 1.7686062246278755, + "grad_norm": 0.3301191373871344, + "learning_rate": 4.313232210907959e-06, + "loss": 0.0278, + "step": 2614 + }, + { + "epoch": 1.7692828146143436, + "grad_norm": 0.3423866029784998, + "learning_rate": 4.30933290426016e-06, + "loss": 0.0349, + "step": 2615 + }, + { + "epoch": 1.769959404600812, + "grad_norm": 0.2764470334730458, + "learning_rate": 4.305434025788735e-06, + "loss": 0.0279, + "step": 2616 + }, + { + "epoch": 1.77063599458728, + "grad_norm": 0.33679865197336717, + "learning_rate": 4.301535577910774e-06, + "loss": 0.0312, + "step": 2617 + }, + { + "epoch": 1.7713125845737483, + "grad_norm": 0.4093251697612486, + "learning_rate": 4.297637563043106e-06, + "loss": 0.0367, + "step": 2618 + }, + { + "epoch": 1.7719891745602165, + "grad_norm": 0.35941913077867427, + "learning_rate": 4.293739983602292e-06, + "loss": 0.0345, + "step": 2619 + }, + { + "epoch": 1.7726657645466846, + "grad_norm": 0.269367285779406, + "learning_rate": 4.28984284200462e-06, + "loss": 0.0323, + "step": 2620 + }, + { + "epoch": 1.773342354533153, + "grad_norm": 0.2676923665995716, + "learning_rate": 4.285946140666107e-06, + "loss": 0.0258, + "step": 2621 + }, + { + "epoch": 1.774018944519621, + "grad_norm": 0.24123311226995156, + "learning_rate": 4.282049882002499e-06, + "loss": 0.0234, + "step": 2622 + }, + { + "epoch": 1.7746955345060893, + "grad_norm": 0.29092215210195627, + "learning_rate": 4.278154068429268e-06, + "loss": 0.0252, + "step": 2623 + }, + { + "epoch": 1.7753721244925575, + "grad_norm": 0.2611541302096314, + "learning_rate": 4.274258702361604e-06, + "loss": 0.0301, + "step": 2624 + }, + { + "epoch": 1.7760487144790256, + "grad_norm": 0.2877502258563499, + "learning_rate": 4.270363786214427e-06, + "loss": 0.0333, + "step": 2625 + }, + { + "epoch": 1.776725304465494, + "grad_norm": 0.3961002448448291, + "learning_rate": 4.266469322402374e-06, + "loss": 0.0268, + "step": 2626 + }, + { + "epoch": 1.777401894451962, + "grad_norm": 0.38927950641745135, + "learning_rate": 4.2625753133398036e-06, + "loss": 0.0284, + "step": 2627 + }, + { + "epoch": 1.7780784844384303, + "grad_norm": 0.6194414898579439, + "learning_rate": 4.25868176144079e-06, + "loss": 0.0397, + "step": 2628 + }, + { + "epoch": 1.7787550744248986, + "grad_norm": 0.342068842517698, + "learning_rate": 4.254788669119127e-06, + "loss": 0.0307, + "step": 2629 + }, + { + "epoch": 1.7794316644113666, + "grad_norm": 0.27806995435172194, + "learning_rate": 4.250896038788324e-06, + "loss": 0.028, + "step": 2630 + }, + { + "epoch": 1.780108254397835, + "grad_norm": 0.34842401607345286, + "learning_rate": 4.247003872861598e-06, + "loss": 0.0284, + "step": 2631 + }, + { + "epoch": 1.780784844384303, + "grad_norm": 0.3546096166196037, + "learning_rate": 4.2431121737518824e-06, + "loss": 0.0253, + "step": 2632 + }, + { + "epoch": 1.7814614343707713, + "grad_norm": 0.3522986221138865, + "learning_rate": 4.239220943871823e-06, + "loss": 0.0303, + "step": 2633 + }, + { + "epoch": 1.7821380243572396, + "grad_norm": 0.30079475954802903, + "learning_rate": 4.23533018563377e-06, + "loss": 0.0221, + "step": 2634 + }, + { + "epoch": 1.7828146143437076, + "grad_norm": 0.5347889112997443, + "learning_rate": 4.231439901449788e-06, + "loss": 0.0457, + "step": 2635 + }, + { + "epoch": 1.783491204330176, + "grad_norm": 0.3461976124986228, + "learning_rate": 4.227550093731641e-06, + "loss": 0.0309, + "step": 2636 + }, + { + "epoch": 1.784167794316644, + "grad_norm": 0.27407351545678477, + "learning_rate": 4.223660764890799e-06, + "loss": 0.0254, + "step": 2637 + }, + { + "epoch": 1.7848443843031123, + "grad_norm": 0.30508100978344044, + "learning_rate": 4.2197719173384374e-06, + "loss": 0.0324, + "step": 2638 + }, + { + "epoch": 1.7855209742895806, + "grad_norm": 0.3924144964947684, + "learning_rate": 4.215883553485431e-06, + "loss": 0.0297, + "step": 2639 + }, + { + "epoch": 1.7861975642760486, + "grad_norm": 0.7441042381740874, + "learning_rate": 4.211995675742358e-06, + "loss": 0.0399, + "step": 2640 + }, + { + "epoch": 1.786874154262517, + "grad_norm": 0.2705361268974501, + "learning_rate": 4.208108286519491e-06, + "loss": 0.0293, + "step": 2641 + }, + { + "epoch": 1.787550744248985, + "grad_norm": 0.30342958290381394, + "learning_rate": 4.204221388226803e-06, + "loss": 0.0331, + "step": 2642 + }, + { + "epoch": 1.7882273342354533, + "grad_norm": 0.26675480220215986, + "learning_rate": 4.2003349832739624e-06, + "loss": 0.0314, + "step": 2643 + }, + { + "epoch": 1.7889039242219216, + "grad_norm": 0.3637247014588533, + "learning_rate": 4.196449074070329e-06, + "loss": 0.0275, + "step": 2644 + }, + { + "epoch": 1.7895805142083896, + "grad_norm": 0.3042441928119871, + "learning_rate": 4.1925636630249565e-06, + "loss": 0.0357, + "step": 2645 + }, + { + "epoch": 1.790257104194858, + "grad_norm": 0.29898440201684634, + "learning_rate": 4.1886787525465914e-06, + "loss": 0.0277, + "step": 2646 + }, + { + "epoch": 1.790933694181326, + "grad_norm": 0.3115332185043352, + "learning_rate": 4.184794345043668e-06, + "loss": 0.0288, + "step": 2647 + }, + { + "epoch": 1.7916102841677943, + "grad_norm": 0.3192481722771207, + "learning_rate": 4.180910442924312e-06, + "loss": 0.032, + "step": 2648 + }, + { + "epoch": 1.7922868741542626, + "grad_norm": 0.35266106657956, + "learning_rate": 4.17702704859633e-06, + "loss": 0.0323, + "step": 2649 + }, + { + "epoch": 1.7929634641407306, + "grad_norm": 0.2733674404274185, + "learning_rate": 4.173144164467221e-06, + "loss": 0.0311, + "step": 2650 + }, + { + "epoch": 1.793640054127199, + "grad_norm": 0.3901746916827659, + "learning_rate": 4.169261792944161e-06, + "loss": 0.0256, + "step": 2651 + }, + { + "epoch": 1.794316644113667, + "grad_norm": 0.4484340132928396, + "learning_rate": 4.165379936434011e-06, + "loss": 0.0371, + "step": 2652 + }, + { + "epoch": 1.7949932341001353, + "grad_norm": 0.27622663319068086, + "learning_rate": 4.161498597343313e-06, + "loss": 0.026, + "step": 2653 + }, + { + "epoch": 1.7956698240866036, + "grad_norm": 0.3073889438853535, + "learning_rate": 4.15761777807829e-06, + "loss": 0.0248, + "step": 2654 + }, + { + "epoch": 1.7963464140730716, + "grad_norm": 0.4537979707068046, + "learning_rate": 4.153737481044838e-06, + "loss": 0.0308, + "step": 2655 + }, + { + "epoch": 1.79702300405954, + "grad_norm": 0.2937261921157818, + "learning_rate": 4.149857708648536e-06, + "loss": 0.0251, + "step": 2656 + }, + { + "epoch": 1.797699594046008, + "grad_norm": 0.3435845967720961, + "learning_rate": 4.1459784632946295e-06, + "loss": 0.0296, + "step": 2657 + }, + { + "epoch": 1.7983761840324763, + "grad_norm": 0.22005209060832764, + "learning_rate": 4.142099747388042e-06, + "loss": 0.0198, + "step": 2658 + }, + { + "epoch": 1.7990527740189446, + "grad_norm": 0.31058586661156673, + "learning_rate": 4.138221563333371e-06, + "loss": 0.0331, + "step": 2659 + }, + { + "epoch": 1.7997293640054126, + "grad_norm": 0.37392366255520676, + "learning_rate": 4.134343913534879e-06, + "loss": 0.0329, + "step": 2660 + }, + { + "epoch": 1.800405953991881, + "grad_norm": 0.3481303544278709, + "learning_rate": 4.1304668003965016e-06, + "loss": 0.027, + "step": 2661 + }, + { + "epoch": 1.801082543978349, + "grad_norm": 0.32759244786674374, + "learning_rate": 4.126590226321838e-06, + "loss": 0.0313, + "step": 2662 + }, + { + "epoch": 1.8017591339648173, + "grad_norm": 0.32188468006032117, + "learning_rate": 4.12271419371416e-06, + "loss": 0.0284, + "step": 2663 + }, + { + "epoch": 1.8024357239512856, + "grad_norm": 0.24912883260939556, + "learning_rate": 4.118838704976392e-06, + "loss": 0.0196, + "step": 2664 + }, + { + "epoch": 1.8031123139377536, + "grad_norm": 0.3189010629371992, + "learning_rate": 4.114963762511134e-06, + "loss": 0.0365, + "step": 2665 + }, + { + "epoch": 1.803788903924222, + "grad_norm": 0.43688902084727205, + "learning_rate": 4.111089368720635e-06, + "loss": 0.0452, + "step": 2666 + }, + { + "epoch": 1.80446549391069, + "grad_norm": 0.4638381936567443, + "learning_rate": 4.107215526006818e-06, + "loss": 0.0571, + "step": 2667 + }, + { + "epoch": 1.8051420838971584, + "grad_norm": 0.4125074637965231, + "learning_rate": 4.10334223677125e-06, + "loss": 0.0309, + "step": 2668 + }, + { + "epoch": 1.8058186738836266, + "grad_norm": 0.32727486615921647, + "learning_rate": 4.099469503415167e-06, + "loss": 0.0432, + "step": 2669 + }, + { + "epoch": 1.8064952638700946, + "grad_norm": 0.3548546321761083, + "learning_rate": 4.0955973283394525e-06, + "loss": 0.0279, + "step": 2670 + }, + { + "epoch": 1.8071718538565629, + "grad_norm": 0.5084283120306401, + "learning_rate": 4.091725713944644e-06, + "loss": 0.0303, + "step": 2671 + }, + { + "epoch": 1.8078484438430311, + "grad_norm": 0.35906172546652476, + "learning_rate": 4.087854662630937e-06, + "loss": 0.0318, + "step": 2672 + }, + { + "epoch": 1.8085250338294994, + "grad_norm": 0.3569372561861631, + "learning_rate": 4.083984176798175e-06, + "loss": 0.0356, + "step": 2673 + }, + { + "epoch": 1.8092016238159676, + "grad_norm": 0.33727519351352203, + "learning_rate": 4.080114258845846e-06, + "loss": 0.0349, + "step": 2674 + }, + { + "epoch": 1.8098782138024356, + "grad_norm": 0.3902204599412901, + "learning_rate": 4.076244911173097e-06, + "loss": 0.0229, + "step": 2675 + }, + { + "epoch": 1.8105548037889039, + "grad_norm": 0.34510078431419167, + "learning_rate": 4.072376136178712e-06, + "loss": 0.0307, + "step": 2676 + }, + { + "epoch": 1.8112313937753721, + "grad_norm": 0.5011029538347361, + "learning_rate": 4.06850793626112e-06, + "loss": 0.0421, + "step": 2677 + }, + { + "epoch": 1.8119079837618404, + "grad_norm": 0.28226630423543714, + "learning_rate": 4.064640313818401e-06, + "loss": 0.028, + "step": 2678 + }, + { + "epoch": 1.8125845737483086, + "grad_norm": 0.345911021189506, + "learning_rate": 4.06077327124827e-06, + "loss": 0.0375, + "step": 2679 + }, + { + "epoch": 1.8132611637347766, + "grad_norm": 0.24967775445369783, + "learning_rate": 4.056906810948086e-06, + "loss": 0.022, + "step": 2680 + }, + { + "epoch": 1.8139377537212449, + "grad_norm": 0.36561200139382116, + "learning_rate": 4.053040935314845e-06, + "loss": 0.0326, + "step": 2681 + }, + { + "epoch": 1.8146143437077131, + "grad_norm": 0.44180192512844846, + "learning_rate": 4.049175646745182e-06, + "loss": 0.0397, + "step": 2682 + }, + { + "epoch": 1.8152909336941814, + "grad_norm": 0.45165061132232887, + "learning_rate": 4.045310947635369e-06, + "loss": 0.0323, + "step": 2683 + }, + { + "epoch": 1.8159675236806496, + "grad_norm": 0.27801313243001463, + "learning_rate": 4.041446840381309e-06, + "loss": 0.031, + "step": 2684 + }, + { + "epoch": 1.8166441136671176, + "grad_norm": 0.3419275007101939, + "learning_rate": 4.03758332737854e-06, + "loss": 0.026, + "step": 2685 + }, + { + "epoch": 1.817320703653586, + "grad_norm": 0.3191810698329368, + "learning_rate": 4.033720411022235e-06, + "loss": 0.0302, + "step": 2686 + }, + { + "epoch": 1.8179972936400541, + "grad_norm": 0.37863581732633167, + "learning_rate": 4.02985809370719e-06, + "loss": 0.0316, + "step": 2687 + }, + { + "epoch": 1.8186738836265224, + "grad_norm": 0.4903037856694656, + "learning_rate": 4.025996377827836e-06, + "loss": 0.0261, + "step": 2688 + }, + { + "epoch": 1.8193504736129906, + "grad_norm": 0.33236139853862945, + "learning_rate": 4.022135265778226e-06, + "loss": 0.025, + "step": 2689 + }, + { + "epoch": 1.8200270635994586, + "grad_norm": 0.36896052672555035, + "learning_rate": 4.018274759952047e-06, + "loss": 0.0312, + "step": 2690 + }, + { + "epoch": 1.820703653585927, + "grad_norm": 0.38812391088835496, + "learning_rate": 4.0144148627426e-06, + "loss": 0.036, + "step": 2691 + }, + { + "epoch": 1.8213802435723951, + "grad_norm": 0.415530489904815, + "learning_rate": 4.010555576542812e-06, + "loss": 0.03, + "step": 2692 + }, + { + "epoch": 1.8220568335588632, + "grad_norm": 0.29735609643700006, + "learning_rate": 4.006696903745236e-06, + "loss": 0.0307, + "step": 2693 + }, + { + "epoch": 1.8227334235453316, + "grad_norm": 0.3116141150028271, + "learning_rate": 4.002838846742039e-06, + "loss": 0.0264, + "step": 2694 + }, + { + "epoch": 1.8234100135317997, + "grad_norm": 0.3288701080055487, + "learning_rate": 3.998981407925009e-06, + "loss": 0.0287, + "step": 2695 + }, + { + "epoch": 1.824086603518268, + "grad_norm": 0.30263041578993677, + "learning_rate": 3.995124589685552e-06, + "loss": 0.0308, + "step": 2696 + }, + { + "epoch": 1.8247631935047361, + "grad_norm": 0.286767356415873, + "learning_rate": 3.991268394414685e-06, + "loss": 0.0294, + "step": 2697 + }, + { + "epoch": 1.8254397834912042, + "grad_norm": 0.39894473368693867, + "learning_rate": 3.987412824503041e-06, + "loss": 0.037, + "step": 2698 + }, + { + "epoch": 1.8261163734776726, + "grad_norm": 0.3609337918371107, + "learning_rate": 3.983557882340866e-06, + "loss": 0.0274, + "step": 2699 + }, + { + "epoch": 1.8267929634641407, + "grad_norm": 0.2664186103399177, + "learning_rate": 3.979703570318017e-06, + "loss": 0.0217, + "step": 2700 + }, + { + "epoch": 1.827469553450609, + "grad_norm": 0.29757603731982274, + "learning_rate": 3.97584989082396e-06, + "loss": 0.0251, + "step": 2701 + }, + { + "epoch": 1.8281461434370772, + "grad_norm": 0.26038998187107976, + "learning_rate": 3.971996846247767e-06, + "loss": 0.0247, + "step": 2702 + }, + { + "epoch": 1.8288227334235452, + "grad_norm": 0.28279378785952597, + "learning_rate": 3.968144438978121e-06, + "loss": 0.0324, + "step": 2703 + }, + { + "epoch": 1.8294993234100136, + "grad_norm": 0.24411594025384967, + "learning_rate": 3.964292671403303e-06, + "loss": 0.0197, + "step": 2704 + }, + { + "epoch": 1.8301759133964817, + "grad_norm": 0.2704535430184035, + "learning_rate": 3.960441545911205e-06, + "loss": 0.0287, + "step": 2705 + }, + { + "epoch": 1.83085250338295, + "grad_norm": 0.39949011410609037, + "learning_rate": 3.956591064889313e-06, + "loss": 0.035, + "step": 2706 + }, + { + "epoch": 1.8315290933694182, + "grad_norm": 0.2856086640643192, + "learning_rate": 3.952741230724721e-06, + "loss": 0.0268, + "step": 2707 + }, + { + "epoch": 1.8322056833558862, + "grad_norm": 0.5079320070658421, + "learning_rate": 3.948892045804117e-06, + "loss": 0.036, + "step": 2708 + }, + { + "epoch": 1.8328822733423547, + "grad_norm": 0.4322514043741924, + "learning_rate": 3.94504351251379e-06, + "loss": 0.0265, + "step": 2709 + }, + { + "epoch": 1.8335588633288227, + "grad_norm": 0.35313776553967785, + "learning_rate": 3.9411956332396224e-06, + "loss": 0.0194, + "step": 2710 + }, + { + "epoch": 1.834235453315291, + "grad_norm": 0.8542069999554189, + "learning_rate": 3.937348410367091e-06, + "loss": 0.05, + "step": 2711 + }, + { + "epoch": 1.8349120433017592, + "grad_norm": 0.5028097463837327, + "learning_rate": 3.9335018462812664e-06, + "loss": 0.0432, + "step": 2712 + }, + { + "epoch": 1.8355886332882272, + "grad_norm": 0.5854402146375531, + "learning_rate": 3.929655943366812e-06, + "loss": 0.0277, + "step": 2713 + }, + { + "epoch": 1.8362652232746957, + "grad_norm": 0.2546232565257687, + "learning_rate": 3.92581070400798e-06, + "loss": 0.0238, + "step": 2714 + }, + { + "epoch": 1.8369418132611637, + "grad_norm": 0.35642062171429384, + "learning_rate": 3.921966130588612e-06, + "loss": 0.0276, + "step": 2715 + }, + { + "epoch": 1.837618403247632, + "grad_norm": 0.29038713260225246, + "learning_rate": 3.918122225492139e-06, + "loss": 0.0255, + "step": 2716 + }, + { + "epoch": 1.8382949932341002, + "grad_norm": 0.37323932909437657, + "learning_rate": 3.914278991101568e-06, + "loss": 0.0422, + "step": 2717 + }, + { + "epoch": 1.8389715832205682, + "grad_norm": 0.31474906581008993, + "learning_rate": 3.910436429799503e-06, + "loss": 0.0232, + "step": 2718 + }, + { + "epoch": 1.8396481732070367, + "grad_norm": 0.2768259932365074, + "learning_rate": 3.906594543968122e-06, + "loss": 0.0231, + "step": 2719 + }, + { + "epoch": 1.8403247631935047, + "grad_norm": 0.3194073338438866, + "learning_rate": 3.902753335989188e-06, + "loss": 0.026, + "step": 2720 + }, + { + "epoch": 1.841001353179973, + "grad_norm": 0.34944793044078193, + "learning_rate": 3.898912808244043e-06, + "loss": 0.0255, + "step": 2721 + }, + { + "epoch": 1.8416779431664412, + "grad_norm": 0.39429096759598886, + "learning_rate": 3.895072963113607e-06, + "loss": 0.0378, + "step": 2722 + }, + { + "epoch": 1.8423545331529092, + "grad_norm": 0.27068499815451585, + "learning_rate": 3.89123380297838e-06, + "loss": 0.0221, + "step": 2723 + }, + { + "epoch": 1.8430311231393777, + "grad_norm": 0.2915361354998442, + "learning_rate": 3.887395330218429e-06, + "loss": 0.032, + "step": 2724 + }, + { + "epoch": 1.8437077131258457, + "grad_norm": 0.2229913450537913, + "learning_rate": 3.883557547213404e-06, + "loss": 0.0243, + "step": 2725 + }, + { + "epoch": 1.844384303112314, + "grad_norm": 0.3006017740086933, + "learning_rate": 3.8797204563425215e-06, + "loss": 0.026, + "step": 2726 + }, + { + "epoch": 1.8450608930987822, + "grad_norm": 0.40672761974402877, + "learning_rate": 3.875884059984571e-06, + "loss": 0.0395, + "step": 2727 + }, + { + "epoch": 1.8457374830852502, + "grad_norm": 0.5611133117859646, + "learning_rate": 3.872048360517914e-06, + "loss": 0.032, + "step": 2728 + }, + { + "epoch": 1.8464140730717187, + "grad_norm": 0.30084265326422077, + "learning_rate": 3.868213360320474e-06, + "loss": 0.029, + "step": 2729 + }, + { + "epoch": 1.8470906630581867, + "grad_norm": 0.29063238857425405, + "learning_rate": 3.864379061769749e-06, + "loss": 0.0272, + "step": 2730 + }, + { + "epoch": 1.847767253044655, + "grad_norm": 0.3234239508261535, + "learning_rate": 3.860545467242793e-06, + "loss": 0.0295, + "step": 2731 + }, + { + "epoch": 1.8484438430311232, + "grad_norm": 0.2660123733234617, + "learning_rate": 3.856712579116229e-06, + "loss": 0.0278, + "step": 2732 + }, + { + "epoch": 1.8491204330175912, + "grad_norm": 0.2748480970316418, + "learning_rate": 3.852880399766243e-06, + "loss": 0.026, + "step": 2733 + }, + { + "epoch": 1.8497970230040597, + "grad_norm": 0.4317617381556935, + "learning_rate": 3.8490489315685764e-06, + "loss": 0.0266, + "step": 2734 + }, + { + "epoch": 1.8504736129905277, + "grad_norm": 0.256923637429559, + "learning_rate": 3.845218176898537e-06, + "loss": 0.0221, + "step": 2735 + }, + { + "epoch": 1.851150202976996, + "grad_norm": 0.312933981334049, + "learning_rate": 3.8413881381309845e-06, + "loss": 0.0334, + "step": 2736 + }, + { + "epoch": 1.8518267929634642, + "grad_norm": 0.33124520546911174, + "learning_rate": 3.837558817640334e-06, + "loss": 0.0262, + "step": 2737 + }, + { + "epoch": 1.8525033829499322, + "grad_norm": 0.3169719756976648, + "learning_rate": 3.8337302178005605e-06, + "loss": 0.0314, + "step": 2738 + }, + { + "epoch": 1.8531799729364007, + "grad_norm": 0.316587761312504, + "learning_rate": 3.829902340985189e-06, + "loss": 0.0308, + "step": 2739 + }, + { + "epoch": 1.8538565629228687, + "grad_norm": 0.42645154956655024, + "learning_rate": 3.826075189567296e-06, + "loss": 0.0412, + "step": 2740 + }, + { + "epoch": 1.854533152909337, + "grad_norm": 0.37969987986458503, + "learning_rate": 3.82224876591951e-06, + "loss": 0.0426, + "step": 2741 + }, + { + "epoch": 1.8552097428958052, + "grad_norm": 0.36015108092892856, + "learning_rate": 3.818423072414007e-06, + "loss": 0.0285, + "step": 2742 + }, + { + "epoch": 1.8558863328822732, + "grad_norm": 0.4391809591592842, + "learning_rate": 3.8145981114225135e-06, + "loss": 0.0357, + "step": 2743 + }, + { + "epoch": 1.8565629228687417, + "grad_norm": 0.4010653251507355, + "learning_rate": 3.8107738853162953e-06, + "loss": 0.0308, + "step": 2744 + }, + { + "epoch": 1.8572395128552097, + "grad_norm": 0.3286902490342025, + "learning_rate": 3.8069503964661656e-06, + "loss": 0.0282, + "step": 2745 + }, + { + "epoch": 1.857916102841678, + "grad_norm": 0.26128621160738486, + "learning_rate": 3.803127647242486e-06, + "loss": 0.025, + "step": 2746 + }, + { + "epoch": 1.8585926928281462, + "grad_norm": 0.339093432402349, + "learning_rate": 3.7993056400151516e-06, + "loss": 0.0307, + "step": 2747 + }, + { + "epoch": 1.8592692828146142, + "grad_norm": 0.38196393377361754, + "learning_rate": 3.795484377153601e-06, + "loss": 0.0323, + "step": 2748 + }, + { + "epoch": 1.8599458728010827, + "grad_norm": 0.35884647469612424, + "learning_rate": 3.791663861026814e-06, + "loss": 0.0308, + "step": 2749 + }, + { + "epoch": 1.8606224627875507, + "grad_norm": 0.305251079044217, + "learning_rate": 3.787844094003302e-06, + "loss": 0.0261, + "step": 2750 + }, + { + "epoch": 1.861299052774019, + "grad_norm": 0.41917420647087306, + "learning_rate": 3.7840250784511147e-06, + "loss": 0.0346, + "step": 2751 + }, + { + "epoch": 1.8619756427604872, + "grad_norm": 0.3114860675137204, + "learning_rate": 3.780206816737837e-06, + "loss": 0.0293, + "step": 2752 + }, + { + "epoch": 1.8626522327469552, + "grad_norm": 0.3543280815249511, + "learning_rate": 3.776389311230584e-06, + "loss": 0.0384, + "step": 2753 + }, + { + "epoch": 1.8633288227334237, + "grad_norm": 0.37800835051382997, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.0351, + "step": 2754 + }, + { + "epoch": 1.8640054127198917, + "grad_norm": 0.3372101682646651, + "learning_rate": 3.7687565783002754e-06, + "loss": 0.0251, + "step": 2755 + }, + { + "epoch": 1.86468200270636, + "grad_norm": 0.2936300964126187, + "learning_rate": 3.7649413556091047e-06, + "loss": 0.0255, + "step": 2756 + }, + { + "epoch": 1.8653585926928282, + "grad_norm": 0.35035127791984333, + "learning_rate": 3.7611268985877213e-06, + "loss": 0.0296, + "step": 2757 + }, + { + "epoch": 1.8660351826792962, + "grad_norm": 0.28936593925166243, + "learning_rate": 3.7573132096008843e-06, + "loss": 0.0287, + "step": 2758 + }, + { + "epoch": 1.8667117726657647, + "grad_norm": 0.23329604171618087, + "learning_rate": 3.753500291012874e-06, + "loss": 0.029, + "step": 2759 + }, + { + "epoch": 1.8673883626522327, + "grad_norm": 0.8916747746121758, + "learning_rate": 3.749688145187497e-06, + "loss": 0.0318, + "step": 2760 + }, + { + "epoch": 1.868064952638701, + "grad_norm": 0.5408879210119105, + "learning_rate": 3.7458767744880763e-06, + "loss": 0.0365, + "step": 2761 + }, + { + "epoch": 1.8687415426251692, + "grad_norm": 0.27138970699536663, + "learning_rate": 3.7420661812774577e-06, + "loss": 0.0247, + "step": 2762 + }, + { + "epoch": 1.8694181326116373, + "grad_norm": 0.2976193614642239, + "learning_rate": 3.738256367918004e-06, + "loss": 0.0287, + "step": 2763 + }, + { + "epoch": 1.8700947225981055, + "grad_norm": 0.373410928693832, + "learning_rate": 3.734447336771591e-06, + "loss": 0.0387, + "step": 2764 + }, + { + "epoch": 1.8707713125845737, + "grad_norm": 0.31411327834361735, + "learning_rate": 3.730639090199616e-06, + "loss": 0.0335, + "step": 2765 + }, + { + "epoch": 1.871447902571042, + "grad_norm": 0.33230149170239837, + "learning_rate": 3.7268316305629836e-06, + "loss": 0.0302, + "step": 2766 + }, + { + "epoch": 1.8721244925575102, + "grad_norm": 0.33934420407494387, + "learning_rate": 3.7230249602221163e-06, + "loss": 0.0309, + "step": 2767 + }, + { + "epoch": 1.8728010825439783, + "grad_norm": 0.29769907147717395, + "learning_rate": 3.719219081536942e-06, + "loss": 0.0259, + "step": 2768 + }, + { + "epoch": 1.8734776725304465, + "grad_norm": 0.37091430711250334, + "learning_rate": 3.7154139968669043e-06, + "loss": 0.0359, + "step": 2769 + }, + { + "epoch": 1.8741542625169147, + "grad_norm": 0.35193881203090227, + "learning_rate": 3.711609708570948e-06, + "loss": 0.0302, + "step": 2770 + }, + { + "epoch": 1.874830852503383, + "grad_norm": 0.3674154203085046, + "learning_rate": 3.7078062190075264e-06, + "loss": 0.0277, + "step": 2771 + }, + { + "epoch": 1.8755074424898512, + "grad_norm": 0.3715442409426697, + "learning_rate": 3.704003530534597e-06, + "loss": 0.0263, + "step": 2772 + }, + { + "epoch": 1.8761840324763193, + "grad_norm": 0.33092732279836434, + "learning_rate": 3.7002016455096247e-06, + "loss": 0.0361, + "step": 2773 + }, + { + "epoch": 1.8768606224627875, + "grad_norm": 0.23898592217454684, + "learning_rate": 3.696400566289571e-06, + "loss": 0.0223, + "step": 2774 + }, + { + "epoch": 1.8775372124492558, + "grad_norm": 0.22496138181328645, + "learning_rate": 3.6926002952309015e-06, + "loss": 0.0219, + "step": 2775 + }, + { + "epoch": 1.878213802435724, + "grad_norm": 0.27929651919297416, + "learning_rate": 3.6888008346895797e-06, + "loss": 0.0251, + "step": 2776 + }, + { + "epoch": 1.8788903924221922, + "grad_norm": 0.33097525483542106, + "learning_rate": 3.685002187021064e-06, + "loss": 0.0232, + "step": 2777 + }, + { + "epoch": 1.8795669824086603, + "grad_norm": 0.45521315284837427, + "learning_rate": 3.681204354580313e-06, + "loss": 0.031, + "step": 2778 + }, + { + "epoch": 1.8802435723951285, + "grad_norm": 0.3521683025846162, + "learning_rate": 3.6774073397217786e-06, + "loss": 0.034, + "step": 2779 + }, + { + "epoch": 1.8809201623815968, + "grad_norm": 0.29050614400709096, + "learning_rate": 3.6736111447994026e-06, + "loss": 0.0195, + "step": 2780 + }, + { + "epoch": 1.881596752368065, + "grad_norm": 0.2268436847521333, + "learning_rate": 3.669815772166625e-06, + "loss": 0.0196, + "step": 2781 + }, + { + "epoch": 1.8822733423545333, + "grad_norm": 0.32915281484908304, + "learning_rate": 3.6660212241763692e-06, + "loss": 0.0279, + "step": 2782 + }, + { + "epoch": 1.8829499323410013, + "grad_norm": 0.34819830031525706, + "learning_rate": 3.662227503181054e-06, + "loss": 0.0357, + "step": 2783 + }, + { + "epoch": 1.8836265223274695, + "grad_norm": 0.5626748289559241, + "learning_rate": 3.658434611532578e-06, + "loss": 0.0392, + "step": 2784 + }, + { + "epoch": 1.8843031123139378, + "grad_norm": 0.2884633592876496, + "learning_rate": 3.65464255158233e-06, + "loss": 0.0256, + "step": 2785 + }, + { + "epoch": 1.8849797023004058, + "grad_norm": 0.20175821768168112, + "learning_rate": 3.6508513256811856e-06, + "loss": 0.0187, + "step": 2786 + }, + { + "epoch": 1.8856562922868743, + "grad_norm": 0.31282576530748646, + "learning_rate": 3.6470609361794972e-06, + "loss": 0.0255, + "step": 2787 + }, + { + "epoch": 1.8863328822733423, + "grad_norm": 0.29362490198931823, + "learning_rate": 3.643271385427105e-06, + "loss": 0.0234, + "step": 2788 + }, + { + "epoch": 1.8870094722598105, + "grad_norm": 0.37555876152309803, + "learning_rate": 3.639482675773324e-06, + "loss": 0.0352, + "step": 2789 + }, + { + "epoch": 1.8876860622462788, + "grad_norm": 0.27726655452882504, + "learning_rate": 3.635694809566954e-06, + "loss": 0.0273, + "step": 2790 + }, + { + "epoch": 1.8883626522327468, + "grad_norm": 0.335796705296422, + "learning_rate": 3.6319077891562616e-06, + "loss": 0.0294, + "step": 2791 + }, + { + "epoch": 1.8890392422192153, + "grad_norm": 0.4186741035926321, + "learning_rate": 3.6281216168889993e-06, + "loss": 0.0417, + "step": 2792 + }, + { + "epoch": 1.8897158322056833, + "grad_norm": 0.2734011889483481, + "learning_rate": 3.624336295112388e-06, + "loss": 0.0249, + "step": 2793 + }, + { + "epoch": 1.8903924221921515, + "grad_norm": 0.2893345296623684, + "learning_rate": 3.6205518261731247e-06, + "loss": 0.0318, + "step": 2794 + }, + { + "epoch": 1.8910690121786198, + "grad_norm": 0.2862134610686118, + "learning_rate": 3.616768212417375e-06, + "loss": 0.0244, + "step": 2795 + }, + { + "epoch": 1.8917456021650878, + "grad_norm": 0.3173863870232804, + "learning_rate": 3.6129854561907786e-06, + "loss": 0.025, + "step": 2796 + }, + { + "epoch": 1.8924221921515563, + "grad_norm": 0.43928982539968053, + "learning_rate": 3.6092035598384356e-06, + "loss": 0.0317, + "step": 2797 + }, + { + "epoch": 1.8930987821380243, + "grad_norm": 0.43430236350593565, + "learning_rate": 3.6054225257049204e-06, + "loss": 0.0355, + "step": 2798 + }, + { + "epoch": 1.8937753721244925, + "grad_norm": 0.3224927637106134, + "learning_rate": 3.6016423561342707e-06, + "loss": 0.0339, + "step": 2799 + }, + { + "epoch": 1.8944519621109608, + "grad_norm": 0.38319398271183497, + "learning_rate": 3.5978630534699873e-06, + "loss": 0.0316, + "step": 2800 + }, + { + "epoch": 1.8951285520974288, + "grad_norm": 0.2640394799838839, + "learning_rate": 3.5940846200550327e-06, + "loss": 0.024, + "step": 2801 + }, + { + "epoch": 1.8958051420838973, + "grad_norm": 0.547423435207649, + "learning_rate": 3.5903070582318356e-06, + "loss": 0.0406, + "step": 2802 + }, + { + "epoch": 1.8964817320703653, + "grad_norm": 0.2993570275064508, + "learning_rate": 3.5865303703422794e-06, + "loss": 0.0255, + "step": 2803 + }, + { + "epoch": 1.8971583220568335, + "grad_norm": 0.6498621705762274, + "learning_rate": 3.5827545587277033e-06, + "loss": 0.0463, + "step": 2804 + }, + { + "epoch": 1.8978349120433018, + "grad_norm": 0.28677708356869375, + "learning_rate": 3.5789796257289117e-06, + "loss": 0.0268, + "step": 2805 + }, + { + "epoch": 1.8985115020297698, + "grad_norm": 0.3740110206219344, + "learning_rate": 3.5752055736861567e-06, + "loss": 0.0399, + "step": 2806 + }, + { + "epoch": 1.8991880920162383, + "grad_norm": 0.36112072535398315, + "learning_rate": 3.571432404939149e-06, + "loss": 0.0253, + "step": 2807 + }, + { + "epoch": 1.8998646820027063, + "grad_norm": 0.33223983356346587, + "learning_rate": 3.567660121827048e-06, + "loss": 0.0308, + "step": 2808 + }, + { + "epoch": 1.9005412719891746, + "grad_norm": 0.33198553490346355, + "learning_rate": 3.5638887266884682e-06, + "loss": 0.0261, + "step": 2809 + }, + { + "epoch": 1.9012178619756428, + "grad_norm": 0.3097804507949422, + "learning_rate": 3.5601182218614706e-06, + "loss": 0.0275, + "step": 2810 + }, + { + "epoch": 1.9018944519621108, + "grad_norm": 0.24213424466380748, + "learning_rate": 3.5563486096835643e-06, + "loss": 0.0221, + "step": 2811 + }, + { + "epoch": 1.9025710419485793, + "grad_norm": 0.4353504298756395, + "learning_rate": 3.552579892491704e-06, + "loss": 0.0289, + "step": 2812 + }, + { + "epoch": 1.9032476319350473, + "grad_norm": 0.5192967445604084, + "learning_rate": 3.548812072622294e-06, + "loss": 0.0276, + "step": 2813 + }, + { + "epoch": 1.9039242219215156, + "grad_norm": 0.4643938534061608, + "learning_rate": 3.545045152411178e-06, + "loss": 0.0385, + "step": 2814 + }, + { + "epoch": 1.9046008119079838, + "grad_norm": 0.2651575826675033, + "learning_rate": 3.5412791341936446e-06, + "loss": 0.0208, + "step": 2815 + }, + { + "epoch": 1.9052774018944518, + "grad_norm": 0.35986295997323603, + "learning_rate": 3.5375140203044233e-06, + "loss": 0.0269, + "step": 2816 + }, + { + "epoch": 1.9059539918809203, + "grad_norm": 0.25351053252441075, + "learning_rate": 3.533749813077677e-06, + "loss": 0.0236, + "step": 2817 + }, + { + "epoch": 1.9066305818673883, + "grad_norm": 0.255312732213536, + "learning_rate": 3.5299865148470157e-06, + "loss": 0.0263, + "step": 2818 + }, + { + "epoch": 1.9073071718538566, + "grad_norm": 0.31432772507999474, + "learning_rate": 3.526224127945479e-06, + "loss": 0.0349, + "step": 2819 + }, + { + "epoch": 1.9079837618403248, + "grad_norm": 0.3648883357223518, + "learning_rate": 3.5224626547055463e-06, + "loss": 0.031, + "step": 2820 + }, + { + "epoch": 1.9086603518267928, + "grad_norm": 0.2595013606780327, + "learning_rate": 3.518702097459126e-06, + "loss": 0.0321, + "step": 2821 + }, + { + "epoch": 1.9093369418132613, + "grad_norm": 0.33185174904071235, + "learning_rate": 3.5149424585375623e-06, + "loss": 0.0346, + "step": 2822 + }, + { + "epoch": 1.9100135317997293, + "grad_norm": 0.28682765527430787, + "learning_rate": 3.5111837402716297e-06, + "loss": 0.0303, + "step": 2823 + }, + { + "epoch": 1.9106901217861976, + "grad_norm": 0.3847310611875437, + "learning_rate": 3.507425944991529e-06, + "loss": 0.0251, + "step": 2824 + }, + { + "epoch": 1.9113667117726658, + "grad_norm": 0.3072857451242148, + "learning_rate": 3.5036690750268897e-06, + "loss": 0.0279, + "step": 2825 + }, + { + "epoch": 1.9120433017591338, + "grad_norm": 0.23586619817907892, + "learning_rate": 3.499913132706771e-06, + "loss": 0.0209, + "step": 2826 + }, + { + "epoch": 1.9127198917456023, + "grad_norm": 0.32330068896389064, + "learning_rate": 3.496158120359653e-06, + "loss": 0.0332, + "step": 2827 + }, + { + "epoch": 1.9133964817320703, + "grad_norm": 0.2751689962619378, + "learning_rate": 3.492404040313443e-06, + "loss": 0.0323, + "step": 2828 + }, + { + "epoch": 1.9140730717185386, + "grad_norm": 0.38804992328882687, + "learning_rate": 3.4886508948954656e-06, + "loss": 0.0273, + "step": 2829 + }, + { + "epoch": 1.9147496617050068, + "grad_norm": 0.24126696684207583, + "learning_rate": 3.484898686432473e-06, + "loss": 0.0221, + "step": 2830 + }, + { + "epoch": 1.9154262516914748, + "grad_norm": 0.4072994040840315, + "learning_rate": 3.4811474172506277e-06, + "loss": 0.0323, + "step": 2831 + }, + { + "epoch": 1.9161028416779433, + "grad_norm": 0.30978967865564594, + "learning_rate": 3.4773970896755167e-06, + "loss": 0.03, + "step": 2832 + }, + { + "epoch": 1.9167794316644113, + "grad_norm": 0.3871956545025896, + "learning_rate": 3.4736477060321387e-06, + "loss": 0.0334, + "step": 2833 + }, + { + "epoch": 1.9174560216508796, + "grad_norm": 0.32370998680942925, + "learning_rate": 3.469899268644913e-06, + "loss": 0.0262, + "step": 2834 + }, + { + "epoch": 1.9181326116373478, + "grad_norm": 0.28469184318348056, + "learning_rate": 3.466151779837665e-06, + "loss": 0.0309, + "step": 2835 + }, + { + "epoch": 1.9188092016238159, + "grad_norm": 0.2795111004603757, + "learning_rate": 3.4624052419336395e-06, + "loss": 0.0249, + "step": 2836 + }, + { + "epoch": 1.9194857916102843, + "grad_norm": 0.2683034552931114, + "learning_rate": 3.458659657255486e-06, + "loss": 0.0219, + "step": 2837 + }, + { + "epoch": 1.9201623815967523, + "grad_norm": 0.256257738319039, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.0277, + "step": 2838 + }, + { + "epoch": 1.9208389715832206, + "grad_norm": 0.33001994167723386, + "learning_rate": 3.4511713568644432e-06, + "loss": 0.0242, + "step": 2839 + }, + { + "epoch": 1.9215155615696888, + "grad_norm": 0.3366863569402713, + "learning_rate": 3.4474286457938976e-06, + "loss": 0.0329, + "step": 2840 + }, + { + "epoch": 1.9221921515561569, + "grad_norm": 0.3280153443728017, + "learning_rate": 3.4436868972339073e-06, + "loss": 0.0281, + "step": 2841 + }, + { + "epoch": 1.9228687415426253, + "grad_norm": 0.3350255705132479, + "learning_rate": 3.4399461135041525e-06, + "loss": 0.0202, + "step": 2842 + }, + { + "epoch": 1.9235453315290933, + "grad_norm": 0.29081080464471826, + "learning_rate": 3.4362062969237227e-06, + "loss": 0.0235, + "step": 2843 + }, + { + "epoch": 1.9242219215155616, + "grad_norm": 0.2621212734470656, + "learning_rate": 3.4324674498110956e-06, + "loss": 0.0235, + "step": 2844 + }, + { + "epoch": 1.9248985115020298, + "grad_norm": 0.3380242772080527, + "learning_rate": 3.4287295744841588e-06, + "loss": 0.0262, + "step": 2845 + }, + { + "epoch": 1.9255751014884979, + "grad_norm": 0.43964945789010224, + "learning_rate": 3.4249926732601914e-06, + "loss": 0.0238, + "step": 2846 + }, + { + "epoch": 1.9262516914749663, + "grad_norm": 0.2710127173580096, + "learning_rate": 3.4212567484558735e-06, + "loss": 0.0263, + "step": 2847 + }, + { + "epoch": 1.9269282814614344, + "grad_norm": 0.26758662188394866, + "learning_rate": 3.4175218023872753e-06, + "loss": 0.0287, + "step": 2848 + }, + { + "epoch": 1.9276048714479026, + "grad_norm": 0.3432446968148877, + "learning_rate": 3.413787837369863e-06, + "loss": 0.0278, + "step": 2849 + }, + { + "epoch": 1.9282814614343708, + "grad_norm": 0.2539660225248182, + "learning_rate": 3.4100548557184944e-06, + "loss": 0.0216, + "step": 2850 + }, + { + "epoch": 1.9289580514208389, + "grad_norm": 0.4090591753487882, + "learning_rate": 3.4063228597474133e-06, + "loss": 0.0286, + "step": 2851 + }, + { + "epoch": 1.9296346414073073, + "grad_norm": 0.2759084303466922, + "learning_rate": 3.40259185177026e-06, + "loss": 0.0275, + "step": 2852 + }, + { + "epoch": 1.9303112313937754, + "grad_norm": 0.2875694538711192, + "learning_rate": 3.3988618341000566e-06, + "loss": 0.0255, + "step": 2853 + }, + { + "epoch": 1.9309878213802436, + "grad_norm": 0.23398353942622174, + "learning_rate": 3.395132809049212e-06, + "loss": 0.0188, + "step": 2854 + }, + { + "epoch": 1.9316644113667119, + "grad_norm": 0.3082346577153797, + "learning_rate": 3.391404778929523e-06, + "loss": 0.0274, + "step": 2855 + }, + { + "epoch": 1.9323410013531799, + "grad_norm": 0.36266806858388334, + "learning_rate": 3.3876777460521647e-06, + "loss": 0.0279, + "step": 2856 + }, + { + "epoch": 1.9330175913396481, + "grad_norm": 0.32350208627996513, + "learning_rate": 3.383951712727701e-06, + "loss": 0.0338, + "step": 2857 + }, + { + "epoch": 1.9336941813261164, + "grad_norm": 0.2780696818957692, + "learning_rate": 3.3802266812660674e-06, + "loss": 0.0262, + "step": 2858 + }, + { + "epoch": 1.9343707713125846, + "grad_norm": 0.24737711764911177, + "learning_rate": 3.3765026539765832e-06, + "loss": 0.0233, + "step": 2859 + }, + { + "epoch": 1.9350473612990529, + "grad_norm": 0.29716412290164596, + "learning_rate": 3.372779633167946e-06, + "loss": 0.0263, + "step": 2860 + }, + { + "epoch": 1.9357239512855209, + "grad_norm": 0.3110587006998367, + "learning_rate": 3.369057621148227e-06, + "loss": 0.0202, + "step": 2861 + }, + { + "epoch": 1.9364005412719891, + "grad_norm": 0.30215191701365823, + "learning_rate": 3.3653366202248738e-06, + "loss": 0.039, + "step": 2862 + }, + { + "epoch": 1.9370771312584574, + "grad_norm": 0.29661925810829787, + "learning_rate": 3.3616166327047084e-06, + "loss": 0.0262, + "step": 2863 + }, + { + "epoch": 1.9377537212449256, + "grad_norm": 0.3110558330578675, + "learning_rate": 3.3578976608939184e-06, + "loss": 0.0325, + "step": 2864 + }, + { + "epoch": 1.9384303112313939, + "grad_norm": 0.49275140021791575, + "learning_rate": 3.3541797070980663e-06, + "loss": 0.044, + "step": 2865 + }, + { + "epoch": 1.939106901217862, + "grad_norm": 0.23297219224990126, + "learning_rate": 3.3504627736220863e-06, + "loss": 0.0215, + "step": 2866 + }, + { + "epoch": 1.9397834912043301, + "grad_norm": 0.3354038043048891, + "learning_rate": 3.3467468627702736e-06, + "loss": 0.0285, + "step": 2867 + }, + { + "epoch": 1.9404600811907984, + "grad_norm": 0.3270926549379668, + "learning_rate": 3.3430319768462956e-06, + "loss": 0.0383, + "step": 2868 + }, + { + "epoch": 1.9411366711772666, + "grad_norm": 0.3071772486584624, + "learning_rate": 3.3393181181531785e-06, + "loss": 0.034, + "step": 2869 + }, + { + "epoch": 1.9418132611637349, + "grad_norm": 0.29458655876061124, + "learning_rate": 3.3356052889933177e-06, + "loss": 0.0254, + "step": 2870 + }, + { + "epoch": 1.942489851150203, + "grad_norm": 0.4383006372306806, + "learning_rate": 3.331893491668464e-06, + "loss": 0.0392, + "step": 2871 + }, + { + "epoch": 1.9431664411366711, + "grad_norm": 0.21682665553500347, + "learning_rate": 3.3281827284797317e-06, + "loss": 0.0137, + "step": 2872 + }, + { + "epoch": 1.9438430311231394, + "grad_norm": 0.4158094120952702, + "learning_rate": 3.3244730017275974e-06, + "loss": 0.0338, + "step": 2873 + }, + { + "epoch": 1.9445196211096076, + "grad_norm": 0.2673568657738758, + "learning_rate": 3.3207643137118872e-06, + "loss": 0.0227, + "step": 2874 + }, + { + "epoch": 1.9451962110960759, + "grad_norm": 0.334600763745565, + "learning_rate": 3.3170566667317917e-06, + "loss": 0.0269, + "step": 2875 + }, + { + "epoch": 1.945872801082544, + "grad_norm": 0.34411316409786435, + "learning_rate": 3.3133500630858507e-06, + "loss": 0.0298, + "step": 2876 + }, + { + "epoch": 1.9465493910690121, + "grad_norm": 0.24721970581413663, + "learning_rate": 3.309644505071959e-06, + "loss": 0.026, + "step": 2877 + }, + { + "epoch": 1.9472259810554804, + "grad_norm": 0.3337237396309158, + "learning_rate": 3.3059399949873605e-06, + "loss": 0.0441, + "step": 2878 + }, + { + "epoch": 1.9479025710419484, + "grad_norm": 0.46310487928044103, + "learning_rate": 3.3022365351286545e-06, + "loss": 0.0216, + "step": 2879 + }, + { + "epoch": 1.9485791610284169, + "grad_norm": 0.27732618646127893, + "learning_rate": 3.298534127791785e-06, + "loss": 0.0295, + "step": 2880 + }, + { + "epoch": 1.949255751014885, + "grad_norm": 0.4301121159037394, + "learning_rate": 3.2948327752720464e-06, + "loss": 0.0357, + "step": 2881 + }, + { + "epoch": 1.9499323410013532, + "grad_norm": 0.2607109714269544, + "learning_rate": 3.2911324798640764e-06, + "loss": 0.0311, + "step": 2882 + }, + { + "epoch": 1.9506089309878214, + "grad_norm": 0.3760848305883796, + "learning_rate": 3.2874332438618607e-06, + "loss": 0.0364, + "step": 2883 + }, + { + "epoch": 1.9512855209742894, + "grad_norm": 0.5506064272337056, + "learning_rate": 3.2837350695587237e-06, + "loss": 0.0199, + "step": 2884 + }, + { + "epoch": 1.951962110960758, + "grad_norm": 0.341082574308908, + "learning_rate": 3.280037959247336e-06, + "loss": 0.0252, + "step": 2885 + }, + { + "epoch": 1.952638700947226, + "grad_norm": 0.3719396780015257, + "learning_rate": 3.276341915219704e-06, + "loss": 0.035, + "step": 2886 + }, + { + "epoch": 1.9533152909336942, + "grad_norm": 0.3037992621206681, + "learning_rate": 3.2726469397671797e-06, + "loss": 0.026, + "step": 2887 + }, + { + "epoch": 1.9539918809201624, + "grad_norm": 0.44635049308627206, + "learning_rate": 3.268953035180445e-06, + "loss": 0.0486, + "step": 2888 + }, + { + "epoch": 1.9546684709066304, + "grad_norm": 0.282533040163484, + "learning_rate": 3.2652602037495247e-06, + "loss": 0.0274, + "step": 2889 + }, + { + "epoch": 1.955345060893099, + "grad_norm": 0.3544760189574648, + "learning_rate": 3.261568447763775e-06, + "loss": 0.0349, + "step": 2890 + }, + { + "epoch": 1.956021650879567, + "grad_norm": 0.3731113037171858, + "learning_rate": 3.2578777695118822e-06, + "loss": 0.0425, + "step": 2891 + }, + { + "epoch": 1.9566982408660352, + "grad_norm": 0.29749706884463234, + "learning_rate": 3.254188171281871e-06, + "loss": 0.0156, + "step": 2892 + }, + { + "epoch": 1.9573748308525034, + "grad_norm": 0.3173142864726359, + "learning_rate": 3.2504996553610924e-06, + "loss": 0.0307, + "step": 2893 + }, + { + "epoch": 1.9580514208389714, + "grad_norm": 0.35033591623661653, + "learning_rate": 3.2468122240362287e-06, + "loss": 0.0344, + "step": 2894 + }, + { + "epoch": 1.95872801082544, + "grad_norm": 0.3322956626141054, + "learning_rate": 3.2431258795932863e-06, + "loss": 0.0314, + "step": 2895 + }, + { + "epoch": 1.959404600811908, + "grad_norm": 0.3749911133312623, + "learning_rate": 3.2394406243176025e-06, + "loss": 0.032, + "step": 2896 + }, + { + "epoch": 1.9600811907983762, + "grad_norm": 0.2466760191045161, + "learning_rate": 3.2357564604938363e-06, + "loss": 0.026, + "step": 2897 + }, + { + "epoch": 1.9607577807848444, + "grad_norm": 0.31893554908880567, + "learning_rate": 3.232073390405969e-06, + "loss": 0.0239, + "step": 2898 + }, + { + "epoch": 1.9614343707713124, + "grad_norm": 0.33148467587806907, + "learning_rate": 3.2283914163373064e-06, + "loss": 0.0338, + "step": 2899 + }, + { + "epoch": 1.962110960757781, + "grad_norm": 0.4047833615238191, + "learning_rate": 3.224710540570475e-06, + "loss": 0.0395, + "step": 2900 + }, + { + "epoch": 1.962787550744249, + "grad_norm": 0.381552121459052, + "learning_rate": 3.2210307653874175e-06, + "loss": 0.0232, + "step": 2901 + }, + { + "epoch": 1.9634641407307172, + "grad_norm": 0.3282149561077376, + "learning_rate": 3.2173520930693987e-06, + "loss": 0.0301, + "step": 2902 + }, + { + "epoch": 1.9641407307171854, + "grad_norm": 0.2952940794290675, + "learning_rate": 3.2136745258969965e-06, + "loss": 0.0308, + "step": 2903 + }, + { + "epoch": 1.9648173207036534, + "grad_norm": 0.6005180166323237, + "learning_rate": 3.2099980661501016e-06, + "loss": 0.0263, + "step": 2904 + }, + { + "epoch": 1.965493910690122, + "grad_norm": 0.34264443079170587, + "learning_rate": 3.2063227161079234e-06, + "loss": 0.0282, + "step": 2905 + }, + { + "epoch": 1.96617050067659, + "grad_norm": 0.3382328779946316, + "learning_rate": 3.202648478048981e-06, + "loss": 0.0365, + "step": 2906 + }, + { + "epoch": 1.9668470906630582, + "grad_norm": 0.2934545968682589, + "learning_rate": 3.1989753542511016e-06, + "loss": 0.0282, + "step": 2907 + }, + { + "epoch": 1.9675236806495264, + "grad_norm": 0.39804762599738625, + "learning_rate": 3.1953033469914273e-06, + "loss": 0.0296, + "step": 2908 + }, + { + "epoch": 1.9682002706359945, + "grad_norm": 0.23323274395145344, + "learning_rate": 3.191632458546401e-06, + "loss": 0.0244, + "step": 2909 + }, + { + "epoch": 1.968876860622463, + "grad_norm": 0.2560444335549138, + "learning_rate": 3.1879626911917806e-06, + "loss": 0.0226, + "step": 2910 + }, + { + "epoch": 1.969553450608931, + "grad_norm": 0.472277980688637, + "learning_rate": 3.1842940472026194e-06, + "loss": 0.0295, + "step": 2911 + }, + { + "epoch": 1.9702300405953992, + "grad_norm": 0.28628681285349195, + "learning_rate": 3.18062652885328e-06, + "loss": 0.0278, + "step": 2912 + }, + { + "epoch": 1.9709066305818674, + "grad_norm": 0.3220856641535677, + "learning_rate": 3.1769601384174274e-06, + "loss": 0.0318, + "step": 2913 + }, + { + "epoch": 1.9715832205683355, + "grad_norm": 0.17608460434720677, + "learning_rate": 3.173294878168025e-06, + "loss": 0.015, + "step": 2914 + }, + { + "epoch": 1.972259810554804, + "grad_norm": 0.348395752132745, + "learning_rate": 3.169630750377337e-06, + "loss": 0.0246, + "step": 2915 + }, + { + "epoch": 1.972936400541272, + "grad_norm": 1.1154799791550307, + "learning_rate": 3.165967757316925e-06, + "loss": 0.0393, + "step": 2916 + }, + { + "epoch": 1.9736129905277402, + "grad_norm": 0.4427655176412636, + "learning_rate": 3.16230590125765e-06, + "loss": 0.0457, + "step": 2917 + }, + { + "epoch": 1.9742895805142084, + "grad_norm": 0.279935305564462, + "learning_rate": 3.1586451844696596e-06, + "loss": 0.0263, + "step": 2918 + }, + { + "epoch": 1.9749661705006765, + "grad_norm": 0.38887833912929604, + "learning_rate": 3.154985609222405e-06, + "loss": 0.0591, + "step": 2919 + }, + { + "epoch": 1.975642760487145, + "grad_norm": 0.40384933616899205, + "learning_rate": 3.1513271777846244e-06, + "loss": 0.033, + "step": 2920 + }, + { + "epoch": 1.976319350473613, + "grad_norm": 0.7291599951587879, + "learning_rate": 3.1476698924243487e-06, + "loss": 0.0249, + "step": 2921 + }, + { + "epoch": 1.9769959404600812, + "grad_norm": 0.28898467345484863, + "learning_rate": 3.1440137554088957e-06, + "loss": 0.0312, + "step": 2922 + }, + { + "epoch": 1.9776725304465494, + "grad_norm": 0.36265665243672346, + "learning_rate": 3.1403587690048775e-06, + "loss": 0.0266, + "step": 2923 + }, + { + "epoch": 1.9783491204330175, + "grad_norm": 0.3259886252116135, + "learning_rate": 3.1367049354781854e-06, + "loss": 0.0314, + "step": 2924 + }, + { + "epoch": 1.979025710419486, + "grad_norm": 0.3582763584476965, + "learning_rate": 3.1330522570939987e-06, + "loss": 0.0232, + "step": 2925 + }, + { + "epoch": 1.979702300405954, + "grad_norm": 0.2935990823159862, + "learning_rate": 3.129400736116783e-06, + "loss": 0.0269, + "step": 2926 + }, + { + "epoch": 1.9803788903924222, + "grad_norm": 0.3274565880793247, + "learning_rate": 3.125750374810283e-06, + "loss": 0.0259, + "step": 2927 + }, + { + "epoch": 1.9810554803788905, + "grad_norm": 0.2658822310608906, + "learning_rate": 3.1221011754375275e-06, + "loss": 0.0283, + "step": 2928 + }, + { + "epoch": 1.9817320703653585, + "grad_norm": 0.3847013296697889, + "learning_rate": 3.118453140260823e-06, + "loss": 0.0298, + "step": 2929 + }, + { + "epoch": 1.982408660351827, + "grad_norm": 0.7130181415800749, + "learning_rate": 3.1148062715417553e-06, + "loss": 0.0294, + "step": 2930 + }, + { + "epoch": 1.983085250338295, + "grad_norm": 0.4373450083423354, + "learning_rate": 3.111160571541183e-06, + "loss": 0.0429, + "step": 2931 + }, + { + "epoch": 1.9837618403247632, + "grad_norm": 0.3110882682758484, + "learning_rate": 3.107516042519248e-06, + "loss": 0.0248, + "step": 2932 + }, + { + "epoch": 1.9844384303112315, + "grad_norm": 0.36071841806590565, + "learning_rate": 3.1038726867353587e-06, + "loss": 0.0384, + "step": 2933 + }, + { + "epoch": 1.9851150202976995, + "grad_norm": 0.28256102100177616, + "learning_rate": 3.1002305064482006e-06, + "loss": 0.0232, + "step": 2934 + }, + { + "epoch": 1.985791610284168, + "grad_norm": 0.2988723458507057, + "learning_rate": 3.096589503915729e-06, + "loss": 0.0323, + "step": 2935 + }, + { + "epoch": 1.986468200270636, + "grad_norm": 0.2731757881803187, + "learning_rate": 3.09294968139517e-06, + "loss": 0.0308, + "step": 2936 + }, + { + "epoch": 1.9871447902571042, + "grad_norm": 0.39227075933969013, + "learning_rate": 3.089311041143017e-06, + "loss": 0.0353, + "step": 2937 + }, + { + "epoch": 1.9878213802435725, + "grad_norm": 0.2858002786161629, + "learning_rate": 3.085673585415031e-06, + "loss": 0.0235, + "step": 2938 + }, + { + "epoch": 1.9884979702300405, + "grad_norm": 0.2443499835741995, + "learning_rate": 3.082037316466236e-06, + "loss": 0.0231, + "step": 2939 + }, + { + "epoch": 1.989174560216509, + "grad_norm": 0.320468895425486, + "learning_rate": 3.078402236550926e-06, + "loss": 0.0297, + "step": 2940 + }, + { + "epoch": 1.989851150202977, + "grad_norm": 0.3353473200075891, + "learning_rate": 3.074768347922652e-06, + "loss": 0.0346, + "step": 2941 + }, + { + "epoch": 1.9905277401894452, + "grad_norm": 0.9569463985916461, + "learning_rate": 3.0711356528342316e-06, + "loss": 0.0252, + "step": 2942 + }, + { + "epoch": 1.9912043301759135, + "grad_norm": 0.25127605328321234, + "learning_rate": 3.06750415353774e-06, + "loss": 0.0246, + "step": 2943 + }, + { + "epoch": 1.9918809201623815, + "grad_norm": 0.37107075055984906, + "learning_rate": 3.063873852284508e-06, + "loss": 0.0337, + "step": 2944 + }, + { + "epoch": 1.9925575101488497, + "grad_norm": 0.2612803381010322, + "learning_rate": 3.0602447513251287e-06, + "loss": 0.0233, + "step": 2945 + }, + { + "epoch": 1.993234100135318, + "grad_norm": 0.6852125222550688, + "learning_rate": 3.0566168529094485e-06, + "loss": 0.0322, + "step": 2946 + }, + { + "epoch": 1.9939106901217862, + "grad_norm": 0.4500098880831753, + "learning_rate": 3.0529901592865705e-06, + "loss": 0.0288, + "step": 2947 + }, + { + "epoch": 1.9945872801082545, + "grad_norm": 0.3593941089907377, + "learning_rate": 3.0493646727048463e-06, + "loss": 0.0306, + "step": 2948 + }, + { + "epoch": 1.9952638700947225, + "grad_norm": 0.4390521539617445, + "learning_rate": 3.045740395411886e-06, + "loss": 0.0417, + "step": 2949 + }, + { + "epoch": 1.9959404600811907, + "grad_norm": 0.3389952295118972, + "learning_rate": 3.042117329654544e-06, + "loss": 0.0286, + "step": 2950 + }, + { + "epoch": 1.996617050067659, + "grad_norm": 0.266164907803794, + "learning_rate": 3.0384954776789255e-06, + "loss": 0.0176, + "step": 2951 + }, + { + "epoch": 1.9972936400541272, + "grad_norm": 0.3084683429644584, + "learning_rate": 3.0348748417303826e-06, + "loss": 0.0262, + "step": 2952 + }, + { + "epoch": 1.9979702300405955, + "grad_norm": 0.4066343568805604, + "learning_rate": 3.0312554240535166e-06, + "loss": 0.0308, + "step": 2953 + }, + { + "epoch": 1.9986468200270635, + "grad_norm": 0.2967193506919042, + "learning_rate": 3.0276372268921694e-06, + "loss": 0.0266, + "step": 2954 + }, + { + "epoch": 1.9993234100135318, + "grad_norm": 0.4838567587953983, + "learning_rate": 3.0240202524894304e-06, + "loss": 0.0378, + "step": 2955 + }, + { + "epoch": 2.0, + "grad_norm": 0.24361262369124725, + "learning_rate": 3.0204045030876267e-06, + "loss": 0.0229, + "step": 2956 + }, + { + "epoch": 2.0, + "eval_loss": 0.03232486918568611, + "eval_runtime": 234.0891, + "eval_samples_per_second": 42.527, + "eval_steps_per_second": 1.333, + "step": 2956 + }, + { + "epoch": 2.000676589986468, + "grad_norm": 0.2863479162677051, + "learning_rate": 3.016789980928331e-06, + "loss": 0.0242, + "step": 2957 + }, + { + "epoch": 2.0013531799729365, + "grad_norm": 0.2446970841308206, + "learning_rate": 3.013176688252349e-06, + "loss": 0.0267, + "step": 2958 + }, + { + "epoch": 2.0020297699594045, + "grad_norm": 0.27546559067848164, + "learning_rate": 3.009564627299728e-06, + "loss": 0.0277, + "step": 2959 + }, + { + "epoch": 2.002706359945873, + "grad_norm": 0.25041569161875626, + "learning_rate": 3.005953800309752e-06, + "loss": 0.0178, + "step": 2960 + }, + { + "epoch": 2.003382949932341, + "grad_norm": 0.2936556310346428, + "learning_rate": 3.0023442095209386e-06, + "loss": 0.0208, + "step": 2961 + }, + { + "epoch": 2.004059539918809, + "grad_norm": 0.2900960504196555, + "learning_rate": 2.9987358571710394e-06, + "loss": 0.0203, + "step": 2962 + }, + { + "epoch": 2.0047361299052775, + "grad_norm": 0.23453405387146745, + "learning_rate": 2.9951287454970405e-06, + "loss": 0.0166, + "step": 2963 + }, + { + "epoch": 2.0054127198917455, + "grad_norm": 0.24436914995189996, + "learning_rate": 2.991522876735154e-06, + "loss": 0.0166, + "step": 2964 + }, + { + "epoch": 2.006089309878214, + "grad_norm": 0.30259742460555106, + "learning_rate": 2.987918253120824e-06, + "loss": 0.0199, + "step": 2965 + }, + { + "epoch": 2.006765899864682, + "grad_norm": 0.3949464234058585, + "learning_rate": 2.984314876888725e-06, + "loss": 0.0201, + "step": 2966 + }, + { + "epoch": 2.00744248985115, + "grad_norm": 0.3185317265892623, + "learning_rate": 2.980712750272754e-06, + "loss": 0.0173, + "step": 2967 + }, + { + "epoch": 2.0081190798376185, + "grad_norm": 0.28231839208490533, + "learning_rate": 2.9771118755060368e-06, + "loss": 0.0168, + "step": 2968 + }, + { + "epoch": 2.0087956698240865, + "grad_norm": 0.3113576343695042, + "learning_rate": 2.9735122548209204e-06, + "loss": 0.0259, + "step": 2969 + }, + { + "epoch": 2.009472259810555, + "grad_norm": 0.2661010558709563, + "learning_rate": 2.96991389044898e-06, + "loss": 0.0205, + "step": 2970 + }, + { + "epoch": 2.010148849797023, + "grad_norm": 0.360734775155085, + "learning_rate": 2.966316784621e-06, + "loss": 0.0285, + "step": 2971 + }, + { + "epoch": 2.010825439783491, + "grad_norm": 0.3333442128135005, + "learning_rate": 2.9627209395669978e-06, + "loss": 0.0215, + "step": 2972 + }, + { + "epoch": 2.0115020297699595, + "grad_norm": 0.3111768204221705, + "learning_rate": 2.9591263575162e-06, + "loss": 0.0191, + "step": 2973 + }, + { + "epoch": 2.0121786197564275, + "grad_norm": 0.3302427223756702, + "learning_rate": 2.9555330406970568e-06, + "loss": 0.0199, + "step": 2974 + }, + { + "epoch": 2.012855209742896, + "grad_norm": 0.29770343162182134, + "learning_rate": 2.9519409913372286e-06, + "loss": 0.0181, + "step": 2975 + }, + { + "epoch": 2.013531799729364, + "grad_norm": 0.3314595406050617, + "learning_rate": 2.9483502116635943e-06, + "loss": 0.0253, + "step": 2976 + }, + { + "epoch": 2.014208389715832, + "grad_norm": 0.2927950786090539, + "learning_rate": 2.9447607039022443e-06, + "loss": 0.0165, + "step": 2977 + }, + { + "epoch": 2.0148849797023005, + "grad_norm": 0.36899655053084923, + "learning_rate": 2.9411724702784762e-06, + "loss": 0.031, + "step": 2978 + }, + { + "epoch": 2.0155615696887685, + "grad_norm": 0.3289829198876795, + "learning_rate": 2.9375855130168046e-06, + "loss": 0.0197, + "step": 2979 + }, + { + "epoch": 2.016238159675237, + "grad_norm": 0.440796677024592, + "learning_rate": 2.9339998343409484e-06, + "loss": 0.0317, + "step": 2980 + }, + { + "epoch": 2.016914749661705, + "grad_norm": 0.29986528125333717, + "learning_rate": 2.9304154364738358e-06, + "loss": 0.0167, + "step": 2981 + }, + { + "epoch": 2.017591339648173, + "grad_norm": 0.5231906812809761, + "learning_rate": 2.9268323216375997e-06, + "loss": 0.0209, + "step": 2982 + }, + { + "epoch": 2.0182679296346415, + "grad_norm": 0.49567420473066737, + "learning_rate": 2.92325049205358e-06, + "loss": 0.0232, + "step": 2983 + }, + { + "epoch": 2.0189445196211095, + "grad_norm": 0.3374261257695268, + "learning_rate": 2.9196699499423143e-06, + "loss": 0.0183, + "step": 2984 + }, + { + "epoch": 2.019621109607578, + "grad_norm": 0.3234196639187083, + "learning_rate": 2.9160906975235493e-06, + "loss": 0.0186, + "step": 2985 + }, + { + "epoch": 2.020297699594046, + "grad_norm": 0.3633058512930501, + "learning_rate": 2.9125127370162253e-06, + "loss": 0.0266, + "step": 2986 + }, + { + "epoch": 2.020974289580514, + "grad_norm": 0.3182453062676391, + "learning_rate": 2.908936070638487e-06, + "loss": 0.0214, + "step": 2987 + }, + { + "epoch": 2.0216508795669825, + "grad_norm": 0.5490919136205034, + "learning_rate": 2.9053607006076766e-06, + "loss": 0.0194, + "step": 2988 + }, + { + "epoch": 2.0223274695534506, + "grad_norm": 0.28036413832885343, + "learning_rate": 2.9017866291403275e-06, + "loss": 0.0169, + "step": 2989 + }, + { + "epoch": 2.023004059539919, + "grad_norm": 0.3326998051076093, + "learning_rate": 2.8982138584521734e-06, + "loss": 0.0208, + "step": 2990 + }, + { + "epoch": 2.023680649526387, + "grad_norm": 0.24252640345277615, + "learning_rate": 2.8946423907581377e-06, + "loss": 0.0168, + "step": 2991 + }, + { + "epoch": 2.024357239512855, + "grad_norm": 0.26774010808471116, + "learning_rate": 2.8910722282723404e-06, + "loss": 0.0136, + "step": 2992 + }, + { + "epoch": 2.0250338294993235, + "grad_norm": 0.3893872262951025, + "learning_rate": 2.8875033732080865e-06, + "loss": 0.0254, + "step": 2993 + }, + { + "epoch": 2.0257104194857916, + "grad_norm": 0.36350093252970944, + "learning_rate": 2.8839358277778758e-06, + "loss": 0.0232, + "step": 2994 + }, + { + "epoch": 2.02638700947226, + "grad_norm": 0.4113375425303043, + "learning_rate": 2.8803695941933933e-06, + "loss": 0.021, + "step": 2995 + }, + { + "epoch": 2.027063599458728, + "grad_norm": 0.3424460948782749, + "learning_rate": 2.876804674665515e-06, + "loss": 0.0166, + "step": 2996 + }, + { + "epoch": 2.027740189445196, + "grad_norm": 0.34633947375349966, + "learning_rate": 2.873241071404296e-06, + "loss": 0.0261, + "step": 2997 + }, + { + "epoch": 2.0284167794316645, + "grad_norm": 0.31695482248283474, + "learning_rate": 2.869678786618976e-06, + "loss": 0.0258, + "step": 2998 + }, + { + "epoch": 2.0290933694181326, + "grad_norm": 0.4051790685017009, + "learning_rate": 2.866117822517982e-06, + "loss": 0.0288, + "step": 2999 + }, + { + "epoch": 2.029769959404601, + "grad_norm": 0.297721583264592, + "learning_rate": 2.86255818130892e-06, + "loss": 0.0151, + "step": 3000 + }, + { + "epoch": 2.030446549391069, + "grad_norm": 0.3263110566690717, + "learning_rate": 2.8589998651985775e-06, + "loss": 0.0264, + "step": 3001 + }, + { + "epoch": 2.031123139377537, + "grad_norm": 0.36359138307229244, + "learning_rate": 2.855442876392914e-06, + "loss": 0.0225, + "step": 3002 + }, + { + "epoch": 2.0317997293640055, + "grad_norm": 0.38974921943148677, + "learning_rate": 2.8518872170970758e-06, + "loss": 0.0204, + "step": 3003 + }, + { + "epoch": 2.0324763193504736, + "grad_norm": 0.3246598767415876, + "learning_rate": 2.848332889515375e-06, + "loss": 0.0159, + "step": 3004 + }, + { + "epoch": 2.033152909336942, + "grad_norm": 0.2715035243485544, + "learning_rate": 2.8447798958513082e-06, + "loss": 0.0199, + "step": 3005 + }, + { + "epoch": 2.03382949932341, + "grad_norm": 0.30416151930942176, + "learning_rate": 2.8412282383075362e-06, + "loss": 0.0195, + "step": 3006 + }, + { + "epoch": 2.034506089309878, + "grad_norm": 0.3174930394310743, + "learning_rate": 2.837677919085896e-06, + "loss": 0.0239, + "step": 3007 + }, + { + "epoch": 2.0351826792963466, + "grad_norm": 0.2872144907614338, + "learning_rate": 2.8341289403873952e-06, + "loss": 0.0171, + "step": 3008 + }, + { + "epoch": 2.0358592692828146, + "grad_norm": 0.3420128841586019, + "learning_rate": 2.83058130441221e-06, + "loss": 0.017, + "step": 3009 + }, + { + "epoch": 2.0365358592692826, + "grad_norm": 0.3087533550670056, + "learning_rate": 2.8270350133596824e-06, + "loss": 0.0164, + "step": 3010 + }, + { + "epoch": 2.037212449255751, + "grad_norm": 0.36783863784836157, + "learning_rate": 2.82349006942832e-06, + "loss": 0.0246, + "step": 3011 + }, + { + "epoch": 2.037889039242219, + "grad_norm": 0.3192419287887565, + "learning_rate": 2.8199464748157983e-06, + "loss": 0.0188, + "step": 3012 + }, + { + "epoch": 2.0385656292286876, + "grad_norm": 0.3207256988137708, + "learning_rate": 2.816404231718958e-06, + "loss": 0.0225, + "step": 3013 + }, + { + "epoch": 2.0392422192151556, + "grad_norm": 0.2848406159498065, + "learning_rate": 2.8128633423337932e-06, + "loss": 0.0209, + "step": 3014 + }, + { + "epoch": 2.0399188092016236, + "grad_norm": 0.3398180925374401, + "learning_rate": 2.8093238088554676e-06, + "loss": 0.0273, + "step": 3015 + }, + { + "epoch": 2.040595399188092, + "grad_norm": 0.2995605132938713, + "learning_rate": 2.8057856334783006e-06, + "loss": 0.0168, + "step": 3016 + }, + { + "epoch": 2.04127198917456, + "grad_norm": 0.44030263845454437, + "learning_rate": 2.802248818395773e-06, + "loss": 0.0195, + "step": 3017 + }, + { + "epoch": 2.0419485791610286, + "grad_norm": 0.2993737804960732, + "learning_rate": 2.7987133658005174e-06, + "loss": 0.0191, + "step": 3018 + }, + { + "epoch": 2.0426251691474966, + "grad_norm": 0.2954601996897239, + "learning_rate": 2.795179277884321e-06, + "loss": 0.0203, + "step": 3019 + }, + { + "epoch": 2.0433017591339646, + "grad_norm": 0.2733113064635633, + "learning_rate": 2.79164655683813e-06, + "loss": 0.0165, + "step": 3020 + }, + { + "epoch": 2.043978349120433, + "grad_norm": 0.25405037608310793, + "learning_rate": 2.788115204852042e-06, + "loss": 0.0145, + "step": 3021 + }, + { + "epoch": 2.044654939106901, + "grad_norm": 0.3284269913514382, + "learning_rate": 2.7845852241153063e-06, + "loss": 0.0222, + "step": 3022 + }, + { + "epoch": 2.0453315290933696, + "grad_norm": 0.3107430259541557, + "learning_rate": 2.781056616816319e-06, + "loss": 0.0219, + "step": 3023 + }, + { + "epoch": 2.0460081190798376, + "grad_norm": 0.2697386714777377, + "learning_rate": 2.7775293851426233e-06, + "loss": 0.0151, + "step": 3024 + }, + { + "epoch": 2.0466847090663056, + "grad_norm": 0.3617580629923284, + "learning_rate": 2.7740035312809153e-06, + "loss": 0.0179, + "step": 3025 + }, + { + "epoch": 2.047361299052774, + "grad_norm": 0.33728690392370864, + "learning_rate": 2.7704790574170372e-06, + "loss": 0.0181, + "step": 3026 + }, + { + "epoch": 2.048037889039242, + "grad_norm": 0.35894504724803605, + "learning_rate": 2.766955965735968e-06, + "loss": 0.0177, + "step": 3027 + }, + { + "epoch": 2.0487144790257106, + "grad_norm": 0.3998741942334366, + "learning_rate": 2.7634342584218364e-06, + "loss": 0.0228, + "step": 3028 + }, + { + "epoch": 2.0493910690121786, + "grad_norm": 0.3556964497640884, + "learning_rate": 2.759913937657912e-06, + "loss": 0.023, + "step": 3029 + }, + { + "epoch": 2.0500676589986466, + "grad_norm": 0.24363937450340487, + "learning_rate": 2.7563950056266053e-06, + "loss": 0.0117, + "step": 3030 + }, + { + "epoch": 2.050744248985115, + "grad_norm": 0.4135397355331111, + "learning_rate": 2.752877464509463e-06, + "loss": 0.0261, + "step": 3031 + }, + { + "epoch": 2.051420838971583, + "grad_norm": 0.29385288459782477, + "learning_rate": 2.7493613164871678e-06, + "loss": 0.0145, + "step": 3032 + }, + { + "epoch": 2.0520974289580516, + "grad_norm": 0.8182621401967292, + "learning_rate": 2.745846563739546e-06, + "loss": 0.0411, + "step": 3033 + }, + { + "epoch": 2.0527740189445196, + "grad_norm": 0.3541771665927555, + "learning_rate": 2.7423332084455543e-06, + "loss": 0.0184, + "step": 3034 + }, + { + "epoch": 2.0534506089309876, + "grad_norm": 0.30746052699659804, + "learning_rate": 2.7388212527832814e-06, + "loss": 0.0162, + "step": 3035 + }, + { + "epoch": 2.054127198917456, + "grad_norm": 0.3681810925856484, + "learning_rate": 2.7353106989299528e-06, + "loss": 0.0233, + "step": 3036 + }, + { + "epoch": 2.054803788903924, + "grad_norm": 0.3064550317174914, + "learning_rate": 2.731801549061923e-06, + "loss": 0.0162, + "step": 3037 + }, + { + "epoch": 2.0554803788903926, + "grad_norm": 0.2513444814612412, + "learning_rate": 2.7282938053546727e-06, + "loss": 0.0154, + "step": 3038 + }, + { + "epoch": 2.0561569688768606, + "grad_norm": 0.397932289746068, + "learning_rate": 2.7247874699828186e-06, + "loss": 0.0204, + "step": 3039 + }, + { + "epoch": 2.0568335588633286, + "grad_norm": 0.2790640626372702, + "learning_rate": 2.7212825451200942e-06, + "loss": 0.0208, + "step": 3040 + }, + { + "epoch": 2.057510148849797, + "grad_norm": 0.32071415577478357, + "learning_rate": 2.7177790329393674e-06, + "loss": 0.0204, + "step": 3041 + }, + { + "epoch": 2.058186738836265, + "grad_norm": 0.31070051610266236, + "learning_rate": 2.7142769356126258e-06, + "loss": 0.0186, + "step": 3042 + }, + { + "epoch": 2.0588633288227336, + "grad_norm": 0.3340397252026217, + "learning_rate": 2.710776255310984e-06, + "loss": 0.0204, + "step": 3043 + }, + { + "epoch": 2.0595399188092016, + "grad_norm": 0.4273394375208807, + "learning_rate": 2.7072769942046716e-06, + "loss": 0.0282, + "step": 3044 + }, + { + "epoch": 2.0602165087956696, + "grad_norm": 0.2965104082661981, + "learning_rate": 2.7037791544630414e-06, + "loss": 0.0188, + "step": 3045 + }, + { + "epoch": 2.060893098782138, + "grad_norm": 0.35599459837318065, + "learning_rate": 2.700282738254567e-06, + "loss": 0.0268, + "step": 3046 + }, + { + "epoch": 2.061569688768606, + "grad_norm": 0.27228564751494283, + "learning_rate": 2.6967877477468394e-06, + "loss": 0.0167, + "step": 3047 + }, + { + "epoch": 2.0622462787550746, + "grad_norm": 0.2715035449675256, + "learning_rate": 2.693294185106562e-06, + "loss": 0.017, + "step": 3048 + }, + { + "epoch": 2.0629228687415426, + "grad_norm": 0.2897796173533462, + "learning_rate": 2.689802052499555e-06, + "loss": 0.0167, + "step": 3049 + }, + { + "epoch": 2.0635994587280106, + "grad_norm": 0.21111631517365298, + "learning_rate": 2.686311352090756e-06, + "loss": 0.0109, + "step": 3050 + }, + { + "epoch": 2.064276048714479, + "grad_norm": 0.3062700790169697, + "learning_rate": 2.682822086044206e-06, + "loss": 0.0174, + "step": 3051 + }, + { + "epoch": 2.064952638700947, + "grad_norm": 0.28150179689863614, + "learning_rate": 2.6793342565230675e-06, + "loss": 0.0156, + "step": 3052 + }, + { + "epoch": 2.0656292286874156, + "grad_norm": 0.2448549983091798, + "learning_rate": 2.6758478656896015e-06, + "loss": 0.0136, + "step": 3053 + }, + { + "epoch": 2.0663058186738836, + "grad_norm": 0.26608039737433065, + "learning_rate": 2.6723629157051844e-06, + "loss": 0.0164, + "step": 3054 + }, + { + "epoch": 2.0669824086603517, + "grad_norm": 0.2764342270796369, + "learning_rate": 2.6688794087302993e-06, + "loss": 0.02, + "step": 3055 + }, + { + "epoch": 2.06765899864682, + "grad_norm": 0.25363760014574555, + "learning_rate": 2.66539734692453e-06, + "loss": 0.0177, + "step": 3056 + }, + { + "epoch": 2.068335588633288, + "grad_norm": 0.4076233031632493, + "learning_rate": 2.66191673244657e-06, + "loss": 0.0225, + "step": 3057 + }, + { + "epoch": 2.0690121786197566, + "grad_norm": 0.3626422278457513, + "learning_rate": 2.658437567454209e-06, + "loss": 0.0245, + "step": 3058 + }, + { + "epoch": 2.0696887686062246, + "grad_norm": 0.32647866554382204, + "learning_rate": 2.6549598541043433e-06, + "loss": 0.024, + "step": 3059 + }, + { + "epoch": 2.0703653585926927, + "grad_norm": 0.44525429835360186, + "learning_rate": 2.6514835945529706e-06, + "loss": 0.025, + "step": 3060 + }, + { + "epoch": 2.071041948579161, + "grad_norm": 0.4043225721957029, + "learning_rate": 2.64800879095518e-06, + "loss": 0.022, + "step": 3061 + }, + { + "epoch": 2.071718538565629, + "grad_norm": 0.28299749687391407, + "learning_rate": 2.644535445465164e-06, + "loss": 0.0166, + "step": 3062 + }, + { + "epoch": 2.0723951285520976, + "grad_norm": 0.5222510352469268, + "learning_rate": 2.641063560236212e-06, + "loss": 0.0325, + "step": 3063 + }, + { + "epoch": 2.0730717185385656, + "grad_norm": 0.2704778046181561, + "learning_rate": 2.637593137420702e-06, + "loss": 0.0129, + "step": 3064 + }, + { + "epoch": 2.0737483085250337, + "grad_norm": 0.3198939038082152, + "learning_rate": 2.6341241791701126e-06, + "loss": 0.0184, + "step": 3065 + }, + { + "epoch": 2.074424898511502, + "grad_norm": 0.29084713587711386, + "learning_rate": 2.6306566876350072e-06, + "loss": 0.0164, + "step": 3066 + }, + { + "epoch": 2.07510148849797, + "grad_norm": 0.360979655960413, + "learning_rate": 2.627190664965046e-06, + "loss": 0.0236, + "step": 3067 + }, + { + "epoch": 2.0757780784844386, + "grad_norm": 0.36181318513778404, + "learning_rate": 2.623726113308977e-06, + "loss": 0.0206, + "step": 3068 + }, + { + "epoch": 2.0764546684709067, + "grad_norm": 0.322372342952916, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.0218, + "step": 3069 + }, + { + "epoch": 2.0771312584573747, + "grad_norm": 0.3414598551719795, + "learning_rate": 2.616801431628938e-06, + "loss": 0.017, + "step": 3070 + }, + { + "epoch": 2.077807848443843, + "grad_norm": 0.4051066445113148, + "learning_rate": 2.613341305897898e-06, + "loss": 0.0177, + "step": 3071 + }, + { + "epoch": 2.078484438430311, + "grad_norm": 0.3209181294474512, + "learning_rate": 2.609882659766605e-06, + "loss": 0.0224, + "step": 3072 + }, + { + "epoch": 2.0791610284167796, + "grad_norm": 0.28578895357623013, + "learning_rate": 2.6064254953792344e-06, + "loss": 0.0165, + "step": 3073 + }, + { + "epoch": 2.0798376184032477, + "grad_norm": 0.6075318475491459, + "learning_rate": 2.6029698148790392e-06, + "loss": 0.0381, + "step": 3074 + }, + { + "epoch": 2.0805142083897157, + "grad_norm": 0.30737186625660634, + "learning_rate": 2.5995156204083573e-06, + "loss": 0.0289, + "step": 3075 + }, + { + "epoch": 2.081190798376184, + "grad_norm": 0.31925899766813914, + "learning_rate": 2.5960629141086014e-06, + "loss": 0.0273, + "step": 3076 + }, + { + "epoch": 2.081867388362652, + "grad_norm": 0.3806833163468387, + "learning_rate": 2.5926116981202688e-06, + "loss": 0.0211, + "step": 3077 + }, + { + "epoch": 2.0825439783491206, + "grad_norm": 0.43544524307895166, + "learning_rate": 2.5891619745829184e-06, + "loss": 0.0184, + "step": 3078 + }, + { + "epoch": 2.0832205683355887, + "grad_norm": 0.3352435432551951, + "learning_rate": 2.585713745635197e-06, + "loss": 0.0248, + "step": 3079 + }, + { + "epoch": 2.0838971583220567, + "grad_norm": 0.2718929984125413, + "learning_rate": 2.5822670134148216e-06, + "loss": 0.0178, + "step": 3080 + }, + { + "epoch": 2.084573748308525, + "grad_norm": 0.3430436391679866, + "learning_rate": 2.5788217800585812e-06, + "loss": 0.0217, + "step": 3081 + }, + { + "epoch": 2.085250338294993, + "grad_norm": 0.395822758178957, + "learning_rate": 2.5753780477023314e-06, + "loss": 0.0258, + "step": 3082 + }, + { + "epoch": 2.0859269282814616, + "grad_norm": 0.2922371050137746, + "learning_rate": 2.571935818481005e-06, + "loss": 0.0197, + "step": 3083 + }, + { + "epoch": 2.0866035182679297, + "grad_norm": 0.34230089275034653, + "learning_rate": 2.5684950945285937e-06, + "loss": 0.0228, + "step": 3084 + }, + { + "epoch": 2.0872801082543977, + "grad_norm": 0.307148243647793, + "learning_rate": 2.5650558779781635e-06, + "loss": 0.0208, + "step": 3085 + }, + { + "epoch": 2.087956698240866, + "grad_norm": 0.3059716364571522, + "learning_rate": 2.5616181709618447e-06, + "loss": 0.0216, + "step": 3086 + }, + { + "epoch": 2.088633288227334, + "grad_norm": 0.32743146079087154, + "learning_rate": 2.558181975610827e-06, + "loss": 0.017, + "step": 3087 + }, + { + "epoch": 2.089309878213802, + "grad_norm": 0.3360812753346708, + "learning_rate": 2.5547472940553685e-06, + "loss": 0.0192, + "step": 3088 + }, + { + "epoch": 2.0899864682002707, + "grad_norm": 0.28990393329614506, + "learning_rate": 2.551314128424788e-06, + "loss": 0.0165, + "step": 3089 + }, + { + "epoch": 2.0906630581867387, + "grad_norm": 0.33940305663292186, + "learning_rate": 2.5478824808474613e-06, + "loss": 0.0241, + "step": 3090 + }, + { + "epoch": 2.091339648173207, + "grad_norm": 0.3011730856791906, + "learning_rate": 2.5444523534508225e-06, + "loss": 0.0188, + "step": 3091 + }, + { + "epoch": 2.092016238159675, + "grad_norm": 0.43797419013313976, + "learning_rate": 2.5410237483613685e-06, + "loss": 0.0456, + "step": 3092 + }, + { + "epoch": 2.092692828146143, + "grad_norm": 0.36926319415432013, + "learning_rate": 2.53759666770465e-06, + "loss": 0.0201, + "step": 3093 + }, + { + "epoch": 2.0933694181326117, + "grad_norm": 0.3753754161314381, + "learning_rate": 2.5341711136052728e-06, + "loss": 0.0241, + "step": 3094 + }, + { + "epoch": 2.0940460081190797, + "grad_norm": 0.36140481303337674, + "learning_rate": 2.530747088186893e-06, + "loss": 0.0202, + "step": 3095 + }, + { + "epoch": 2.094722598105548, + "grad_norm": 0.2812130611026342, + "learning_rate": 2.527324593572223e-06, + "loss": 0.0182, + "step": 3096 + }, + { + "epoch": 2.095399188092016, + "grad_norm": 0.32819748804745974, + "learning_rate": 2.523903631883028e-06, + "loss": 0.0141, + "step": 3097 + }, + { + "epoch": 2.096075778078484, + "grad_norm": 0.23769268205062138, + "learning_rate": 2.520484205240116e-06, + "loss": 0.0145, + "step": 3098 + }, + { + "epoch": 2.0967523680649527, + "grad_norm": 0.25897294118421665, + "learning_rate": 2.517066315763348e-06, + "loss": 0.0129, + "step": 3099 + }, + { + "epoch": 2.0974289580514207, + "grad_norm": 0.2811692980432601, + "learning_rate": 2.5136499655716306e-06, + "loss": 0.0241, + "step": 3100 + }, + { + "epoch": 2.098105548037889, + "grad_norm": 0.30983498713273716, + "learning_rate": 2.5102351567829187e-06, + "loss": 0.018, + "step": 3101 + }, + { + "epoch": 2.098782138024357, + "grad_norm": 0.3454829575212401, + "learning_rate": 2.5068218915142093e-06, + "loss": 0.0295, + "step": 3102 + }, + { + "epoch": 2.0994587280108252, + "grad_norm": 0.3549707454350051, + "learning_rate": 2.503410171881544e-06, + "loss": 0.0242, + "step": 3103 + }, + { + "epoch": 2.1001353179972937, + "grad_norm": 0.29752709096247937, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0206, + "step": 3104 + }, + { + "epoch": 2.1008119079837617, + "grad_norm": 0.29067978319376647, + "learning_rate": 2.496591377983706e-06, + "loss": 0.0146, + "step": 3105 + }, + { + "epoch": 2.10148849797023, + "grad_norm": 0.31437900460949353, + "learning_rate": 2.49318430794582e-06, + "loss": 0.0174, + "step": 3106 + }, + { + "epoch": 2.102165087956698, + "grad_norm": 0.4734757269035232, + "learning_rate": 2.4897787919985457e-06, + "loss": 0.0315, + "step": 3107 + }, + { + "epoch": 2.1028416779431662, + "grad_norm": 0.9767500849091907, + "learning_rate": 2.4863748322531144e-06, + "loss": 0.0856, + "step": 3108 + }, + { + "epoch": 2.1035182679296347, + "grad_norm": 0.2395614894247743, + "learning_rate": 2.4829724308198003e-06, + "loss": 0.012, + "step": 3109 + }, + { + "epoch": 2.1041948579161027, + "grad_norm": 0.2639550064345917, + "learning_rate": 2.4795715898079116e-06, + "loss": 0.0144, + "step": 3110 + }, + { + "epoch": 2.104871447902571, + "grad_norm": 0.33505158691687015, + "learning_rate": 2.476172311325783e-06, + "loss": 0.0258, + "step": 3111 + }, + { + "epoch": 2.105548037889039, + "grad_norm": 0.4326205521958161, + "learning_rate": 2.472774597480783e-06, + "loss": 0.0199, + "step": 3112 + }, + { + "epoch": 2.1062246278755072, + "grad_norm": 0.33457247400374934, + "learning_rate": 2.4693784503793128e-06, + "loss": 0.0162, + "step": 3113 + }, + { + "epoch": 2.1069012178619757, + "grad_norm": 0.26248819364761294, + "learning_rate": 2.4659838721268005e-06, + "loss": 0.0159, + "step": 3114 + }, + { + "epoch": 2.1075778078484437, + "grad_norm": 0.29913466215394774, + "learning_rate": 2.462590864827703e-06, + "loss": 0.0175, + "step": 3115 + }, + { + "epoch": 2.108254397834912, + "grad_norm": 0.32815787935026647, + "learning_rate": 2.4591994305854988e-06, + "loss": 0.0206, + "step": 3116 + }, + { + "epoch": 2.10893098782138, + "grad_norm": 0.34816461794815484, + "learning_rate": 2.4558095715026975e-06, + "loss": 0.0254, + "step": 3117 + }, + { + "epoch": 2.1096075778078482, + "grad_norm": 0.3432385006117984, + "learning_rate": 2.4524212896808265e-06, + "loss": 0.0138, + "step": 3118 + }, + { + "epoch": 2.1102841677943167, + "grad_norm": 0.42969136301244204, + "learning_rate": 2.4490345872204403e-06, + "loss": 0.0182, + "step": 3119 + }, + { + "epoch": 2.1109607577807847, + "grad_norm": 0.3903531142555779, + "learning_rate": 2.4456494662211082e-06, + "loss": 0.0189, + "step": 3120 + }, + { + "epoch": 2.111637347767253, + "grad_norm": 0.38684894804978254, + "learning_rate": 2.442265928781426e-06, + "loss": 0.0193, + "step": 3121 + }, + { + "epoch": 2.1123139377537212, + "grad_norm": 0.31597492188614235, + "learning_rate": 2.438883976999003e-06, + "loss": 0.0177, + "step": 3122 + }, + { + "epoch": 2.1129905277401893, + "grad_norm": 0.2959327742325448, + "learning_rate": 2.43550361297047e-06, + "loss": 0.0192, + "step": 3123 + }, + { + "epoch": 2.1136671177266577, + "grad_norm": 0.7283818726336723, + "learning_rate": 2.4321248387914677e-06, + "loss": 0.0219, + "step": 3124 + }, + { + "epoch": 2.1143437077131257, + "grad_norm": 0.2040244513760422, + "learning_rate": 2.4287476565566525e-06, + "loss": 0.0117, + "step": 3125 + }, + { + "epoch": 2.115020297699594, + "grad_norm": 0.274361357398989, + "learning_rate": 2.4253720683596976e-06, + "loss": 0.0173, + "step": 3126 + }, + { + "epoch": 2.1156968876860622, + "grad_norm": 0.2942109825610358, + "learning_rate": 2.421998076293285e-06, + "loss": 0.0213, + "step": 3127 + }, + { + "epoch": 2.1163734776725303, + "grad_norm": 0.34853093582183314, + "learning_rate": 2.4186256824491106e-06, + "loss": 0.0178, + "step": 3128 + }, + { + "epoch": 2.1170500676589987, + "grad_norm": 0.4101569612373172, + "learning_rate": 2.4152548889178722e-06, + "loss": 0.0293, + "step": 3129 + }, + { + "epoch": 2.1177266576454667, + "grad_norm": 0.43937037555293795, + "learning_rate": 2.4118856977892846e-06, + "loss": 0.0193, + "step": 3130 + }, + { + "epoch": 2.118403247631935, + "grad_norm": 0.3322120577445713, + "learning_rate": 2.4085181111520607e-06, + "loss": 0.0216, + "step": 3131 + }, + { + "epoch": 2.1190798376184032, + "grad_norm": 0.29239964634064164, + "learning_rate": 2.4051521310939258e-06, + "loss": 0.0141, + "step": 3132 + }, + { + "epoch": 2.1197564276048713, + "grad_norm": 0.37022345024811043, + "learning_rate": 2.401787759701603e-06, + "loss": 0.0271, + "step": 3133 + }, + { + "epoch": 2.1204330175913397, + "grad_norm": 0.24897850304443564, + "learning_rate": 2.3984249990608237e-06, + "loss": 0.0129, + "step": 3134 + }, + { + "epoch": 2.1211096075778078, + "grad_norm": 0.4290963757508825, + "learning_rate": 2.3950638512563173e-06, + "loss": 0.0144, + "step": 3135 + }, + { + "epoch": 2.121786197564276, + "grad_norm": 0.36143080362684216, + "learning_rate": 2.3917043183718162e-06, + "loss": 0.0151, + "step": 3136 + }, + { + "epoch": 2.1224627875507442, + "grad_norm": 0.4450079992651522, + "learning_rate": 2.3883464024900484e-06, + "loss": 0.0217, + "step": 3137 + }, + { + "epoch": 2.1231393775372123, + "grad_norm": 0.3034654070729038, + "learning_rate": 2.3849901056927383e-06, + "loss": 0.02, + "step": 3138 + }, + { + "epoch": 2.1238159675236807, + "grad_norm": 0.3380960064178022, + "learning_rate": 2.381635430060611e-06, + "loss": 0.0218, + "step": 3139 + }, + { + "epoch": 2.1244925575101488, + "grad_norm": 0.3219803482217954, + "learning_rate": 2.3782823776733866e-06, + "loss": 0.0167, + "step": 3140 + }, + { + "epoch": 2.1251691474966172, + "grad_norm": 0.3350566290144601, + "learning_rate": 2.374930950609773e-06, + "loss": 0.0244, + "step": 3141 + }, + { + "epoch": 2.1258457374830853, + "grad_norm": 0.3168758393416301, + "learning_rate": 2.371581150947476e-06, + "loss": 0.0191, + "step": 3142 + }, + { + "epoch": 2.1265223274695533, + "grad_norm": 0.25228403321014414, + "learning_rate": 2.368232980763194e-06, + "loss": 0.0123, + "step": 3143 + }, + { + "epoch": 2.1271989174560217, + "grad_norm": 0.3507719167433762, + "learning_rate": 2.364886442132606e-06, + "loss": 0.0181, + "step": 3144 + }, + { + "epoch": 2.1278755074424898, + "grad_norm": 0.4039840478946242, + "learning_rate": 2.361541537130392e-06, + "loss": 0.0255, + "step": 3145 + }, + { + "epoch": 2.1285520974289582, + "grad_norm": 0.337061361222386, + "learning_rate": 2.358198267830206e-06, + "loss": 0.0203, + "step": 3146 + }, + { + "epoch": 2.1292286874154263, + "grad_norm": 0.3906657092238292, + "learning_rate": 2.3548566363046993e-06, + "loss": 0.0223, + "step": 3147 + }, + { + "epoch": 2.1299052774018943, + "grad_norm": 0.35249780243469786, + "learning_rate": 2.351516644625502e-06, + "loss": 0.018, + "step": 3148 + }, + { + "epoch": 2.1305818673883627, + "grad_norm": 0.46141388142612605, + "learning_rate": 2.3481782948632317e-06, + "loss": 0.0265, + "step": 3149 + }, + { + "epoch": 2.1312584573748308, + "grad_norm": 0.2826702497054701, + "learning_rate": 2.344841589087482e-06, + "loss": 0.018, + "step": 3150 + }, + { + "epoch": 2.1319350473612992, + "grad_norm": 0.3455073675603878, + "learning_rate": 2.34150652936683e-06, + "loss": 0.0185, + "step": 3151 + }, + { + "epoch": 2.1326116373477673, + "grad_norm": 0.38621961715633646, + "learning_rate": 2.3381731177688346e-06, + "loss": 0.0265, + "step": 3152 + }, + { + "epoch": 2.1332882273342353, + "grad_norm": 0.323081885914747, + "learning_rate": 2.3348413563600324e-06, + "loss": 0.0181, + "step": 3153 + }, + { + "epoch": 2.1339648173207038, + "grad_norm": 0.3681896960518674, + "learning_rate": 2.331511247205933e-06, + "loss": 0.0286, + "step": 3154 + }, + { + "epoch": 2.134641407307172, + "grad_norm": 0.3662293584547147, + "learning_rate": 2.3281827923710265e-06, + "loss": 0.0248, + "step": 3155 + }, + { + "epoch": 2.1353179972936402, + "grad_norm": 0.31740753045573344, + "learning_rate": 2.324855993918775e-06, + "loss": 0.0206, + "step": 3156 + }, + { + "epoch": 2.1359945872801083, + "grad_norm": 0.31070838197533174, + "learning_rate": 2.321530853911616e-06, + "loss": 0.0222, + "step": 3157 + }, + { + "epoch": 2.1366711772665763, + "grad_norm": 0.22895039731281813, + "learning_rate": 2.318207374410956e-06, + "loss": 0.0141, + "step": 3158 + }, + { + "epoch": 2.1373477672530448, + "grad_norm": 0.31200385704446354, + "learning_rate": 2.3148855574771706e-06, + "loss": 0.0163, + "step": 3159 + }, + { + "epoch": 2.138024357239513, + "grad_norm": 0.3070878337888588, + "learning_rate": 2.3115654051696097e-06, + "loss": 0.0194, + "step": 3160 + }, + { + "epoch": 2.1387009472259813, + "grad_norm": 0.3616286300226413, + "learning_rate": 2.3082469195465893e-06, + "loss": 0.0206, + "step": 3161 + }, + { + "epoch": 2.1393775372124493, + "grad_norm": 0.24900051851747546, + "learning_rate": 2.304930102665389e-06, + "loss": 0.0131, + "step": 3162 + }, + { + "epoch": 2.1400541271989173, + "grad_norm": 0.2791204943878726, + "learning_rate": 2.3016149565822608e-06, + "loss": 0.0188, + "step": 3163 + }, + { + "epoch": 2.1407307171853858, + "grad_norm": 0.34760553752948525, + "learning_rate": 2.2983014833524115e-06, + "loss": 0.0195, + "step": 3164 + }, + { + "epoch": 2.141407307171854, + "grad_norm": 0.24185930008667683, + "learning_rate": 2.2949896850300186e-06, + "loss": 0.0145, + "step": 3165 + }, + { + "epoch": 2.1420838971583223, + "grad_norm": 0.2506993012488286, + "learning_rate": 2.2916795636682197e-06, + "loss": 0.0126, + "step": 3166 + }, + { + "epoch": 2.1427604871447903, + "grad_norm": 0.32172613565753005, + "learning_rate": 2.288371121319109e-06, + "loss": 0.0205, + "step": 3167 + }, + { + "epoch": 2.1434370771312583, + "grad_norm": 0.4611798141323994, + "learning_rate": 2.2850643600337435e-06, + "loss": 0.0351, + "step": 3168 + }, + { + "epoch": 2.1441136671177268, + "grad_norm": 0.3419789356555703, + "learning_rate": 2.281759281862137e-06, + "loss": 0.0216, + "step": 3169 + }, + { + "epoch": 2.144790257104195, + "grad_norm": 0.24546682359372127, + "learning_rate": 2.278455888853262e-06, + "loss": 0.0116, + "step": 3170 + }, + { + "epoch": 2.1454668470906633, + "grad_norm": 0.35225760769126374, + "learning_rate": 2.2751541830550417e-06, + "loss": 0.0253, + "step": 3171 + }, + { + "epoch": 2.1461434370771313, + "grad_norm": 0.3203742931779411, + "learning_rate": 2.2718541665143546e-06, + "loss": 0.0211, + "step": 3172 + }, + { + "epoch": 2.1468200270635993, + "grad_norm": 0.2889708131145228, + "learning_rate": 2.2685558412770344e-06, + "loss": 0.0179, + "step": 3173 + }, + { + "epoch": 2.147496617050068, + "grad_norm": 0.3505469783998427, + "learning_rate": 2.265259209387867e-06, + "loss": 0.0233, + "step": 3174 + }, + { + "epoch": 2.148173207036536, + "grad_norm": 0.32734335525877833, + "learning_rate": 2.261964272890582e-06, + "loss": 0.0213, + "step": 3175 + }, + { + "epoch": 2.1488497970230043, + "grad_norm": 0.3017664556990426, + "learning_rate": 2.258671033827866e-06, + "loss": 0.0169, + "step": 3176 + }, + { + "epoch": 2.1495263870094723, + "grad_norm": 0.3256373908456404, + "learning_rate": 2.2553794942413506e-06, + "loss": 0.016, + "step": 3177 + }, + { + "epoch": 2.1502029769959403, + "grad_norm": 0.31744304201815216, + "learning_rate": 2.2520896561716086e-06, + "loss": 0.0212, + "step": 3178 + }, + { + "epoch": 2.150879566982409, + "grad_norm": 0.38536951168032857, + "learning_rate": 2.248801521658167e-06, + "loss": 0.0188, + "step": 3179 + }, + { + "epoch": 2.151556156968877, + "grad_norm": 0.28270267223725937, + "learning_rate": 2.245515092739488e-06, + "loss": 0.0163, + "step": 3180 + }, + { + "epoch": 2.1522327469553453, + "grad_norm": 0.224035320442952, + "learning_rate": 2.242230371452982e-06, + "loss": 0.0138, + "step": 3181 + }, + { + "epoch": 2.1529093369418133, + "grad_norm": 0.3133768757718769, + "learning_rate": 2.2389473598349994e-06, + "loss": 0.0175, + "step": 3182 + }, + { + "epoch": 2.1535859269282813, + "grad_norm": 0.30665383612985975, + "learning_rate": 2.2356660599208335e-06, + "loss": 0.0227, + "step": 3183 + }, + { + "epoch": 2.15426251691475, + "grad_norm": 0.3078336811363443, + "learning_rate": 2.2323864737447067e-06, + "loss": 0.0215, + "step": 3184 + }, + { + "epoch": 2.154939106901218, + "grad_norm": 0.33968603058332386, + "learning_rate": 2.229108603339789e-06, + "loss": 0.0223, + "step": 3185 + }, + { + "epoch": 2.1556156968876863, + "grad_norm": 0.3595968077075736, + "learning_rate": 2.2258324507381834e-06, + "loss": 0.0188, + "step": 3186 + }, + { + "epoch": 2.1562922868741543, + "grad_norm": 0.2904430152947272, + "learning_rate": 2.2225580179709303e-06, + "loss": 0.0153, + "step": 3187 + }, + { + "epoch": 2.1569688768606223, + "grad_norm": 0.3394738888015632, + "learning_rate": 2.219285307067997e-06, + "loss": 0.0188, + "step": 3188 + }, + { + "epoch": 2.157645466847091, + "grad_norm": 0.3501521027907099, + "learning_rate": 2.2160143200582906e-06, + "loss": 0.0248, + "step": 3189 + }, + { + "epoch": 2.158322056833559, + "grad_norm": 0.3040340349490807, + "learning_rate": 2.2127450589696475e-06, + "loss": 0.0182, + "step": 3190 + }, + { + "epoch": 2.1589986468200273, + "grad_norm": 0.31513527812824993, + "learning_rate": 2.209477525828831e-06, + "loss": 0.019, + "step": 3191 + }, + { + "epoch": 2.1596752368064953, + "grad_norm": 0.64752223556514, + "learning_rate": 2.2062117226615375e-06, + "loss": 0.0232, + "step": 3192 + }, + { + "epoch": 2.1603518267929633, + "grad_norm": 0.3599564774739713, + "learning_rate": 2.202947651492387e-06, + "loss": 0.0207, + "step": 3193 + }, + { + "epoch": 2.161028416779432, + "grad_norm": 0.29667305394672316, + "learning_rate": 2.1996853143449285e-06, + "loss": 0.0146, + "step": 3194 + }, + { + "epoch": 2.1617050067659, + "grad_norm": 0.2850668544793242, + "learning_rate": 2.1964247132416373e-06, + "loss": 0.0158, + "step": 3195 + }, + { + "epoch": 2.1623815967523683, + "grad_norm": 0.2922406175180509, + "learning_rate": 2.1931658502039067e-06, + "loss": 0.017, + "step": 3196 + }, + { + "epoch": 2.1630581867388363, + "grad_norm": 0.2814097867526757, + "learning_rate": 2.1899087272520596e-06, + "loss": 0.0179, + "step": 3197 + }, + { + "epoch": 2.1637347767253043, + "grad_norm": 0.3060937668295674, + "learning_rate": 2.186653346405333e-06, + "loss": 0.0204, + "step": 3198 + }, + { + "epoch": 2.164411366711773, + "grad_norm": 0.36995219678170427, + "learning_rate": 2.1833997096818897e-06, + "loss": 0.0206, + "step": 3199 + }, + { + "epoch": 2.165087956698241, + "grad_norm": 0.32436797225167857, + "learning_rate": 2.1801478190988107e-06, + "loss": 0.0194, + "step": 3200 + }, + { + "epoch": 2.1657645466847093, + "grad_norm": 0.3338826377427646, + "learning_rate": 2.1768976766720896e-06, + "loss": 0.0206, + "step": 3201 + }, + { + "epoch": 2.1664411366711773, + "grad_norm": 0.32769878792295004, + "learning_rate": 2.1736492844166406e-06, + "loss": 0.0213, + "step": 3202 + }, + { + "epoch": 2.1671177266576453, + "grad_norm": 0.32818583449722943, + "learning_rate": 2.170402644346294e-06, + "loss": 0.0208, + "step": 3203 + }, + { + "epoch": 2.167794316644114, + "grad_norm": 0.2628270966287541, + "learning_rate": 2.16715775847379e-06, + "loss": 0.0099, + "step": 3204 + }, + { + "epoch": 2.168470906630582, + "grad_norm": 0.3491051999150974, + "learning_rate": 2.163914628810781e-06, + "loss": 0.0208, + "step": 3205 + }, + { + "epoch": 2.16914749661705, + "grad_norm": 0.27665429608391007, + "learning_rate": 2.1606732573678344e-06, + "loss": 0.0159, + "step": 3206 + }, + { + "epoch": 2.1698240866035183, + "grad_norm": 0.3409165219663345, + "learning_rate": 2.157433646154426e-06, + "loss": 0.021, + "step": 3207 + }, + { + "epoch": 2.1705006765899864, + "grad_norm": 0.48716337299657286, + "learning_rate": 2.154195797178941e-06, + "loss": 0.0173, + "step": 3208 + }, + { + "epoch": 2.171177266576455, + "grad_norm": 0.36184449937916396, + "learning_rate": 2.1509597124486693e-06, + "loss": 0.0244, + "step": 3209 + }, + { + "epoch": 2.171853856562923, + "grad_norm": 0.38203593083620024, + "learning_rate": 2.147725393969811e-06, + "loss": 0.0315, + "step": 3210 + }, + { + "epoch": 2.172530446549391, + "grad_norm": 0.29355225838059074, + "learning_rate": 2.1444928437474667e-06, + "loss": 0.02, + "step": 3211 + }, + { + "epoch": 2.1732070365358593, + "grad_norm": 0.320653509069443, + "learning_rate": 2.1412620637856445e-06, + "loss": 0.0147, + "step": 3212 + }, + { + "epoch": 2.1738836265223274, + "grad_norm": 0.2894634971415182, + "learning_rate": 2.138033056087256e-06, + "loss": 0.0157, + "step": 3213 + }, + { + "epoch": 2.174560216508796, + "grad_norm": 0.3086742816786467, + "learning_rate": 2.1348058226541072e-06, + "loss": 0.0173, + "step": 3214 + }, + { + "epoch": 2.175236806495264, + "grad_norm": 0.38367991285926245, + "learning_rate": 2.1315803654869125e-06, + "loss": 0.0251, + "step": 3215 + }, + { + "epoch": 2.175913396481732, + "grad_norm": 0.33939296708239935, + "learning_rate": 2.1283566865852824e-06, + "loss": 0.0189, + "step": 3216 + }, + { + "epoch": 2.1765899864682003, + "grad_norm": 0.3613853776427313, + "learning_rate": 2.1251347879477217e-06, + "loss": 0.0213, + "step": 3217 + }, + { + "epoch": 2.1772665764546684, + "grad_norm": 0.388599815378119, + "learning_rate": 2.1219146715716332e-06, + "loss": 0.03, + "step": 3218 + }, + { + "epoch": 2.177943166441137, + "grad_norm": 0.3012896606065024, + "learning_rate": 2.1186963394533165e-06, + "loss": 0.0162, + "step": 3219 + }, + { + "epoch": 2.178619756427605, + "grad_norm": 0.32609586136770297, + "learning_rate": 2.1154797935879647e-06, + "loss": 0.0175, + "step": 3220 + }, + { + "epoch": 2.179296346414073, + "grad_norm": 0.32685332068870293, + "learning_rate": 2.112265035969664e-06, + "loss": 0.0158, + "step": 3221 + }, + { + "epoch": 2.1799729364005414, + "grad_norm": 0.3908665491980813, + "learning_rate": 2.1090520685913874e-06, + "loss": 0.0214, + "step": 3222 + }, + { + "epoch": 2.1806495263870094, + "grad_norm": 0.30518133848043216, + "learning_rate": 2.1058408934450055e-06, + "loss": 0.0131, + "step": 3223 + }, + { + "epoch": 2.181326116373478, + "grad_norm": 0.2627082568803207, + "learning_rate": 2.102631512521269e-06, + "loss": 0.0163, + "step": 3224 + }, + { + "epoch": 2.182002706359946, + "grad_norm": 0.29533056788281364, + "learning_rate": 2.099423927809826e-06, + "loss": 0.0198, + "step": 3225 + }, + { + "epoch": 2.182679296346414, + "grad_norm": 0.30782129434951133, + "learning_rate": 2.096218141299203e-06, + "loss": 0.0182, + "step": 3226 + }, + { + "epoch": 2.1833558863328824, + "grad_norm": 0.3685495402218638, + "learning_rate": 2.0930141549768145e-06, + "loss": 0.0246, + "step": 3227 + }, + { + "epoch": 2.1840324763193504, + "grad_norm": 0.3730644700321736, + "learning_rate": 2.089811970828961e-06, + "loss": 0.0232, + "step": 3228 + }, + { + "epoch": 2.184709066305819, + "grad_norm": 0.26012433544422686, + "learning_rate": 2.086611590840826e-06, + "loss": 0.0129, + "step": 3229 + }, + { + "epoch": 2.185385656292287, + "grad_norm": 0.3506394183014197, + "learning_rate": 2.0834130169964695e-06, + "loss": 0.0206, + "step": 3230 + }, + { + "epoch": 2.186062246278755, + "grad_norm": 0.3047006489407835, + "learning_rate": 2.0802162512788337e-06, + "loss": 0.0155, + "step": 3231 + }, + { + "epoch": 2.1867388362652234, + "grad_norm": 0.2975742048756299, + "learning_rate": 2.0770212956697435e-06, + "loss": 0.019, + "step": 3232 + }, + { + "epoch": 2.1874154262516914, + "grad_norm": 0.42409559410941233, + "learning_rate": 2.073828152149898e-06, + "loss": 0.0404, + "step": 3233 + }, + { + "epoch": 2.18809201623816, + "grad_norm": 0.29268843453271054, + "learning_rate": 2.0706368226988772e-06, + "loss": 0.0167, + "step": 3234 + }, + { + "epoch": 2.188768606224628, + "grad_norm": 0.30444783746080334, + "learning_rate": 2.0674473092951286e-06, + "loss": 0.0162, + "step": 3235 + }, + { + "epoch": 2.189445196211096, + "grad_norm": 0.41625888111181386, + "learning_rate": 2.064259613915981e-06, + "loss": 0.0267, + "step": 3236 + }, + { + "epoch": 2.1901217861975644, + "grad_norm": 0.3221104298750319, + "learning_rate": 2.061073738537635e-06, + "loss": 0.0166, + "step": 3237 + }, + { + "epoch": 2.1907983761840324, + "grad_norm": 0.23311987996484307, + "learning_rate": 2.0578896851351606e-06, + "loss": 0.0101, + "step": 3238 + }, + { + "epoch": 2.191474966170501, + "grad_norm": 0.2947568327994009, + "learning_rate": 2.0547074556824964e-06, + "loss": 0.0189, + "step": 3239 + }, + { + "epoch": 2.192151556156969, + "grad_norm": 0.2907318768196047, + "learning_rate": 2.0515270521524562e-06, + "loss": 0.0195, + "step": 3240 + }, + { + "epoch": 2.192828146143437, + "grad_norm": 0.295783654449266, + "learning_rate": 2.0483484765167172e-06, + "loss": 0.021, + "step": 3241 + }, + { + "epoch": 2.1935047361299054, + "grad_norm": 0.27507509133129837, + "learning_rate": 2.0451717307458287e-06, + "loss": 0.0155, + "step": 3242 + }, + { + "epoch": 2.1941813261163734, + "grad_norm": 0.34791367861363576, + "learning_rate": 2.041996816809197e-06, + "loss": 0.0178, + "step": 3243 + }, + { + "epoch": 2.194857916102842, + "grad_norm": 0.33254911691864697, + "learning_rate": 2.0388237366751005e-06, + "loss": 0.02, + "step": 3244 + }, + { + "epoch": 2.19553450608931, + "grad_norm": 0.31488586876258784, + "learning_rate": 2.0356524923106763e-06, + "loss": 0.0198, + "step": 3245 + }, + { + "epoch": 2.196211096075778, + "grad_norm": 0.39976937951504854, + "learning_rate": 2.032483085681927e-06, + "loss": 0.0225, + "step": 3246 + }, + { + "epoch": 2.1968876860622464, + "grad_norm": 0.31041717262696583, + "learning_rate": 2.029315518753711e-06, + "loss": 0.0291, + "step": 3247 + }, + { + "epoch": 2.1975642760487144, + "grad_norm": 0.2528109532127451, + "learning_rate": 2.0261497934897507e-06, + "loss": 0.0191, + "step": 3248 + }, + { + "epoch": 2.198240866035183, + "grad_norm": 0.30236700353663026, + "learning_rate": 2.0229859118526244e-06, + "loss": 0.0255, + "step": 3249 + }, + { + "epoch": 2.198917456021651, + "grad_norm": 0.35257299480078474, + "learning_rate": 2.019823875803771e-06, + "loss": 0.0167, + "step": 3250 + }, + { + "epoch": 2.199594046008119, + "grad_norm": 0.31437589864559173, + "learning_rate": 2.0166636873034807e-06, + "loss": 0.0183, + "step": 3251 + }, + { + "epoch": 2.2002706359945874, + "grad_norm": 0.349225535822057, + "learning_rate": 2.0135053483108973e-06, + "loss": 0.0186, + "step": 3252 + }, + { + "epoch": 2.2009472259810554, + "grad_norm": 0.46514667041805957, + "learning_rate": 2.0103488607840233e-06, + "loss": 0.018, + "step": 3253 + }, + { + "epoch": 2.201623815967524, + "grad_norm": 0.342102432161444, + "learning_rate": 2.00719422667971e-06, + "loss": 0.0192, + "step": 3254 + }, + { + "epoch": 2.202300405953992, + "grad_norm": 0.3930160959618195, + "learning_rate": 2.004041447953663e-06, + "loss": 0.0206, + "step": 3255 + }, + { + "epoch": 2.20297699594046, + "grad_norm": 0.3251662407712104, + "learning_rate": 2.0008905265604316e-06, + "loss": 0.0155, + "step": 3256 + }, + { + "epoch": 2.2036535859269284, + "grad_norm": 0.3090610137339622, + "learning_rate": 1.9977414644534206e-06, + "loss": 0.0145, + "step": 3257 + }, + { + "epoch": 2.2043301759133964, + "grad_norm": 0.26289042865730033, + "learning_rate": 1.9945942635848745e-06, + "loss": 0.0152, + "step": 3258 + }, + { + "epoch": 2.205006765899865, + "grad_norm": 0.5642809810025953, + "learning_rate": 1.9914489259058933e-06, + "loss": 0.0259, + "step": 3259 + }, + { + "epoch": 2.205683355886333, + "grad_norm": 0.32713953201831053, + "learning_rate": 1.9883054533664128e-06, + "loss": 0.0259, + "step": 3260 + }, + { + "epoch": 2.206359945872801, + "grad_norm": 0.3162273027872668, + "learning_rate": 1.985163847915217e-06, + "loss": 0.0147, + "step": 3261 + }, + { + "epoch": 2.2070365358592694, + "grad_norm": 0.37714029508563507, + "learning_rate": 1.9820241114999334e-06, + "loss": 0.0174, + "step": 3262 + }, + { + "epoch": 2.2077131258457374, + "grad_norm": 0.46112262827519634, + "learning_rate": 1.9788862460670305e-06, + "loss": 0.0162, + "step": 3263 + }, + { + "epoch": 2.208389715832206, + "grad_norm": 0.3185607785282328, + "learning_rate": 1.9757502535618137e-06, + "loss": 0.0199, + "step": 3264 + }, + { + "epoch": 2.209066305818674, + "grad_norm": 0.2971899984451772, + "learning_rate": 1.9726161359284283e-06, + "loss": 0.0159, + "step": 3265 + }, + { + "epoch": 2.209742895805142, + "grad_norm": 0.362931081049006, + "learning_rate": 1.96948389510986e-06, + "loss": 0.0202, + "step": 3266 + }, + { + "epoch": 2.2104194857916104, + "grad_norm": 0.2878952502965713, + "learning_rate": 1.9663535330479305e-06, + "loss": 0.015, + "step": 3267 + }, + { + "epoch": 2.2110960757780784, + "grad_norm": 0.40934712757172603, + "learning_rate": 1.963225051683292e-06, + "loss": 0.0175, + "step": 3268 + }, + { + "epoch": 2.2117726657645465, + "grad_norm": 0.32443670045538464, + "learning_rate": 1.9600984529554366e-06, + "loss": 0.0143, + "step": 3269 + }, + { + "epoch": 2.212449255751015, + "grad_norm": 0.3538126115344034, + "learning_rate": 1.956973738802689e-06, + "loss": 0.0147, + "step": 3270 + }, + { + "epoch": 2.213125845737483, + "grad_norm": 0.29986686834136383, + "learning_rate": 1.953850911162199e-06, + "loss": 0.0164, + "step": 3271 + }, + { + "epoch": 2.2138024357239514, + "grad_norm": 0.6028385981170276, + "learning_rate": 1.950729971969955e-06, + "loss": 0.0236, + "step": 3272 + }, + { + "epoch": 2.2144790257104194, + "grad_norm": 0.36524417102212586, + "learning_rate": 1.9476109231607687e-06, + "loss": 0.0214, + "step": 3273 + }, + { + "epoch": 2.2151556156968875, + "grad_norm": 0.2795522132726754, + "learning_rate": 1.9444937666682834e-06, + "loss": 0.0199, + "step": 3274 + }, + { + "epoch": 2.215832205683356, + "grad_norm": 0.2694536250827638, + "learning_rate": 1.941378504424968e-06, + "loss": 0.0164, + "step": 3275 + }, + { + "epoch": 2.216508795669824, + "grad_norm": 0.3112459841930046, + "learning_rate": 1.938265138362118e-06, + "loss": 0.0189, + "step": 3276 + }, + { + "epoch": 2.2171853856562924, + "grad_norm": 0.3207034243512326, + "learning_rate": 1.935153670409853e-06, + "loss": 0.018, + "step": 3277 + }, + { + "epoch": 2.2178619756427604, + "grad_norm": 0.31740722486130024, + "learning_rate": 1.9320441024971113e-06, + "loss": 0.0138, + "step": 3278 + }, + { + "epoch": 2.2185385656292285, + "grad_norm": 0.3036380412383107, + "learning_rate": 1.928936436551661e-06, + "loss": 0.0137, + "step": 3279 + }, + { + "epoch": 2.219215155615697, + "grad_norm": 0.39343985298433704, + "learning_rate": 1.925830674500088e-06, + "loss": 0.0289, + "step": 3280 + }, + { + "epoch": 2.219891745602165, + "grad_norm": 0.3290770852233371, + "learning_rate": 1.922726818267795e-06, + "loss": 0.019, + "step": 3281 + }, + { + "epoch": 2.2205683355886334, + "grad_norm": 0.3092438350443896, + "learning_rate": 1.9196248697790066e-06, + "loss": 0.0164, + "step": 3282 + }, + { + "epoch": 2.2212449255751014, + "grad_norm": 0.4131225849760436, + "learning_rate": 1.916524830956763e-06, + "loss": 0.0231, + "step": 3283 + }, + { + "epoch": 2.2219215155615695, + "grad_norm": 0.3571328444470763, + "learning_rate": 1.913426703722924e-06, + "loss": 0.0188, + "step": 3284 + }, + { + "epoch": 2.222598105548038, + "grad_norm": 0.3010393276105099, + "learning_rate": 1.9103304899981603e-06, + "loss": 0.019, + "step": 3285 + }, + { + "epoch": 2.223274695534506, + "grad_norm": 0.39026975137224207, + "learning_rate": 1.9072361917019538e-06, + "loss": 0.0219, + "step": 3286 + }, + { + "epoch": 2.2239512855209744, + "grad_norm": 0.36305577116596943, + "learning_rate": 1.9041438107526055e-06, + "loss": 0.0197, + "step": 3287 + }, + { + "epoch": 2.2246278755074425, + "grad_norm": 0.3174543111115988, + "learning_rate": 1.901053349067225e-06, + "loss": 0.0169, + "step": 3288 + }, + { + "epoch": 2.2253044654939105, + "grad_norm": 0.36631910025895825, + "learning_rate": 1.8979648085617342e-06, + "loss": 0.02, + "step": 3289 + }, + { + "epoch": 2.225981055480379, + "grad_norm": 0.45650184645697806, + "learning_rate": 1.894878191150859e-06, + "loss": 0.027, + "step": 3290 + }, + { + "epoch": 2.226657645466847, + "grad_norm": 0.36677429686660684, + "learning_rate": 1.891793498748134e-06, + "loss": 0.021, + "step": 3291 + }, + { + "epoch": 2.2273342354533154, + "grad_norm": 0.25755487214056577, + "learning_rate": 1.888710733265905e-06, + "loss": 0.0158, + "step": 3292 + }, + { + "epoch": 2.2280108254397835, + "grad_norm": 0.3702697622738798, + "learning_rate": 1.8856298966153214e-06, + "loss": 0.0205, + "step": 3293 + }, + { + "epoch": 2.2286874154262515, + "grad_norm": 0.3684115091572002, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.0251, + "step": 3294 + }, + { + "epoch": 2.22936400541272, + "grad_norm": 0.33524908870469344, + "learning_rate": 1.8794740174476966e-06, + "loss": 0.0172, + "step": 3295 + }, + { + "epoch": 2.230040595399188, + "grad_norm": 0.27152983687193905, + "learning_rate": 1.87639897874697e-06, + "loss": 0.0172, + "step": 3296 + }, + { + "epoch": 2.2307171853856564, + "grad_norm": 0.3108717034521195, + "learning_rate": 1.8733258765105129e-06, + "loss": 0.0178, + "step": 3297 + }, + { + "epoch": 2.2313937753721245, + "grad_norm": 0.3713682052869584, + "learning_rate": 1.8702547126434818e-06, + "loss": 0.0279, + "step": 3298 + }, + { + "epoch": 2.2320703653585925, + "grad_norm": 0.3713718158434182, + "learning_rate": 1.8671854890498308e-06, + "loss": 0.0209, + "step": 3299 + }, + { + "epoch": 2.232746955345061, + "grad_norm": 0.3262102498384552, + "learning_rate": 1.864118207632315e-06, + "loss": 0.0227, + "step": 3300 + }, + { + "epoch": 2.233423545331529, + "grad_norm": 0.3359589824418625, + "learning_rate": 1.8610528702924851e-06, + "loss": 0.0212, + "step": 3301 + }, + { + "epoch": 2.2341001353179974, + "grad_norm": 0.300309902695362, + "learning_rate": 1.8579894789306813e-06, + "loss": 0.0161, + "step": 3302 + }, + { + "epoch": 2.2347767253044655, + "grad_norm": 0.2912282101498163, + "learning_rate": 1.8549280354460437e-06, + "loss": 0.0178, + "step": 3303 + }, + { + "epoch": 2.2354533152909335, + "grad_norm": 0.3718179520180523, + "learning_rate": 1.851868541736503e-06, + "loss": 0.0179, + "step": 3304 + }, + { + "epoch": 2.236129905277402, + "grad_norm": 0.3315459690239705, + "learning_rate": 1.8488109996987774e-06, + "loss": 0.0193, + "step": 3305 + }, + { + "epoch": 2.23680649526387, + "grad_norm": 0.3404284163203787, + "learning_rate": 1.845755411228382e-06, + "loss": 0.0252, + "step": 3306 + }, + { + "epoch": 2.2374830852503385, + "grad_norm": 0.3214098997727977, + "learning_rate": 1.8427017782196126e-06, + "loss": 0.016, + "step": 3307 + }, + { + "epoch": 2.2381596752368065, + "grad_norm": 0.28455585200848127, + "learning_rate": 1.8396501025655594e-06, + "loss": 0.0182, + "step": 3308 + }, + { + "epoch": 2.2388362652232745, + "grad_norm": 0.2405288163158476, + "learning_rate": 1.8366003861580966e-06, + "loss": 0.0137, + "step": 3309 + }, + { + "epoch": 2.239512855209743, + "grad_norm": 0.32872578102522326, + "learning_rate": 1.8335526308878877e-06, + "loss": 0.0166, + "step": 3310 + }, + { + "epoch": 2.240189445196211, + "grad_norm": 0.29057845428120543, + "learning_rate": 1.8305068386443696e-06, + "loss": 0.0204, + "step": 3311 + }, + { + "epoch": 2.2408660351826795, + "grad_norm": 0.34077445036319737, + "learning_rate": 1.8274630113157727e-06, + "loss": 0.0242, + "step": 3312 + }, + { + "epoch": 2.2415426251691475, + "grad_norm": 0.3095720384911149, + "learning_rate": 1.8244211507891064e-06, + "loss": 0.0177, + "step": 3313 + }, + { + "epoch": 2.2422192151556155, + "grad_norm": 0.364990408897162, + "learning_rate": 1.8213812589501611e-06, + "loss": 0.0227, + "step": 3314 + }, + { + "epoch": 2.242895805142084, + "grad_norm": 0.32634833013060577, + "learning_rate": 1.818343337683503e-06, + "loss": 0.0231, + "step": 3315 + }, + { + "epoch": 2.243572395128552, + "grad_norm": 0.27229027873812145, + "learning_rate": 1.815307388872481e-06, + "loss": 0.0175, + "step": 3316 + }, + { + "epoch": 2.2442489851150205, + "grad_norm": 0.2409021564066747, + "learning_rate": 1.8122734143992216e-06, + "loss": 0.0138, + "step": 3317 + }, + { + "epoch": 2.2449255751014885, + "grad_norm": 0.28672227986611876, + "learning_rate": 1.8092414161446225e-06, + "loss": 0.0149, + "step": 3318 + }, + { + "epoch": 2.2456021650879565, + "grad_norm": 0.32337583923203034, + "learning_rate": 1.8062113959883616e-06, + "loss": 0.0264, + "step": 3319 + }, + { + "epoch": 2.246278755074425, + "grad_norm": 0.3304537785642103, + "learning_rate": 1.8031833558088858e-06, + "loss": 0.0278, + "step": 3320 + }, + { + "epoch": 2.246955345060893, + "grad_norm": 0.3545009666826769, + "learning_rate": 1.8001572974834169e-06, + "loss": 0.0206, + "step": 3321 + }, + { + "epoch": 2.2476319350473615, + "grad_norm": 0.32211374285095357, + "learning_rate": 1.7971332228879518e-06, + "loss": 0.0194, + "step": 3322 + }, + { + "epoch": 2.2483085250338295, + "grad_norm": 0.3635517758902661, + "learning_rate": 1.7941111338972484e-06, + "loss": 0.017, + "step": 3323 + }, + { + "epoch": 2.2489851150202975, + "grad_norm": 0.2504823537719474, + "learning_rate": 1.7910910323848435e-06, + "loss": 0.0147, + "step": 3324 + }, + { + "epoch": 2.249661705006766, + "grad_norm": 0.33661846951747487, + "learning_rate": 1.7880729202230334e-06, + "loss": 0.0257, + "step": 3325 + }, + { + "epoch": 2.250338294993234, + "grad_norm": 0.3957781612156996, + "learning_rate": 1.7850567992828865e-06, + "loss": 0.02, + "step": 3326 + }, + { + "epoch": 2.2510148849797025, + "grad_norm": 0.3159214464212701, + "learning_rate": 1.7820426714342375e-06, + "loss": 0.0174, + "step": 3327 + }, + { + "epoch": 2.2516914749661705, + "grad_norm": 0.33062886479922, + "learning_rate": 1.7790305385456797e-06, + "loss": 0.0194, + "step": 3328 + }, + { + "epoch": 2.2523680649526385, + "grad_norm": 0.4406893847042817, + "learning_rate": 1.7760204024845745e-06, + "loss": 0.0232, + "step": 3329 + }, + { + "epoch": 2.253044654939107, + "grad_norm": 0.3160202959529048, + "learning_rate": 1.7730122651170457e-06, + "loss": 0.0224, + "step": 3330 + }, + { + "epoch": 2.253721244925575, + "grad_norm": 0.37480327409660913, + "learning_rate": 1.7700061283079744e-06, + "loss": 0.0174, + "step": 3331 + }, + { + "epoch": 2.2543978349120435, + "grad_norm": 0.4228752435563574, + "learning_rate": 1.7670019939210025e-06, + "loss": 0.0315, + "step": 3332 + }, + { + "epoch": 2.2550744248985115, + "grad_norm": 0.2693928943056749, + "learning_rate": 1.763999863818533e-06, + "loss": 0.0196, + "step": 3333 + }, + { + "epoch": 2.2557510148849795, + "grad_norm": 0.3366616168555836, + "learning_rate": 1.760999739861724e-06, + "loss": 0.023, + "step": 3334 + }, + { + "epoch": 2.256427604871448, + "grad_norm": 0.4049835032094548, + "learning_rate": 1.7580016239104924e-06, + "loss": 0.0178, + "step": 3335 + }, + { + "epoch": 2.257104194857916, + "grad_norm": 0.28705145507234686, + "learning_rate": 1.755005517823506e-06, + "loss": 0.0193, + "step": 3336 + }, + { + "epoch": 2.2577807848443845, + "grad_norm": 0.3626690760858157, + "learning_rate": 1.7520114234581914e-06, + "loss": 0.027, + "step": 3337 + }, + { + "epoch": 2.2584573748308525, + "grad_norm": 0.2852905675647844, + "learning_rate": 1.7490193426707236e-06, + "loss": 0.016, + "step": 3338 + }, + { + "epoch": 2.2591339648173205, + "grad_norm": 0.36799778032248737, + "learning_rate": 1.7460292773160315e-06, + "loss": 0.0176, + "step": 3339 + }, + { + "epoch": 2.259810554803789, + "grad_norm": 0.34353736716847627, + "learning_rate": 1.7430412292477978e-06, + "loss": 0.0161, + "step": 3340 + }, + { + "epoch": 2.260487144790257, + "grad_norm": 0.4684909137468032, + "learning_rate": 1.7400552003184463e-06, + "loss": 0.0164, + "step": 3341 + }, + { + "epoch": 2.2611637347767255, + "grad_norm": 0.3139210344528339, + "learning_rate": 1.7370711923791567e-06, + "loss": 0.0181, + "step": 3342 + }, + { + "epoch": 2.2618403247631935, + "grad_norm": 0.39734702734172855, + "learning_rate": 1.7340892072798544e-06, + "loss": 0.0164, + "step": 3343 + }, + { + "epoch": 2.2625169147496615, + "grad_norm": 0.331187064414431, + "learning_rate": 1.7311092468692082e-06, + "loss": 0.0258, + "step": 3344 + }, + { + "epoch": 2.26319350473613, + "grad_norm": 0.3470376153450806, + "learning_rate": 1.7281313129946302e-06, + "loss": 0.0182, + "step": 3345 + }, + { + "epoch": 2.263870094722598, + "grad_norm": 0.2907836320723048, + "learning_rate": 1.725155407502282e-06, + "loss": 0.0123, + "step": 3346 + }, + { + "epoch": 2.2645466847090665, + "grad_norm": 0.38607837864990185, + "learning_rate": 1.7221815322370633e-06, + "loss": 0.0196, + "step": 3347 + }, + { + "epoch": 2.2652232746955345, + "grad_norm": 0.28422494135273507, + "learning_rate": 1.7192096890426192e-06, + "loss": 0.0166, + "step": 3348 + }, + { + "epoch": 2.2658998646820026, + "grad_norm": 0.2641311829510012, + "learning_rate": 1.7162398797613284e-06, + "loss": 0.0135, + "step": 3349 + }, + { + "epoch": 2.266576454668471, + "grad_norm": 0.3203543867591377, + "learning_rate": 1.7132721062343156e-06, + "loss": 0.016, + "step": 3350 + }, + { + "epoch": 2.267253044654939, + "grad_norm": 0.457206063763029, + "learning_rate": 1.7103063703014372e-06, + "loss": 0.0171, + "step": 3351 + }, + { + "epoch": 2.2679296346414075, + "grad_norm": 0.29120129204884165, + "learning_rate": 1.7073426738012939e-06, + "loss": 0.0138, + "step": 3352 + }, + { + "epoch": 2.2686062246278755, + "grad_norm": 0.32769236145685815, + "learning_rate": 1.7043810185712135e-06, + "loss": 0.0207, + "step": 3353 + }, + { + "epoch": 2.2692828146143436, + "grad_norm": 0.3115450139158369, + "learning_rate": 1.7014214064472646e-06, + "loss": 0.0164, + "step": 3354 + }, + { + "epoch": 2.269959404600812, + "grad_norm": 0.31194935580325733, + "learning_rate": 1.6984638392642467e-06, + "loss": 0.0211, + "step": 3355 + }, + { + "epoch": 2.27063599458728, + "grad_norm": 0.36413969011600267, + "learning_rate": 1.6955083188556947e-06, + "loss": 0.0188, + "step": 3356 + }, + { + "epoch": 2.2713125845737485, + "grad_norm": 0.36765580778539764, + "learning_rate": 1.6925548470538695e-06, + "loss": 0.0202, + "step": 3357 + }, + { + "epoch": 2.2719891745602165, + "grad_norm": 0.41977518152765175, + "learning_rate": 1.6896034256897626e-06, + "loss": 0.0205, + "step": 3358 + }, + { + "epoch": 2.2726657645466846, + "grad_norm": 0.30153968052530306, + "learning_rate": 1.686654056593099e-06, + "loss": 0.0167, + "step": 3359 + }, + { + "epoch": 2.273342354533153, + "grad_norm": 0.2897073649002921, + "learning_rate": 1.683706741592327e-06, + "loss": 0.0152, + "step": 3360 + }, + { + "epoch": 2.274018944519621, + "grad_norm": 0.3040463950675685, + "learning_rate": 1.6807614825146258e-06, + "loss": 0.0189, + "step": 3361 + }, + { + "epoch": 2.2746955345060895, + "grad_norm": 0.29797419596106484, + "learning_rate": 1.6778182811858934e-06, + "loss": 0.0174, + "step": 3362 + }, + { + "epoch": 2.2753721244925575, + "grad_norm": 0.2961945532484389, + "learning_rate": 1.6748771394307584e-06, + "loss": 0.0165, + "step": 3363 + }, + { + "epoch": 2.2760487144790256, + "grad_norm": 0.3990668565554466, + "learning_rate": 1.671938059072571e-06, + "loss": 0.02, + "step": 3364 + }, + { + "epoch": 2.276725304465494, + "grad_norm": 0.4480022991768558, + "learning_rate": 1.6690010419334008e-06, + "loss": 0.0438, + "step": 3365 + }, + { + "epoch": 2.277401894451962, + "grad_norm": 0.337434658168982, + "learning_rate": 1.6660660898340392e-06, + "loss": 0.0206, + "step": 3366 + }, + { + "epoch": 2.2780784844384305, + "grad_norm": 0.35270920413483253, + "learning_rate": 1.6631332045939996e-06, + "loss": 0.0209, + "step": 3367 + }, + { + "epoch": 2.2787550744248986, + "grad_norm": 0.31489615453735553, + "learning_rate": 1.6602023880315126e-06, + "loss": 0.0192, + "step": 3368 + }, + { + "epoch": 2.2794316644113666, + "grad_norm": 0.3605404755993179, + "learning_rate": 1.6572736419635288e-06, + "loss": 0.0177, + "step": 3369 + }, + { + "epoch": 2.280108254397835, + "grad_norm": 0.30101785860707747, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.0142, + "step": 3370 + }, + { + "epoch": 2.280784844384303, + "grad_norm": 0.2753648508252713, + "learning_rate": 1.651422368572436e-06, + "loss": 0.018, + "step": 3371 + }, + { + "epoch": 2.2814614343707715, + "grad_norm": 0.3273227895585344, + "learning_rate": 1.648499844876802e-06, + "loss": 0.0182, + "step": 3372 + }, + { + "epoch": 2.2821380243572396, + "grad_norm": 0.29784328791567416, + "learning_rate": 1.6455793989306169e-06, + "loss": 0.0136, + "step": 3373 + }, + { + "epoch": 2.2828146143437076, + "grad_norm": 0.33450496584038497, + "learning_rate": 1.642661032544396e-06, + "loss": 0.0172, + "step": 3374 + }, + { + "epoch": 2.283491204330176, + "grad_norm": 0.28921021704318484, + "learning_rate": 1.639744747527371e-06, + "loss": 0.0141, + "step": 3375 + }, + { + "epoch": 2.284167794316644, + "grad_norm": 0.29539931758206955, + "learning_rate": 1.636830545687481e-06, + "loss": 0.0144, + "step": 3376 + }, + { + "epoch": 2.2848443843031125, + "grad_norm": 0.3591264355407285, + "learning_rate": 1.6339184288313769e-06, + "loss": 0.028, + "step": 3377 + }, + { + "epoch": 2.2855209742895806, + "grad_norm": 0.44566592922721643, + "learning_rate": 1.631008398764412e-06, + "loss": 0.0288, + "step": 3378 + }, + { + "epoch": 2.2861975642760486, + "grad_norm": 0.5247644817268662, + "learning_rate": 1.6281004572906462e-06, + "loss": 0.0337, + "step": 3379 + }, + { + "epoch": 2.286874154262517, + "grad_norm": 0.3956964017910272, + "learning_rate": 1.6251946062128482e-06, + "loss": 0.0168, + "step": 3380 + }, + { + "epoch": 2.287550744248985, + "grad_norm": 0.3002357813842024, + "learning_rate": 1.6222908473324889e-06, + "loss": 0.0177, + "step": 3381 + }, + { + "epoch": 2.2882273342354535, + "grad_norm": 0.27279860216353286, + "learning_rate": 1.6193891824497438e-06, + "loss": 0.0188, + "step": 3382 + }, + { + "epoch": 2.2889039242219216, + "grad_norm": 0.4217966814469358, + "learning_rate": 1.616489613363486e-06, + "loss": 0.0271, + "step": 3383 + }, + { + "epoch": 2.2895805142083896, + "grad_norm": 0.28064319219119543, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.0157, + "step": 3384 + }, + { + "epoch": 2.290257104194858, + "grad_norm": 0.27570323148005854, + "learning_rate": 1.6106967697694442e-06, + "loss": 0.0171, + "step": 3385 + }, + { + "epoch": 2.290933694181326, + "grad_norm": 0.35091574079144383, + "learning_rate": 1.6078034988529112e-06, + "loss": 0.0188, + "step": 3386 + }, + { + "epoch": 2.2916102841677946, + "grad_norm": 0.3721782944501329, + "learning_rate": 1.604912330915364e-06, + "loss": 0.0253, + "step": 3387 + }, + { + "epoch": 2.2922868741542626, + "grad_norm": 0.29111923911747417, + "learning_rate": 1.6020232677491732e-06, + "loss": 0.0166, + "step": 3388 + }, + { + "epoch": 2.2929634641407306, + "grad_norm": 0.29045324372586967, + "learning_rate": 1.5991363111454023e-06, + "loss": 0.0164, + "step": 3389 + }, + { + "epoch": 2.293640054127199, + "grad_norm": 0.3534138112428688, + "learning_rate": 1.5962514628938103e-06, + "loss": 0.0221, + "step": 3390 + }, + { + "epoch": 2.294316644113667, + "grad_norm": 0.32072838866259135, + "learning_rate": 1.5933687247828462e-06, + "loss": 0.0195, + "step": 3391 + }, + { + "epoch": 2.2949932341001356, + "grad_norm": 0.3098815018278398, + "learning_rate": 1.59048809859965e-06, + "loss": 0.0176, + "step": 3392 + }, + { + "epoch": 2.2956698240866036, + "grad_norm": 0.3926479778143456, + "learning_rate": 1.5876095861300567e-06, + "loss": 0.0215, + "step": 3393 + }, + { + "epoch": 2.2963464140730716, + "grad_norm": 0.38504918431993057, + "learning_rate": 1.5847331891585888e-06, + "loss": 0.0336, + "step": 3394 + }, + { + "epoch": 2.29702300405954, + "grad_norm": 0.35860940232195915, + "learning_rate": 1.5818589094684594e-06, + "loss": 0.0187, + "step": 3395 + }, + { + "epoch": 2.297699594046008, + "grad_norm": 0.36040946508834704, + "learning_rate": 1.5789867488415633e-06, + "loss": 0.0238, + "step": 3396 + }, + { + "epoch": 2.2983761840324766, + "grad_norm": 0.5018342132225554, + "learning_rate": 1.5761167090584885e-06, + "loss": 0.018, + "step": 3397 + }, + { + "epoch": 2.2990527740189446, + "grad_norm": 0.2834935241222944, + "learning_rate": 1.5732487918985017e-06, + "loss": 0.0146, + "step": 3398 + }, + { + "epoch": 2.2997293640054126, + "grad_norm": 0.28017211065018244, + "learning_rate": 1.5703829991395602e-06, + "loss": 0.0126, + "step": 3399 + }, + { + "epoch": 2.300405953991881, + "grad_norm": 0.3812149184324965, + "learning_rate": 1.5675193325582983e-06, + "loss": 0.0145, + "step": 3400 + }, + { + "epoch": 2.301082543978349, + "grad_norm": 0.33818481581544535, + "learning_rate": 1.5646577939300362e-06, + "loss": 0.0166, + "step": 3401 + }, + { + "epoch": 2.301759133964817, + "grad_norm": 0.3891537744780298, + "learning_rate": 1.5617983850287737e-06, + "loss": 0.0264, + "step": 3402 + }, + { + "epoch": 2.3024357239512856, + "grad_norm": 0.3547798750552972, + "learning_rate": 1.5589411076271916e-06, + "loss": 0.0293, + "step": 3403 + }, + { + "epoch": 2.3031123139377536, + "grad_norm": 0.344589393233328, + "learning_rate": 1.5560859634966457e-06, + "loss": 0.0206, + "step": 3404 + }, + { + "epoch": 2.303788903924222, + "grad_norm": 0.31333696680940976, + "learning_rate": 1.5532329544071712e-06, + "loss": 0.0195, + "step": 3405 + }, + { + "epoch": 2.30446549391069, + "grad_norm": 0.27836311682266196, + "learning_rate": 1.5503820821274812e-06, + "loss": 0.016, + "step": 3406 + }, + { + "epoch": 2.305142083897158, + "grad_norm": 0.3281966681069725, + "learning_rate": 1.5475333484249633e-06, + "loss": 0.0199, + "step": 3407 + }, + { + "epoch": 2.3058186738836266, + "grad_norm": 0.31803083744014177, + "learning_rate": 1.544686755065677e-06, + "loss": 0.019, + "step": 3408 + }, + { + "epoch": 2.3064952638700946, + "grad_norm": 0.2976133648194919, + "learning_rate": 1.5418423038143576e-06, + "loss": 0.0103, + "step": 3409 + }, + { + "epoch": 2.307171853856563, + "grad_norm": 0.5054667403002192, + "learning_rate": 1.5389999964344138e-06, + "loss": 0.0289, + "step": 3410 + }, + { + "epoch": 2.307848443843031, + "grad_norm": 0.27112112428549096, + "learning_rate": 1.5361598346879193e-06, + "loss": 0.0122, + "step": 3411 + }, + { + "epoch": 2.308525033829499, + "grad_norm": 0.3464400771438308, + "learning_rate": 1.5333218203356243e-06, + "loss": 0.0204, + "step": 3412 + }, + { + "epoch": 2.3092016238159676, + "grad_norm": 0.3975492838739599, + "learning_rate": 1.5304859551369417e-06, + "loss": 0.0249, + "step": 3413 + }, + { + "epoch": 2.3098782138024356, + "grad_norm": 0.2738991600856642, + "learning_rate": 1.5276522408499567e-06, + "loss": 0.0144, + "step": 3414 + }, + { + "epoch": 2.310554803788904, + "grad_norm": 0.32361113515357953, + "learning_rate": 1.5248206792314197e-06, + "loss": 0.0244, + "step": 3415 + }, + { + "epoch": 2.311231393775372, + "grad_norm": 0.3037149226711114, + "learning_rate": 1.5219912720367474e-06, + "loss": 0.0194, + "step": 3416 + }, + { + "epoch": 2.31190798376184, + "grad_norm": 0.2594839056926496, + "learning_rate": 1.5191640210200186e-06, + "loss": 0.0151, + "step": 3417 + }, + { + "epoch": 2.3125845737483086, + "grad_norm": 0.3193148236143407, + "learning_rate": 1.5163389279339746e-06, + "loss": 0.0161, + "step": 3418 + }, + { + "epoch": 2.3132611637347766, + "grad_norm": 0.3299916254958434, + "learning_rate": 1.5135159945300232e-06, + "loss": 0.0232, + "step": 3419 + }, + { + "epoch": 2.313937753721245, + "grad_norm": 0.356079585830143, + "learning_rate": 1.5106952225582312e-06, + "loss": 0.0228, + "step": 3420 + }, + { + "epoch": 2.314614343707713, + "grad_norm": 0.346548967899213, + "learning_rate": 1.5078766137673229e-06, + "loss": 0.0286, + "step": 3421 + }, + { + "epoch": 2.315290933694181, + "grad_norm": 0.31734678333429894, + "learning_rate": 1.5050601699046852e-06, + "loss": 0.0176, + "step": 3422 + }, + { + "epoch": 2.3159675236806496, + "grad_norm": 0.3138444770545622, + "learning_rate": 1.5022458927163618e-06, + "loss": 0.0185, + "step": 3423 + }, + { + "epoch": 2.3166441136671176, + "grad_norm": 0.3872281981192431, + "learning_rate": 1.499433783947054e-06, + "loss": 0.04, + "step": 3424 + }, + { + "epoch": 2.317320703653586, + "grad_norm": 0.3047666004806965, + "learning_rate": 1.4966238453401161e-06, + "loss": 0.0159, + "step": 3425 + }, + { + "epoch": 2.317997293640054, + "grad_norm": 0.28401856592096936, + "learning_rate": 1.4938160786375571e-06, + "loss": 0.012, + "step": 3426 + }, + { + "epoch": 2.318673883626522, + "grad_norm": 0.37776503020377644, + "learning_rate": 1.4910104855800429e-06, + "loss": 0.0177, + "step": 3427 + }, + { + "epoch": 2.3193504736129906, + "grad_norm": 0.31668460822759414, + "learning_rate": 1.488207067906891e-06, + "loss": 0.0142, + "step": 3428 + }, + { + "epoch": 2.3200270635994586, + "grad_norm": 0.2910053986557296, + "learning_rate": 1.4854058273560667e-06, + "loss": 0.0178, + "step": 3429 + }, + { + "epoch": 2.320703653585927, + "grad_norm": 0.31869749440623857, + "learning_rate": 1.4826067656641912e-06, + "loss": 0.0165, + "step": 3430 + }, + { + "epoch": 2.321380243572395, + "grad_norm": 0.38881098890086335, + "learning_rate": 1.479809884566528e-06, + "loss": 0.0207, + "step": 3431 + }, + { + "epoch": 2.322056833558863, + "grad_norm": 0.36727104723757226, + "learning_rate": 1.477015185796995e-06, + "loss": 0.0238, + "step": 3432 + }, + { + "epoch": 2.3227334235453316, + "grad_norm": 0.29700678786934914, + "learning_rate": 1.4742226710881558e-06, + "loss": 0.0156, + "step": 3433 + }, + { + "epoch": 2.3234100135317997, + "grad_norm": 0.34989861500261144, + "learning_rate": 1.4714323421712163e-06, + "loss": 0.0194, + "step": 3434 + }, + { + "epoch": 2.324086603518268, + "grad_norm": 0.3807261903374077, + "learning_rate": 1.4686442007760315e-06, + "loss": 0.0238, + "step": 3435 + }, + { + "epoch": 2.324763193504736, + "grad_norm": 0.32828486518153954, + "learning_rate": 1.465858248631099e-06, + "loss": 0.0205, + "step": 3436 + }, + { + "epoch": 2.325439783491204, + "grad_norm": 0.3854799403276748, + "learning_rate": 1.4630744874635611e-06, + "loss": 0.0241, + "step": 3437 + }, + { + "epoch": 2.3261163734776726, + "grad_norm": 0.3381987589387271, + "learning_rate": 1.460292918999195e-06, + "loss": 0.0232, + "step": 3438 + }, + { + "epoch": 2.3267929634641407, + "grad_norm": 0.28866416084621654, + "learning_rate": 1.4575135449624251e-06, + "loss": 0.0141, + "step": 3439 + }, + { + "epoch": 2.3274695534506087, + "grad_norm": 0.3468131314919482, + "learning_rate": 1.4547363670763138e-06, + "loss": 0.0179, + "step": 3440 + }, + { + "epoch": 2.328146143437077, + "grad_norm": 0.27370334148235237, + "learning_rate": 1.4519613870625632e-06, + "loss": 0.0159, + "step": 3441 + }, + { + "epoch": 2.328822733423545, + "grad_norm": 0.3461188549124858, + "learning_rate": 1.4491886066415084e-06, + "loss": 0.0127, + "step": 3442 + }, + { + "epoch": 2.3294993234100136, + "grad_norm": 0.2953882693563509, + "learning_rate": 1.4464180275321255e-06, + "loss": 0.0186, + "step": 3443 + }, + { + "epoch": 2.3301759133964817, + "grad_norm": 0.3380921062470368, + "learning_rate": 1.4436496514520253e-06, + "loss": 0.0141, + "step": 3444 + }, + { + "epoch": 2.3308525033829497, + "grad_norm": 0.3157462679666608, + "learning_rate": 1.4408834801174492e-06, + "loss": 0.0215, + "step": 3445 + }, + { + "epoch": 2.331529093369418, + "grad_norm": 0.2838315171078676, + "learning_rate": 1.438119515243277e-06, + "loss": 0.019, + "step": 3446 + }, + { + "epoch": 2.332205683355886, + "grad_norm": 0.29102971186807003, + "learning_rate": 1.4353577585430152e-06, + "loss": 0.0185, + "step": 3447 + }, + { + "epoch": 2.3328822733423547, + "grad_norm": 0.36108657223021884, + "learning_rate": 1.4325982117288052e-06, + "loss": 0.0232, + "step": 3448 + }, + { + "epoch": 2.3335588633288227, + "grad_norm": 0.34128339886805575, + "learning_rate": 1.4298408765114191e-06, + "loss": 0.0179, + "step": 3449 + }, + { + "epoch": 2.3342354533152907, + "grad_norm": 0.29909188182042723, + "learning_rate": 1.4270857546002548e-06, + "loss": 0.0168, + "step": 3450 + }, + { + "epoch": 2.334912043301759, + "grad_norm": 0.3854950782607295, + "learning_rate": 1.4243328477033369e-06, + "loss": 0.0262, + "step": 3451 + }, + { + "epoch": 2.335588633288227, + "grad_norm": 0.35774118889171214, + "learning_rate": 1.4215821575273219e-06, + "loss": 0.0216, + "step": 3452 + }, + { + "epoch": 2.3362652232746957, + "grad_norm": 0.406041101629836, + "learning_rate": 1.4188336857774892e-06, + "loss": 0.0293, + "step": 3453 + }, + { + "epoch": 2.3369418132611637, + "grad_norm": 0.36314789122100627, + "learning_rate": 1.4160874341577447e-06, + "loss": 0.02, + "step": 3454 + }, + { + "epoch": 2.3376184032476317, + "grad_norm": 0.3202102887868807, + "learning_rate": 1.413343404370613e-06, + "loss": 0.02, + "step": 3455 + }, + { + "epoch": 2.3382949932341, + "grad_norm": 0.39385977245190484, + "learning_rate": 1.410601598117246e-06, + "loss": 0.0222, + "step": 3456 + }, + { + "epoch": 2.338971583220568, + "grad_norm": 0.33812379984256224, + "learning_rate": 1.4078620170974178e-06, + "loss": 0.0127, + "step": 3457 + }, + { + "epoch": 2.3396481732070367, + "grad_norm": 0.3835284152767019, + "learning_rate": 1.4051246630095195e-06, + "loss": 0.0219, + "step": 3458 + }, + { + "epoch": 2.3403247631935047, + "grad_norm": 0.45913432753312217, + "learning_rate": 1.4023895375505608e-06, + "loss": 0.0225, + "step": 3459 + }, + { + "epoch": 2.3410013531799727, + "grad_norm": 0.2930001985549042, + "learning_rate": 1.3996566424161746e-06, + "loss": 0.0145, + "step": 3460 + }, + { + "epoch": 2.341677943166441, + "grad_norm": 0.29618365578774825, + "learning_rate": 1.396925979300608e-06, + "loss": 0.0189, + "step": 3461 + }, + { + "epoch": 2.342354533152909, + "grad_norm": 0.3156416431072414, + "learning_rate": 1.3941975498967265e-06, + "loss": 0.0146, + "step": 3462 + }, + { + "epoch": 2.3430311231393777, + "grad_norm": 0.34450896180533125, + "learning_rate": 1.3914713558960064e-06, + "loss": 0.0222, + "step": 3463 + }, + { + "epoch": 2.3437077131258457, + "grad_norm": 0.37433044639671526, + "learning_rate": 1.3887473989885441e-06, + "loss": 0.0227, + "step": 3464 + }, + { + "epoch": 2.3443843031123137, + "grad_norm": 0.2967831896285163, + "learning_rate": 1.3860256808630429e-06, + "loss": 0.0198, + "step": 3465 + }, + { + "epoch": 2.345060893098782, + "grad_norm": 0.35596070996685975, + "learning_rate": 1.383306203206823e-06, + "loss": 0.0192, + "step": 3466 + }, + { + "epoch": 2.34573748308525, + "grad_norm": 0.27285421060885284, + "learning_rate": 1.3805889677058148e-06, + "loss": 0.015, + "step": 3467 + }, + { + "epoch": 2.3464140730717187, + "grad_norm": 0.3379030178564435, + "learning_rate": 1.3778739760445552e-06, + "loss": 0.0182, + "step": 3468 + }, + { + "epoch": 2.3470906630581867, + "grad_norm": 0.37442555210690887, + "learning_rate": 1.375161229906195e-06, + "loss": 0.0176, + "step": 3469 + }, + { + "epoch": 2.3477672530446547, + "grad_norm": 0.3659478507841638, + "learning_rate": 1.372450730972491e-06, + "loss": 0.0216, + "step": 3470 + }, + { + "epoch": 2.348443843031123, + "grad_norm": 0.6884523305018001, + "learning_rate": 1.3697424809238058e-06, + "loss": 0.0239, + "step": 3471 + }, + { + "epoch": 2.349120433017591, + "grad_norm": 0.2773584734882262, + "learning_rate": 1.3670364814391062e-06, + "loss": 0.0199, + "step": 3472 + }, + { + "epoch": 2.3497970230040597, + "grad_norm": 0.3683873083754821, + "learning_rate": 1.3643327341959684e-06, + "loss": 0.0236, + "step": 3473 + }, + { + "epoch": 2.3504736129905277, + "grad_norm": 0.3465351617126101, + "learning_rate": 1.361631240870569e-06, + "loss": 0.0248, + "step": 3474 + }, + { + "epoch": 2.3511502029769957, + "grad_norm": 0.673446290767098, + "learning_rate": 1.35893200313769e-06, + "loss": 0.0232, + "step": 3475 + }, + { + "epoch": 2.351826792963464, + "grad_norm": 0.453653251164762, + "learning_rate": 1.3562350226707106e-06, + "loss": 0.0345, + "step": 3476 + }, + { + "epoch": 2.352503382949932, + "grad_norm": 0.3427863808370553, + "learning_rate": 1.3535403011416159e-06, + "loss": 0.0267, + "step": 3477 + }, + { + "epoch": 2.3531799729364007, + "grad_norm": 0.2961253295322023, + "learning_rate": 1.3508478402209858e-06, + "loss": 0.02, + "step": 3478 + }, + { + "epoch": 2.3538565629228687, + "grad_norm": 0.33042714299973824, + "learning_rate": 1.3481576415780035e-06, + "loss": 0.02, + "step": 3479 + }, + { + "epoch": 2.3545331529093367, + "grad_norm": 0.2886625596054307, + "learning_rate": 1.3454697068804434e-06, + "loss": 0.0145, + "step": 3480 + }, + { + "epoch": 2.355209742895805, + "grad_norm": 0.5073975258555212, + "learning_rate": 1.3427840377946826e-06, + "loss": 0.0278, + "step": 3481 + }, + { + "epoch": 2.3558863328822732, + "grad_norm": 0.3175605735846637, + "learning_rate": 1.3401006359856916e-06, + "loss": 0.0157, + "step": 3482 + }, + { + "epoch": 2.3565629228687417, + "grad_norm": 0.4040167872806406, + "learning_rate": 1.337419503117035e-06, + "loss": 0.0426, + "step": 3483 + }, + { + "epoch": 2.3572395128552097, + "grad_norm": 0.22903622237412308, + "learning_rate": 1.3347406408508695e-06, + "loss": 0.0121, + "step": 3484 + }, + { + "epoch": 2.3579161028416777, + "grad_norm": 0.4586465906503626, + "learning_rate": 1.332064050847945e-06, + "loss": 0.0163, + "step": 3485 + }, + { + "epoch": 2.358592692828146, + "grad_norm": 0.37767507286954305, + "learning_rate": 1.3293897347676032e-06, + "loss": 0.0218, + "step": 3486 + }, + { + "epoch": 2.3592692828146142, + "grad_norm": 0.5212575672701087, + "learning_rate": 1.3267176942677763e-06, + "loss": 0.0185, + "step": 3487 + }, + { + "epoch": 2.3599458728010827, + "grad_norm": 0.3895583593845578, + "learning_rate": 1.324047931004987e-06, + "loss": 0.0242, + "step": 3488 + }, + { + "epoch": 2.3606224627875507, + "grad_norm": 0.30814132888671697, + "learning_rate": 1.321380446634342e-06, + "loss": 0.0144, + "step": 3489 + }, + { + "epoch": 2.3612990527740187, + "grad_norm": 0.29548396843448144, + "learning_rate": 1.31871524280954e-06, + "loss": 0.0159, + "step": 3490 + }, + { + "epoch": 2.361975642760487, + "grad_norm": 0.29620029474076864, + "learning_rate": 1.3160523211828612e-06, + "loss": 0.0154, + "step": 3491 + }, + { + "epoch": 2.3626522327469552, + "grad_norm": 0.3089293933425394, + "learning_rate": 1.313391683405177e-06, + "loss": 0.0183, + "step": 3492 + }, + { + "epoch": 2.3633288227334237, + "grad_norm": 0.29458168000264845, + "learning_rate": 1.310733331125935e-06, + "loss": 0.0135, + "step": 3493 + }, + { + "epoch": 2.3640054127198917, + "grad_norm": 0.5333665226371411, + "learning_rate": 1.3080772659931728e-06, + "loss": 0.0216, + "step": 3494 + }, + { + "epoch": 2.3646820027063598, + "grad_norm": 0.4694395097376205, + "learning_rate": 1.305423489653508e-06, + "loss": 0.0268, + "step": 3495 + }, + { + "epoch": 2.365358592692828, + "grad_norm": 0.26610297140918115, + "learning_rate": 1.3027720037521397e-06, + "loss": 0.0129, + "step": 3496 + }, + { + "epoch": 2.3660351826792962, + "grad_norm": 0.3216074320309018, + "learning_rate": 1.3001228099328445e-06, + "loss": 0.0158, + "step": 3497 + }, + { + "epoch": 2.3667117726657647, + "grad_norm": 0.3675644364816066, + "learning_rate": 1.297475909837979e-06, + "loss": 0.0207, + "step": 3498 + }, + { + "epoch": 2.3673883626522327, + "grad_norm": 0.4496383494220818, + "learning_rate": 1.29483130510848e-06, + "loss": 0.0239, + "step": 3499 + }, + { + "epoch": 2.3680649526387008, + "grad_norm": 0.38480410347240224, + "learning_rate": 1.2921889973838591e-06, + "loss": 0.0254, + "step": 3500 + }, + { + "epoch": 2.3687415426251692, + "grad_norm": 0.3474437991994648, + "learning_rate": 1.289548988302207e-06, + "loss": 0.016, + "step": 3501 + }, + { + "epoch": 2.3694181326116373, + "grad_norm": 0.3643345445652335, + "learning_rate": 1.2869112795001836e-06, + "loss": 0.0168, + "step": 3502 + }, + { + "epoch": 2.3700947225981057, + "grad_norm": 0.3653840433304237, + "learning_rate": 1.2842758726130283e-06, + "loss": 0.0201, + "step": 3503 + }, + { + "epoch": 2.3707713125845737, + "grad_norm": 0.4239909885433043, + "learning_rate": 1.281642769274552e-06, + "loss": 0.0176, + "step": 3504 + }, + { + "epoch": 2.3714479025710418, + "grad_norm": 0.372786400830861, + "learning_rate": 1.2790119711171356e-06, + "loss": 0.0255, + "step": 3505 + }, + { + "epoch": 2.3721244925575102, + "grad_norm": 0.5484580889192269, + "learning_rate": 1.2763834797717312e-06, + "loss": 0.0256, + "step": 3506 + }, + { + "epoch": 2.3728010825439783, + "grad_norm": 0.327930154927057, + "learning_rate": 1.2737572968678624e-06, + "loss": 0.0196, + "step": 3507 + }, + { + "epoch": 2.3734776725304467, + "grad_norm": 0.45904627217132027, + "learning_rate": 1.2711334240336216e-06, + "loss": 0.0225, + "step": 3508 + }, + { + "epoch": 2.3741542625169147, + "grad_norm": 0.35917177934795713, + "learning_rate": 1.26851186289567e-06, + "loss": 0.019, + "step": 3509 + }, + { + "epoch": 2.3748308525033828, + "grad_norm": 0.847389695305474, + "learning_rate": 1.2658926150792321e-06, + "loss": 0.0275, + "step": 3510 + }, + { + "epoch": 2.3755074424898512, + "grad_norm": 0.43979032251488054, + "learning_rate": 1.2632756822081e-06, + "loss": 0.0242, + "step": 3511 + }, + { + "epoch": 2.3761840324763193, + "grad_norm": 0.4016908873869214, + "learning_rate": 1.2606610659046314e-06, + "loss": 0.0223, + "step": 3512 + }, + { + "epoch": 2.3768606224627877, + "grad_norm": 0.3964610840879998, + "learning_rate": 1.2580487677897496e-06, + "loss": 0.0294, + "step": 3513 + }, + { + "epoch": 2.3775372124492558, + "grad_norm": 0.32040431574055594, + "learning_rate": 1.255438789482935e-06, + "loss": 0.0209, + "step": 3514 + }, + { + "epoch": 2.378213802435724, + "grad_norm": 0.2733499958799494, + "learning_rate": 1.2528311326022364e-06, + "loss": 0.0175, + "step": 3515 + }, + { + "epoch": 2.3788903924221922, + "grad_norm": 0.41348718080267, + "learning_rate": 1.250225798764259e-06, + "loss": 0.0193, + "step": 3516 + }, + { + "epoch": 2.3795669824086603, + "grad_norm": 0.3065936874861683, + "learning_rate": 1.2476227895841714e-06, + "loss": 0.0196, + "step": 3517 + }, + { + "epoch": 2.3802435723951287, + "grad_norm": 0.26321713581050965, + "learning_rate": 1.2450221066756973e-06, + "loss": 0.0164, + "step": 3518 + }, + { + "epoch": 2.3809201623815968, + "grad_norm": 0.24304563264346346, + "learning_rate": 1.242423751651119e-06, + "loss": 0.0161, + "step": 3519 + }, + { + "epoch": 2.381596752368065, + "grad_norm": 0.2887832946381938, + "learning_rate": 1.2398277261212777e-06, + "loss": 0.0141, + "step": 3520 + }, + { + "epoch": 2.3822733423545333, + "grad_norm": 0.3201531185077651, + "learning_rate": 1.2372340316955694e-06, + "loss": 0.0224, + "step": 3521 + }, + { + "epoch": 2.3829499323410013, + "grad_norm": 0.2673111245201406, + "learning_rate": 1.234642669981946e-06, + "loss": 0.0151, + "step": 3522 + }, + { + "epoch": 2.3836265223274697, + "grad_norm": 0.361959365189735, + "learning_rate": 1.232053642586909e-06, + "loss": 0.0255, + "step": 3523 + }, + { + "epoch": 2.3843031123139378, + "grad_norm": 0.23907565095316533, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.0149, + "step": 3524 + }, + { + "epoch": 2.384979702300406, + "grad_norm": 0.2761112499672755, + "learning_rate": 1.2268825971713833e-06, + "loss": 0.017, + "step": 3525 + }, + { + "epoch": 2.3856562922868743, + "grad_norm": 0.32362861719233066, + "learning_rate": 1.2243005823566638e-06, + "loss": 0.0188, + "step": 3526 + }, + { + "epoch": 2.3863328822733423, + "grad_norm": 0.2613132688502163, + "learning_rate": 1.2217209082720677e-06, + "loss": 0.0147, + "step": 3527 + }, + { + "epoch": 2.3870094722598107, + "grad_norm": 0.3284213473585238, + "learning_rate": 1.2191435765168557e-06, + "loss": 0.0188, + "step": 3528 + }, + { + "epoch": 2.3876860622462788, + "grad_norm": 0.3256730328988581, + "learning_rate": 1.2165685886888346e-06, + "loss": 0.0333, + "step": 3529 + }, + { + "epoch": 2.388362652232747, + "grad_norm": 0.3677415875121975, + "learning_rate": 1.2139959463843593e-06, + "loss": 0.0198, + "step": 3530 + }, + { + "epoch": 2.3890392422192153, + "grad_norm": 0.4418753233954852, + "learning_rate": 1.2114256511983274e-06, + "loss": 0.025, + "step": 3531 + }, + { + "epoch": 2.3897158322056833, + "grad_norm": 0.34847431989760974, + "learning_rate": 1.2088577047241834e-06, + "loss": 0.0185, + "step": 3532 + }, + { + "epoch": 2.3903924221921518, + "grad_norm": 0.37733463812285334, + "learning_rate": 1.2062921085539152e-06, + "loss": 0.0214, + "step": 3533 + }, + { + "epoch": 2.39106901217862, + "grad_norm": 0.45738770036944465, + "learning_rate": 1.2037288642780575e-06, + "loss": 0.0152, + "step": 3534 + }, + { + "epoch": 2.391745602165088, + "grad_norm": 0.3755377152715656, + "learning_rate": 1.2011679734856796e-06, + "loss": 0.0214, + "step": 3535 + }, + { + "epoch": 2.3924221921515563, + "grad_norm": 0.40400668220835156, + "learning_rate": 1.1986094377643976e-06, + "loss": 0.0183, + "step": 3536 + }, + { + "epoch": 2.3930987821380243, + "grad_norm": 0.31864548279603405, + "learning_rate": 1.1960532587003666e-06, + "loss": 0.019, + "step": 3537 + }, + { + "epoch": 2.3937753721244928, + "grad_norm": 0.3939370885379173, + "learning_rate": 1.193499437878277e-06, + "loss": 0.0196, + "step": 3538 + }, + { + "epoch": 2.394451962110961, + "grad_norm": 0.23943780206307763, + "learning_rate": 1.1909479768813641e-06, + "loss": 0.0107, + "step": 3539 + }, + { + "epoch": 2.395128552097429, + "grad_norm": 0.28387392170040904, + "learning_rate": 1.1883988772913924e-06, + "loss": 0.0203, + "step": 3540 + }, + { + "epoch": 2.3958051420838973, + "grad_norm": 0.3548361942597856, + "learning_rate": 1.1858521406886674e-06, + "loss": 0.0177, + "step": 3541 + }, + { + "epoch": 2.3964817320703653, + "grad_norm": 0.2809746690444741, + "learning_rate": 1.183307768652029e-06, + "loss": 0.0133, + "step": 3542 + }, + { + "epoch": 2.3971583220568338, + "grad_norm": 0.24331556416682687, + "learning_rate": 1.180765762758852e-06, + "loss": 0.0125, + "step": 3543 + }, + { + "epoch": 2.397834912043302, + "grad_norm": 0.36627104128099164, + "learning_rate": 1.1782261245850417e-06, + "loss": 0.0166, + "step": 3544 + }, + { + "epoch": 2.39851150202977, + "grad_norm": 0.3347947577313801, + "learning_rate": 1.1756888557050356e-06, + "loss": 0.0242, + "step": 3545 + }, + { + "epoch": 2.3991880920162383, + "grad_norm": 0.4153508901482643, + "learning_rate": 1.173153957691805e-06, + "loss": 0.0384, + "step": 3546 + }, + { + "epoch": 2.3998646820027063, + "grad_norm": 0.3022178539899148, + "learning_rate": 1.1706214321168513e-06, + "loss": 0.0203, + "step": 3547 + }, + { + "epoch": 2.4005412719891748, + "grad_norm": 0.38753508758662086, + "learning_rate": 1.1680912805502008e-06, + "loss": 0.0238, + "step": 3548 + }, + { + "epoch": 2.401217861975643, + "grad_norm": 0.3144206326333019, + "learning_rate": 1.165563504560413e-06, + "loss": 0.0158, + "step": 3549 + }, + { + "epoch": 2.401894451962111, + "grad_norm": 0.3268120250622495, + "learning_rate": 1.1630381057145735e-06, + "loss": 0.0108, + "step": 3550 + }, + { + "epoch": 2.4025710419485793, + "grad_norm": 0.30599363281197867, + "learning_rate": 1.1605150855782916e-06, + "loss": 0.021, + "step": 3551 + }, + { + "epoch": 2.4032476319350473, + "grad_norm": 0.3216079548933299, + "learning_rate": 1.157994445715706e-06, + "loss": 0.0166, + "step": 3552 + }, + { + "epoch": 2.403924221921516, + "grad_norm": 0.32487055741332094, + "learning_rate": 1.155476187689475e-06, + "loss": 0.0227, + "step": 3553 + }, + { + "epoch": 2.404600811907984, + "grad_norm": 0.2592134713163622, + "learning_rate": 1.1529603130607837e-06, + "loss": 0.0151, + "step": 3554 + }, + { + "epoch": 2.405277401894452, + "grad_norm": 0.5691157660691436, + "learning_rate": 1.1504468233893408e-06, + "loss": 0.0286, + "step": 3555 + }, + { + "epoch": 2.4059539918809203, + "grad_norm": 0.35597353966299355, + "learning_rate": 1.1479357202333707e-06, + "loss": 0.0188, + "step": 3556 + }, + { + "epoch": 2.4066305818673883, + "grad_norm": 0.24157505857099631, + "learning_rate": 1.1454270051496264e-06, + "loss": 0.012, + "step": 3557 + }, + { + "epoch": 2.407307171853857, + "grad_norm": 0.4523490673598951, + "learning_rate": 1.1429206796933717e-06, + "loss": 0.0225, + "step": 3558 + }, + { + "epoch": 2.407983761840325, + "grad_norm": 0.27794856013467706, + "learning_rate": 1.1404167454183957e-06, + "loss": 0.0155, + "step": 3559 + }, + { + "epoch": 2.408660351826793, + "grad_norm": 0.3850251532065914, + "learning_rate": 1.137915203877003e-06, + "loss": 0.0253, + "step": 3560 + }, + { + "epoch": 2.4093369418132613, + "grad_norm": 0.32746507418285364, + "learning_rate": 1.1354160566200128e-06, + "loss": 0.0183, + "step": 3561 + }, + { + "epoch": 2.4100135317997293, + "grad_norm": 0.3644506235132886, + "learning_rate": 1.132919305196763e-06, + "loss": 0.0208, + "step": 3562 + }, + { + "epoch": 2.410690121786198, + "grad_norm": 0.3174355500605888, + "learning_rate": 1.130424951155104e-06, + "loss": 0.0215, + "step": 3563 + }, + { + "epoch": 2.411366711772666, + "grad_norm": 0.31997648625186803, + "learning_rate": 1.1279329960414047e-06, + "loss": 0.0176, + "step": 3564 + }, + { + "epoch": 2.412043301759134, + "grad_norm": 0.3186085921629601, + "learning_rate": 1.1254434414005367e-06, + "loss": 0.0164, + "step": 3565 + }, + { + "epoch": 2.4127198917456023, + "grad_norm": 0.32475947283866646, + "learning_rate": 1.1229562887758927e-06, + "loss": 0.0188, + "step": 3566 + }, + { + "epoch": 2.4133964817320703, + "grad_norm": 0.31312475654368, + "learning_rate": 1.1204715397093735e-06, + "loss": 0.0173, + "step": 3567 + }, + { + "epoch": 2.414073071718539, + "grad_norm": 0.33532270460929536, + "learning_rate": 1.1179891957413908e-06, + "loss": 0.0238, + "step": 3568 + }, + { + "epoch": 2.414749661705007, + "grad_norm": 0.6103818765626172, + "learning_rate": 1.1155092584108606e-06, + "loss": 0.0301, + "step": 3569 + }, + { + "epoch": 2.415426251691475, + "grad_norm": 0.28215322588991454, + "learning_rate": 1.113031729255214e-06, + "loss": 0.015, + "step": 3570 + }, + { + "epoch": 2.4161028416779433, + "grad_norm": 0.3777151223630397, + "learning_rate": 1.1105566098103825e-06, + "loss": 0.0227, + "step": 3571 + }, + { + "epoch": 2.4167794316644113, + "grad_norm": 0.29649804984965666, + "learning_rate": 1.1080839016108086e-06, + "loss": 0.0194, + "step": 3572 + }, + { + "epoch": 2.41745602165088, + "grad_norm": 0.33306378870190073, + "learning_rate": 1.1056136061894386e-06, + "loss": 0.0182, + "step": 3573 + }, + { + "epoch": 2.418132611637348, + "grad_norm": 0.38621214047041735, + "learning_rate": 1.1031457250777206e-06, + "loss": 0.0258, + "step": 3574 + }, + { + "epoch": 2.418809201623816, + "grad_norm": 0.32207222992722523, + "learning_rate": 1.1006802598056081e-06, + "loss": 0.0152, + "step": 3575 + }, + { + "epoch": 2.4194857916102843, + "grad_norm": 0.3316487167521873, + "learning_rate": 1.0982172119015594e-06, + "loss": 0.0222, + "step": 3576 + }, + { + "epoch": 2.4201623815967523, + "grad_norm": 0.4281193426466968, + "learning_rate": 1.0957565828925292e-06, + "loss": 0.026, + "step": 3577 + }, + { + "epoch": 2.420838971583221, + "grad_norm": 0.3457078644941504, + "learning_rate": 1.0932983743039739e-06, + "loss": 0.0193, + "step": 3578 + }, + { + "epoch": 2.421515561569689, + "grad_norm": 0.35555742541029767, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.0203, + "step": 3579 + }, + { + "epoch": 2.422192151556157, + "grad_norm": 0.26932840661341484, + "learning_rate": 1.0883892244826173e-06, + "loss": 0.0151, + "step": 3580 + }, + { + "epoch": 2.4228687415426253, + "grad_norm": 0.3705681130606943, + "learning_rate": 1.0859382862932255e-06, + "loss": 0.027, + "step": 3581 + }, + { + "epoch": 2.4235453315290933, + "grad_norm": 0.38953695957431494, + "learning_rate": 1.0834897746111233e-06, + "loss": 0.0198, + "step": 3582 + }, + { + "epoch": 2.424221921515562, + "grad_norm": 0.3317674407950328, + "learning_rate": 1.0810436909542571e-06, + "loss": 0.0187, + "step": 3583 + }, + { + "epoch": 2.42489851150203, + "grad_norm": 0.3219899912954504, + "learning_rate": 1.0786000368390686e-06, + "loss": 0.0189, + "step": 3584 + }, + { + "epoch": 2.425575101488498, + "grad_norm": 0.3255756865159592, + "learning_rate": 1.0761588137804896e-06, + "loss": 0.0207, + "step": 3585 + }, + { + "epoch": 2.4262516914749663, + "grad_norm": 0.3572568166615696, + "learning_rate": 1.0737200232919465e-06, + "loss": 0.0179, + "step": 3586 + }, + { + "epoch": 2.4269282814614344, + "grad_norm": 0.4304802398588593, + "learning_rate": 1.0712836668853583e-06, + "loss": 0.0197, + "step": 3587 + }, + { + "epoch": 2.4276048714479024, + "grad_norm": 0.2816174167407849, + "learning_rate": 1.0688497460711345e-06, + "loss": 0.0146, + "step": 3588 + }, + { + "epoch": 2.428281461434371, + "grad_norm": 0.3331761050155744, + "learning_rate": 1.0664182623581777e-06, + "loss": 0.0215, + "step": 3589 + }, + { + "epoch": 2.428958051420839, + "grad_norm": 0.3461907404987719, + "learning_rate": 1.0639892172538734e-06, + "loss": 0.02, + "step": 3590 + }, + { + "epoch": 2.4296346414073073, + "grad_norm": 0.3791631041733398, + "learning_rate": 1.0615626122640988e-06, + "loss": 0.0186, + "step": 3591 + }, + { + "epoch": 2.4303112313937754, + "grad_norm": 0.3245515999284233, + "learning_rate": 1.0591384488932188e-06, + "loss": 0.0251, + "step": 3592 + }, + { + "epoch": 2.4309878213802434, + "grad_norm": 0.2593365509195044, + "learning_rate": 1.0567167286440844e-06, + "loss": 0.0158, + "step": 3593 + }, + { + "epoch": 2.431664411366712, + "grad_norm": 0.4416984677568257, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.0231, + "step": 3594 + }, + { + "epoch": 2.43234100135318, + "grad_norm": 0.2752010486638857, + "learning_rate": 1.0518806235148814e-06, + "loss": 0.0161, + "step": 3595 + }, + { + "epoch": 2.4330175913396483, + "grad_norm": 0.2913899193060892, + "learning_rate": 1.0494662416329366e-06, + "loss": 0.0157, + "step": 3596 + }, + { + "epoch": 2.4336941813261164, + "grad_norm": 0.3917553111418661, + "learning_rate": 1.0470543088689855e-06, + "loss": 0.022, + "step": 3597 + }, + { + "epoch": 2.4343707713125844, + "grad_norm": 0.3034249763115742, + "learning_rate": 1.044644826718295e-06, + "loss": 0.0154, + "step": 3598 + }, + { + "epoch": 2.435047361299053, + "grad_norm": 0.3145332035277517, + "learning_rate": 1.0422377966746133e-06, + "loss": 0.0182, + "step": 3599 + }, + { + "epoch": 2.435723951285521, + "grad_norm": 0.2589253114587557, + "learning_rate": 1.0398332202301708e-06, + "loss": 0.0123, + "step": 3600 + }, + { + "epoch": 2.4364005412719894, + "grad_norm": 0.24622442813531342, + "learning_rate": 1.0374310988756747e-06, + "loss": 0.0118, + "step": 3601 + }, + { + "epoch": 2.4370771312584574, + "grad_norm": 0.2982017403070333, + "learning_rate": 1.0350314341003121e-06, + "loss": 0.0161, + "step": 3602 + }, + { + "epoch": 2.4377537212449254, + "grad_norm": 0.2828273007905014, + "learning_rate": 1.0326342273917432e-06, + "loss": 0.0192, + "step": 3603 + }, + { + "epoch": 2.438430311231394, + "grad_norm": 0.2905337340739197, + "learning_rate": 1.0302394802361104e-06, + "loss": 0.0115, + "step": 3604 + }, + { + "epoch": 2.439106901217862, + "grad_norm": 0.3221955757275716, + "learning_rate": 1.0278471941180245e-06, + "loss": 0.0189, + "step": 3605 + }, + { + "epoch": 2.4397834912043304, + "grad_norm": 0.39995015244508103, + "learning_rate": 1.0254573705205751e-06, + "loss": 0.0201, + "step": 3606 + }, + { + "epoch": 2.4404600811907984, + "grad_norm": 0.27357873960222684, + "learning_rate": 1.0230700109253255e-06, + "loss": 0.0167, + "step": 3607 + }, + { + "epoch": 2.4411366711772664, + "grad_norm": 0.2724211804132972, + "learning_rate": 1.0206851168123078e-06, + "loss": 0.0137, + "step": 3608 + }, + { + "epoch": 2.441813261163735, + "grad_norm": 0.3582863886440724, + "learning_rate": 1.0183026896600284e-06, + "loss": 0.0184, + "step": 3609 + }, + { + "epoch": 2.442489851150203, + "grad_norm": 0.3258502584036862, + "learning_rate": 1.0159227309454662e-06, + "loss": 0.016, + "step": 3610 + }, + { + "epoch": 2.4431664411366714, + "grad_norm": 0.42611940152469097, + "learning_rate": 1.0135452421440645e-06, + "loss": 0.0225, + "step": 3611 + }, + { + "epoch": 2.4438430311231394, + "grad_norm": 0.27375167470137807, + "learning_rate": 1.0111702247297372e-06, + "loss": 0.0149, + "step": 3612 + }, + { + "epoch": 2.4445196211096074, + "grad_norm": 0.3884996950545533, + "learning_rate": 1.0087976801748694e-06, + "loss": 0.0224, + "step": 3613 + }, + { + "epoch": 2.445196211096076, + "grad_norm": 0.3679433733704196, + "learning_rate": 1.00642760995031e-06, + "loss": 0.0166, + "step": 3614 + }, + { + "epoch": 2.445872801082544, + "grad_norm": 0.3342586133317336, + "learning_rate": 1.0040600155253766e-06, + "loss": 0.0194, + "step": 3615 + }, + { + "epoch": 2.4465493910690124, + "grad_norm": 0.3673869880514802, + "learning_rate": 1.0016948983678471e-06, + "loss": 0.0236, + "step": 3616 + }, + { + "epoch": 2.4472259810554804, + "grad_norm": 0.35815533304459596, + "learning_rate": 9.993322599439692e-07, + "loss": 0.0176, + "step": 3617 + }, + { + "epoch": 2.4479025710419484, + "grad_norm": 0.38776650522245165, + "learning_rate": 9.969721017184492e-07, + "loss": 0.0218, + "step": 3618 + }, + { + "epoch": 2.448579161028417, + "grad_norm": 0.33733432858135887, + "learning_rate": 9.946144251544604e-07, + "loss": 0.018, + "step": 3619 + }, + { + "epoch": 2.449255751014885, + "grad_norm": 0.42845636603426923, + "learning_rate": 9.92259231713632e-07, + "loss": 0.0224, + "step": 3620 + }, + { + "epoch": 2.449932341001353, + "grad_norm": 0.31609882237347636, + "learning_rate": 9.899065228560596e-07, + "loss": 0.0191, + "step": 3621 + }, + { + "epoch": 2.4506089309878214, + "grad_norm": 0.32348897993986947, + "learning_rate": 9.87556300040295e-07, + "loss": 0.0167, + "step": 3622 + }, + { + "epoch": 2.4512855209742894, + "grad_norm": 0.36727952481851095, + "learning_rate": 9.852085647233505e-07, + "loss": 0.0218, + "step": 3623 + }, + { + "epoch": 2.451962110960758, + "grad_norm": 0.3426880024563584, + "learning_rate": 9.82863318360695e-07, + "loss": 0.0209, + "step": 3624 + }, + { + "epoch": 2.452638700947226, + "grad_norm": 0.29728264085109457, + "learning_rate": 9.805205624062535e-07, + "loss": 0.0175, + "step": 3625 + }, + { + "epoch": 2.453315290933694, + "grad_norm": 0.31534817697654494, + "learning_rate": 9.781802983124094e-07, + "loss": 0.0167, + "step": 3626 + }, + { + "epoch": 2.4539918809201624, + "grad_norm": 0.3601449965736396, + "learning_rate": 9.758425275299998e-07, + "loss": 0.0242, + "step": 3627 + }, + { + "epoch": 2.4546684709066304, + "grad_norm": 0.2857567366562343, + "learning_rate": 9.735072515083193e-07, + "loss": 0.0176, + "step": 3628 + }, + { + "epoch": 2.455345060893099, + "grad_norm": 0.25617022661644473, + "learning_rate": 9.711744716951093e-07, + "loss": 0.0135, + "step": 3629 + }, + { + "epoch": 2.456021650879567, + "grad_norm": 0.3529573135201078, + "learning_rate": 9.688441895365708e-07, + "loss": 0.0207, + "step": 3630 + }, + { + "epoch": 2.456698240866035, + "grad_norm": 0.33674943059847223, + "learning_rate": 9.665164064773496e-07, + "loss": 0.0175, + "step": 3631 + }, + { + "epoch": 2.4573748308525034, + "grad_norm": 0.37782206613329106, + "learning_rate": 9.641911239605494e-07, + "loss": 0.0196, + "step": 3632 + }, + { + "epoch": 2.4580514208389714, + "grad_norm": 0.3862462874137182, + "learning_rate": 9.618683434277176e-07, + "loss": 0.0228, + "step": 3633 + }, + { + "epoch": 2.45872801082544, + "grad_norm": 0.34999219455681013, + "learning_rate": 9.595480663188528e-07, + "loss": 0.0195, + "step": 3634 + }, + { + "epoch": 2.459404600811908, + "grad_norm": 0.4626169830326315, + "learning_rate": 9.572302940724032e-07, + "loss": 0.0393, + "step": 3635 + }, + { + "epoch": 2.460081190798376, + "grad_norm": 0.2913626246404951, + "learning_rate": 9.549150281252633e-07, + "loss": 0.0158, + "step": 3636 + }, + { + "epoch": 2.4607577807848444, + "grad_norm": 0.3279146939460105, + "learning_rate": 9.526022699127718e-07, + "loss": 0.0169, + "step": 3637 + }, + { + "epoch": 2.4614343707713124, + "grad_norm": 0.2571865806121024, + "learning_rate": 9.502920208687133e-07, + "loss": 0.0166, + "step": 3638 + }, + { + "epoch": 2.462110960757781, + "grad_norm": 0.30294011483908945, + "learning_rate": 9.479842824253182e-07, + "loss": 0.0167, + "step": 3639 + }, + { + "epoch": 2.462787550744249, + "grad_norm": 0.39661258407298433, + "learning_rate": 9.456790560132617e-07, + "loss": 0.0195, + "step": 3640 + }, + { + "epoch": 2.463464140730717, + "grad_norm": 0.3158447434300703, + "learning_rate": 9.433763430616577e-07, + "loss": 0.0186, + "step": 3641 + }, + { + "epoch": 2.4641407307171854, + "grad_norm": 0.38141902736677236, + "learning_rate": 9.410761449980654e-07, + "loss": 0.0188, + "step": 3642 + }, + { + "epoch": 2.4648173207036534, + "grad_norm": 0.27162725093169054, + "learning_rate": 9.387784632484825e-07, + "loss": 0.016, + "step": 3643 + }, + { + "epoch": 2.465493910690122, + "grad_norm": 0.2828561253646193, + "learning_rate": 9.364832992373501e-07, + "loss": 0.0153, + "step": 3644 + }, + { + "epoch": 2.46617050067659, + "grad_norm": 0.3750875612411601, + "learning_rate": 9.341906543875451e-07, + "loss": 0.0243, + "step": 3645 + }, + { + "epoch": 2.466847090663058, + "grad_norm": 0.30320451689941996, + "learning_rate": 9.319005301203821e-07, + "loss": 0.0116, + "step": 3646 + }, + { + "epoch": 2.4675236806495264, + "grad_norm": 0.30771201182797003, + "learning_rate": 9.296129278556155e-07, + "loss": 0.0144, + "step": 3647 + }, + { + "epoch": 2.4682002706359945, + "grad_norm": 0.34404982430849945, + "learning_rate": 9.273278490114357e-07, + "loss": 0.0199, + "step": 3648 + }, + { + "epoch": 2.468876860622463, + "grad_norm": 0.27712231545755306, + "learning_rate": 9.250452950044702e-07, + "loss": 0.0151, + "step": 3649 + }, + { + "epoch": 2.469553450608931, + "grad_norm": 0.3061379277484997, + "learning_rate": 9.227652672497761e-07, + "loss": 0.0161, + "step": 3650 + }, + { + "epoch": 2.470230040595399, + "grad_norm": 0.2580160324430578, + "learning_rate": 9.204877671608515e-07, + "loss": 0.012, + "step": 3651 + }, + { + "epoch": 2.4709066305818674, + "grad_norm": 0.2458200032209575, + "learning_rate": 9.182127961496196e-07, + "loss": 0.0168, + "step": 3652 + }, + { + "epoch": 2.4715832205683355, + "grad_norm": 0.30992285070756337, + "learning_rate": 9.159403556264435e-07, + "loss": 0.0171, + "step": 3653 + }, + { + "epoch": 2.472259810554804, + "grad_norm": 0.27087369368753017, + "learning_rate": 9.136704470001101e-07, + "loss": 0.0144, + "step": 3654 + }, + { + "epoch": 2.472936400541272, + "grad_norm": 0.3955016414382731, + "learning_rate": 9.114030716778433e-07, + "loss": 0.0183, + "step": 3655 + }, + { + "epoch": 2.47361299052774, + "grad_norm": 0.2533859532558727, + "learning_rate": 9.091382310652925e-07, + "loss": 0.0115, + "step": 3656 + }, + { + "epoch": 2.4742895805142084, + "grad_norm": 0.3260767643275918, + "learning_rate": 9.068759265665384e-07, + "loss": 0.0223, + "step": 3657 + }, + { + "epoch": 2.4749661705006765, + "grad_norm": 0.288620760090034, + "learning_rate": 9.046161595840858e-07, + "loss": 0.0117, + "step": 3658 + }, + { + "epoch": 2.475642760487145, + "grad_norm": 0.3286027645711313, + "learning_rate": 9.023589315188686e-07, + "loss": 0.019, + "step": 3659 + }, + { + "epoch": 2.476319350473613, + "grad_norm": 0.32142937478643163, + "learning_rate": 9.001042437702468e-07, + "loss": 0.0172, + "step": 3660 + }, + { + "epoch": 2.476995940460081, + "grad_norm": 0.2941548716427655, + "learning_rate": 8.978520977360067e-07, + "loss": 0.0146, + "step": 3661 + }, + { + "epoch": 2.4776725304465494, + "grad_norm": 0.26511273894977394, + "learning_rate": 8.956024948123549e-07, + "loss": 0.0159, + "step": 3662 + }, + { + "epoch": 2.4783491204330175, + "grad_norm": 0.31953367946362177, + "learning_rate": 8.933554363939256e-07, + "loss": 0.0148, + "step": 3663 + }, + { + "epoch": 2.479025710419486, + "grad_norm": 0.31577564757791765, + "learning_rate": 8.911109238737748e-07, + "loss": 0.0213, + "step": 3664 + }, + { + "epoch": 2.479702300405954, + "grad_norm": 0.2829178437388904, + "learning_rate": 8.888689586433768e-07, + "loss": 0.0144, + "step": 3665 + }, + { + "epoch": 2.480378890392422, + "grad_norm": 0.2735767790253131, + "learning_rate": 8.866295420926319e-07, + "loss": 0.0179, + "step": 3666 + }, + { + "epoch": 2.4810554803788905, + "grad_norm": 0.31001672218423, + "learning_rate": 8.843926756098548e-07, + "loss": 0.0215, + "step": 3667 + }, + { + "epoch": 2.4817320703653585, + "grad_norm": 0.33414976016780795, + "learning_rate": 8.821583605817835e-07, + "loss": 0.02, + "step": 3668 + }, + { + "epoch": 2.482408660351827, + "grad_norm": 0.284854847309466, + "learning_rate": 8.799265983935734e-07, + "loss": 0.017, + "step": 3669 + }, + { + "epoch": 2.483085250338295, + "grad_norm": 0.26849249382240087, + "learning_rate": 8.776973904287972e-07, + "loss": 0.0144, + "step": 3670 + }, + { + "epoch": 2.483761840324763, + "grad_norm": 0.3337874131085429, + "learning_rate": 8.754707380694427e-07, + "loss": 0.0184, + "step": 3671 + }, + { + "epoch": 2.4844384303112315, + "grad_norm": 0.2735184717788136, + "learning_rate": 8.732466426959135e-07, + "loss": 0.016, + "step": 3672 + }, + { + "epoch": 2.4851150202976995, + "grad_norm": 0.3753345660246591, + "learning_rate": 8.7102510568703e-07, + "loss": 0.0264, + "step": 3673 + }, + { + "epoch": 2.485791610284168, + "grad_norm": 0.35112101487173825, + "learning_rate": 8.688061284200266e-07, + "loss": 0.0185, + "step": 3674 + }, + { + "epoch": 2.486468200270636, + "grad_norm": 0.48090583087705707, + "learning_rate": 8.665897122705463e-07, + "loss": 0.0211, + "step": 3675 + }, + { + "epoch": 2.487144790257104, + "grad_norm": 0.34084347866296383, + "learning_rate": 8.6437585861265e-07, + "loss": 0.0219, + "step": 3676 + }, + { + "epoch": 2.4878213802435725, + "grad_norm": 0.3247018053195929, + "learning_rate": 8.621645688188085e-07, + "loss": 0.0154, + "step": 3677 + }, + { + "epoch": 2.4884979702300405, + "grad_norm": 0.3268995129040612, + "learning_rate": 8.599558442598998e-07, + "loss": 0.014, + "step": 3678 + }, + { + "epoch": 2.489174560216509, + "grad_norm": 0.3043518421228281, + "learning_rate": 8.577496863052165e-07, + "loss": 0.0147, + "step": 3679 + }, + { + "epoch": 2.489851150202977, + "grad_norm": 0.34875701962468597, + "learning_rate": 8.555460963224549e-07, + "loss": 0.0194, + "step": 3680 + }, + { + "epoch": 2.490527740189445, + "grad_norm": 0.2949701090986178, + "learning_rate": 8.53345075677724e-07, + "loss": 0.0167, + "step": 3681 + }, + { + "epoch": 2.4912043301759135, + "grad_norm": 0.31695728671430934, + "learning_rate": 8.511466257355384e-07, + "loss": 0.0242, + "step": 3682 + }, + { + "epoch": 2.4918809201623815, + "grad_norm": 0.29890344422040294, + "learning_rate": 8.48950747858816e-07, + "loss": 0.017, + "step": 3683 + }, + { + "epoch": 2.49255751014885, + "grad_norm": 0.3273187322016261, + "learning_rate": 8.46757443408886e-07, + "loss": 0.0144, + "step": 3684 + }, + { + "epoch": 2.493234100135318, + "grad_norm": 0.3509080983687528, + "learning_rate": 8.44566713745476e-07, + "loss": 0.0186, + "step": 3685 + }, + { + "epoch": 2.493910690121786, + "grad_norm": 0.36309102824738165, + "learning_rate": 8.42378560226722e-07, + "loss": 0.0262, + "step": 3686 + }, + { + "epoch": 2.4945872801082545, + "grad_norm": 0.3209157502033997, + "learning_rate": 8.401929842091616e-07, + "loss": 0.0203, + "step": 3687 + }, + { + "epoch": 2.4952638700947225, + "grad_norm": 0.33224393104659095, + "learning_rate": 8.380099870477321e-07, + "loss": 0.0241, + "step": 3688 + }, + { + "epoch": 2.495940460081191, + "grad_norm": 0.2850066777570733, + "learning_rate": 8.358295700957753e-07, + "loss": 0.0188, + "step": 3689 + }, + { + "epoch": 2.496617050067659, + "grad_norm": 0.45238747277186764, + "learning_rate": 8.336517347050327e-07, + "loss": 0.036, + "step": 3690 + }, + { + "epoch": 2.497293640054127, + "grad_norm": 0.31987116564388407, + "learning_rate": 8.314764822256465e-07, + "loss": 0.0178, + "step": 3691 + }, + { + "epoch": 2.4979702300405955, + "grad_norm": 0.29664999198584385, + "learning_rate": 8.293038140061516e-07, + "loss": 0.0207, + "step": 3692 + }, + { + "epoch": 2.4986468200270635, + "grad_norm": 0.29991950939417333, + "learning_rate": 8.271337313934869e-07, + "loss": 0.023, + "step": 3693 + }, + { + "epoch": 2.499323410013532, + "grad_norm": 0.3175573012633243, + "learning_rate": 8.24966235732988e-07, + "loss": 0.0228, + "step": 3694 + }, + { + "epoch": 2.5, + "grad_norm": 0.2579720120488598, + "learning_rate": 8.22801328368385e-07, + "loss": 0.0133, + "step": 3695 + }, + { + "epoch": 2.500676589986468, + "grad_norm": 0.41447195630641864, + "learning_rate": 8.206390106418028e-07, + "loss": 0.0287, + "step": 3696 + }, + { + "epoch": 2.5013531799729365, + "grad_norm": 0.26294665783938886, + "learning_rate": 8.184792838937633e-07, + "loss": 0.013, + "step": 3697 + }, + { + "epoch": 2.5020297699594045, + "grad_norm": 0.3487304987438679, + "learning_rate": 8.163221494631785e-07, + "loss": 0.0169, + "step": 3698 + }, + { + "epoch": 2.502706359945873, + "grad_norm": 0.3363029598186698, + "learning_rate": 8.141676086873574e-07, + "loss": 0.0138, + "step": 3699 + }, + { + "epoch": 2.503382949932341, + "grad_norm": 0.33537386953847986, + "learning_rate": 8.120156629019987e-07, + "loss": 0.0162, + "step": 3700 + }, + { + "epoch": 2.504059539918809, + "grad_norm": 0.30018568109620863, + "learning_rate": 8.098663134411922e-07, + "loss": 0.0219, + "step": 3701 + }, + { + "epoch": 2.5047361299052775, + "grad_norm": 0.2736818891809169, + "learning_rate": 8.077195616374184e-07, + "loss": 0.014, + "step": 3702 + }, + { + "epoch": 2.5054127198917455, + "grad_norm": 0.2916938971910028, + "learning_rate": 8.055754088215501e-07, + "loss": 0.0167, + "step": 3703 + }, + { + "epoch": 2.506089309878214, + "grad_norm": 0.3742103765871252, + "learning_rate": 8.03433856322845e-07, + "loss": 0.0221, + "step": 3704 + }, + { + "epoch": 2.506765899864682, + "grad_norm": 0.2853029902243529, + "learning_rate": 8.012949054689484e-07, + "loss": 0.017, + "step": 3705 + }, + { + "epoch": 2.50744248985115, + "grad_norm": 0.2780046433962602, + "learning_rate": 7.991585575858962e-07, + "loss": 0.0129, + "step": 3706 + }, + { + "epoch": 2.5081190798376185, + "grad_norm": 0.34726562537663525, + "learning_rate": 7.970248139981091e-07, + "loss": 0.0229, + "step": 3707 + }, + { + "epoch": 2.5087956698240865, + "grad_norm": 0.3124849687652771, + "learning_rate": 7.948936760283937e-07, + "loss": 0.0166, + "step": 3708 + }, + { + "epoch": 2.509472259810555, + "grad_norm": 0.37500088438852763, + "learning_rate": 7.92765144997939e-07, + "loss": 0.0177, + "step": 3709 + }, + { + "epoch": 2.510148849797023, + "grad_norm": 0.3044971725677902, + "learning_rate": 7.906392222263199e-07, + "loss": 0.0166, + "step": 3710 + }, + { + "epoch": 2.510825439783491, + "grad_norm": 0.2617148716258878, + "learning_rate": 7.885159090314959e-07, + "loss": 0.0117, + "step": 3711 + }, + { + "epoch": 2.5115020297699595, + "grad_norm": 0.34720228744816334, + "learning_rate": 7.863952067298042e-07, + "loss": 0.0178, + "step": 3712 + }, + { + "epoch": 2.5121786197564275, + "grad_norm": 0.31964201336305254, + "learning_rate": 7.842771166359681e-07, + "loss": 0.0148, + "step": 3713 + }, + { + "epoch": 2.512855209742896, + "grad_norm": 0.30802601534510443, + "learning_rate": 7.821616400630866e-07, + "loss": 0.0169, + "step": 3714 + }, + { + "epoch": 2.513531799729364, + "grad_norm": 0.2899769362573107, + "learning_rate": 7.80048778322643e-07, + "loss": 0.0147, + "step": 3715 + }, + { + "epoch": 2.514208389715832, + "grad_norm": 0.3276382558417513, + "learning_rate": 7.779385327244987e-07, + "loss": 0.0155, + "step": 3716 + }, + { + "epoch": 2.5148849797023005, + "grad_norm": 0.41759341270109446, + "learning_rate": 7.758309045768908e-07, + "loss": 0.0216, + "step": 3717 + }, + { + "epoch": 2.5155615696887685, + "grad_norm": 0.38468041223551724, + "learning_rate": 7.737258951864341e-07, + "loss": 0.0257, + "step": 3718 + }, + { + "epoch": 2.516238159675237, + "grad_norm": 0.3883977011172923, + "learning_rate": 7.716235058581218e-07, + "loss": 0.0194, + "step": 3719 + }, + { + "epoch": 2.516914749661705, + "grad_norm": 0.35220631420106524, + "learning_rate": 7.695237378953224e-07, + "loss": 0.0149, + "step": 3720 + }, + { + "epoch": 2.517591339648173, + "grad_norm": 0.3119448090893277, + "learning_rate": 7.674265925997804e-07, + "loss": 0.0256, + "step": 3721 + }, + { + "epoch": 2.5182679296346415, + "grad_norm": 0.31443390805581484, + "learning_rate": 7.653320712716095e-07, + "loss": 0.0208, + "step": 3722 + }, + { + "epoch": 2.5189445196211095, + "grad_norm": 0.41807724934513535, + "learning_rate": 7.632401752093016e-07, + "loss": 0.0214, + "step": 3723 + }, + { + "epoch": 2.519621109607578, + "grad_norm": 0.28393848318928605, + "learning_rate": 7.611509057097211e-07, + "loss": 0.0161, + "step": 3724 + }, + { + "epoch": 2.520297699594046, + "grad_norm": 0.3403491041134632, + "learning_rate": 7.590642640681012e-07, + "loss": 0.0177, + "step": 3725 + }, + { + "epoch": 2.520974289580514, + "grad_norm": 0.3331428333939334, + "learning_rate": 7.569802515780455e-07, + "loss": 0.0146, + "step": 3726 + }, + { + "epoch": 2.5216508795669825, + "grad_norm": 0.3760862041467032, + "learning_rate": 7.548988695315313e-07, + "loss": 0.0306, + "step": 3727 + }, + { + "epoch": 2.5223274695534506, + "grad_norm": 0.3027211125719161, + "learning_rate": 7.528201192189028e-07, + "loss": 0.0175, + "step": 3728 + }, + { + "epoch": 2.523004059539919, + "grad_norm": 0.3191198087320616, + "learning_rate": 7.507440019288742e-07, + "loss": 0.0195, + "step": 3729 + }, + { + "epoch": 2.523680649526387, + "grad_norm": 0.41889847322413687, + "learning_rate": 7.486705189485243e-07, + "loss": 0.021, + "step": 3730 + }, + { + "epoch": 2.524357239512855, + "grad_norm": 0.4182492998268176, + "learning_rate": 7.465996715633028e-07, + "loss": 0.0211, + "step": 3731 + }, + { + "epoch": 2.5250338294993235, + "grad_norm": 0.347004273713614, + "learning_rate": 7.44531461057022e-07, + "loss": 0.0205, + "step": 3732 + }, + { + "epoch": 2.5257104194857916, + "grad_norm": 0.34847214537558846, + "learning_rate": 7.424658887118613e-07, + "loss": 0.0182, + "step": 3733 + }, + { + "epoch": 2.52638700947226, + "grad_norm": 0.34440728730388054, + "learning_rate": 7.404029558083653e-07, + "loss": 0.0172, + "step": 3734 + }, + { + "epoch": 2.527063599458728, + "grad_norm": 0.2578896878374381, + "learning_rate": 7.383426636254392e-07, + "loss": 0.015, + "step": 3735 + }, + { + "epoch": 2.527740189445196, + "grad_norm": 0.33215378511702953, + "learning_rate": 7.362850134403543e-07, + "loss": 0.0127, + "step": 3736 + }, + { + "epoch": 2.5284167794316645, + "grad_norm": 0.3640017273800162, + "learning_rate": 7.342300065287439e-07, + "loss": 0.0205, + "step": 3737 + }, + { + "epoch": 2.5290933694181326, + "grad_norm": 0.4096276659255588, + "learning_rate": 7.321776441646001e-07, + "loss": 0.0352, + "step": 3738 + }, + { + "epoch": 2.529769959404601, + "grad_norm": 0.2871912767415467, + "learning_rate": 7.301279276202761e-07, + "loss": 0.018, + "step": 3739 + }, + { + "epoch": 2.530446549391069, + "grad_norm": 0.3251713139436743, + "learning_rate": 7.280808581664866e-07, + "loss": 0.0243, + "step": 3740 + }, + { + "epoch": 2.531123139377537, + "grad_norm": 0.3780638753045574, + "learning_rate": 7.260364370723044e-07, + "loss": 0.0198, + "step": 3741 + }, + { + "epoch": 2.5317997293640055, + "grad_norm": 0.3513956004069086, + "learning_rate": 7.239946656051622e-07, + "loss": 0.0224, + "step": 3742 + }, + { + "epoch": 2.5324763193504736, + "grad_norm": 0.34038329609714074, + "learning_rate": 7.219555450308446e-07, + "loss": 0.0158, + "step": 3743 + }, + { + "epoch": 2.533152909336942, + "grad_norm": 0.34755681876113453, + "learning_rate": 7.199190766135001e-07, + "loss": 0.0169, + "step": 3744 + }, + { + "epoch": 2.53382949932341, + "grad_norm": 0.29591709662706916, + "learning_rate": 7.178852616156262e-07, + "loss": 0.0223, + "step": 3745 + }, + { + "epoch": 2.534506089309878, + "grad_norm": 0.38133263415756624, + "learning_rate": 7.158541012980813e-07, + "loss": 0.0189, + "step": 3746 + }, + { + "epoch": 2.5351826792963466, + "grad_norm": 0.4194238768991091, + "learning_rate": 7.138255969200724e-07, + "loss": 0.0273, + "step": 3747 + }, + { + "epoch": 2.5358592692828146, + "grad_norm": 0.27822039690481304, + "learning_rate": 7.117997497391648e-07, + "loss": 0.0168, + "step": 3748 + }, + { + "epoch": 2.536535859269283, + "grad_norm": 0.39759140555950817, + "learning_rate": 7.097765610112745e-07, + "loss": 0.023, + "step": 3749 + }, + { + "epoch": 2.537212449255751, + "grad_norm": 0.4016877400509333, + "learning_rate": 7.077560319906696e-07, + "loss": 0.0283, + "step": 3750 + }, + { + "epoch": 2.537889039242219, + "grad_norm": 0.3758635653520881, + "learning_rate": 7.057381639299693e-07, + "loss": 0.0157, + "step": 3751 + }, + { + "epoch": 2.5385656292286876, + "grad_norm": 0.2847335457232802, + "learning_rate": 7.037229580801414e-07, + "loss": 0.0133, + "step": 3752 + }, + { + "epoch": 2.5392422192151556, + "grad_norm": 0.38036657443586286, + "learning_rate": 7.017104156905058e-07, + "loss": 0.0249, + "step": 3753 + }, + { + "epoch": 2.539918809201624, + "grad_norm": 0.3057178712858743, + "learning_rate": 6.997005380087301e-07, + "loss": 0.0214, + "step": 3754 + }, + { + "epoch": 2.540595399188092, + "grad_norm": 0.35050630154278756, + "learning_rate": 6.976933262808322e-07, + "loss": 0.0186, + "step": 3755 + }, + { + "epoch": 2.54127198917456, + "grad_norm": 0.24373961201498015, + "learning_rate": 6.95688781751172e-07, + "loss": 0.0117, + "step": 3756 + }, + { + "epoch": 2.5419485791610286, + "grad_norm": 0.3235327242981872, + "learning_rate": 6.936869056624623e-07, + "loss": 0.0157, + "step": 3757 + }, + { + "epoch": 2.5426251691474966, + "grad_norm": 0.30609144696792684, + "learning_rate": 6.916876992557553e-07, + "loss": 0.017, + "step": 3758 + }, + { + "epoch": 2.543301759133965, + "grad_norm": 0.22594102548472556, + "learning_rate": 6.896911637704534e-07, + "loss": 0.0132, + "step": 3759 + }, + { + "epoch": 2.543978349120433, + "grad_norm": 0.5191390673623143, + "learning_rate": 6.876973004442988e-07, + "loss": 0.0216, + "step": 3760 + }, + { + "epoch": 2.544654939106901, + "grad_norm": 0.2796887288244487, + "learning_rate": 6.85706110513381e-07, + "loss": 0.0206, + "step": 3761 + }, + { + "epoch": 2.5453315290933696, + "grad_norm": 0.3677716090530408, + "learning_rate": 6.837175952121305e-07, + "loss": 0.0214, + "step": 3762 + }, + { + "epoch": 2.5460081190798376, + "grad_norm": 0.32068769101337385, + "learning_rate": 6.8173175577332e-07, + "loss": 0.0186, + "step": 3763 + }, + { + "epoch": 2.546684709066306, + "grad_norm": 0.41819358878408824, + "learning_rate": 6.797485934280618e-07, + "loss": 0.0238, + "step": 3764 + }, + { + "epoch": 2.547361299052774, + "grad_norm": 0.3896390142288087, + "learning_rate": 6.777681094058087e-07, + "loss": 0.0214, + "step": 3765 + }, + { + "epoch": 2.548037889039242, + "grad_norm": 0.30481597134225336, + "learning_rate": 6.757903049343556e-07, + "loss": 0.0136, + "step": 3766 + }, + { + "epoch": 2.5487144790257106, + "grad_norm": 0.30709409095919515, + "learning_rate": 6.738151812398353e-07, + "loss": 0.0186, + "step": 3767 + }, + { + "epoch": 2.5493910690121786, + "grad_norm": 0.2751161121270378, + "learning_rate": 6.718427395467165e-07, + "loss": 0.0136, + "step": 3768 + }, + { + "epoch": 2.550067658998647, + "grad_norm": 0.36730811831170196, + "learning_rate": 6.698729810778065e-07, + "loss": 0.0229, + "step": 3769 + }, + { + "epoch": 2.550744248985115, + "grad_norm": 0.3901367916758139, + "learning_rate": 6.67905907054251e-07, + "loss": 0.0196, + "step": 3770 + }, + { + "epoch": 2.551420838971583, + "grad_norm": 0.3218881770971386, + "learning_rate": 6.659415186955298e-07, + "loss": 0.0166, + "step": 3771 + }, + { + "epoch": 2.5520974289580516, + "grad_norm": 0.2959047106763243, + "learning_rate": 6.639798172194567e-07, + "loss": 0.0152, + "step": 3772 + }, + { + "epoch": 2.5527740189445196, + "grad_norm": 0.41578956702986747, + "learning_rate": 6.620208038421805e-07, + "loss": 0.0217, + "step": 3773 + }, + { + "epoch": 2.553450608930988, + "grad_norm": 0.29811809542579076, + "learning_rate": 6.600644797781847e-07, + "loss": 0.0214, + "step": 3774 + }, + { + "epoch": 2.554127198917456, + "grad_norm": 0.364721312291806, + "learning_rate": 6.581108462402847e-07, + "loss": 0.0303, + "step": 3775 + }, + { + "epoch": 2.554803788903924, + "grad_norm": 0.3529172118128871, + "learning_rate": 6.561599044396288e-07, + "loss": 0.0184, + "step": 3776 + }, + { + "epoch": 2.555480378890392, + "grad_norm": 0.3152404408119672, + "learning_rate": 6.542116555856953e-07, + "loss": 0.0189, + "step": 3777 + }, + { + "epoch": 2.5561569688768606, + "grad_norm": 0.37995639828497735, + "learning_rate": 6.522661008862918e-07, + "loss": 0.0258, + "step": 3778 + }, + { + "epoch": 2.556833558863329, + "grad_norm": 0.34963033118812814, + "learning_rate": 6.503232415475591e-07, + "loss": 0.0241, + "step": 3779 + }, + { + "epoch": 2.557510148849797, + "grad_norm": 0.26705100509969, + "learning_rate": 6.483830787739659e-07, + "loss": 0.0154, + "step": 3780 + }, + { + "epoch": 2.558186738836265, + "grad_norm": 0.3153615336651123, + "learning_rate": 6.464456137683061e-07, + "loss": 0.019, + "step": 3781 + }, + { + "epoch": 2.558863328822733, + "grad_norm": 0.32949469173911516, + "learning_rate": 6.445108477317046e-07, + "loss": 0.0228, + "step": 3782 + }, + { + "epoch": 2.5595399188092016, + "grad_norm": 0.299929681374004, + "learning_rate": 6.425787818636131e-07, + "loss": 0.0133, + "step": 3783 + }, + { + "epoch": 2.56021650879567, + "grad_norm": 0.34799625123696276, + "learning_rate": 6.406494173618083e-07, + "loss": 0.0209, + "step": 3784 + }, + { + "epoch": 2.560893098782138, + "grad_norm": 0.33894449678548877, + "learning_rate": 6.387227554223918e-07, + "loss": 0.0203, + "step": 3785 + }, + { + "epoch": 2.561569688768606, + "grad_norm": 0.3136589437938693, + "learning_rate": 6.367987972397887e-07, + "loss": 0.0157, + "step": 3786 + }, + { + "epoch": 2.562246278755074, + "grad_norm": 0.3658814669623945, + "learning_rate": 6.348775440067507e-07, + "loss": 0.0233, + "step": 3787 + }, + { + "epoch": 2.5629228687415426, + "grad_norm": 0.302550822795212, + "learning_rate": 6.329589969143518e-07, + "loss": 0.0167, + "step": 3788 + }, + { + "epoch": 2.563599458728011, + "grad_norm": 0.44812161415334467, + "learning_rate": 6.310431571519865e-07, + "loss": 0.0174, + "step": 3789 + }, + { + "epoch": 2.564276048714479, + "grad_norm": 0.33489522778398995, + "learning_rate": 6.291300259073724e-07, + "loss": 0.0198, + "step": 3790 + }, + { + "epoch": 2.564952638700947, + "grad_norm": 0.39341720529773727, + "learning_rate": 6.27219604366549e-07, + "loss": 0.026, + "step": 3791 + }, + { + "epoch": 2.565629228687415, + "grad_norm": 0.30574329808242695, + "learning_rate": 6.25311893713873e-07, + "loss": 0.016, + "step": 3792 + }, + { + "epoch": 2.5663058186738836, + "grad_norm": 0.23251544683637546, + "learning_rate": 6.234068951320243e-07, + "loss": 0.0107, + "step": 3793 + }, + { + "epoch": 2.566982408660352, + "grad_norm": 0.3135740981894931, + "learning_rate": 6.215046098019967e-07, + "loss": 0.016, + "step": 3794 + }, + { + "epoch": 2.56765899864682, + "grad_norm": 0.24331517761875804, + "learning_rate": 6.196050389031061e-07, + "loss": 0.0119, + "step": 3795 + }, + { + "epoch": 2.568335588633288, + "grad_norm": 0.25651413300338116, + "learning_rate": 6.177081836129833e-07, + "loss": 0.0171, + "step": 3796 + }, + { + "epoch": 2.569012178619756, + "grad_norm": 0.30903109530849543, + "learning_rate": 6.158140451075794e-07, + "loss": 0.0196, + "step": 3797 + }, + { + "epoch": 2.5696887686062246, + "grad_norm": 0.28212594949742376, + "learning_rate": 6.139226245611535e-07, + "loss": 0.0177, + "step": 3798 + }, + { + "epoch": 2.5703653585926927, + "grad_norm": 0.3398177304051992, + "learning_rate": 6.120339231462862e-07, + "loss": 0.0329, + "step": 3799 + }, + { + "epoch": 2.571041948579161, + "grad_norm": 0.35122524012297607, + "learning_rate": 6.101479420338713e-07, + "loss": 0.019, + "step": 3800 + }, + { + "epoch": 2.571718538565629, + "grad_norm": 0.30860813441030255, + "learning_rate": 6.082646823931165e-07, + "loss": 0.0191, + "step": 3801 + }, + { + "epoch": 2.572395128552097, + "grad_norm": 0.31340422768397574, + "learning_rate": 6.063841453915381e-07, + "loss": 0.0222, + "step": 3802 + }, + { + "epoch": 2.5730717185385656, + "grad_norm": 0.28261952362490333, + "learning_rate": 6.045063321949696e-07, + "loss": 0.018, + "step": 3803 + }, + { + "epoch": 2.5737483085250337, + "grad_norm": 0.3929177836615564, + "learning_rate": 6.026312439675553e-07, + "loss": 0.0268, + "step": 3804 + }, + { + "epoch": 2.574424898511502, + "grad_norm": 0.3149803959614211, + "learning_rate": 6.007588818717458e-07, + "loss": 0.016, + "step": 3805 + }, + { + "epoch": 2.57510148849797, + "grad_norm": 0.2584046503682228, + "learning_rate": 5.988892470683072e-07, + "loss": 0.0136, + "step": 3806 + }, + { + "epoch": 2.575778078484438, + "grad_norm": 0.24415284472503188, + "learning_rate": 5.9702234071631e-07, + "loss": 0.0111, + "step": 3807 + }, + { + "epoch": 2.5764546684709067, + "grad_norm": 0.3517556331948916, + "learning_rate": 5.951581639731374e-07, + "loss": 0.0215, + "step": 3808 + }, + { + "epoch": 2.5771312584573747, + "grad_norm": 0.37205934687573333, + "learning_rate": 5.932967179944788e-07, + "loss": 0.0147, + "step": 3809 + }, + { + "epoch": 2.577807848443843, + "grad_norm": 0.36745162922669494, + "learning_rate": 5.914380039343281e-07, + "loss": 0.0244, + "step": 3810 + }, + { + "epoch": 2.578484438430311, + "grad_norm": 0.3069121104010219, + "learning_rate": 5.895820229449906e-07, + "loss": 0.0162, + "step": 3811 + }, + { + "epoch": 2.579161028416779, + "grad_norm": 0.39617489864023986, + "learning_rate": 5.877287761770717e-07, + "loss": 0.0178, + "step": 3812 + }, + { + "epoch": 2.5798376184032477, + "grad_norm": 0.23492464813043765, + "learning_rate": 5.858782647794864e-07, + "loss": 0.0132, + "step": 3813 + }, + { + "epoch": 2.5805142083897157, + "grad_norm": 0.3715920389620766, + "learning_rate": 5.84030489899452e-07, + "loss": 0.021, + "step": 3814 + }, + { + "epoch": 2.581190798376184, + "grad_norm": 0.3444343101060036, + "learning_rate": 5.821854526824883e-07, + "loss": 0.0136, + "step": 3815 + }, + { + "epoch": 2.581867388362652, + "grad_norm": 0.32409626370097167, + "learning_rate": 5.803431542724192e-07, + "loss": 0.0211, + "step": 3816 + }, + { + "epoch": 2.58254397834912, + "grad_norm": 0.2916593852312832, + "learning_rate": 5.785035958113717e-07, + "loss": 0.019, + "step": 3817 + }, + { + "epoch": 2.5832205683355887, + "grad_norm": 0.3729373264065649, + "learning_rate": 5.766667784397706e-07, + "loss": 0.0155, + "step": 3818 + }, + { + "epoch": 2.5838971583220567, + "grad_norm": 0.30098753602743283, + "learning_rate": 5.748327032963464e-07, + "loss": 0.0118, + "step": 3819 + }, + { + "epoch": 2.584573748308525, + "grad_norm": 0.3989803538788793, + "learning_rate": 5.730013715181238e-07, + "loss": 0.0268, + "step": 3820 + }, + { + "epoch": 2.585250338294993, + "grad_norm": 0.40759638105888774, + "learning_rate": 5.711727842404319e-07, + "loss": 0.0216, + "step": 3821 + }, + { + "epoch": 2.585926928281461, + "grad_norm": 0.4623888636214892, + "learning_rate": 5.693469425968962e-07, + "loss": 0.0176, + "step": 3822 + }, + { + "epoch": 2.5866035182679297, + "grad_norm": 0.2533420398951006, + "learning_rate": 5.675238477194389e-07, + "loss": 0.0128, + "step": 3823 + }, + { + "epoch": 2.5872801082543977, + "grad_norm": 0.2778696966059566, + "learning_rate": 5.657035007382822e-07, + "loss": 0.0183, + "step": 3824 + }, + { + "epoch": 2.587956698240866, + "grad_norm": 0.2707644682041514, + "learning_rate": 5.63885902781941e-07, + "loss": 0.0145, + "step": 3825 + }, + { + "epoch": 2.588633288227334, + "grad_norm": 0.319753787554978, + "learning_rate": 5.620710549772295e-07, + "loss": 0.0153, + "step": 3826 + }, + { + "epoch": 2.589309878213802, + "grad_norm": 0.2499952247532181, + "learning_rate": 5.602589584492563e-07, + "loss": 0.0133, + "step": 3827 + }, + { + "epoch": 2.5899864682002707, + "grad_norm": 0.32337333650405664, + "learning_rate": 5.584496143214213e-07, + "loss": 0.016, + "step": 3828 + }, + { + "epoch": 2.5906630581867387, + "grad_norm": 0.3326756489389962, + "learning_rate": 5.566430237154219e-07, + "loss": 0.017, + "step": 3829 + }, + { + "epoch": 2.591339648173207, + "grad_norm": 0.30600092409383467, + "learning_rate": 5.548391877512471e-07, + "loss": 0.0146, + "step": 3830 + }, + { + "epoch": 2.592016238159675, + "grad_norm": 0.2859905917314146, + "learning_rate": 5.530381075471775e-07, + "loss": 0.0138, + "step": 3831 + }, + { + "epoch": 2.592692828146143, + "grad_norm": 0.43210430557751284, + "learning_rate": 5.512397842197847e-07, + "loss": 0.0199, + "step": 3832 + }, + { + "epoch": 2.5933694181326117, + "grad_norm": 0.2575695695965481, + "learning_rate": 5.494442188839333e-07, + "loss": 0.014, + "step": 3833 + }, + { + "epoch": 2.5940460081190797, + "grad_norm": 0.4279612494651425, + "learning_rate": 5.476514126527771e-07, + "loss": 0.0194, + "step": 3834 + }, + { + "epoch": 2.594722598105548, + "grad_norm": 0.30195321132602454, + "learning_rate": 5.458613666377599e-07, + "loss": 0.0209, + "step": 3835 + }, + { + "epoch": 2.595399188092016, + "grad_norm": 0.6498733864084568, + "learning_rate": 5.440740819486123e-07, + "loss": 0.0294, + "step": 3836 + }, + { + "epoch": 2.596075778078484, + "grad_norm": 0.2808968561263005, + "learning_rate": 5.422895596933559e-07, + "loss": 0.0137, + "step": 3837 + }, + { + "epoch": 2.5967523680649527, + "grad_norm": 0.3737479669148715, + "learning_rate": 5.405078009782966e-07, + "loss": 0.0169, + "step": 3838 + }, + { + "epoch": 2.5974289580514207, + "grad_norm": 0.4120158920436347, + "learning_rate": 5.387288069080298e-07, + "loss": 0.0254, + "step": 3839 + }, + { + "epoch": 2.598105548037889, + "grad_norm": 0.2961257258795956, + "learning_rate": 5.369525785854368e-07, + "loss": 0.0166, + "step": 3840 + }, + { + "epoch": 2.598782138024357, + "grad_norm": 0.32195093334073316, + "learning_rate": 5.351791171116815e-07, + "loss": 0.0178, + "step": 3841 + }, + { + "epoch": 2.5994587280108252, + "grad_norm": 0.36623583464451537, + "learning_rate": 5.334084235862158e-07, + "loss": 0.0213, + "step": 3842 + }, + { + "epoch": 2.6001353179972937, + "grad_norm": 0.30841867725387645, + "learning_rate": 5.316404991067747e-07, + "loss": 0.0191, + "step": 3843 + }, + { + "epoch": 2.6008119079837617, + "grad_norm": 0.4489018552982301, + "learning_rate": 5.29875344769375e-07, + "loss": 0.0266, + "step": 3844 + }, + { + "epoch": 2.60148849797023, + "grad_norm": 0.3270851138595341, + "learning_rate": 5.281129616683167e-07, + "loss": 0.0178, + "step": 3845 + }, + { + "epoch": 2.602165087956698, + "grad_norm": 0.26661414593490385, + "learning_rate": 5.263533508961827e-07, + "loss": 0.0123, + "step": 3846 + }, + { + "epoch": 2.6028416779431662, + "grad_norm": 0.4057171635880116, + "learning_rate": 5.24596513543838e-07, + "loss": 0.0269, + "step": 3847 + }, + { + "epoch": 2.6035182679296347, + "grad_norm": 0.24243674863329226, + "learning_rate": 5.228424507004265e-07, + "loss": 0.0125, + "step": 3848 + }, + { + "epoch": 2.6041948579161027, + "grad_norm": 0.2859036518229947, + "learning_rate": 5.210911634533722e-07, + "loss": 0.0176, + "step": 3849 + }, + { + "epoch": 2.604871447902571, + "grad_norm": 0.30940377964776217, + "learning_rate": 5.193426528883788e-07, + "loss": 0.0184, + "step": 3850 + }, + { + "epoch": 2.605548037889039, + "grad_norm": 0.41125207699366584, + "learning_rate": 5.175969200894293e-07, + "loss": 0.0283, + "step": 3851 + }, + { + "epoch": 2.6062246278755072, + "grad_norm": 0.24518511219965616, + "learning_rate": 5.15853966138784e-07, + "loss": 0.0111, + "step": 3852 + }, + { + "epoch": 2.6069012178619757, + "grad_norm": 0.28153827896785844, + "learning_rate": 5.141137921169792e-07, + "loss": 0.0165, + "step": 3853 + }, + { + "epoch": 2.6075778078484437, + "grad_norm": 0.5510704796068311, + "learning_rate": 5.123763991028291e-07, + "loss": 0.0177, + "step": 3854 + }, + { + "epoch": 2.608254397834912, + "grad_norm": 0.33342472568412335, + "learning_rate": 5.106417881734244e-07, + "loss": 0.0212, + "step": 3855 + }, + { + "epoch": 2.60893098782138, + "grad_norm": 0.3150637932340437, + "learning_rate": 5.089099604041314e-07, + "loss": 0.0154, + "step": 3856 + }, + { + "epoch": 2.6096075778078482, + "grad_norm": 0.30085535857761864, + "learning_rate": 5.071809168685887e-07, + "loss": 0.0185, + "step": 3857 + }, + { + "epoch": 2.6102841677943167, + "grad_norm": 0.28791112855982204, + "learning_rate": 5.054546586387093e-07, + "loss": 0.0176, + "step": 3858 + }, + { + "epoch": 2.6109607577807847, + "grad_norm": 0.34472367636173984, + "learning_rate": 5.037311867846817e-07, + "loss": 0.019, + "step": 3859 + }, + { + "epoch": 2.611637347767253, + "grad_norm": 0.33585988543813833, + "learning_rate": 5.020105023749644e-07, + "loss": 0.0203, + "step": 3860 + }, + { + "epoch": 2.6123139377537212, + "grad_norm": 0.34803229829542065, + "learning_rate": 5.002926064762908e-07, + "loss": 0.0181, + "step": 3861 + }, + { + "epoch": 2.6129905277401893, + "grad_norm": 0.3991988805851519, + "learning_rate": 4.985775001536619e-07, + "loss": 0.0209, + "step": 3862 + }, + { + "epoch": 2.6136671177266577, + "grad_norm": 0.40529106287499, + "learning_rate": 4.968651844703514e-07, + "loss": 0.0218, + "step": 3863 + }, + { + "epoch": 2.6143437077131257, + "grad_norm": 0.32193538848448844, + "learning_rate": 4.951556604879049e-07, + "loss": 0.0202, + "step": 3864 + }, + { + "epoch": 2.615020297699594, + "grad_norm": 0.30364844266341456, + "learning_rate": 4.934489292661326e-07, + "loss": 0.0178, + "step": 3865 + }, + { + "epoch": 2.6156968876860622, + "grad_norm": 0.29246687146878775, + "learning_rate": 4.917449918631162e-07, + "loss": 0.016, + "step": 3866 + }, + { + "epoch": 2.6163734776725303, + "grad_norm": 0.34378482284101136, + "learning_rate": 4.900438493352056e-07, + "loss": 0.0212, + "step": 3867 + }, + { + "epoch": 2.6170500676589987, + "grad_norm": 0.3659490933303389, + "learning_rate": 4.883455027370171e-07, + "loss": 0.0207, + "step": 3868 + }, + { + "epoch": 2.6177266576454667, + "grad_norm": 0.23131728722454328, + "learning_rate": 4.866499531214353e-07, + "loss": 0.01, + "step": 3869 + }, + { + "epoch": 2.618403247631935, + "grad_norm": 0.2980442039402375, + "learning_rate": 4.849572015396081e-07, + "loss": 0.0172, + "step": 3870 + }, + { + "epoch": 2.6190798376184032, + "grad_norm": 0.39373854818566745, + "learning_rate": 4.832672490409513e-07, + "loss": 0.0227, + "step": 3871 + }, + { + "epoch": 2.6197564276048713, + "grad_norm": 0.29211987408626305, + "learning_rate": 4.815800966731432e-07, + "loss": 0.0125, + "step": 3872 + }, + { + "epoch": 2.6204330175913397, + "grad_norm": 0.37663135841547213, + "learning_rate": 4.798957454821285e-07, + "loss": 0.018, + "step": 3873 + }, + { + "epoch": 2.6211096075778078, + "grad_norm": 0.3709047479395079, + "learning_rate": 4.782141965121129e-07, + "loss": 0.0223, + "step": 3874 + }, + { + "epoch": 2.621786197564276, + "grad_norm": 0.3376669800648237, + "learning_rate": 4.7653545080556694e-07, + "loss": 0.0216, + "step": 3875 + }, + { + "epoch": 2.6224627875507442, + "grad_norm": 0.40515608442775186, + "learning_rate": 4.748595094032221e-07, + "loss": 0.0141, + "step": 3876 + }, + { + "epoch": 2.6231393775372123, + "grad_norm": 0.3988322929438632, + "learning_rate": 4.7318637334407335e-07, + "loss": 0.018, + "step": 3877 + }, + { + "epoch": 2.6238159675236807, + "grad_norm": 0.3295221068990801, + "learning_rate": 4.715160436653732e-07, + "loss": 0.0205, + "step": 3878 + }, + { + "epoch": 2.6244925575101488, + "grad_norm": 0.3369310092609611, + "learning_rate": 4.698485214026349e-07, + "loss": 0.0233, + "step": 3879 + }, + { + "epoch": 2.6251691474966172, + "grad_norm": 0.4172184697329247, + "learning_rate": 4.6818380758963445e-07, + "loss": 0.0147, + "step": 3880 + }, + { + "epoch": 2.6258457374830853, + "grad_norm": 0.2769336173766099, + "learning_rate": 4.6652190325840396e-07, + "loss": 0.0141, + "step": 3881 + }, + { + "epoch": 2.6265223274695533, + "grad_norm": 0.34232015925707115, + "learning_rate": 4.6486280943923547e-07, + "loss": 0.0176, + "step": 3882 + }, + { + "epoch": 2.6271989174560217, + "grad_norm": 0.3316339782534317, + "learning_rate": 4.632065271606756e-07, + "loss": 0.0182, + "step": 3883 + }, + { + "epoch": 2.6278755074424898, + "grad_norm": 0.2813458272506469, + "learning_rate": 4.615530574495325e-07, + "loss": 0.0155, + "step": 3884 + }, + { + "epoch": 2.6285520974289582, + "grad_norm": 0.28417227930603656, + "learning_rate": 4.5990240133086617e-07, + "loss": 0.0151, + "step": 3885 + }, + { + "epoch": 2.6292286874154263, + "grad_norm": 0.2888775215342934, + "learning_rate": 4.582545598279964e-07, + "loss": 0.0132, + "step": 3886 + }, + { + "epoch": 2.6299052774018943, + "grad_norm": 0.33495339114295725, + "learning_rate": 4.566095339624943e-07, + "loss": 0.0135, + "step": 3887 + }, + { + "epoch": 2.6305818673883627, + "grad_norm": 0.3677816456633802, + "learning_rate": 4.549673247541875e-07, + "loss": 0.0213, + "step": 3888 + }, + { + "epoch": 2.6312584573748308, + "grad_norm": 0.2877152983626595, + "learning_rate": 4.533279332211582e-07, + "loss": 0.0147, + "step": 3889 + }, + { + "epoch": 2.6319350473612992, + "grad_norm": 0.28540870328855866, + "learning_rate": 4.516913603797407e-07, + "loss": 0.0197, + "step": 3890 + }, + { + "epoch": 2.6326116373477673, + "grad_norm": 0.5090477039481519, + "learning_rate": 4.5005760724452173e-07, + "loss": 0.0226, + "step": 3891 + }, + { + "epoch": 2.6332882273342353, + "grad_norm": 0.3120565437034527, + "learning_rate": 4.484266748283389e-07, + "loss": 0.0211, + "step": 3892 + }, + { + "epoch": 2.6339648173207038, + "grad_norm": 0.2577547985077275, + "learning_rate": 4.4679856414228394e-07, + "loss": 0.0113, + "step": 3893 + }, + { + "epoch": 2.634641407307172, + "grad_norm": 0.3646085325861183, + "learning_rate": 4.4517327619569784e-07, + "loss": 0.0221, + "step": 3894 + }, + { + "epoch": 2.6353179972936402, + "grad_norm": 0.3798157236233289, + "learning_rate": 4.435508119961701e-07, + "loss": 0.0196, + "step": 3895 + }, + { + "epoch": 2.6359945872801083, + "grad_norm": 0.44490105371403355, + "learning_rate": 4.4193117254954174e-07, + "loss": 0.0181, + "step": 3896 + }, + { + "epoch": 2.6366711772665763, + "grad_norm": 0.2954257625147546, + "learning_rate": 4.403143588599029e-07, + "loss": 0.0118, + "step": 3897 + }, + { + "epoch": 2.6373477672530448, + "grad_norm": 0.28004517966788645, + "learning_rate": 4.387003719295896e-07, + "loss": 0.0149, + "step": 3898 + }, + { + "epoch": 2.638024357239513, + "grad_norm": 0.33120734676245644, + "learning_rate": 4.37089212759188e-07, + "loss": 0.0235, + "step": 3899 + }, + { + "epoch": 2.6387009472259813, + "grad_norm": 0.3608183612599865, + "learning_rate": 4.3548088234752814e-07, + "loss": 0.02, + "step": 3900 + }, + { + "epoch": 2.6393775372124493, + "grad_norm": 0.32828570395822737, + "learning_rate": 4.3387538169168905e-07, + "loss": 0.0151, + "step": 3901 + }, + { + "epoch": 2.6400541271989173, + "grad_norm": 0.3096144338983367, + "learning_rate": 4.322727117869951e-07, + "loss": 0.0247, + "step": 3902 + }, + { + "epoch": 2.6407307171853858, + "grad_norm": 0.36095470417773273, + "learning_rate": 4.3067287362701606e-07, + "loss": 0.0225, + "step": 3903 + }, + { + "epoch": 2.641407307171854, + "grad_norm": 0.3319403506732052, + "learning_rate": 4.2907586820356337e-07, + "loss": 0.0246, + "step": 3904 + }, + { + "epoch": 2.6420838971583223, + "grad_norm": 0.32482972799388027, + "learning_rate": 4.2748169650669524e-07, + "loss": 0.0174, + "step": 3905 + }, + { + "epoch": 2.6427604871447903, + "grad_norm": 0.32913880251857697, + "learning_rate": 4.258903595247116e-07, + "loss": 0.0176, + "step": 3906 + }, + { + "epoch": 2.6434370771312583, + "grad_norm": 0.2908676502386541, + "learning_rate": 4.2430185824415717e-07, + "loss": 0.0155, + "step": 3907 + }, + { + "epoch": 2.6441136671177268, + "grad_norm": 0.3098953513324127, + "learning_rate": 4.2271619364981474e-07, + "loss": 0.0181, + "step": 3908 + }, + { + "epoch": 2.644790257104195, + "grad_norm": 0.27277513724138214, + "learning_rate": 4.211333667247125e-07, + "loss": 0.0159, + "step": 3909 + }, + { + "epoch": 2.6454668470906633, + "grad_norm": 0.33789192453975786, + "learning_rate": 4.195533784501177e-07, + "loss": 0.0203, + "step": 3910 + }, + { + "epoch": 2.6461434370771313, + "grad_norm": 0.35939308429289896, + "learning_rate": 4.179762298055384e-07, + "loss": 0.0211, + "step": 3911 + }, + { + "epoch": 2.6468200270635993, + "grad_norm": 0.320411301639234, + "learning_rate": 4.164019217687215e-07, + "loss": 0.0146, + "step": 3912 + }, + { + "epoch": 2.647496617050068, + "grad_norm": 0.2362825884117557, + "learning_rate": 4.1483045531565183e-07, + "loss": 0.0091, + "step": 3913 + }, + { + "epoch": 2.648173207036536, + "grad_norm": 0.28708167376221655, + "learning_rate": 4.132618314205544e-07, + "loss": 0.0143, + "step": 3914 + }, + { + "epoch": 2.6488497970230043, + "grad_norm": 0.38186143626136787, + "learning_rate": 4.1169605105589315e-07, + "loss": 0.0321, + "step": 3915 + }, + { + "epoch": 2.6495263870094723, + "grad_norm": 0.3484998324938573, + "learning_rate": 4.101331151923649e-07, + "loss": 0.019, + "step": 3916 + }, + { + "epoch": 2.6502029769959403, + "grad_norm": 0.3209783144879589, + "learning_rate": 4.085730247989078e-07, + "loss": 0.0159, + "step": 3917 + }, + { + "epoch": 2.650879566982409, + "grad_norm": 0.27431039425284287, + "learning_rate": 4.070157808426928e-07, + "loss": 0.0142, + "step": 3918 + }, + { + "epoch": 2.651556156968877, + "grad_norm": 0.3111461728880971, + "learning_rate": 4.0546138428912694e-07, + "loss": 0.0211, + "step": 3919 + }, + { + "epoch": 2.6522327469553453, + "grad_norm": 0.337765663300153, + "learning_rate": 4.039098361018534e-07, + "loss": 0.018, + "step": 3920 + }, + { + "epoch": 2.6529093369418133, + "grad_norm": 0.2841292113821926, + "learning_rate": 4.0236113724274716e-07, + "loss": 0.0143, + "step": 3921 + }, + { + "epoch": 2.6535859269282813, + "grad_norm": 0.2911658918869312, + "learning_rate": 4.0081528867191854e-07, + "loss": 0.0165, + "step": 3922 + }, + { + "epoch": 2.65426251691475, + "grad_norm": 0.27203809441422655, + "learning_rate": 3.992722913477104e-07, + "loss": 0.0136, + "step": 3923 + }, + { + "epoch": 2.654939106901218, + "grad_norm": 0.5587079487321519, + "learning_rate": 3.9773214622669974e-07, + "loss": 0.0282, + "step": 3924 + }, + { + "epoch": 2.6556156968876863, + "grad_norm": 0.322198990516875, + "learning_rate": 3.9619485426369007e-07, + "loss": 0.0163, + "step": 3925 + }, + { + "epoch": 2.6562922868741543, + "grad_norm": 0.31230145293933687, + "learning_rate": 3.9466041641172126e-07, + "loss": 0.0214, + "step": 3926 + }, + { + "epoch": 2.6569688768606223, + "grad_norm": 0.31695084783290833, + "learning_rate": 3.9312883362206177e-07, + "loss": 0.0203, + "step": 3927 + }, + { + "epoch": 2.657645466847091, + "grad_norm": 0.42114465466333423, + "learning_rate": 3.916001068442116e-07, + "loss": 0.0176, + "step": 3928 + }, + { + "epoch": 2.658322056833559, + "grad_norm": 0.3724269800045378, + "learning_rate": 3.90074237025897e-07, + "loss": 0.0244, + "step": 3929 + }, + { + "epoch": 2.6589986468200273, + "grad_norm": 0.34608745347403674, + "learning_rate": 3.885512251130763e-07, + "loss": 0.0187, + "step": 3930 + }, + { + "epoch": 2.6596752368064953, + "grad_norm": 0.28587571147229535, + "learning_rate": 3.870310720499354e-07, + "loss": 0.0137, + "step": 3931 + }, + { + "epoch": 2.6603518267929633, + "grad_norm": 0.33921847814139083, + "learning_rate": 3.8551377877888487e-07, + "loss": 0.0127, + "step": 3932 + }, + { + "epoch": 2.661028416779432, + "grad_norm": 0.267265988150043, + "learning_rate": 3.839993462405678e-07, + "loss": 0.0176, + "step": 3933 + }, + { + "epoch": 2.6617050067659, + "grad_norm": 0.3617449862708202, + "learning_rate": 3.8248777537384763e-07, + "loss": 0.0223, + "step": 3934 + }, + { + "epoch": 2.6623815967523683, + "grad_norm": 0.8014209907018301, + "learning_rate": 3.8097906711581864e-07, + "loss": 0.025, + "step": 3935 + }, + { + "epoch": 2.6630581867388363, + "grad_norm": 0.3736165538060828, + "learning_rate": 3.794732224017994e-07, + "loss": 0.0193, + "step": 3936 + }, + { + "epoch": 2.6637347767253043, + "grad_norm": 0.4260184161980021, + "learning_rate": 3.7797024216533143e-07, + "loss": 0.0242, + "step": 3937 + }, + { + "epoch": 2.664411366711773, + "grad_norm": 0.3472416873892331, + "learning_rate": 3.764701273381799e-07, + "loss": 0.0188, + "step": 3938 + }, + { + "epoch": 2.665087956698241, + "grad_norm": 0.35323929395202686, + "learning_rate": 3.7497287885033763e-07, + "loss": 0.0217, + "step": 3939 + }, + { + "epoch": 2.6657645466847093, + "grad_norm": 0.40594597062584553, + "learning_rate": 3.734784976300165e-07, + "loss": 0.0269, + "step": 3940 + }, + { + "epoch": 2.6664411366711773, + "grad_norm": 0.44665833491750784, + "learning_rate": 3.719869846036539e-07, + "loss": 0.0305, + "step": 3941 + }, + { + "epoch": 2.6671177266576453, + "grad_norm": 0.30024012099265956, + "learning_rate": 3.7049834069590507e-07, + "loss": 0.0182, + "step": 3942 + }, + { + "epoch": 2.667794316644114, + "grad_norm": 0.3269498330187841, + "learning_rate": 3.6901256682965123e-07, + "loss": 0.0171, + "step": 3943 + }, + { + "epoch": 2.668470906630582, + "grad_norm": 0.36432034507712474, + "learning_rate": 3.675296639259912e-07, + "loss": 0.0178, + "step": 3944 + }, + { + "epoch": 2.6691474966170503, + "grad_norm": 0.40697105264204897, + "learning_rate": 3.6604963290424453e-07, + "loss": 0.028, + "step": 3945 + }, + { + "epoch": 2.6698240866035183, + "grad_norm": 0.2819036122469473, + "learning_rate": 3.6457247468195233e-07, + "loss": 0.0144, + "step": 3946 + }, + { + "epoch": 2.6705006765899864, + "grad_norm": 0.3322224073131182, + "learning_rate": 3.6309819017487034e-07, + "loss": 0.0139, + "step": 3947 + }, + { + "epoch": 2.671177266576455, + "grad_norm": 0.2542359632629604, + "learning_rate": 3.6162678029697696e-07, + "loss": 0.0129, + "step": 3948 + }, + { + "epoch": 2.671853856562923, + "grad_norm": 0.3018835528992928, + "learning_rate": 3.60158245960468e-07, + "loss": 0.0153, + "step": 3949 + }, + { + "epoch": 2.6725304465493913, + "grad_norm": 0.31663925709287366, + "learning_rate": 3.5869258807575414e-07, + "loss": 0.0184, + "step": 3950 + }, + { + "epoch": 2.6732070365358593, + "grad_norm": 0.4249759802609871, + "learning_rate": 3.572298075514652e-07, + "loss": 0.0277, + "step": 3951 + }, + { + "epoch": 2.6738836265223274, + "grad_norm": 0.2824209969784279, + "learning_rate": 3.557699052944447e-07, + "loss": 0.0145, + "step": 3952 + }, + { + "epoch": 2.674560216508796, + "grad_norm": 0.32105438490120597, + "learning_rate": 3.5431288220975466e-07, + "loss": 0.0248, + "step": 3953 + }, + { + "epoch": 2.675236806495264, + "grad_norm": 0.3317638633088956, + "learning_rate": 3.528587392006716e-07, + "loss": 0.0146, + "step": 3954 + }, + { + "epoch": 2.6759133964817323, + "grad_norm": 0.3247865928920339, + "learning_rate": 3.5140747716868375e-07, + "loss": 0.0161, + "step": 3955 + }, + { + "epoch": 2.6765899864682003, + "grad_norm": 0.34873860460927325, + "learning_rate": 3.499590970134964e-07, + "loss": 0.0192, + "step": 3956 + }, + { + "epoch": 2.6772665764546684, + "grad_norm": 0.3032937689202816, + "learning_rate": 3.48513599633028e-07, + "loss": 0.0164, + "step": 3957 + }, + { + "epoch": 2.677943166441137, + "grad_norm": 0.3220608890504641, + "learning_rate": 3.470709859234084e-07, + "loss": 0.0145, + "step": 3958 + }, + { + "epoch": 2.678619756427605, + "grad_norm": 0.2921901563624883, + "learning_rate": 3.4563125677897936e-07, + "loss": 0.0131, + "step": 3959 + }, + { + "epoch": 2.6792963464140733, + "grad_norm": 0.35235928759794943, + "learning_rate": 3.4419441309229587e-07, + "loss": 0.021, + "step": 3960 + }, + { + "epoch": 2.6799729364005414, + "grad_norm": 0.3240001104687845, + "learning_rate": 3.427604557541242e-07, + "loss": 0.0215, + "step": 3961 + }, + { + "epoch": 2.6806495263870094, + "grad_norm": 0.36378012234467383, + "learning_rate": 3.4132938565344054e-07, + "loss": 0.0197, + "step": 3962 + }, + { + "epoch": 2.6813261163734774, + "grad_norm": 0.42436223450235305, + "learning_rate": 3.3990120367743074e-07, + "loss": 0.0289, + "step": 3963 + }, + { + "epoch": 2.682002706359946, + "grad_norm": 0.4797539008409651, + "learning_rate": 3.38475910711491e-07, + "loss": 0.0279, + "step": 3964 + }, + { + "epoch": 2.6826792963464143, + "grad_norm": 0.5036292250663403, + "learning_rate": 3.370535076392256e-07, + "loss": 0.028, + "step": 3965 + }, + { + "epoch": 2.6833558863328824, + "grad_norm": 0.3521912885566871, + "learning_rate": 3.356339953424481e-07, + "loss": 0.0175, + "step": 3966 + }, + { + "epoch": 2.6840324763193504, + "grad_norm": 0.34324717810799954, + "learning_rate": 3.342173747011801e-07, + "loss": 0.0148, + "step": 3967 + }, + { + "epoch": 2.6847090663058184, + "grad_norm": 0.2884494151212183, + "learning_rate": 3.3280364659364903e-07, + "loss": 0.0155, + "step": 3968 + }, + { + "epoch": 2.685385656292287, + "grad_norm": 0.27871209698815347, + "learning_rate": 3.313928118962906e-07, + "loss": 0.0185, + "step": 3969 + }, + { + "epoch": 2.6860622462787553, + "grad_norm": 0.31202318809896584, + "learning_rate": 3.299848714837473e-07, + "loss": 0.015, + "step": 3970 + }, + { + "epoch": 2.6867388362652234, + "grad_norm": 0.30238832288143436, + "learning_rate": 3.285798262288653e-07, + "loss": 0.019, + "step": 3971 + }, + { + "epoch": 2.6874154262516914, + "grad_norm": 0.31633936011316777, + "learning_rate": 3.271776770026963e-07, + "loss": 0.0196, + "step": 3972 + }, + { + "epoch": 2.6880920162381594, + "grad_norm": 0.2756780328798173, + "learning_rate": 3.2577842467449773e-07, + "loss": 0.0188, + "step": 3973 + }, + { + "epoch": 2.688768606224628, + "grad_norm": 0.27233306192939727, + "learning_rate": 3.243820701117306e-07, + "loss": 0.0146, + "step": 3974 + }, + { + "epoch": 2.6894451962110963, + "grad_norm": 0.2768605775656786, + "learning_rate": 3.229886141800609e-07, + "loss": 0.0196, + "step": 3975 + }, + { + "epoch": 2.6901217861975644, + "grad_norm": 0.3232662211606284, + "learning_rate": 3.2159805774335364e-07, + "loss": 0.0203, + "step": 3976 + }, + { + "epoch": 2.6907983761840324, + "grad_norm": 0.37823678373998143, + "learning_rate": 3.2021040166368145e-07, + "loss": 0.0366, + "step": 3977 + }, + { + "epoch": 2.6914749661705004, + "grad_norm": 0.40305361291705155, + "learning_rate": 3.18825646801314e-07, + "loss": 0.022, + "step": 3978 + }, + { + "epoch": 2.692151556156969, + "grad_norm": 0.4273099718053193, + "learning_rate": 3.174437940147268e-07, + "loss": 0.0329, + "step": 3979 + }, + { + "epoch": 2.6928281461434374, + "grad_norm": 0.29755498790040424, + "learning_rate": 3.160648441605918e-07, + "loss": 0.0162, + "step": 3980 + }, + { + "epoch": 2.6935047361299054, + "grad_norm": 0.2667826288793149, + "learning_rate": 3.146887980937852e-07, + "loss": 0.0191, + "step": 3981 + }, + { + "epoch": 2.6941813261163734, + "grad_norm": 0.2869237590396578, + "learning_rate": 3.133156566673806e-07, + "loss": 0.0138, + "step": 3982 + }, + { + "epoch": 2.6948579161028414, + "grad_norm": 0.31340279503148366, + "learning_rate": 3.119454207326533e-07, + "loss": 0.0187, + "step": 3983 + }, + { + "epoch": 2.69553450608931, + "grad_norm": 0.3644251122156195, + "learning_rate": 3.105780911390738e-07, + "loss": 0.022, + "step": 3984 + }, + { + "epoch": 2.696211096075778, + "grad_norm": 0.38640727503191297, + "learning_rate": 3.0921366873431337e-07, + "loss": 0.0213, + "step": 3985 + }, + { + "epoch": 2.6968876860622464, + "grad_norm": 0.4404023970252406, + "learning_rate": 3.0785215436423986e-07, + "loss": 0.0189, + "step": 3986 + }, + { + "epoch": 2.6975642760487144, + "grad_norm": 0.3390027575595125, + "learning_rate": 3.0649354887291927e-07, + "loss": 0.0184, + "step": 3987 + }, + { + "epoch": 2.6982408660351824, + "grad_norm": 0.34572300340973866, + "learning_rate": 3.05137853102615e-07, + "loss": 0.021, + "step": 3988 + }, + { + "epoch": 2.698917456021651, + "grad_norm": 0.33657469038263754, + "learning_rate": 3.037850678937831e-07, + "loss": 0.018, + "step": 3989 + }, + { + "epoch": 2.699594046008119, + "grad_norm": 0.3761462390392621, + "learning_rate": 3.0243519408507894e-07, + "loss": 0.0227, + "step": 3990 + }, + { + "epoch": 2.7002706359945874, + "grad_norm": 0.2847388699465433, + "learning_rate": 3.0108823251335183e-07, + "loss": 0.0178, + "step": 3991 + }, + { + "epoch": 2.7009472259810554, + "grad_norm": 0.32007823943767616, + "learning_rate": 2.997441840136445e-07, + "loss": 0.0142, + "step": 3992 + }, + { + "epoch": 2.7016238159675234, + "grad_norm": 0.4164781570430055, + "learning_rate": 2.984030494191942e-07, + "loss": 0.0179, + "step": 3993 + }, + { + "epoch": 2.702300405953992, + "grad_norm": 0.30710764810054547, + "learning_rate": 2.97064829561432e-07, + "loss": 0.0146, + "step": 3994 + }, + { + "epoch": 2.70297699594046, + "grad_norm": 0.3725245796358553, + "learning_rate": 2.957295252699832e-07, + "loss": 0.0255, + "step": 3995 + }, + { + "epoch": 2.7036535859269284, + "grad_norm": 0.31916801311574255, + "learning_rate": 2.9439713737266504e-07, + "loss": 0.021, + "step": 3996 + }, + { + "epoch": 2.7043301759133964, + "grad_norm": 0.34789644252839536, + "learning_rate": 2.930676666954846e-07, + "loss": 0.0186, + "step": 3997 + }, + { + "epoch": 2.7050067658998644, + "grad_norm": 0.3185141638540178, + "learning_rate": 2.917411140626425e-07, + "loss": 0.0202, + "step": 3998 + }, + { + "epoch": 2.705683355886333, + "grad_norm": 0.35599833079731513, + "learning_rate": 2.904174802965293e-07, + "loss": 0.0184, + "step": 3999 + }, + { + "epoch": 2.706359945872801, + "grad_norm": 0.3320771075498398, + "learning_rate": 2.8909676621772853e-07, + "loss": 0.0221, + "step": 4000 + }, + { + "epoch": 2.7070365358592694, + "grad_norm": 0.26379576526571435, + "learning_rate": 2.877789726450092e-07, + "loss": 0.0117, + "step": 4001 + }, + { + "epoch": 2.7077131258457374, + "grad_norm": 0.33358097165521583, + "learning_rate": 2.864641003953339e-07, + "loss": 0.0167, + "step": 4002 + }, + { + "epoch": 2.7083897158322054, + "grad_norm": 0.2729659485043568, + "learning_rate": 2.8515215028385223e-07, + "loss": 0.0145, + "step": 4003 + }, + { + "epoch": 2.709066305818674, + "grad_norm": 0.30138506793419817, + "learning_rate": 2.8384312312390306e-07, + "loss": 0.0179, + "step": 4004 + }, + { + "epoch": 2.709742895805142, + "grad_norm": 0.350962633376903, + "learning_rate": 2.8253701972701275e-07, + "loss": 0.0193, + "step": 4005 + }, + { + "epoch": 2.7104194857916104, + "grad_norm": 0.25997132874121065, + "learning_rate": 2.8123384090289307e-07, + "loss": 0.0164, + "step": 4006 + }, + { + "epoch": 2.7110960757780784, + "grad_norm": 0.3182245648724455, + "learning_rate": 2.799335874594461e-07, + "loss": 0.015, + "step": 4007 + }, + { + "epoch": 2.7117726657645465, + "grad_norm": 0.28796902178207934, + "learning_rate": 2.7863626020275867e-07, + "loss": 0.0169, + "step": 4008 + }, + { + "epoch": 2.712449255751015, + "grad_norm": 0.25975610658489845, + "learning_rate": 2.773418599371047e-07, + "loss": 0.0122, + "step": 4009 + }, + { + "epoch": 2.713125845737483, + "grad_norm": 0.47643332218771917, + "learning_rate": 2.7605038746494063e-07, + "loss": 0.0246, + "step": 4010 + }, + { + "epoch": 2.7138024357239514, + "grad_norm": 0.29446278438100165, + "learning_rate": 2.7476184358691206e-07, + "loss": 0.0175, + "step": 4011 + }, + { + "epoch": 2.7144790257104194, + "grad_norm": 0.27594452911980805, + "learning_rate": 2.7347622910184445e-07, + "loss": 0.0148, + "step": 4012 + }, + { + "epoch": 2.7151556156968875, + "grad_norm": 0.4369159147523556, + "learning_rate": 2.7219354480675144e-07, + "loss": 0.0315, + "step": 4013 + }, + { + "epoch": 2.715832205683356, + "grad_norm": 0.3572726531465084, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.0263, + "step": 4014 + }, + { + "epoch": 2.716508795669824, + "grad_norm": 0.3064147132424032, + "learning_rate": 2.696369699654489e-07, + "loss": 0.014, + "step": 4015 + }, + { + "epoch": 2.7171853856562924, + "grad_norm": 0.3252035525213185, + "learning_rate": 2.6836308100417874e-07, + "loss": 0.0201, + "step": 4016 + }, + { + "epoch": 2.7178619756427604, + "grad_norm": 0.36956768488070535, + "learning_rate": 2.670921254027592e-07, + "loss": 0.0278, + "step": 4017 + }, + { + "epoch": 2.7185385656292285, + "grad_norm": 0.33678750438956134, + "learning_rate": 2.6582410394911327e-07, + "loss": 0.0177, + "step": 4018 + }, + { + "epoch": 2.719215155615697, + "grad_norm": 0.3031224517435561, + "learning_rate": 2.6455901742934556e-07, + "loss": 0.0217, + "step": 4019 + }, + { + "epoch": 2.719891745602165, + "grad_norm": 0.3040656428382453, + "learning_rate": 2.6329686662774247e-07, + "loss": 0.016, + "step": 4020 + }, + { + "epoch": 2.7205683355886334, + "grad_norm": 0.3473243847101063, + "learning_rate": 2.620376523267698e-07, + "loss": 0.0206, + "step": 4021 + }, + { + "epoch": 2.7212449255751014, + "grad_norm": 0.39693332106447327, + "learning_rate": 2.6078137530707146e-07, + "loss": 0.0172, + "step": 4022 + }, + { + "epoch": 2.7219215155615695, + "grad_norm": 0.36349345616889783, + "learning_rate": 2.595280363474717e-07, + "loss": 0.0218, + "step": 4023 + }, + { + "epoch": 2.722598105548038, + "grad_norm": 0.3510368180734231, + "learning_rate": 2.582776362249739e-07, + "loss": 0.0173, + "step": 4024 + }, + { + "epoch": 2.723274695534506, + "grad_norm": 0.3009889770250133, + "learning_rate": 2.5703017571475755e-07, + "loss": 0.017, + "step": 4025 + }, + { + "epoch": 2.7239512855209744, + "grad_norm": 0.3231731108995585, + "learning_rate": 2.5578565559018276e-07, + "loss": 0.0182, + "step": 4026 + }, + { + "epoch": 2.7246278755074425, + "grad_norm": 0.425246576115556, + "learning_rate": 2.545440766227825e-07, + "loss": 0.0345, + "step": 4027 + }, + { + "epoch": 2.7253044654939105, + "grad_norm": 0.3699676156137622, + "learning_rate": 2.5330543958227036e-07, + "loss": 0.0195, + "step": 4028 + }, + { + "epoch": 2.725981055480379, + "grad_norm": 0.3979910221226708, + "learning_rate": 2.520697452365345e-07, + "loss": 0.0262, + "step": 4029 + }, + { + "epoch": 2.726657645466847, + "grad_norm": 0.3519156429332919, + "learning_rate": 2.508369943516387e-07, + "loss": 0.026, + "step": 4030 + }, + { + "epoch": 2.7273342354533154, + "grad_norm": 0.29848732385448223, + "learning_rate": 2.4960718769182214e-07, + "loss": 0.0186, + "step": 4031 + }, + { + "epoch": 2.7280108254397835, + "grad_norm": 0.47792407734513015, + "learning_rate": 2.483803260194978e-07, + "loss": 0.0191, + "step": 4032 + }, + { + "epoch": 2.7286874154262515, + "grad_norm": 0.4101167635055601, + "learning_rate": 2.4715641009525446e-07, + "loss": 0.0272, + "step": 4033 + }, + { + "epoch": 2.72936400541272, + "grad_norm": 0.28252063753987144, + "learning_rate": 2.459354406778547e-07, + "loss": 0.0147, + "step": 4034 + }, + { + "epoch": 2.730040595399188, + "grad_norm": 0.3063204855995186, + "learning_rate": 2.447174185242324e-07, + "loss": 0.0166, + "step": 4035 + }, + { + "epoch": 2.7307171853856564, + "grad_norm": 0.2621946755270409, + "learning_rate": 2.4350234438949625e-07, + "loss": 0.0139, + "step": 4036 + }, + { + "epoch": 2.7313937753721245, + "grad_norm": 0.6575602680490917, + "learning_rate": 2.4229021902692663e-07, + "loss": 0.0282, + "step": 4037 + }, + { + "epoch": 2.7320703653585925, + "grad_norm": 0.2779304144145073, + "learning_rate": 2.4108104318797674e-07, + "loss": 0.0153, + "step": 4038 + }, + { + "epoch": 2.732746955345061, + "grad_norm": 0.3329234905094126, + "learning_rate": 2.3987481762226984e-07, + "loss": 0.0161, + "step": 4039 + }, + { + "epoch": 2.733423545331529, + "grad_norm": 0.3040250237676759, + "learning_rate": 2.3867154307759986e-07, + "loss": 0.014, + "step": 4040 + }, + { + "epoch": 2.7341001353179974, + "grad_norm": 0.30396870746681326, + "learning_rate": 2.3747122029993296e-07, + "loss": 0.0195, + "step": 4041 + }, + { + "epoch": 2.7347767253044655, + "grad_norm": 0.30697060065287424, + "learning_rate": 2.3627385003340552e-07, + "loss": 0.0164, + "step": 4042 + }, + { + "epoch": 2.7354533152909335, + "grad_norm": 0.32791553905501825, + "learning_rate": 2.3507943302032045e-07, + "loss": 0.0163, + "step": 4043 + }, + { + "epoch": 2.736129905277402, + "grad_norm": 0.38213003884367064, + "learning_rate": 2.3388797000115427e-07, + "loss": 0.0187, + "step": 4044 + }, + { + "epoch": 2.73680649526387, + "grad_norm": 0.22295909082208143, + "learning_rate": 2.3269946171454727e-07, + "loss": 0.0104, + "step": 4045 + }, + { + "epoch": 2.7374830852503385, + "grad_norm": 0.34977044871127216, + "learning_rate": 2.3151390889731285e-07, + "loss": 0.02, + "step": 4046 + }, + { + "epoch": 2.7381596752368065, + "grad_norm": 0.3241616477307261, + "learning_rate": 2.3033131228442863e-07, + "loss": 0.0232, + "step": 4047 + }, + { + "epoch": 2.7388362652232745, + "grad_norm": 0.3648919069938753, + "learning_rate": 2.2915167260904092e-07, + "loss": 0.0242, + "step": 4048 + }, + { + "epoch": 2.739512855209743, + "grad_norm": 0.3734339835125443, + "learning_rate": 2.2797499060246253e-07, + "loss": 0.0214, + "step": 4049 + }, + { + "epoch": 2.740189445196211, + "grad_norm": 0.3434765124359413, + "learning_rate": 2.2680126699417383e-07, + "loss": 0.0235, + "step": 4050 + }, + { + "epoch": 2.7408660351826795, + "grad_norm": 0.32959240028144116, + "learning_rate": 2.256305025118194e-07, + "loss": 0.0213, + "step": 4051 + }, + { + "epoch": 2.7415426251691475, + "grad_norm": 0.3160533452250457, + "learning_rate": 2.244626978812109e-07, + "loss": 0.0169, + "step": 4052 + }, + { + "epoch": 2.7422192151556155, + "grad_norm": 0.8114519106917566, + "learning_rate": 2.2329785382632253e-07, + "loss": 0.0345, + "step": 4053 + }, + { + "epoch": 2.742895805142084, + "grad_norm": 0.3152431991332842, + "learning_rate": 2.2213597106929608e-07, + "loss": 0.0161, + "step": 4054 + }, + { + "epoch": 2.743572395128552, + "grad_norm": 0.3065261676246775, + "learning_rate": 2.2097705033043703e-07, + "loss": 0.0147, + "step": 4055 + }, + { + "epoch": 2.7442489851150205, + "grad_norm": 0.3535460570128464, + "learning_rate": 2.198210923282118e-07, + "loss": 0.0174, + "step": 4056 + }, + { + "epoch": 2.7449255751014885, + "grad_norm": 0.3520352465737316, + "learning_rate": 2.1866809777925323e-07, + "loss": 0.0177, + "step": 4057 + }, + { + "epoch": 2.7456021650879565, + "grad_norm": 0.33604005886941474, + "learning_rate": 2.1751806739835624e-07, + "loss": 0.0183, + "step": 4058 + }, + { + "epoch": 2.746278755074425, + "grad_norm": 0.26371486220756835, + "learning_rate": 2.163710018984766e-07, + "loss": 0.0132, + "step": 4059 + }, + { + "epoch": 2.746955345060893, + "grad_norm": 0.3041056847418416, + "learning_rate": 2.1522690199073382e-07, + "loss": 0.014, + "step": 4060 + }, + { + "epoch": 2.7476319350473615, + "grad_norm": 0.35208989016331793, + "learning_rate": 2.140857683844072e-07, + "loss": 0.0186, + "step": 4061 + }, + { + "epoch": 2.7483085250338295, + "grad_norm": 0.33364142321908025, + "learning_rate": 2.1294760178693918e-07, + "loss": 0.0176, + "step": 4062 + }, + { + "epoch": 2.7489851150202975, + "grad_norm": 0.2958162686569277, + "learning_rate": 2.118124029039309e-07, + "loss": 0.017, + "step": 4063 + }, + { + "epoch": 2.749661705006766, + "grad_norm": 0.2805006933993456, + "learning_rate": 2.1068017243914663e-07, + "loss": 0.0167, + "step": 4064 + }, + { + "epoch": 2.750338294993234, + "grad_norm": 0.30392576671577776, + "learning_rate": 2.0955091109450488e-07, + "loss": 0.0152, + "step": 4065 + }, + { + "epoch": 2.7510148849797025, + "grad_norm": 0.3334875369482922, + "learning_rate": 2.0842461957008841e-07, + "loss": 0.0226, + "step": 4066 + }, + { + "epoch": 2.7516914749661705, + "grad_norm": 0.262672594488508, + "learning_rate": 2.0730129856413705e-07, + "loss": 0.0178, + "step": 4067 + }, + { + "epoch": 2.7523680649526385, + "grad_norm": 0.3303580177380507, + "learning_rate": 2.061809487730504e-07, + "loss": 0.0224, + "step": 4068 + }, + { + "epoch": 2.753044654939107, + "grad_norm": 0.3283166808566351, + "learning_rate": 2.050635708913834e-07, + "loss": 0.0188, + "step": 4069 + }, + { + "epoch": 2.753721244925575, + "grad_norm": 0.31562572929572663, + "learning_rate": 2.0394916561185085e-07, + "loss": 0.0176, + "step": 4070 + }, + { + "epoch": 2.7543978349120435, + "grad_norm": 0.388767555169842, + "learning_rate": 2.0283773362532455e-07, + "loss": 0.0169, + "step": 4071 + }, + { + "epoch": 2.7550744248985115, + "grad_norm": 0.3048219874973848, + "learning_rate": 2.0172927562083056e-07, + "loss": 0.02, + "step": 4072 + }, + { + "epoch": 2.7557510148849795, + "grad_norm": 0.274060625739595, + "learning_rate": 2.006237922855553e-07, + "loss": 0.0133, + "step": 4073 + }, + { + "epoch": 2.756427604871448, + "grad_norm": 0.3099779178997045, + "learning_rate": 1.9952128430483718e-07, + "loss": 0.0218, + "step": 4074 + }, + { + "epoch": 2.757104194857916, + "grad_norm": 0.2725512711461539, + "learning_rate": 1.9842175236217176e-07, + "loss": 0.0143, + "step": 4075 + }, + { + "epoch": 2.7577807848443845, + "grad_norm": 0.39468192117410833, + "learning_rate": 1.973251971392115e-07, + "loss": 0.0181, + "step": 4076 + }, + { + "epoch": 2.7584573748308525, + "grad_norm": 0.39116902292312916, + "learning_rate": 1.962316193157593e-07, + "loss": 0.0205, + "step": 4077 + }, + { + "epoch": 2.7591339648173205, + "grad_norm": 0.33392762399766013, + "learning_rate": 1.9514101956977617e-07, + "loss": 0.0212, + "step": 4078 + }, + { + "epoch": 2.759810554803789, + "grad_norm": 0.3625373916301655, + "learning_rate": 1.9405339857737348e-07, + "loss": 0.021, + "step": 4079 + }, + { + "epoch": 2.760487144790257, + "grad_norm": 0.29297155837592603, + "learning_rate": 1.9296875701281858e-07, + "loss": 0.0142, + "step": 4080 + }, + { + "epoch": 2.7611637347767255, + "grad_norm": 0.37434360044098514, + "learning_rate": 1.9188709554853137e-07, + "loss": 0.0131, + "step": 4081 + }, + { + "epoch": 2.7618403247631935, + "grad_norm": 0.32840018770567087, + "learning_rate": 1.9080841485508205e-07, + "loss": 0.0178, + "step": 4082 + }, + { + "epoch": 2.7625169147496615, + "grad_norm": 0.2732008471951929, + "learning_rate": 1.8973271560119576e-07, + "loss": 0.0166, + "step": 4083 + }, + { + "epoch": 2.76319350473613, + "grad_norm": 0.304425593920917, + "learning_rate": 1.8865999845374794e-07, + "loss": 0.0173, + "step": 4084 + }, + { + "epoch": 2.763870094722598, + "grad_norm": 0.3357367881595279, + "learning_rate": 1.8759026407776605e-07, + "loss": 0.0189, + "step": 4085 + }, + { + "epoch": 2.7645466847090665, + "grad_norm": 0.37373448162813294, + "learning_rate": 1.8652351313642568e-07, + "loss": 0.0187, + "step": 4086 + }, + { + "epoch": 2.7652232746955345, + "grad_norm": 0.29380493140749553, + "learning_rate": 1.8545974629105624e-07, + "loss": 0.0184, + "step": 4087 + }, + { + "epoch": 2.7658998646820026, + "grad_norm": 0.36995227087745874, + "learning_rate": 1.8439896420113569e-07, + "loss": 0.0224, + "step": 4088 + }, + { + "epoch": 2.766576454668471, + "grad_norm": 0.3713994600818487, + "learning_rate": 1.8334116752429243e-07, + "loss": 0.0192, + "step": 4089 + }, + { + "epoch": 2.767253044654939, + "grad_norm": 0.29526245991345584, + "learning_rate": 1.8228635691630191e-07, + "loss": 0.0149, + "step": 4090 + }, + { + "epoch": 2.7679296346414075, + "grad_norm": 0.28427182057613726, + "learning_rate": 1.812345330310916e-07, + "loss": 0.0122, + "step": 4091 + }, + { + "epoch": 2.7686062246278755, + "grad_norm": 0.28069488963185774, + "learning_rate": 1.801856965207338e-07, + "loss": 0.0125, + "step": 4092 + }, + { + "epoch": 2.7692828146143436, + "grad_norm": 0.35868479336121434, + "learning_rate": 1.791398480354517e-07, + "loss": 0.0232, + "step": 4093 + }, + { + "epoch": 2.769959404600812, + "grad_norm": 0.3313312477344652, + "learning_rate": 1.78096988223615e-07, + "loss": 0.0186, + "step": 4094 + }, + { + "epoch": 2.77063599458728, + "grad_norm": 0.326954198352059, + "learning_rate": 1.770571177317404e-07, + "loss": 0.0166, + "step": 4095 + }, + { + "epoch": 2.7713125845737485, + "grad_norm": 0.38009409106856973, + "learning_rate": 1.7602023720449114e-07, + "loss": 0.0212, + "step": 4096 + }, + { + "epoch": 2.7719891745602165, + "grad_norm": 0.2556002151284434, + "learning_rate": 1.74986347284678e-07, + "loss": 0.0114, + "step": 4097 + }, + { + "epoch": 2.7726657645466846, + "grad_norm": 0.2925961625184005, + "learning_rate": 1.7395544861325718e-07, + "loss": 0.0212, + "step": 4098 + }, + { + "epoch": 2.773342354533153, + "grad_norm": 0.2905471849559168, + "learning_rate": 1.7292754182932914e-07, + "loss": 0.0136, + "step": 4099 + }, + { + "epoch": 2.774018944519621, + "grad_norm": 0.3106757077951478, + "learning_rate": 1.7190262757014076e-07, + "loss": 0.0144, + "step": 4100 + }, + { + "epoch": 2.7746955345060895, + "grad_norm": 0.3014654925650417, + "learning_rate": 1.7088070647108433e-07, + "loss": 0.0142, + "step": 4101 + }, + { + "epoch": 2.7753721244925575, + "grad_norm": 0.37528508507028097, + "learning_rate": 1.6986177916569646e-07, + "loss": 0.018, + "step": 4102 + }, + { + "epoch": 2.7760487144790256, + "grad_norm": 0.3515124685348501, + "learning_rate": 1.688458462856557e-07, + "loss": 0.0213, + "step": 4103 + }, + { + "epoch": 2.776725304465494, + "grad_norm": 0.32662458304207914, + "learning_rate": 1.6783290846078714e-07, + "loss": 0.0205, + "step": 4104 + }, + { + "epoch": 2.777401894451962, + "grad_norm": 0.23417014264730251, + "learning_rate": 1.6682296631905626e-07, + "loss": 0.0107, + "step": 4105 + }, + { + "epoch": 2.7780784844384305, + "grad_norm": 0.30065525384623853, + "learning_rate": 1.6581602048657387e-07, + "loss": 0.0172, + "step": 4106 + }, + { + "epoch": 2.7787550744248986, + "grad_norm": 0.3665657592493083, + "learning_rate": 1.648120715875906e-07, + "loss": 0.0203, + "step": 4107 + }, + { + "epoch": 2.7794316644113666, + "grad_norm": 0.37673042416747315, + "learning_rate": 1.6381112024450196e-07, + "loss": 0.0178, + "step": 4108 + }, + { + "epoch": 2.780108254397835, + "grad_norm": 0.34082547617953735, + "learning_rate": 1.6281316707784377e-07, + "loss": 0.0172, + "step": 4109 + }, + { + "epoch": 2.780784844384303, + "grad_norm": 0.3240317627049622, + "learning_rate": 1.618182127062934e-07, + "loss": 0.0225, + "step": 4110 + }, + { + "epoch": 2.7814614343707715, + "grad_norm": 0.3259841086009218, + "learning_rate": 1.6082625774666793e-07, + "loss": 0.0171, + "step": 4111 + }, + { + "epoch": 2.7821380243572396, + "grad_norm": 0.32598710617131105, + "learning_rate": 1.5983730281392663e-07, + "loss": 0.0198, + "step": 4112 + }, + { + "epoch": 2.7828146143437076, + "grad_norm": 0.2709223470740636, + "learning_rate": 1.588513485211679e-07, + "loss": 0.0164, + "step": 4113 + }, + { + "epoch": 2.783491204330176, + "grad_norm": 0.32928346289954025, + "learning_rate": 1.5786839547963008e-07, + "loss": 0.0189, + "step": 4114 + }, + { + "epoch": 2.784167794316644, + "grad_norm": 0.30672617000876923, + "learning_rate": 1.5688844429869232e-07, + "loss": 0.0132, + "step": 4115 + }, + { + "epoch": 2.7848443843031125, + "grad_norm": 0.25503069071354534, + "learning_rate": 1.5591149558587037e-07, + "loss": 0.0154, + "step": 4116 + }, + { + "epoch": 2.7855209742895806, + "grad_norm": 0.31228690440274604, + "learning_rate": 1.5493754994681977e-07, + "loss": 0.0154, + "step": 4117 + }, + { + "epoch": 2.7861975642760486, + "grad_norm": 0.35886890995941473, + "learning_rate": 1.539666079853358e-07, + "loss": 0.0189, + "step": 4118 + }, + { + "epoch": 2.786874154262517, + "grad_norm": 0.29012322934131934, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.0138, + "step": 4119 + }, + { + "epoch": 2.787550744248985, + "grad_norm": 0.29350351400257524, + "learning_rate": 1.5203373750092676e-07, + "loss": 0.018, + "step": 4120 + }, + { + "epoch": 2.7882273342354535, + "grad_norm": 0.3287251860029488, + "learning_rate": 1.5107181017627813e-07, + "loss": 0.0217, + "step": 4121 + }, + { + "epoch": 2.7889039242219216, + "grad_norm": 0.3509188190630282, + "learning_rate": 1.5011288892574526e-07, + "loss": 0.017, + "step": 4122 + }, + { + "epoch": 2.7895805142083896, + "grad_norm": 0.3343502518240095, + "learning_rate": 1.4915697434380816e-07, + "loss": 0.0125, + "step": 4123 + }, + { + "epoch": 2.790257104194858, + "grad_norm": 0.25046203213359464, + "learning_rate": 1.4820406702308165e-07, + "loss": 0.0138, + "step": 4124 + }, + { + "epoch": 2.790933694181326, + "grad_norm": 0.33263767132269406, + "learning_rate": 1.4725416755431655e-07, + "loss": 0.0247, + "step": 4125 + }, + { + "epoch": 2.7916102841677946, + "grad_norm": 0.3691855820101089, + "learning_rate": 1.463072765264001e-07, + "loss": 0.0264, + "step": 4126 + }, + { + "epoch": 2.7922868741542626, + "grad_norm": 0.2736780190443737, + "learning_rate": 1.4536339452635385e-07, + "loss": 0.0154, + "step": 4127 + }, + { + "epoch": 2.7929634641407306, + "grad_norm": 0.3061179819121896, + "learning_rate": 1.444225221393325e-07, + "loss": 0.0196, + "step": 4128 + }, + { + "epoch": 2.793640054127199, + "grad_norm": 0.36039938771301444, + "learning_rate": 1.4348465994862782e-07, + "loss": 0.0207, + "step": 4129 + }, + { + "epoch": 2.794316644113667, + "grad_norm": 0.4566606225699603, + "learning_rate": 1.4254980853566248e-07, + "loss": 0.0261, + "step": 4130 + }, + { + "epoch": 2.7949932341001356, + "grad_norm": 0.34136881554932963, + "learning_rate": 1.4161796847999566e-07, + "loss": 0.0238, + "step": 4131 + }, + { + "epoch": 2.7956698240866036, + "grad_norm": 0.3282136685011032, + "learning_rate": 1.4068914035931635e-07, + "loss": 0.0219, + "step": 4132 + }, + { + "epoch": 2.7963464140730716, + "grad_norm": 0.296902671355774, + "learning_rate": 1.3976332474944842e-07, + "loss": 0.0169, + "step": 4133 + }, + { + "epoch": 2.79702300405954, + "grad_norm": 0.3690960315227845, + "learning_rate": 1.388405222243472e-07, + "loss": 0.0298, + "step": 4134 + }, + { + "epoch": 2.797699594046008, + "grad_norm": 0.3073177677940477, + "learning_rate": 1.3792073335610111e-07, + "loss": 0.0179, + "step": 4135 + }, + { + "epoch": 2.7983761840324766, + "grad_norm": 0.3274661142774267, + "learning_rate": 1.3700395871493023e-07, + "loss": 0.0211, + "step": 4136 + }, + { + "epoch": 2.7990527740189446, + "grad_norm": 0.33622335014898314, + "learning_rate": 1.360901988691843e-07, + "loss": 0.021, + "step": 4137 + }, + { + "epoch": 2.7997293640054126, + "grad_norm": 0.33537010188096567, + "learning_rate": 1.3517945438534629e-07, + "loss": 0.0173, + "step": 4138 + }, + { + "epoch": 2.800405953991881, + "grad_norm": 0.32687748121447535, + "learning_rate": 1.342717258280274e-07, + "loss": 0.0186, + "step": 4139 + }, + { + "epoch": 2.801082543978349, + "grad_norm": 0.3690251150778055, + "learning_rate": 1.333670137599713e-07, + "loss": 0.0176, + "step": 4140 + }, + { + "epoch": 2.8017591339648176, + "grad_norm": 0.34519815737695225, + "learning_rate": 1.3246531874204994e-07, + "loss": 0.0257, + "step": 4141 + }, + { + "epoch": 2.8024357239512856, + "grad_norm": 0.24481152014945498, + "learning_rate": 1.3156664133326614e-07, + "loss": 0.0126, + "step": 4142 + }, + { + "epoch": 2.8031123139377536, + "grad_norm": 0.3739915472938706, + "learning_rate": 1.3067098209075202e-07, + "loss": 0.0175, + "step": 4143 + }, + { + "epoch": 2.803788903924222, + "grad_norm": 0.347442694922643, + "learning_rate": 1.2977834156976733e-07, + "loss": 0.0212, + "step": 4144 + }, + { + "epoch": 2.80446549391069, + "grad_norm": 0.2839483018207203, + "learning_rate": 1.2888872032370103e-07, + "loss": 0.0114, + "step": 4145 + }, + { + "epoch": 2.8051420838971586, + "grad_norm": 0.32521226637344836, + "learning_rate": 1.280021189040709e-07, + "loss": 0.0181, + "step": 4146 + }, + { + "epoch": 2.8058186738836266, + "grad_norm": 0.2842628100028989, + "learning_rate": 1.2711853786052108e-07, + "loss": 0.0131, + "step": 4147 + }, + { + "epoch": 2.8064952638700946, + "grad_norm": 0.5208399257736981, + "learning_rate": 1.2623797774082514e-07, + "loss": 0.019, + "step": 4148 + }, + { + "epoch": 2.8071718538565626, + "grad_norm": 0.3550062429672941, + "learning_rate": 1.253604390908819e-07, + "loss": 0.0207, + "step": 4149 + }, + { + "epoch": 2.807848443843031, + "grad_norm": 0.3540559434553578, + "learning_rate": 1.2448592245471903e-07, + "loss": 0.0212, + "step": 4150 + }, + { + "epoch": 2.8085250338294996, + "grad_norm": 0.3446182176537769, + "learning_rate": 1.2361442837449e-07, + "loss": 0.0199, + "step": 4151 + }, + { + "epoch": 2.8092016238159676, + "grad_norm": 0.3930861168869538, + "learning_rate": 1.2274595739047267e-07, + "loss": 0.02, + "step": 4152 + }, + { + "epoch": 2.8098782138024356, + "grad_norm": 0.331933688100313, + "learning_rate": 1.2188051004107305e-07, + "loss": 0.0169, + "step": 4153 + }, + { + "epoch": 2.8105548037889037, + "grad_norm": 0.2710769268242476, + "learning_rate": 1.210180868628219e-07, + "loss": 0.0146, + "step": 4154 + }, + { + "epoch": 2.811231393775372, + "grad_norm": 0.291195714945549, + "learning_rate": 1.2015868839037492e-07, + "loss": 0.0162, + "step": 4155 + }, + { + "epoch": 2.8119079837618406, + "grad_norm": 0.362413962982955, + "learning_rate": 1.1930231515651313e-07, + "loss": 0.0288, + "step": 4156 + }, + { + "epoch": 2.8125845737483086, + "grad_norm": 0.2807957681518928, + "learning_rate": 1.1844896769214187e-07, + "loss": 0.0128, + "step": 4157 + }, + { + "epoch": 2.8132611637347766, + "grad_norm": 0.3585190271177819, + "learning_rate": 1.1759864652629072e-07, + "loss": 0.0175, + "step": 4158 + }, + { + "epoch": 2.8139377537212447, + "grad_norm": 0.23158674845334967, + "learning_rate": 1.1675135218611188e-07, + "loss": 0.0126, + "step": 4159 + }, + { + "epoch": 2.814614343707713, + "grad_norm": 0.3380125058541794, + "learning_rate": 1.1590708519688243e-07, + "loss": 0.0185, + "step": 4160 + }, + { + "epoch": 2.8152909336941816, + "grad_norm": 0.35484980017854956, + "learning_rate": 1.1506584608200366e-07, + "loss": 0.0171, + "step": 4161 + }, + { + "epoch": 2.8159675236806496, + "grad_norm": 0.3041795775691457, + "learning_rate": 1.142276353629973e-07, + "loss": 0.0167, + "step": 4162 + }, + { + "epoch": 2.8166441136671176, + "grad_norm": 0.2883595137984694, + "learning_rate": 1.1339245355950934e-07, + "loss": 0.0152, + "step": 4163 + }, + { + "epoch": 2.8173207036535857, + "grad_norm": 0.3345296906089819, + "learning_rate": 1.1256030118930727e-07, + "loss": 0.0194, + "step": 4164 + }, + { + "epoch": 2.817997293640054, + "grad_norm": 0.35993936957584743, + "learning_rate": 1.1173117876828066e-07, + "loss": 0.0187, + "step": 4165 + }, + { + "epoch": 2.8186738836265226, + "grad_norm": 0.3673734361345342, + "learning_rate": 1.1090508681044055e-07, + "loss": 0.0238, + "step": 4166 + }, + { + "epoch": 2.8193504736129906, + "grad_norm": 0.40161920530572387, + "learning_rate": 1.1008202582792005e-07, + "loss": 0.0311, + "step": 4167 + }, + { + "epoch": 2.8200270635994586, + "grad_norm": 0.3802887760171123, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.0201, + "step": 4168 + }, + { + "epoch": 2.8207036535859267, + "grad_norm": 0.4511150041871571, + "learning_rate": 1.0844499882797011e-07, + "loss": 0.0235, + "step": 4169 + }, + { + "epoch": 2.821380243572395, + "grad_norm": 0.3747750908844225, + "learning_rate": 1.0763103382541052e-07, + "loss": 0.0179, + "step": 4170 + }, + { + "epoch": 2.822056833558863, + "grad_norm": 0.35174672555112185, + "learning_rate": 1.0682010182790637e-07, + "loss": 0.0196, + "step": 4171 + }, + { + "epoch": 2.8227334235453316, + "grad_norm": 0.27613717626728473, + "learning_rate": 1.0601220333819162e-07, + "loss": 0.0153, + "step": 4172 + }, + { + "epoch": 2.8234100135317997, + "grad_norm": 0.36129890907296824, + "learning_rate": 1.0520733885712008e-07, + "loss": 0.0148, + "step": 4173 + }, + { + "epoch": 2.8240866035182677, + "grad_norm": 0.3063260499050128, + "learning_rate": 1.0440550888366485e-07, + "loss": 0.0202, + "step": 4174 + }, + { + "epoch": 2.824763193504736, + "grad_norm": 0.26862879142559287, + "learning_rate": 1.0360671391491606e-07, + "loss": 0.0155, + "step": 4175 + }, + { + "epoch": 2.825439783491204, + "grad_norm": 0.3108379329412415, + "learning_rate": 1.0281095444608425e-07, + "loss": 0.0141, + "step": 4176 + }, + { + "epoch": 2.8261163734776726, + "grad_norm": 0.31685624074629615, + "learning_rate": 1.0201823097049812e-07, + "loss": 0.0209, + "step": 4177 + }, + { + "epoch": 2.8267929634641407, + "grad_norm": 0.2674887247006238, + "learning_rate": 1.0122854397960292e-07, + "loss": 0.0153, + "step": 4178 + }, + { + "epoch": 2.8274695534506087, + "grad_norm": 0.36815715043709984, + "learning_rate": 1.0044189396296144e-07, + "loss": 0.0195, + "step": 4179 + }, + { + "epoch": 2.828146143437077, + "grad_norm": 0.45243847055235403, + "learning_rate": 9.965828140825529e-08, + "loss": 0.0322, + "step": 4180 + }, + { + "epoch": 2.828822733423545, + "grad_norm": 0.32561370294205155, + "learning_rate": 9.887770680128083e-08, + "loss": 0.0201, + "step": 4181 + }, + { + "epoch": 2.8294993234100136, + "grad_norm": 0.378921279145927, + "learning_rate": 9.810017062595322e-08, + "loss": 0.0185, + "step": 4182 + }, + { + "epoch": 2.8301759133964817, + "grad_norm": 0.38997410309334873, + "learning_rate": 9.732567336430298e-08, + "loss": 0.0264, + "step": 4183 + }, + { + "epoch": 2.8308525033829497, + "grad_norm": 0.33019969569208485, + "learning_rate": 9.655421549647603e-08, + "loss": 0.0171, + "step": 4184 + }, + { + "epoch": 2.831529093369418, + "grad_norm": 0.32296767331333515, + "learning_rate": 9.57857975007348e-08, + "loss": 0.018, + "step": 4185 + }, + { + "epoch": 2.832205683355886, + "grad_norm": 0.24066758014289052, + "learning_rate": 9.502041985345766e-08, + "loss": 0.0141, + "step": 4186 + }, + { + "epoch": 2.8328822733423547, + "grad_norm": 0.4270692892252477, + "learning_rate": 9.42580830291373e-08, + "loss": 0.0303, + "step": 4187 + }, + { + "epoch": 2.8335588633288227, + "grad_norm": 0.46981352624557154, + "learning_rate": 9.349878750038067e-08, + "loss": 0.0166, + "step": 4188 + }, + { + "epoch": 2.8342354533152907, + "grad_norm": 0.32867190206042646, + "learning_rate": 9.274253373791064e-08, + "loss": 0.0172, + "step": 4189 + }, + { + "epoch": 2.834912043301759, + "grad_norm": 0.26081705893631607, + "learning_rate": 9.198932221056333e-08, + "loss": 0.0183, + "step": 4190 + }, + { + "epoch": 2.835588633288227, + "grad_norm": 0.3046085644654613, + "learning_rate": 9.123915338529132e-08, + "loss": 0.0188, + "step": 4191 + }, + { + "epoch": 2.8362652232746957, + "grad_norm": 0.45431772838139406, + "learning_rate": 9.049202772715593e-08, + "loss": 0.0327, + "step": 4192 + }, + { + "epoch": 2.8369418132611637, + "grad_norm": 0.2873985175221937, + "learning_rate": 8.974794569933609e-08, + "loss": 0.0159, + "step": 4193 + }, + { + "epoch": 2.8376184032476317, + "grad_norm": 0.31784620721086676, + "learning_rate": 8.900690776312282e-08, + "loss": 0.0197, + "step": 4194 + }, + { + "epoch": 2.8382949932341, + "grad_norm": 0.30041381746593987, + "learning_rate": 8.826891437791974e-08, + "loss": 0.0169, + "step": 4195 + }, + { + "epoch": 2.838971583220568, + "grad_norm": 0.39558605785565415, + "learning_rate": 8.753396600124254e-08, + "loss": 0.0289, + "step": 4196 + }, + { + "epoch": 2.8396481732070367, + "grad_norm": 0.37784200747162283, + "learning_rate": 8.680206308871953e-08, + "loss": 0.0231, + "step": 4197 + }, + { + "epoch": 2.8403247631935047, + "grad_norm": 0.3790069222464381, + "learning_rate": 8.607320609409165e-08, + "loss": 0.0219, + "step": 4198 + }, + { + "epoch": 2.8410013531799727, + "grad_norm": 0.3142251266132667, + "learning_rate": 8.534739546921023e-08, + "loss": 0.0165, + "step": 4199 + }, + { + "epoch": 2.841677943166441, + "grad_norm": 0.35237740062774736, + "learning_rate": 8.462463166403978e-08, + "loss": 0.0241, + "step": 4200 + }, + { + "epoch": 2.842354533152909, + "grad_norm": 0.2665480806466482, + "learning_rate": 8.390491512665355e-08, + "loss": 0.0169, + "step": 4201 + }, + { + "epoch": 2.8430311231393777, + "grad_norm": 0.3091381157886086, + "learning_rate": 8.318824630323741e-08, + "loss": 0.0182, + "step": 4202 + }, + { + "epoch": 2.8437077131258457, + "grad_norm": 0.3159771346997211, + "learning_rate": 8.247462563808816e-08, + "loss": 0.0147, + "step": 4203 + }, + { + "epoch": 2.8443843031123137, + "grad_norm": 0.25437533995979045, + "learning_rate": 8.176405357361194e-08, + "loss": 0.0145, + "step": 4204 + }, + { + "epoch": 2.845060893098782, + "grad_norm": 0.3153744422734466, + "learning_rate": 8.105653055032415e-08, + "loss": 0.0158, + "step": 4205 + }, + { + "epoch": 2.84573748308525, + "grad_norm": 0.4315609203813294, + "learning_rate": 8.035205700685167e-08, + "loss": 0.0321, + "step": 4206 + }, + { + "epoch": 2.8464140730717187, + "grad_norm": 0.29239149692314864, + "learning_rate": 7.965063337993018e-08, + "loss": 0.0147, + "step": 4207 + }, + { + "epoch": 2.8470906630581867, + "grad_norm": 0.32624058008096235, + "learning_rate": 7.89522601044046e-08, + "loss": 0.0229, + "step": 4208 + }, + { + "epoch": 2.8477672530446547, + "grad_norm": 0.30141679336917077, + "learning_rate": 7.825693761322861e-08, + "loss": 0.0154, + "step": 4209 + }, + { + "epoch": 2.848443843031123, + "grad_norm": 0.33653125023671926, + "learning_rate": 7.756466633746407e-08, + "loss": 0.0184, + "step": 4210 + }, + { + "epoch": 2.849120433017591, + "grad_norm": 0.3675320849851784, + "learning_rate": 7.687544670628267e-08, + "loss": 0.0181, + "step": 4211 + }, + { + "epoch": 2.8497970230040597, + "grad_norm": 0.30627731914656653, + "learning_rate": 7.618927914696372e-08, + "loss": 0.0151, + "step": 4212 + }, + { + "epoch": 2.8504736129905277, + "grad_norm": 0.5556851057427106, + "learning_rate": 7.550616408489253e-08, + "loss": 0.0264, + "step": 4213 + }, + { + "epoch": 2.8511502029769957, + "grad_norm": 0.25391006207421374, + "learning_rate": 7.482610194356477e-08, + "loss": 0.0126, + "step": 4214 + }, + { + "epoch": 2.851826792963464, + "grad_norm": 0.3823095849765143, + "learning_rate": 7.414909314458263e-08, + "loss": 0.0241, + "step": 4215 + }, + { + "epoch": 2.852503382949932, + "grad_norm": 0.3234817129922742, + "learning_rate": 7.347513810765427e-08, + "loss": 0.0222, + "step": 4216 + }, + { + "epoch": 2.8531799729364007, + "grad_norm": 0.3765176485465511, + "learning_rate": 7.280423725059604e-08, + "loss": 0.0166, + "step": 4217 + }, + { + "epoch": 2.8538565629228687, + "grad_norm": 0.30554992343241966, + "learning_rate": 7.213639098933022e-08, + "loss": 0.0158, + "step": 4218 + }, + { + "epoch": 2.8545331529093367, + "grad_norm": 0.46525063222991864, + "learning_rate": 7.147159973788508e-08, + "loss": 0.0394, + "step": 4219 + }, + { + "epoch": 2.855209742895805, + "grad_norm": 0.30322183391296076, + "learning_rate": 7.080986390839539e-08, + "loss": 0.0164, + "step": 4220 + }, + { + "epoch": 2.8558863328822732, + "grad_norm": 0.37710971981176267, + "learning_rate": 7.015118391110299e-08, + "loss": 0.0235, + "step": 4221 + }, + { + "epoch": 2.8565629228687417, + "grad_norm": 0.2656839806725427, + "learning_rate": 6.949556015435178e-08, + "loss": 0.0162, + "step": 4222 + }, + { + "epoch": 2.8572395128552097, + "grad_norm": 0.36277813108526075, + "learning_rate": 6.884299304459497e-08, + "loss": 0.017, + "step": 4223 + }, + { + "epoch": 2.8579161028416777, + "grad_norm": 0.39673098082738284, + "learning_rate": 6.819348298638839e-08, + "loss": 0.0302, + "step": 4224 + }, + { + "epoch": 2.858592692828146, + "grad_norm": 0.31371828741325075, + "learning_rate": 6.75470303823933e-08, + "loss": 0.0188, + "step": 4225 + }, + { + "epoch": 2.8592692828146142, + "grad_norm": 0.26473994958768376, + "learning_rate": 6.690363563337466e-08, + "loss": 0.018, + "step": 4226 + }, + { + "epoch": 2.8599458728010827, + "grad_norm": 0.2767737819584196, + "learning_rate": 6.626329913820339e-08, + "loss": 0.0152, + "step": 4227 + }, + { + "epoch": 2.8606224627875507, + "grad_norm": 0.273429670336525, + "learning_rate": 6.562602129385365e-08, + "loss": 0.0171, + "step": 4228 + }, + { + "epoch": 2.8612990527740187, + "grad_norm": 0.27272266057380484, + "learning_rate": 6.499180249540382e-08, + "loss": 0.0142, + "step": 4229 + }, + { + "epoch": 2.861975642760487, + "grad_norm": 0.32846334676597794, + "learning_rate": 6.436064313603385e-08, + "loss": 0.0159, + "step": 4230 + }, + { + "epoch": 2.8626522327469552, + "grad_norm": 0.3317592781453487, + "learning_rate": 6.373254360703019e-08, + "loss": 0.0175, + "step": 4231 + }, + { + "epoch": 2.8633288227334237, + "grad_norm": 0.3523062154902482, + "learning_rate": 6.310750429777912e-08, + "loss": 0.0279, + "step": 4232 + }, + { + "epoch": 2.8640054127198917, + "grad_norm": 0.33862087809046665, + "learning_rate": 6.248552559577292e-08, + "loss": 0.0155, + "step": 4233 + }, + { + "epoch": 2.8646820027063598, + "grad_norm": 0.30863870798026977, + "learning_rate": 6.186660788660315e-08, + "loss": 0.024, + "step": 4234 + }, + { + "epoch": 2.865358592692828, + "grad_norm": 0.43530468617613666, + "learning_rate": 6.125075155396675e-08, + "loss": 0.0249, + "step": 4235 + }, + { + "epoch": 2.8660351826792962, + "grad_norm": 0.33410857146880635, + "learning_rate": 6.063795697966057e-08, + "loss": 0.0182, + "step": 4236 + }, + { + "epoch": 2.8667117726657647, + "grad_norm": 0.2581141263823941, + "learning_rate": 6.00282245435857e-08, + "loss": 0.013, + "step": 4237 + }, + { + "epoch": 2.8673883626522327, + "grad_norm": 0.26336019085508233, + "learning_rate": 5.9421554623742e-08, + "loss": 0.0199, + "step": 4238 + }, + { + "epoch": 2.8680649526387008, + "grad_norm": 0.3068932902803712, + "learning_rate": 5.881794759623194e-08, + "loss": 0.019, + "step": 4239 + }, + { + "epoch": 2.8687415426251692, + "grad_norm": 0.30334602603987404, + "learning_rate": 5.8217403835260086e-08, + "loss": 0.017, + "step": 4240 + }, + { + "epoch": 2.8694181326116373, + "grad_norm": 0.36373523194783963, + "learning_rate": 5.7619923713130857e-08, + "loss": 0.0218, + "step": 4241 + }, + { + "epoch": 2.8700947225981057, + "grad_norm": 0.28776157462866214, + "learning_rate": 5.7025507600250165e-08, + "loss": 0.0168, + "step": 4242 + }, + { + "epoch": 2.8707713125845737, + "grad_norm": 0.3741713230544328, + "learning_rate": 5.643415586512324e-08, + "loss": 0.0183, + "step": 4243 + }, + { + "epoch": 2.8714479025710418, + "grad_norm": 0.35753259645318475, + "learning_rate": 5.584586887435739e-08, + "loss": 0.0165, + "step": 4244 + }, + { + "epoch": 2.8721244925575102, + "grad_norm": 0.4091639225039315, + "learning_rate": 5.526064699265754e-08, + "loss": 0.0221, + "step": 4245 + }, + { + "epoch": 2.8728010825439783, + "grad_norm": 0.4310266598775968, + "learning_rate": 5.4678490582830704e-08, + "loss": 0.0161, + "step": 4246 + }, + { + "epoch": 2.8734776725304467, + "grad_norm": 0.4589657330215928, + "learning_rate": 5.409940000578207e-08, + "loss": 0.025, + "step": 4247 + }, + { + "epoch": 2.8741542625169147, + "grad_norm": 0.39574053407186405, + "learning_rate": 5.352337562051613e-08, + "loss": 0.0205, + "step": 4248 + }, + { + "epoch": 2.8748308525033828, + "grad_norm": 0.31761749193169986, + "learning_rate": 5.2950417784137785e-08, + "loss": 0.0135, + "step": 4249 + }, + { + "epoch": 2.8755074424898512, + "grad_norm": 0.3288942339427781, + "learning_rate": 5.2380526851850135e-08, + "loss": 0.0119, + "step": 4250 + }, + { + "epoch": 2.8761840324763193, + "grad_norm": 0.2709022082033616, + "learning_rate": 5.181370317695389e-08, + "loss": 0.0117, + "step": 4251 + }, + { + "epoch": 2.8768606224627877, + "grad_norm": 0.33017093317214713, + "learning_rate": 5.124994711084963e-08, + "loss": 0.0164, + "step": 4252 + }, + { + "epoch": 2.8775372124492558, + "grad_norm": 0.41359223837025066, + "learning_rate": 5.0689259003035566e-08, + "loss": 0.0244, + "step": 4253 + }, + { + "epoch": 2.878213802435724, + "grad_norm": 0.33650748628730026, + "learning_rate": 5.013163920110864e-08, + "loss": 0.0211, + "step": 4254 + }, + { + "epoch": 2.8788903924221922, + "grad_norm": 0.34398603125699145, + "learning_rate": 4.9577088050762337e-08, + "loss": 0.0207, + "step": 4255 + }, + { + "epoch": 2.8795669824086603, + "grad_norm": 0.33198254647461567, + "learning_rate": 4.9025605895788867e-08, + "loss": 0.0218, + "step": 4256 + }, + { + "epoch": 2.8802435723951287, + "grad_norm": 0.35877004905417115, + "learning_rate": 4.847719307807752e-08, + "loss": 0.0189, + "step": 4257 + }, + { + "epoch": 2.8809201623815968, + "grad_norm": 0.3116332685026539, + "learning_rate": 4.793184993761468e-08, + "loss": 0.0156, + "step": 4258 + }, + { + "epoch": 2.881596752368065, + "grad_norm": 0.3082410344380756, + "learning_rate": 4.73895768124838e-08, + "loss": 0.0148, + "step": 4259 + }, + { + "epoch": 2.8822733423545333, + "grad_norm": 0.3041206173305893, + "learning_rate": 4.685037403886483e-08, + "loss": 0.02, + "step": 4260 + }, + { + "epoch": 2.8829499323410013, + "grad_norm": 0.2693770911865805, + "learning_rate": 4.631424195103373e-08, + "loss": 0.0117, + "step": 4261 + }, + { + "epoch": 2.8836265223274697, + "grad_norm": 0.3201209591829607, + "learning_rate": 4.578118088136463e-08, + "loss": 0.0166, + "step": 4262 + }, + { + "epoch": 2.8843031123139378, + "grad_norm": 0.36151687197068305, + "learning_rate": 4.52511911603265e-08, + "loss": 0.0237, + "step": 4263 + }, + { + "epoch": 2.884979702300406, + "grad_norm": 0.30576819613758666, + "learning_rate": 4.4724273116483754e-08, + "loss": 0.0175, + "step": 4264 + }, + { + "epoch": 2.8856562922868743, + "grad_norm": 0.4126017261557132, + "learning_rate": 4.42004270764973e-08, + "loss": 0.0232, + "step": 4265 + }, + { + "epoch": 2.8863328822733423, + "grad_norm": 0.28007083278233813, + "learning_rate": 4.367965336512403e-08, + "loss": 0.0172, + "step": 4266 + }, + { + "epoch": 2.8870094722598107, + "grad_norm": 0.29123182114067986, + "learning_rate": 4.316195230521514e-08, + "loss": 0.0172, + "step": 4267 + }, + { + "epoch": 2.8876860622462788, + "grad_norm": 0.41340741961638583, + "learning_rate": 4.264732421771722e-08, + "loss": 0.0241, + "step": 4268 + }, + { + "epoch": 2.888362652232747, + "grad_norm": 0.21205094059444654, + "learning_rate": 4.21357694216723e-08, + "loss": 0.0104, + "step": 4269 + }, + { + "epoch": 2.8890392422192153, + "grad_norm": 0.32319959609921306, + "learning_rate": 4.162728823421669e-08, + "loss": 0.015, + "step": 4270 + }, + { + "epoch": 2.8897158322056833, + "grad_norm": 0.3290901086365611, + "learning_rate": 4.112188097058156e-08, + "loss": 0.0235, + "step": 4271 + }, + { + "epoch": 2.8903924221921518, + "grad_norm": 0.25413269095569097, + "learning_rate": 4.061954794409184e-08, + "loss": 0.0127, + "step": 4272 + }, + { + "epoch": 2.89106901217862, + "grad_norm": 0.30256805738488934, + "learning_rate": 4.0120289466166754e-08, + "loss": 0.0164, + "step": 4273 + }, + { + "epoch": 2.891745602165088, + "grad_norm": 0.38270235641862727, + "learning_rate": 3.9624105846319813e-08, + "loss": 0.0171, + "step": 4274 + }, + { + "epoch": 2.8924221921515563, + "grad_norm": 0.3169592219433915, + "learning_rate": 3.9130997392157756e-08, + "loss": 0.0193, + "step": 4275 + }, + { + "epoch": 2.8930987821380243, + "grad_norm": 0.37530967308090235, + "learning_rate": 3.86409644093827e-08, + "loss": 0.0227, + "step": 4276 + }, + { + "epoch": 2.8937753721244928, + "grad_norm": 0.3727951648012234, + "learning_rate": 3.8154007201787194e-08, + "loss": 0.0213, + "step": 4277 + }, + { + "epoch": 2.894451962110961, + "grad_norm": 0.2400579021593019, + "learning_rate": 3.7670126071259194e-08, + "loss": 0.0119, + "step": 4278 + }, + { + "epoch": 2.895128552097429, + "grad_norm": 0.491117410386177, + "learning_rate": 3.718932131777819e-08, + "loss": 0.0221, + "step": 4279 + }, + { + "epoch": 2.8958051420838973, + "grad_norm": 0.3236535062948911, + "learning_rate": 3.6711593239417976e-08, + "loss": 0.0154, + "step": 4280 + }, + { + "epoch": 2.8964817320703653, + "grad_norm": 0.26659002110625496, + "learning_rate": 3.62369421323433e-08, + "loss": 0.0184, + "step": 4281 + }, + { + "epoch": 2.8971583220568338, + "grad_norm": 0.3384157344158936, + "learning_rate": 3.576536829081323e-08, + "loss": 0.0226, + "step": 4282 + }, + { + "epoch": 2.897834912043302, + "grad_norm": 0.4754787105293215, + "learning_rate": 3.52968720071778e-08, + "loss": 0.0222, + "step": 4283 + }, + { + "epoch": 2.89851150202977, + "grad_norm": 0.34755706930405583, + "learning_rate": 3.483145357187967e-08, + "loss": 0.0179, + "step": 4284 + }, + { + "epoch": 2.8991880920162383, + "grad_norm": 0.325935012514578, + "learning_rate": 3.436911327345305e-08, + "loss": 0.0144, + "step": 4285 + }, + { + "epoch": 2.8998646820027063, + "grad_norm": 0.3327741788248334, + "learning_rate": 3.3909851398523654e-08, + "loss": 0.0132, + "step": 4286 + }, + { + "epoch": 2.9005412719891748, + "grad_norm": 0.2861945720917734, + "learning_rate": 3.345366823180929e-08, + "loss": 0.0157, + "step": 4287 + }, + { + "epoch": 2.901217861975643, + "grad_norm": 0.3665978352277033, + "learning_rate": 3.300056405611873e-08, + "loss": 0.0154, + "step": 4288 + }, + { + "epoch": 2.901894451962111, + "grad_norm": 0.3339163588510717, + "learning_rate": 3.2550539152352845e-08, + "loss": 0.0156, + "step": 4289 + }, + { + "epoch": 2.9025710419485793, + "grad_norm": 0.4395250423334691, + "learning_rate": 3.2103593799501786e-08, + "loss": 0.0272, + "step": 4290 + }, + { + "epoch": 2.9032476319350473, + "grad_norm": 0.2962692895914033, + "learning_rate": 3.165972827464892e-08, + "loss": 0.0121, + "step": 4291 + }, + { + "epoch": 2.903924221921516, + "grad_norm": 0.30857554868165343, + "learning_rate": 3.1218942852965226e-08, + "loss": 0.0186, + "step": 4292 + }, + { + "epoch": 2.904600811907984, + "grad_norm": 0.3379736851327122, + "learning_rate": 3.078123780771602e-08, + "loss": 0.0178, + "step": 4293 + }, + { + "epoch": 2.905277401894452, + "grad_norm": 0.3347150338757051, + "learning_rate": 3.034661341025258e-08, + "loss": 0.0127, + "step": 4294 + }, + { + "epoch": 2.9059539918809203, + "grad_norm": 0.47547131888115907, + "learning_rate": 2.9915069930019914e-08, + "loss": 0.0239, + "step": 4295 + }, + { + "epoch": 2.9066305818673883, + "grad_norm": 0.34846448209053205, + "learning_rate": 2.94866076345518e-08, + "loss": 0.0203, + "step": 4296 + }, + { + "epoch": 2.907307171853857, + "grad_norm": 0.3472791435166466, + "learning_rate": 2.9061226789471874e-08, + "loss": 0.0217, + "step": 4297 + }, + { + "epoch": 2.907983761840325, + "grad_norm": 0.2908861983310066, + "learning_rate": 2.863892765849252e-08, + "loss": 0.0167, + "step": 4298 + }, + { + "epoch": 2.908660351826793, + "grad_norm": 0.27631325099648174, + "learning_rate": 2.8219710503416543e-08, + "loss": 0.015, + "step": 4299 + }, + { + "epoch": 2.9093369418132613, + "grad_norm": 0.4381992878386518, + "learning_rate": 2.78035755841366e-08, + "loss": 0.0182, + "step": 4300 + }, + { + "epoch": 2.9100135317997293, + "grad_norm": 0.3772522330202533, + "learning_rate": 2.7390523158633552e-08, + "loss": 0.03, + "step": 4301 + }, + { + "epoch": 2.910690121786198, + "grad_norm": 0.39301401342529896, + "learning_rate": 2.6980553482977566e-08, + "loss": 0.0215, + "step": 4302 + }, + { + "epoch": 2.911366711772666, + "grad_norm": 0.34538212284144815, + "learning_rate": 2.657366681132756e-08, + "loss": 0.0252, + "step": 4303 + }, + { + "epoch": 2.912043301759134, + "grad_norm": 0.3033583250639276, + "learning_rate": 2.6169863395932304e-08, + "loss": 0.0151, + "step": 4304 + }, + { + "epoch": 2.9127198917456023, + "grad_norm": 0.4348684049174322, + "learning_rate": 2.5769143487127113e-08, + "loss": 0.0244, + "step": 4305 + }, + { + "epoch": 2.9133964817320703, + "grad_norm": 0.357439540770848, + "learning_rate": 2.5371507333337153e-08, + "loss": 0.0199, + "step": 4306 + }, + { + "epoch": 2.914073071718539, + "grad_norm": 0.2885648400552308, + "learning_rate": 2.497695518107579e-08, + "loss": 0.0169, + "step": 4307 + }, + { + "epoch": 2.914749661705007, + "grad_norm": 0.25282593387606855, + "learning_rate": 2.4585487274942922e-08, + "loss": 0.0128, + "step": 4308 + }, + { + "epoch": 2.915426251691475, + "grad_norm": 0.2277673314759974, + "learning_rate": 2.4197103857628858e-08, + "loss": 0.0124, + "step": 4309 + }, + { + "epoch": 2.9161028416779433, + "grad_norm": 0.30799563707363364, + "learning_rate": 2.381180516990933e-08, + "loss": 0.0189, + "step": 4310 + }, + { + "epoch": 2.9167794316644113, + "grad_norm": 0.402521176810984, + "learning_rate": 2.3429591450649934e-08, + "loss": 0.0165, + "step": 4311 + }, + { + "epoch": 2.91745602165088, + "grad_norm": 0.30582298260297197, + "learning_rate": 2.305046293680113e-08, + "loss": 0.0158, + "step": 4312 + }, + { + "epoch": 2.918132611637348, + "grad_norm": 0.3068926391837632, + "learning_rate": 2.267441986340324e-08, + "loss": 0.0143, + "step": 4313 + }, + { + "epoch": 2.918809201623816, + "grad_norm": 0.30115489166458353, + "learning_rate": 2.230146246358256e-08, + "loss": 0.0157, + "step": 4314 + }, + { + "epoch": 2.9194857916102843, + "grad_norm": 0.42883998913216026, + "learning_rate": 2.193159096855191e-08, + "loss": 0.0289, + "step": 4315 + }, + { + "epoch": 2.9201623815967523, + "grad_norm": 0.5385956885172357, + "learning_rate": 2.1564805607612317e-08, + "loss": 0.0269, + "step": 4316 + }, + { + "epoch": 2.920838971583221, + "grad_norm": 0.26155103827421755, + "learning_rate": 2.120110660815078e-08, + "loss": 0.0151, + "step": 4317 + }, + { + "epoch": 2.921515561569689, + "grad_norm": 0.39640151390909856, + "learning_rate": 2.0840494195641382e-08, + "loss": 0.0193, + "step": 4318 + }, + { + "epoch": 2.922192151556157, + "grad_norm": 0.27912308164286453, + "learning_rate": 2.0482968593643625e-08, + "loss": 0.013, + "step": 4319 + }, + { + "epoch": 2.9228687415426253, + "grad_norm": 0.3229440425464994, + "learning_rate": 2.012853002380466e-08, + "loss": 0.0141, + "step": 4320 + }, + { + "epoch": 2.9235453315290933, + "grad_norm": 0.28732500317070214, + "learning_rate": 1.97771787058576e-08, + "loss": 0.02, + "step": 4321 + }, + { + "epoch": 2.924221921515562, + "grad_norm": 0.3558103778800274, + "learning_rate": 1.942891485762044e-08, + "loss": 0.0276, + "step": 4322 + }, + { + "epoch": 2.92489851150203, + "grad_norm": 0.40707033045057484, + "learning_rate": 1.9083738694998798e-08, + "loss": 0.0212, + "step": 4323 + }, + { + "epoch": 2.925575101488498, + "grad_norm": 0.2882245457459522, + "learning_rate": 1.8741650431982615e-08, + "loss": 0.0135, + "step": 4324 + }, + { + "epoch": 2.9262516914749663, + "grad_norm": 0.3716334418961407, + "learning_rate": 1.8402650280648916e-08, + "loss": 0.0215, + "step": 4325 + }, + { + "epoch": 2.9269282814614344, + "grad_norm": 0.3709921645244406, + "learning_rate": 1.8066738451159028e-08, + "loss": 0.0262, + "step": 4326 + }, + { + "epoch": 2.927604871447903, + "grad_norm": 0.3314299162218466, + "learning_rate": 1.773391515176026e-08, + "loss": 0.0166, + "step": 4327 + }, + { + "epoch": 2.928281461434371, + "grad_norm": 0.3393260454830498, + "learning_rate": 1.740418058878479e-08, + "loss": 0.0197, + "step": 4328 + }, + { + "epoch": 2.928958051420839, + "grad_norm": 0.3084975827429294, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.0202, + "step": 4329 + }, + { + "epoch": 2.9296346414073073, + "grad_norm": 0.3019682415559219, + "learning_rate": 1.6753978487860645e-08, + "loss": 0.0183, + "step": 4330 + }, + { + "epoch": 2.9303112313937754, + "grad_norm": 0.3057852451626844, + "learning_rate": 1.6433511353002863e-08, + "loss": 0.0149, + "step": 4331 + }, + { + "epoch": 2.930987821380244, + "grad_norm": 0.35440586101875543, + "learning_rate": 1.6116133760747944e-08, + "loss": 0.02, + "step": 4332 + }, + { + "epoch": 2.931664411366712, + "grad_norm": 0.396189232384318, + "learning_rate": 1.5801845907854606e-08, + "loss": 0.021, + "step": 4333 + }, + { + "epoch": 2.93234100135318, + "grad_norm": 0.383055252792876, + "learning_rate": 1.549064798916311e-08, + "loss": 0.0334, + "step": 4334 + }, + { + "epoch": 2.933017591339648, + "grad_norm": 0.5205310542121773, + "learning_rate": 1.5182540197600237e-08, + "loss": 0.0205, + "step": 4335 + }, + { + "epoch": 2.9336941813261164, + "grad_norm": 0.2664001445596687, + "learning_rate": 1.4877522724175974e-08, + "loss": 0.013, + "step": 4336 + }, + { + "epoch": 2.934370771312585, + "grad_norm": 0.3788167280191553, + "learning_rate": 1.4575595757985172e-08, + "loss": 0.0224, + "step": 4337 + }, + { + "epoch": 2.935047361299053, + "grad_norm": 0.3201633152950062, + "learning_rate": 1.4276759486205328e-08, + "loss": 0.0199, + "step": 4338 + }, + { + "epoch": 2.935723951285521, + "grad_norm": 0.5406506993061142, + "learning_rate": 1.3981014094099354e-08, + "loss": 0.017, + "step": 4339 + }, + { + "epoch": 2.936400541271989, + "grad_norm": 0.2973005758907856, + "learning_rate": 1.368835976501337e-08, + "loss": 0.0176, + "step": 4340 + }, + { + "epoch": 2.9370771312584574, + "grad_norm": 0.27892407613947634, + "learning_rate": 1.3398796680377245e-08, + "loss": 0.0142, + "step": 4341 + }, + { + "epoch": 2.937753721244926, + "grad_norm": 0.38503686951243543, + "learning_rate": 1.3112325019704608e-08, + "loss": 0.021, + "step": 4342 + }, + { + "epoch": 2.938430311231394, + "grad_norm": 0.3339655848594916, + "learning_rate": 1.2828944960592837e-08, + "loss": 0.0176, + "step": 4343 + }, + { + "epoch": 2.939106901217862, + "grad_norm": 0.316123728556445, + "learning_rate": 1.2548656678721404e-08, + "loss": 0.0164, + "step": 4344 + }, + { + "epoch": 2.93978349120433, + "grad_norm": 0.36367628136946084, + "learning_rate": 1.2271460347854091e-08, + "loss": 0.0179, + "step": 4345 + }, + { + "epoch": 2.9404600811907984, + "grad_norm": 0.3488339117202152, + "learning_rate": 1.1997356139838434e-08, + "loss": 0.0213, + "step": 4346 + }, + { + "epoch": 2.941136671177267, + "grad_norm": 0.35022374428722575, + "learning_rate": 1.1726344224603504e-08, + "loss": 0.02, + "step": 4347 + }, + { + "epoch": 2.941813261163735, + "grad_norm": 0.39506691018080775, + "learning_rate": 1.145842477016268e-08, + "loss": 0.0225, + "step": 4348 + }, + { + "epoch": 2.942489851150203, + "grad_norm": 0.3458942447882672, + "learning_rate": 1.119359794261088e-08, + "loss": 0.0203, + "step": 4349 + }, + { + "epoch": 2.943166441136671, + "grad_norm": 0.31112225633978385, + "learning_rate": 1.0931863906127327e-08, + "loss": 0.0179, + "step": 4350 + }, + { + "epoch": 2.9438430311231394, + "grad_norm": 0.3632591345807602, + "learning_rate": 1.0673222822972229e-08, + "loss": 0.021, + "step": 4351 + }, + { + "epoch": 2.944519621109608, + "grad_norm": 0.26889010243432787, + "learning_rate": 1.0417674853489545e-08, + "loss": 0.0134, + "step": 4352 + }, + { + "epoch": 2.945196211096076, + "grad_norm": 0.33497895902046243, + "learning_rate": 1.0165220156105326e-08, + "loss": 0.0233, + "step": 4353 + }, + { + "epoch": 2.945872801082544, + "grad_norm": 0.3273435086703643, + "learning_rate": 9.915858887327157e-09, + "loss": 0.0153, + "step": 4354 + }, + { + "epoch": 2.946549391069012, + "grad_norm": 0.3195513135185016, + "learning_rate": 9.669591201746375e-09, + "loss": 0.0209, + "step": 4355 + }, + { + "epoch": 2.9472259810554804, + "grad_norm": 0.3509855885167185, + "learning_rate": 9.426417252035858e-09, + "loss": 0.0196, + "step": 4356 + }, + { + "epoch": 2.9479025710419484, + "grad_norm": 0.3005686980471101, + "learning_rate": 9.186337188949456e-09, + "loss": 0.0136, + "step": 4357 + }, + { + "epoch": 2.948579161028417, + "grad_norm": 0.35888684219728445, + "learning_rate": 8.949351161324227e-09, + "loss": 0.0213, + "step": 4358 + }, + { + "epoch": 2.949255751014885, + "grad_norm": 0.2688442947169199, + "learning_rate": 8.715459316078756e-09, + "loss": 0.0163, + "step": 4359 + }, + { + "epoch": 2.949932341001353, + "grad_norm": 0.24857245333319675, + "learning_rate": 8.484661798213723e-09, + "loss": 0.013, + "step": 4360 + }, + { + "epoch": 2.9506089309878214, + "grad_norm": 0.35704831755136823, + "learning_rate": 8.256958750810784e-09, + "loss": 0.015, + "step": 4361 + }, + { + "epoch": 2.9512855209742894, + "grad_norm": 0.3010921743036329, + "learning_rate": 8.032350315033688e-09, + "loss": 0.0146, + "step": 4362 + }, + { + "epoch": 2.951962110960758, + "grad_norm": 0.3536846063831116, + "learning_rate": 7.810836630127717e-09, + "loss": 0.0157, + "step": 4363 + }, + { + "epoch": 2.952638700947226, + "grad_norm": 0.40101883380471337, + "learning_rate": 7.59241783341913e-09, + "loss": 0.0199, + "step": 4364 + }, + { + "epoch": 2.953315290933694, + "grad_norm": 0.40459086470998656, + "learning_rate": 7.377094060315726e-09, + "loss": 0.0202, + "step": 4365 + }, + { + "epoch": 2.9539918809201624, + "grad_norm": 0.4537988531466028, + "learning_rate": 7.164865444306834e-09, + "loss": 0.0203, + "step": 4366 + }, + { + "epoch": 2.9546684709066304, + "grad_norm": 0.32286109225660575, + "learning_rate": 6.9557321169622105e-09, + "loss": 0.0247, + "step": 4367 + }, + { + "epoch": 2.955345060893099, + "grad_norm": 1.0824472983870232, + "learning_rate": 6.7496942079342546e-09, + "loss": 0.0195, + "step": 4368 + }, + { + "epoch": 2.956021650879567, + "grad_norm": 0.5544590267783411, + "learning_rate": 6.546751844955235e-09, + "loss": 0.0225, + "step": 4369 + }, + { + "epoch": 2.956698240866035, + "grad_norm": 0.22849275168826153, + "learning_rate": 6.346905153837291e-09, + "loss": 0.0102, + "step": 4370 + }, + { + "epoch": 2.9573748308525034, + "grad_norm": 0.34317849125388905, + "learning_rate": 6.150154258476315e-09, + "loss": 0.017, + "step": 4371 + }, + { + "epoch": 2.9580514208389714, + "grad_norm": 0.29587388142693405, + "learning_rate": 5.956499280845851e-09, + "loss": 0.0148, + "step": 4372 + }, + { + "epoch": 2.95872801082544, + "grad_norm": 0.3199452788834128, + "learning_rate": 5.765940341002085e-09, + "loss": 0.0208, + "step": 4373 + }, + { + "epoch": 2.959404600811908, + "grad_norm": 0.28730714447598094, + "learning_rate": 5.578477557081074e-09, + "loss": 0.0106, + "step": 4374 + }, + { + "epoch": 2.960081190798376, + "grad_norm": 0.31094536156736957, + "learning_rate": 5.394111045299855e-09, + "loss": 0.0204, + "step": 4375 + }, + { + "epoch": 2.9607577807848444, + "grad_norm": 0.26448000549333334, + "learning_rate": 5.212840919955886e-09, + "loss": 0.0156, + "step": 4376 + }, + { + "epoch": 2.9614343707713124, + "grad_norm": 0.36121195162214426, + "learning_rate": 5.034667293427053e-09, + "loss": 0.0221, + "step": 4377 + }, + { + "epoch": 2.962110960757781, + "grad_norm": 0.3919679143121755, + "learning_rate": 4.859590276170556e-09, + "loss": 0.0236, + "step": 4378 + }, + { + "epoch": 2.962787550744249, + "grad_norm": 0.31466905088752206, + "learning_rate": 4.687609976725127e-09, + "loss": 0.0207, + "step": 4379 + }, + { + "epoch": 2.963464140730717, + "grad_norm": 0.276137358393101, + "learning_rate": 4.51872650170937e-09, + "loss": 0.0159, + "step": 4380 + }, + { + "epoch": 2.9641407307171854, + "grad_norm": 0.27536454716675796, + "learning_rate": 4.352939955822311e-09, + "loss": 0.0136, + "step": 4381 + }, + { + "epoch": 2.9648173207036534, + "grad_norm": 0.3095208723601742, + "learning_rate": 4.190250441841737e-09, + "loss": 0.0147, + "step": 4382 + }, + { + "epoch": 2.965493910690122, + "grad_norm": 0.36776435927075146, + "learning_rate": 4.030658060626969e-09, + "loss": 0.0191, + "step": 4383 + }, + { + "epoch": 2.96617050067659, + "grad_norm": 0.2839355716419772, + "learning_rate": 3.874162911117196e-09, + "loss": 0.013, + "step": 4384 + }, + { + "epoch": 2.966847090663058, + "grad_norm": 0.32540217483087935, + "learning_rate": 3.7207650903298143e-09, + "loss": 0.0149, + "step": 4385 + }, + { + "epoch": 2.9675236806495264, + "grad_norm": 0.3140991526101512, + "learning_rate": 3.570464693364306e-09, + "loss": 0.0215, + "step": 4386 + }, + { + "epoch": 2.9682002706359945, + "grad_norm": 0.30812042457101607, + "learning_rate": 3.4232618133978044e-09, + "loss": 0.0153, + "step": 4387 + }, + { + "epoch": 2.968876860622463, + "grad_norm": 0.33732578138585134, + "learning_rate": 3.279156541688422e-09, + "loss": 0.0229, + "step": 4388 + }, + { + "epoch": 2.969553450608931, + "grad_norm": 0.2377707525395033, + "learning_rate": 3.1381489675746946e-09, + "loss": 0.0087, + "step": 4389 + }, + { + "epoch": 2.970230040595399, + "grad_norm": 0.3067346004371049, + "learning_rate": 3.000239178472253e-09, + "loss": 0.0134, + "step": 4390 + }, + { + "epoch": 2.9709066305818674, + "grad_norm": 0.4933288385571862, + "learning_rate": 2.8654272598788167e-09, + "loss": 0.0411, + "step": 4391 + }, + { + "epoch": 2.9715832205683355, + "grad_norm": 0.37518135491918647, + "learning_rate": 2.7337132953697555e-09, + "loss": 0.0168, + "step": 4392 + }, + { + "epoch": 2.972259810554804, + "grad_norm": 0.300150884049802, + "learning_rate": 2.605097366601417e-09, + "loss": 0.0146, + "step": 4393 + }, + { + "epoch": 2.972936400541272, + "grad_norm": 0.2936469448510459, + "learning_rate": 2.479579553307798e-09, + "loss": 0.0167, + "step": 4394 + }, + { + "epoch": 2.97361299052774, + "grad_norm": 0.297790522107297, + "learning_rate": 2.3571599333038765e-09, + "loss": 0.0152, + "step": 4395 + }, + { + "epoch": 2.9742895805142084, + "grad_norm": 0.3821502282272499, + "learning_rate": 2.237838582483387e-09, + "loss": 0.0183, + "step": 4396 + }, + { + "epoch": 2.9749661705006765, + "grad_norm": 0.29273770886435896, + "learning_rate": 2.12161557481827e-09, + "loss": 0.0195, + "step": 4397 + }, + { + "epoch": 2.975642760487145, + "grad_norm": 0.2801009993689134, + "learning_rate": 2.008490982360889e-09, + "loss": 0.0114, + "step": 4398 + }, + { + "epoch": 2.976319350473613, + "grad_norm": 0.38089038697206073, + "learning_rate": 1.8984648752429222e-09, + "loss": 0.0179, + "step": 4399 + }, + { + "epoch": 2.976995940460081, + "grad_norm": 0.2877555876007096, + "learning_rate": 1.7915373216742527e-09, + "loss": 0.015, + "step": 4400 + }, + { + "epoch": 2.9776725304465494, + "grad_norm": 0.3755693582452172, + "learning_rate": 1.687708387944076e-09, + "loss": 0.0227, + "step": 4401 + }, + { + "epoch": 2.9783491204330175, + "grad_norm": 0.2785422941954074, + "learning_rate": 1.5869781384203475e-09, + "loss": 0.0128, + "step": 4402 + }, + { + "epoch": 2.979025710419486, + "grad_norm": 0.44204489971832406, + "learning_rate": 1.4893466355514474e-09, + "loss": 0.0175, + "step": 4403 + }, + { + "epoch": 2.979702300405954, + "grad_norm": 0.3323320010546501, + "learning_rate": 1.3948139398628492e-09, + "loss": 0.0181, + "step": 4404 + }, + { + "epoch": 2.980378890392422, + "grad_norm": 0.27961205538820144, + "learning_rate": 1.3033801099598954e-09, + "loss": 0.0131, + "step": 4405 + }, + { + "epoch": 2.9810554803788905, + "grad_norm": 0.33412975393517885, + "learning_rate": 1.215045202527243e-09, + "loss": 0.0171, + "step": 4406 + }, + { + "epoch": 2.9817320703653585, + "grad_norm": 0.2890974529256528, + "learning_rate": 1.1298092723266429e-09, + "loss": 0.0144, + "step": 4407 + }, + { + "epoch": 2.982408660351827, + "grad_norm": 0.33997638992605866, + "learning_rate": 1.0476723722002702e-09, + "loss": 0.0296, + "step": 4408 + }, + { + "epoch": 2.983085250338295, + "grad_norm": 0.281155159120715, + "learning_rate": 9.686345530690589e-10, + "loss": 0.0129, + "step": 4409 + }, + { + "epoch": 2.983761840324763, + "grad_norm": 0.433244080656716, + "learning_rate": 8.926958639315919e-10, + "loss": 0.0216, + "step": 4410 + }, + { + "epoch": 2.9844384303112315, + "grad_norm": 0.3665508858483157, + "learning_rate": 8.198563518657665e-10, + "loss": 0.0163, + "step": 4411 + }, + { + "epoch": 2.9851150202976995, + "grad_norm": 0.31428569622042574, + "learning_rate": 7.50116062028794e-10, + "loss": 0.0143, + "step": 4412 + }, + { + "epoch": 2.985791610284168, + "grad_norm": 0.339015937564257, + "learning_rate": 6.834750376549793e-10, + "loss": 0.0215, + "step": 4413 + }, + { + "epoch": 2.986468200270636, + "grad_norm": 0.3381678729181359, + "learning_rate": 6.199333200590519e-10, + "loss": 0.0161, + "step": 4414 + }, + { + "epoch": 2.987144790257104, + "grad_norm": 0.2771287078950449, + "learning_rate": 5.594909486328348e-10, + "loss": 0.0153, + "step": 4415 + }, + { + "epoch": 2.9878213802435725, + "grad_norm": 0.3394705935867139, + "learning_rate": 5.021479608474655e-10, + "loss": 0.0193, + "step": 4416 + }, + { + "epoch": 2.9884979702300405, + "grad_norm": 0.3644892330047543, + "learning_rate": 4.4790439225284034e-10, + "loss": 0.0242, + "step": 4417 + }, + { + "epoch": 2.989174560216509, + "grad_norm": 0.3136943948894721, + "learning_rate": 3.967602764770595e-10, + "loss": 0.0184, + "step": 4418 + }, + { + "epoch": 2.989851150202977, + "grad_norm": 0.387996779344426, + "learning_rate": 3.487156452258722e-10, + "loss": 0.0198, + "step": 4419 + }, + { + "epoch": 2.990527740189445, + "grad_norm": 0.29577891501208836, + "learning_rate": 3.0377052828489684e-10, + "loss": 0.0163, + "step": 4420 + }, + { + "epoch": 2.9912043301759135, + "grad_norm": 0.3113922493751291, + "learning_rate": 2.6192495351795576e-10, + "loss": 0.0143, + "step": 4421 + }, + { + "epoch": 2.9918809201623815, + "grad_norm": 0.329045522456078, + "learning_rate": 2.231789468670753e-10, + "loss": 0.0212, + "step": 4422 + }, + { + "epoch": 2.99255751014885, + "grad_norm": 0.3697015023651616, + "learning_rate": 1.8753253235248568e-10, + "loss": 0.0277, + "step": 4423 + }, + { + "epoch": 2.993234100135318, + "grad_norm": 0.3461954717802595, + "learning_rate": 1.5498573207262112e-10, + "loss": 0.0249, + "step": 4424 + }, + { + "epoch": 2.993910690121786, + "grad_norm": 0.33534009206015153, + "learning_rate": 1.2553856620522997e-10, + "loss": 0.0212, + "step": 4425 + }, + { + "epoch": 2.9945872801082545, + "grad_norm": 0.29711609164962155, + "learning_rate": 9.919105300570941e-11, + "loss": 0.0109, + "step": 4426 + }, + { + "epoch": 2.9952638700947225, + "grad_norm": 0.2793411841570855, + "learning_rate": 7.59432088082157e-11, + "loss": 0.0121, + "step": 4427 + }, + { + "epoch": 2.995940460081191, + "grad_norm": 0.3296633975783328, + "learning_rate": 5.579504802566416e-11, + "loss": 0.0213, + "step": 4428 + }, + { + "epoch": 2.996617050067659, + "grad_norm": 0.45326530102336376, + "learning_rate": 3.8746583148063786e-11, + "loss": 0.0231, + "step": 4429 + }, + { + "epoch": 2.997293640054127, + "grad_norm": 0.29331055135595124, + "learning_rate": 2.4797824744737797e-11, + "loss": 0.0153, + "step": 4430 + }, + { + "epoch": 2.9979702300405955, + "grad_norm": 0.2800545169223398, + "learning_rate": 1.3948781463213324e-11, + "loss": 0.0119, + "step": 4431 + }, + { + "epoch": 2.9986468200270635, + "grad_norm": 0.3171020771903546, + "learning_rate": 6.199460029221449e-12, + "loss": 0.0209, + "step": 4432 + }, + { + "epoch": 2.999323410013532, + "grad_norm": 0.6336304879901102, + "learning_rate": 1.549865247807425e-12, + "loss": 0.0254, + "step": 4433 + }, + { + "epoch": 3.0, + "grad_norm": 0.3215913587648698, + "learning_rate": 0.0, + "loss": 0.014, + "step": 4434 + }, + { + "epoch": 3.0, + "eval_loss": 0.03193313255906105, + "eval_runtime": 234.4462, + "eval_samples_per_second": 42.462, + "eval_steps_per_second": 1.331, + "step": 4434 + }, + { + "epoch": 3.0, + "step": 4434, + "total_flos": 1.381053238219899e+18, + "train_loss": 0.03611092418570863, + "train_runtime": 47401.0192, + "train_samples_per_second": 11.969, + "train_steps_per_second": 0.094 + } + ], + "logging_steps": 1, + "max_steps": 4434, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.381053238219899e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}